aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMike Pall <mike>2011-02-02 02:29:37 +0100
committerMike Pall <mike>2011-02-02 02:29:37 +0100
commitb613216efc7447dae645d8834e4d6f3185cd1bcc (patch)
tree0859fed377f00ebeada70ba45d02496b7fb4a249 /src
parentc539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff)
downloadluajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip
Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs.
Add generic HIOP instruction for extra backend functionality. Add support for HIOP to x86 backend. Use POWI for 64 bit integer x^k, too. POWI is lowered to a call by SPLIT or the x64 backend.
Diffstat (limited to 'src')
-rw-r--r--src/Makefile2
-rw-r--r--src/Makefile.dep15
-rw-r--r--src/lj_asm.c444
-rw-r--r--src/lj_carith.c8
-rw-r--r--src/lj_carith.h3
-rw-r--r--src/lj_crecord.c20
-rw-r--r--src/lj_ir.h21
-rw-r--r--src/lj_iropt.h6
-rw-r--r--src/lj_jit.h12
-rw-r--r--src/lj_opt_fold.c25
-rw-r--r--src/lj_opt_split.c343
-rw-r--r--src/lj_target_x86.h9
-rw-r--r--src/lj_trace.c2
-rw-r--r--src/ljamalg.c1
14 files changed, 795 insertions, 116 deletions
diff --git a/src/Makefile b/src/Makefile
index a2be1a18..0150b049 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -331,7 +331,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \
331 lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \ 331 lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \
332 lj_lex.o lj_parse.o \ 332 lj_lex.o lj_parse.o \
333 lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \ 333 lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \
334 lj_opt_dce.o lj_opt_loop.o \ 334 lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \
335 lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \ 335 lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \
336 lj_asm.o lj_trace.o lj_gdbjit.o \ 336 lj_asm.o lj_trace.o lj_gdbjit.o \
337 lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \ 337 lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \
diff --git a/src/Makefile.dep b/src/Makefile.dep
index 3d0c4239..1534ac27 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -128,6 +128,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
128lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ 128lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \
129 lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ 129 lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
130 lj_dispatch.h lj_traceerr.h 130 lj_dispatch.h lj_traceerr.h
131lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \
132 lj_arch.h
131lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ 133lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
132 lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \ 134 lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \
133 lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h 135 lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h
@@ -167,10 +169,11 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \
167 lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \ 169 lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \
168 lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \ 170 lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \
169 lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ 171 lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \
170 lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_mcode.c lj_mcode.h lj_snap.c \ 172 lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c \
171 lj_target.h lj_target_*.h lj_record.c lj_record.h lj_ffrecord.h \ 173 lj_mcode.h lj_snap.c lj_target.h lj_target_*.h lj_record.c lj_record.h \
172 lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \ 174 lj_ffrecord.h lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h \
173 lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \ 175 lj_asm.c lj_asm.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \
174 lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \ 176 lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \
175 lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c 177 lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \
178 lib_ffi.c lib_init.c
176luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h 179luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h
diff --git a/src/lj_asm.c b/src/lj_asm.c
index cc2ae597..441700d4 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
347 } 347 }
348} 348}
349 349
350/* op rm/mrm, i */
351static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i)
352{
353 x86Op xo;
354 if (checki8(i)) {
355 emit_i8(as, i);
356 xo = XG_TOXOi8(xg);
357 } else {
358 emit_i32(as, i);
359 xo = XG_TOXOi(xg);
360 }
361 emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64));
362}
363
350/* -- Emit moves ---------------------------------------------------------- */ 364/* -- Emit moves ---------------------------------------------------------- */
351 365
352/* mov [base+ofs], i */ 366/* mov [base+ofs], i */
@@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i)
371/* mov r, i / xor r, r */ 385/* mov r, i / xor r, r */
372static void emit_loadi(ASMState *as, Reg r, int32_t i) 386static void emit_loadi(ASMState *as, Reg r, int32_t i)
373{ 387{
374 if (i == 0) { 388 /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */
389 if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP ||
390 (as->curins+1 < as->T->nins &&
391 IR(as->curins+1)->o == IR_HIOP)))) {
375 emit_rr(as, XO_ARITH(XOg_XOR), r, r); 392 emit_rr(as, XO_ARITH(XOg_XOR), r, r);
376 } else { 393 } else {
377 MCode *p = as->mcp; 394 MCode *p = as->mcp;
@@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv)
422/* Label for short jumps. */ 439/* Label for short jumps. */
423typedef MCode *MCLabel; 440typedef MCode *MCLabel;
424 441
442#if LJ_32 && LJ_HASFFI
443/* jmp short target */
444static void emit_sjmp(ASMState *as, MCLabel target)
445{
446 MCode *p = as->mcp;
447 ptrdiff_t delta = target - p;
448 lua_assert(delta == (int8_t)delta);
449 p[-1] = (MCode)(int8_t)delta;
450 p[-2] = XI_JMPs;
451 as->mcp = p - 2;
452}
453#endif
454
425/* jcc short target */ 455/* jcc short target */
426static void emit_sjcc(ASMState *as, int cc, MCLabel target) 456static void emit_sjcc(ASMState *as, int cc, MCLabel target)
427{ 457{
@@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir)
630 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ 660 } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
631 lua_assert(irt_isnil(ir->t)); 661 lua_assert(irt_isnil(ir->t));
632 emit_getgl(as, r, jit_L); 662 emit_getgl(as, r, jit_L);
633#if LJ_64 /* NYI: 32 bit register pairs. */ 663#if LJ_64
634 } else if (ir->o == IR_KINT64) { 664 } else if (ir->o == IR_KINT64) {
635 emit_loadu64(as, r, ir_kint64(ir)->u64); 665 emit_loadu64(as, r, ir_kint64(ir)->u64);
636#endif 666#endif
@@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref)
681#if LJ_64 711#if LJ_64
682#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) 712#define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0))
683#else 713#else
684/* NYI: 32 bit register pairs. */ 714#define REX_64IR(ir, r) (r)
685#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r))
686#endif 715#endif
687 716
688/* Generic move between two regs. */ 717/* Generic move between two regs. */
@@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref)
939 emit_loadn(as, dest, tv); 968 emit_loadn(as, dest, tv);
940 return; 969 return;
941 } 970 }
942#if LJ_64 /* NYI: 32 bit register pairs. */ 971#if LJ_64
943 } else if (ir->o == IR_KINT64) { 972 } else if (ir->o == IR_KINT64) {
944 emit_loadu64(as, dest, ir_kint64(ir)->u64); 973 emit_loadu64(as, dest, ir_kint64(ir)->u64);
945 return; 974 return;
@@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
1463#endif 1492#endif
1464 if (r) { /* Argument is in a register. */ 1493 if (r) { /* Argument is in a register. */
1465 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 1494 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
1466#if LJ_64 /* NYI: 32 bit register pairs. */ 1495#if LJ_64
1467 if (ir->o == IR_KINT64) 1496 if (ir->o == IR_KINT64)
1468 emit_loadu64(as, r, ir_kint64(ir)->u64); 1497 emit_loadu64(as, r, ir_kint64(ir)->u64);
1469 else 1498 else
@@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
1519 ra_evictset(as, drop); /* Evictions must be performed first. */ 1548 ra_evictset(as, drop); /* Evictions must be performed first. */
1520 if (ra_used(ir)) { 1549 if (ra_used(ir)) {
1521 if (irt_isfp(ir->t)) { 1550 if (irt_isfp(ir->t)) {
1522 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1551 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1523#if LJ_64 1552#if LJ_64
1524 if ((ci->flags & CCI_CASTU64)) { 1553 if ((ci->flags & CCI_CASTU64)) {
1525 Reg dest = ir->r; 1554 Reg dest = ir->r;
@@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir)
1632 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 1661 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
1633 IRRef lref = ir->op1; 1662 IRRef lref = ir->op1;
1634 lua_assert(irt_type(ir->t) != st); 1663 lua_assert(irt_type(ir->t) != st);
1664 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */
1635 if (irt_isfp(ir->t)) { 1665 if (irt_isfp(ir->t)) {
1636 Reg dest = ra_dest(as, ir, RSET_FPR); 1666 Reg dest = ra_dest(as, ir, RSET_FPR);
1637 if (stfp) { /* FP to FP conversion. */ 1667 if (stfp) { /* FP to FP conversion. */
1638 Reg left = asm_fuseload(as, lref, RSET_FPR); 1668 Reg left = asm_fuseload(as, lref, RSET_FPR);
1639 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); 1669 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
1640 if (left == dest) return; /* Avoid the XO_XORPS. */ 1670 if (left == dest) return; /* Avoid the XO_XORPS. */
1641#if LJ_32 1671 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
1642 } else if (st >= IRT_U32) { 1672 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
1643 /* NYI: 64 bit integer or uint32_t to number conversion. */ 1673 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000));
1644 setintV(&as->J->errinfo, ir->o); 1674 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
1645 lj_trace_err_info(as->J, LJ_TRERR_NYIIR); 1675 if (irt_isfloat(ir->t))
1676 emit_rr(as, XO_CVTSD2SS, dest, dest);
1677 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
1678 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
1679 emit_loadn(as, bias, k);
1680 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
1646 return; 1681 return;
1647#endif
1648 } else { /* Integer to FP conversion. */ 1682 } else { /* Integer to FP conversion. */
1649 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? 1683 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
1650 ra_alloc1(as, lref, RSET_GPR) : 1684 ra_alloc1(as, lref, RSET_GPR) :
@@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir)
1663 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ 1697 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
1664 } else if (stfp) { /* FP to integer conversion. */ 1698 } else if (stfp) { /* FP to integer conversion. */
1665 if (irt_isguard(ir->t)) { 1699 if (irt_isguard(ir->t)) {
1666 lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ 1700 /* Checked conversions are only supported from number to int. */
1701 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
1667 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 1702 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
1668#if LJ_32
1669 } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) {
1670 /* NYI: number to 64 bit integer or uint32_t conversion. */
1671 setintV(&as->J->errinfo, ir->o);
1672 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1673#endif
1674 } else { 1703 } else {
1675 Reg dest = ra_dest(as, ir, RSET_GPR); 1704 Reg dest = ra_dest(as, ir, RSET_GPR);
1676 x86Op op = st == IRT_NUM ? 1705 x86Op op = st == IRT_NUM ?
1677 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : 1706 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
1678 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); 1707 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
1679 if (LJ_64 && irt_isu64(ir->t)) { 1708 if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */
1680 const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); 1709 /* u32 = (int32_t)(number - 2^31) + 2^31 */
1681 MCLabel l_end = emit_label(as); 1710 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1682 Reg left = IR(lref)->r; 1711 ra_scratch(as, RSET_FPR);
1712 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
1713 emit_rr(as, op, dest, tmp);
1714 if (st == IRT_NUM)
1715 emit_rma(as, XO_ADDSD, tmp,
1716 lj_ir_k64_find(as->J, U64x(c1e00000,00000000)));
1717 else
1718 emit_rma(as, XO_ADDSS, tmp,
1719 lj_ir_k64_find(as->J, U64x(00000000,cf000000)));
1720 ra_left(as, tmp, lref);
1721 } else if (LJ_64 && irt_isu64(ir->t)) {
1683 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ 1722 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1684 if (ra_hasreg(left)) { 1723 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
1685 Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); 1724 ra_scratch(as, RSET_FPR);
1686 emit_rr(as, op, dest|REX_64, tmpn); 1725 MCLabel l_end = emit_label(as);
1687 emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); 1726 emit_rr(as, op, dest|REX_64, tmp);
1688 emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); 1727 if (st == IRT_NUM)
1689 } else { 1728 emit_rma(as, XO_ADDSD, tmp,
1690 left = ra_allocref(as, lref, RSET_FPR); 1729 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1691 emit_rr(as, op, dest|REX_64, left); 1730 else
1692 emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); 1731 emit_rma(as, XO_ADDSS, tmp,
1693 } 1732 lj_ir_k64_find(as->J, U64x(00000000,df800000)));
1694 emit_sjcc(as, CC_NS, l_end); 1733 emit_sjcc(as, CC_NS, l_end);
1695 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ 1734 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */
1696 emit_rr(as, op, dest|REX_64, left); 1735 emit_rr(as, op, dest|REX_64, tmp);
1736 ra_left(as, tmp, lref);
1697 } else { 1737 } else {
1698 Reg left = asm_fuseload(as, lref, RSET_FPR); 1738 Reg left = asm_fuseload(as, lref, RSET_FPR);
1699 if (LJ_64 && irt_isu32(ir->t)) 1739 if (LJ_64 && irt_isu32(ir->t))
1700 emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ 1740 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
1701 emit_mrm(as, op, 1741 emit_mrm(as, op,
1702 dest|((LJ_64 && 1742 dest|((LJ_64 &&
1703 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 1743 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
@@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir)
1728 emit_mrm(as, op, dest, left); 1768 emit_mrm(as, op, dest, left);
1729 } 1769 }
1730 } else { /* 32/64 bit integer conversions. */ 1770 } else { /* 32/64 bit integer conversions. */
1731 if (irt_is64(ir->t)) { 1771 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */
1732#if LJ_32 1772 Reg dest = ra_dest(as, ir, RSET_GPR);
1733 /* NYI: conversion to 64 bit integers. */ 1773 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1734 setintV(&as->J->errinfo, ir->o); 1774 } else if (irt_is64(ir->t)) {
1735 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1736#else
1737 Reg dest = ra_dest(as, ir, RSET_GPR); 1775 Reg dest = ra_dest(as, ir, RSET_GPR);
1738 if (st64 || !(ir->op2 & IRCONV_SEXT)) { 1776 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
1739 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ 1777 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
@@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir)
1742 Reg left = asm_fuseload(as, lref, RSET_GPR); 1780 Reg left = asm_fuseload(as, lref, RSET_GPR);
1743 emit_mrm(as, XO_MOVSXd, dest|REX_64, left); 1781 emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
1744 } 1782 }
1745#endif
1746 } else { 1783 } else {
1747 Reg dest = ra_dest(as, ir, RSET_GPR); 1784 Reg dest = ra_dest(as, ir, RSET_GPR);
1748 if (st64) { 1785 if (st64) {
1749#if LJ_32
1750 /* NYI: conversion from 64 bit integers. */
1751 setintV(&as->J->errinfo, ir->o);
1752 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
1753#else
1754 Reg left = asm_fuseload(as, lref, RSET_GPR); 1786 Reg left = asm_fuseload(as, lref, RSET_GPR);
1755 /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits 1787 /* This is either a 32 bit reg/reg mov which zeroes the hiword
1756 ** or a load of the lower 32 bits from a 64 bit address. 1788 ** or a load of the loword from a 64 bit address.
1757 */ 1789 */
1758 emit_mrm(as, XO_MOV, dest, left); 1790 emit_mrm(as, XO_MOV, dest, left);
1759#endif
1760 } else { /* 32/32 bit no-op (cast). */ 1791 } else { /* 32/32 bit no-op (cast). */
1761 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 1792 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */
1762 } 1793 }
@@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir)
1764 } 1795 }
1765} 1796}
1766 1797
1798#if LJ_32 && LJ_HASFFI
1799/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
1800
1801/* 64 bit integer to FP conversion in 32 bit mode. */
1802static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
1803{
1804 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
1805 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
1806 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1807 Reg dest = ir->r;
1808 if (ra_hasreg(dest)) {
1809 ra_free(as, dest);
1810 ra_modified(as, dest);
1811 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS,
1812 dest, RID_ESP, ofs);
1813 }
1814 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
1815 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
1816 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
1817 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
1818 MCLabel l_end = emit_label(as);
1819 emit_rma(as, XO_FADDq, XOg_FADDq,
1820 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
1821 emit_sjcc(as, CC_NS, l_end);
1822 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
1823 } else {
1824 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64);
1825 }
1826 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
1827 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
1828 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
1829 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
1830}
1831
1832/* FP to 64 bit integer conversion in 32 bit mode. */
1833static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
1834{
1835 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
1836 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
1837 Reg lo, hi;
1838 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
1839 lua_assert(dt == IRT_I64 || dt == IRT_U64);
1840 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
1841 hi = ra_dest(as, ir, RSET_GPR);
1842 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
1843 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
1844 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
1845 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */
1846 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
1847 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
1848 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
1849 }
1850 if (dt == IRT_U64) {
1851 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
1852 MCLabel l_pop, l_end = emit_label(as);
1853 emit_x87op(as, XI_FPOP);
1854 l_pop = emit_label(as);
1855 emit_sjmp(as, l_end);
1856 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1857 if ((as->flags & JIT_F_SSE3))
1858 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1859 else
1860 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1861 emit_rma(as, XO_FADDq, XOg_FADDq,
1862 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
1863 emit_sjcc(as, CC_NS, l_pop);
1864 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
1865 }
1866 emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1867 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */
1868 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1869 } else { /* Otherwise set FPU rounding mode to truncate before the store. */
1870 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1871 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1872 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1873 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1874 emit_loadi(as, lo, 0xc00);
1875 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1876 }
1877 if (dt == IRT_U64)
1878 emit_x87op(as, XI_FDUP);
1879 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1880 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1881 asm_fuseload(as, ir->op1, RSET_EMPTY));
1882}
1883#endif
1884
1767static void asm_strto(ASMState *as, IRIns *ir) 1885static void asm_strto(ASMState *as, IRIns *ir)
1768{ 1886{
1769 /* Force a spill slot for the destination register (if any). */ 1887 /* Force a spill slot for the destination register (if any). */
@@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir)
2644 ra_left(as, RID_EAX, ir->op2); 2762 ra_left(as, RID_EAX, ir->op2);
2645} 2763}
2646 2764
2765#if LJ_64 && LJ_HASFFI
2766static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
2767{
2768 const CCallInfo *ci = &lj_ir_callinfo[id];
2769 IRRef args[2];
2770 args[0] = ir->op1;
2771 args[1] = ir->op2;
2772 asm_setupresult(as, ir, ci);
2773 asm_gencall(as, ci, args);
2774}
2775#endif
2776
2647/* Find out whether swapping operands might be beneficial. */ 2777/* Find out whether swapping operands might be beneficial. */
2648static int swapops(ASMState *as, IRIns *ir) 2778static int swapops(ASMState *as, IRIns *ir)
2649{ 2779{
@@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2877/* -- Comparisons --------------------------------------------------------- */ 3007/* -- Comparisons --------------------------------------------------------- */
2878 3008
2879/* Virtual flags for unordered FP comparisons. */ 3009/* Virtual flags for unordered FP comparisons. */
2880#define VCC_U 0x100 /* Unordered. */ 3010#define VCC_U 0x1000 /* Unordered. */
2881#define VCC_P 0x200 /* Needs extra CC_P branch. */ 3011#define VCC_P 0x2000 /* Needs extra CC_P branch. */
2882#define VCC_S 0x400 /* Swap avoids CC_P branch. */ 3012#define VCC_S 0x4000 /* Swap avoids CC_P branch. */
2883#define VCC_PS (VCC_P|VCC_S) 3013#define VCC_PS (VCC_P|VCC_S)
2884 3014
2885static void asm_comp_(ASMState *as, IRIns *ir, int cc) 3015/* Map of comparisons to flags. ORDER IR. */
3016#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf))
3017static const uint16_t asm_compmap[IR_ABC+1] = {
3018 /* signed non-eq unsigned flags */
3019 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS),
3020 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0),
3021 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS),
3022 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0),
3023 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U),
3024 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS),
3025 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U),
3026 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS),
3027 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
3028 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P),
3029 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */
3030};
3031
3032/* FP and integer comparisons. */
3033static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2886{ 3034{
2887 if (irt_isnum(ir->t)) { 3035 if (irt_isnum(ir->t)) {
2888 IRRef lref = ir->op1; 3036 IRRef lref = ir->op1;
@@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3008 if (irl+1 == ir) /* Referencing previous ins? */ 3156 if (irl+1 == ir) /* Referencing previous ins? */
3009 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ 3157 as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */
3010 } else { 3158 } else {
3011 x86Op xo; 3159 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
3012 if (checki8(imm)) {
3013 emit_i8(as, imm);
3014 xo = XO_ARITHi8;
3015 } else {
3016 emit_i32(as, imm);
3017 xo = XO_ARITHi;
3018 }
3019 emit_mrm(as, xo, r64 + XOg_CMP, left);
3020 } 3160 }
3021 } 3161 }
3022 } else { 3162 } else {
@@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
3028 } 3168 }
3029} 3169}
3030 3170
3031#define asm_comp(as, ir, ci, cf, cu) \ 3171#if LJ_32 && LJ_HASFFI
3032 asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) 3172/* 64 bit integer comparisons in 32 bit mode. */
3173static void asm_comp_int64(ASMState *as, IRIns *ir)
3174{
3175 uint32_t cc = asm_compmap[(ir-1)->o];
3176 RegSet allow = RSET_GPR;
3177 Reg lefthi = RID_NONE, leftlo = RID_NONE;
3178 Reg righthi = RID_NONE, rightlo = RID_NONE;
3179 MCLabel l_around;
3180 x86ModRM mrm;
3181
3182 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */
3183
3184 /* Allocate/fuse hiword operands. */
3185 if (irref_isk(ir->op2)) {
3186 lefthi = asm_fuseload(as, ir->op1, allow);
3187 } else {
3188 lefthi = ra_alloc1(as, ir->op1, allow);
3189 righthi = asm_fuseload(as, ir->op2, allow);
3190 if (righthi == RID_MRM) {
3191 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3192 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3193 } else {
3194 rset_clear(allow, righthi);
3195 }
3196 }
3197 mrm = as->mrm; /* Save state for hiword instruction. */
3198
3199 /* Allocate/fuse loword operands. */
3200 if (irref_isk((ir-1)->op2)) {
3201 leftlo = asm_fuseload(as, (ir-1)->op1, allow);
3202 } else {
3203 leftlo = ra_alloc1(as, (ir-1)->op1, allow);
3204 rightlo = asm_fuseload(as, (ir-1)->op2, allow);
3205 if (rightlo == RID_MRM) {
3206 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
3207 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
3208 } else {
3209 rset_clear(allow, rightlo);
3210 }
3211 }
3212
3213 /* All register allocations must be performed _before_ this point. */
3214 l_around = emit_label(as);
3215 as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */
3216
3217 /* Loword comparison and branch. */
3218 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */
3219 if (ra_noreg(rightlo)) {
3220 int32_t imm = IR((ir-1)->op2)->i;
3221 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
3222 emit_rr(as, XO_TEST, leftlo, leftlo);
3223 else
3224 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
3225 } else {
3226 emit_mrm(as, XO_CMP, leftlo, rightlo);
3227 }
3228
3229 /* Hiword comparison and branches. */
3230 if ((cc & 15) != CC_NE)
3231 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */
3232 if ((cc & 15) != CC_E)
3233 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */
3234 as->mrm = mrm; /* Restore state. */
3235 if (ra_noreg(righthi)) {
3236 int32_t imm = IR(ir->op2)->i;
3237 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
3238 emit_rr(as, XO_TEST, lefthi, lefthi);
3239 else
3240 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
3241 } else {
3242 emit_mrm(as, XO_CMP, lefthi, righthi);
3243 }
3244}
3245#endif
3246
3247/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
3248
3249/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
3250static void asm_hiop(ASMState *as, IRIns *ir)
3251{
3252#if LJ_32 && LJ_HASFFI
3253 /* HIOP is marked as a store because it needs its own DCE logic. */
3254 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
3255 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
3256 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
3257 if (usehi || uselo) {
3258 if (irt_isfp(ir->t))
3259 asm_conv_fp_int64(as, ir);
3260 else
3261 asm_conv_int64_fp(as, ir);
3262 }
3263 as->curins--; /* Always skip the CONV. */
3264 return;
3265 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
3266 asm_comp_int64(as, ir);
3267 return;
3268 }
3269 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
3270 switch ((ir-1)->o) {
3271 case IR_ADD:
3272 asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD);
3273 break;
3274 case IR_SUB:
3275 asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB);
3276 break;
3277 case IR_NEG: {
3278 Reg dest = ra_dest(as, ir, RSET_GPR);
3279 emit_rr(as, XO_GROUP3, XOg_NEG, dest);
3280 if (uselo) {
3281 emit_i8(as, 0);
3282 emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
3283 }
3284 ra_left(as, dest, ir->op1);
3285 break;
3286 }
3287 case IR_CALLN:
3288 ra_destreg(as, ir, RID_RETHI);
3289 if (!uselo)
3290 ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */
3291 break;
3292 default: lua_assert(0); break;
3293 }
3294#else
3295 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
3296#endif
3297}
3033 3298
3034/* -- Stack handling ------------------------------------------------------ */ 3299/* -- Stack handling ------------------------------------------------------ */
3035 3300
@@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir)
3682 switch ((IROp)ir->o) { 3947 switch ((IROp)ir->o) {
3683 /* Miscellaneous ops. */ 3948 /* Miscellaneous ops. */
3684 case IR_LOOP: asm_loop(as); break; 3949 case IR_LOOP: asm_loop(as); break;
3685 case IR_NOP: break; 3950 case IR_NOP: lua_assert(!ra_used(ir)); break;
3686 case IR_PHI: asm_phi(as, ir); break; 3951 case IR_PHI: asm_phi(as, ir); break;
3952 case IR_HIOP: asm_hiop(as, ir); break;
3687 3953
3688 /* Guarded assertions. */ 3954 /* Guarded assertions. */
3689 case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; 3955 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
3690 case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; 3956 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
3691 case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; 3957 case IR_EQ: case IR_NE: case IR_ABC:
3692 case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; 3958 asm_comp(as, ir, asm_compmap[ir->o]);
3693 case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; 3959 break;
3694 case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break;
3695 case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break;
3696 case IR_ABC:
3697 case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break;
3698 case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break;
3699 case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break;
3700 3960
3701 case IR_RETF: asm_retf(as, ir); break; 3961 case IR_RETF: asm_retf(as, ir); break;
3702 3962
@@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir)
3744 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: 4004 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3745 asm_fpmath(as, ir); 4005 asm_fpmath(as, ir);
3746 break; 4006 break;
3747 case IR_POWI: asm_powi(as, ir); break; 4007 case IR_POWI:
4008#if LJ_64 && LJ_HASFFI
4009 if (!irt_isnum(ir->t))
4010 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
4011 IRCALL_lj_carith_powu64);
4012 else
4013#endif
4014 asm_powi(as, ir);
4015 break;
3748 4016
3749 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ 4017 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3750 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; 4018 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as)
3801{ 4069{
3802 for (as->curins--; as->curins > as->stopins; as->curins--) { 4070 for (as->curins--; as->curins > as->stopins; as->curins--) {
3803 IRIns *ir = IR(as->curins); 4071 IRIns *ir = IR(as->curins);
4072 lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
3804 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) 4073 if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
3805 continue; /* Dead-code elimination can be soooo easy. */ 4074 continue; /* Dead-code elimination can be soooo easy. */
3806 if (irt_isguard(ir->t)) 4075 if (irt_isguard(ir->t))
@@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3864 case IR_CALLN: case IR_CALLL: case IR_CALLS: { 4133 case IR_CALLN: case IR_CALLL: case IR_CALLS: {
3865 const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; 4134 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
3866#if LJ_64 4135#if LJ_64
3867 /* NYI: add stack slots for x64 calls with many args. */
3868 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); 4136 lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6));
3869 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); 4137 ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
3870#else 4138#else
3871 /* NYI: not fastcall-aware, but doesn't matter (yet). */ 4139 lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2);
3872 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ 4140 if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
3873 as->evenspill = (int32_t)CCI_NARGS(ci); 4141 as->evenspill = (int32_t)CCI_NARGS(ci);
3874 ir->prev = REGSP_HINT(RID_RET); 4142 ir->prev = REGSP_HINT(RID_RET);
@@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3878 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; 4146 (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
3879 continue; 4147 continue;
3880 } 4148 }
4149#if LJ_32 && LJ_HASFFI
4150 case IR_HIOP:
4151 if ((ir-1)->o == IR_CALLN)
4152 ir->prev = REGSP_HINT(RID_RETHI);
4153 break;
4154#endif
3881 /* C calls evict all scratch regs and return results in RID_RET. */ 4155 /* C calls evict all scratch regs and return results in RID_RET. */
3882 case IR_SNEW: case IR_NEWREF: 4156 case IR_SNEW: case IR_NEWREF:
3883#if !LJ_64 4157#if !LJ_64
@@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T)
3894 as->modset = RSET_SCRATCH; 4168 as->modset = RSET_SCRATCH;
3895 break; 4169 break;
3896 case IR_POWI: 4170 case IR_POWI:
4171#if LJ_64 && LJ_HASFFI
4172 if (!irt_isnum(ir->t)) {
4173 ir->prev = REGSP_HINT(RID_RET);
4174 if (inloop)
4175 as->modset |= (RSET_SCRATCH & RSET_GPR);
4176 continue;
4177 }
4178#endif
3897 ir->prev = REGSP_HINT(RID_XMM0); 4179 ir->prev = REGSP_HINT(RID_XMM0);
3898 if (inloop) 4180 if (inloop)
3899 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); 4181 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
diff --git a/src/lj_carith.c b/src/lj_carith.c
index 46f07be7..134a61fb 100644
--- a/src/lj_carith.c
+++ b/src/lj_carith.c
@@ -230,6 +230,14 @@ int lj_carith_op(lua_State *L, MMS mm)
230 230
231/* -- 64 bit integer arithmetic helpers ----------------------------------- */ 231/* -- 64 bit integer arithmetic helpers ----------------------------------- */
232 232
233#if LJ_32
234/* Signed/unsigned 64 bit multiply. */
235int64_t lj_carith_mul64(int64_t a, int64_t b)
236{
237 return a * b;
238}
239#endif
240
233/* Unsigned 64 bit x^k. */ 241/* Unsigned 64 bit x^k. */
234uint64_t lj_carith_powu64(uint64_t x, uint64_t k) 242uint64_t lj_carith_powu64(uint64_t x, uint64_t k)
235{ 243{
diff --git a/src/lj_carith.h b/src/lj_carith.h
index 6870172b..14073603 100644
--- a/src/lj_carith.h
+++ b/src/lj_carith.h
@@ -12,6 +12,9 @@
12 12
13LJ_FUNC int lj_carith_op(lua_State *L, MMS mm); 13LJ_FUNC int lj_carith_op(lua_State *L, MMS mm);
14 14
15#if LJ_32
16LJ_FUNC int64_t lj_carith_mul64(int64_t x, int64_t k);
17#endif
15LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k); 18LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k);
16LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k); 19LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k);
17 20
diff --git a/src/lj_crecord.c b/src/lj_crecord.c
index 61210907..5eafa3a7 100644
--- a/src/lj_crecord.c
+++ b/src/lj_crecord.c
@@ -189,6 +189,7 @@ static void crec_ct_ct(jit_State *J, CType *d, CType *s, TRef dp, TRef sp,
189 sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0); 189 sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0);
190#endif 190#endif
191 xstore: 191 xstore:
192 if (dt == IRT_I64 || dt == IRT_U64) lj_needsplit(J);
192 emitir(IRT(IR_XSTORE, dt), dp, sp); 193 emitir(IRT(IR_XSTORE, dt), dp, sp);
193 break; 194 break;
194 case CCX(I, C): 195 case CCX(I, C):
@@ -311,6 +312,7 @@ static TRef crec_tv_ct(jit_State *J, CType *s, CTypeID sid, TRef sp)
311 TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp, 312 TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp,
312 lj_ir_kintp(J, sizeof(GCcdata))); 313 lj_ir_kintp(J, sizeof(GCcdata)));
313 emitir(IRT(IR_XSTORE, t), ptr, tr); 314 emitir(IRT(IR_XSTORE, t), ptr, tr);
315 lj_needsplit(J);
314 return dp; 316 return dp;
315 } else if ((sinfo & CTF_BOOL)) { 317 } else if ((sinfo & CTF_BOOL)) {
316 /* Assume not equal to zero. Fixup and emit pending guard later. */ 318 /* Assume not equal to zero. Fixup and emit pending guard later. */
@@ -406,7 +408,10 @@ static void crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval)
406 if (ctype_isenum(s->info)) s = ctype_child(cts, s); 408 if (ctype_isenum(s->info)) s = ctype_child(cts, s);
407 if (ctype_isnum(s->info)) { /* Load number value. */ 409 if (ctype_isnum(s->info)) { /* Load number value. */
408 IRType t = crec_ct2irt(s); 410 IRType t = crec_ct2irt(s);
409 if (t != IRT_CDATA) sp = emitir(IRT(IR_XLOAD, t), sp, 0); 411 if (t != IRT_CDATA) {
412 sp = emitir(IRT(IR_XLOAD, t), sp, 0);
413 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
414 }
410 } 415 }
411 goto doconv; 416 goto doconv;
412 } 417 }
@@ -499,8 +504,10 @@ void LJ_FASTCALL recff_cdata_index(jit_State *J, RecordFFData *rd)
499 if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) { 504 if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) {
500 idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata))); 505 idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata)));
501 idx = emitir(IRT(IR_XLOAD, t), idx, 0); 506 idx = emitir(IRT(IR_XLOAD, t), idx, 0);
502 if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) 507 if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) {
503 idx = emitconv(idx, IRT_INT, t, 0); 508 idx = emitconv(idx, IRT_INT, t, 0);
509 lj_needsplit(J);
510 }
504 goto integer_key; 511 goto integer_key;
505 } 512 }
506 } else if (tref_isstr(idx)) { 513 } else if (tref_isstr(idx)) {
@@ -664,6 +671,7 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm)
664 CTypeID id; 671 CTypeID id;
665 TRef tr, dp, ptr; 672 TRef tr, dp, ptr;
666 MSize i; 673 MSize i;
674 lj_needsplit(J);
667 if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) || 675 if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) ||
668 ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) { 676 ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) {
669 dt = IRT_U64; id = CTID_UINT64; 677 dt = IRT_U64; id = CTID_UINT64;
@@ -691,9 +699,6 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm)
691 lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]); 699 lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]);
692 J->postproc = LJ_POST_FIXGUARD; 700 J->postproc = LJ_POST_FIXGUARD;
693 return TREF_TRUE; 701 return TREF_TRUE;
694 } else if (mm == MM_pow) {
695 tr = lj_ir_call(J, dt == IRT_I64 ? IRCALL_lj_carith_powi64 :
696 IRCALL_lj_carith_powu64, sp[0], sp[1]);
697 } else { 702 } else {
698 if (mm == MM_div || mm == MM_mod) 703 if (mm == MM_div || mm == MM_mod)
699 return 0; /* NYI: integer div, mod. */ 704 return 0; /* NYI: integer div, mod. */
@@ -754,10 +759,11 @@ static TRef crec_arith_ptr(jit_State *J, TRef *sp, CType **s, MMS mm)
754 tr = emitconv(tr, IRT_INTP, IRT_INT, 759 tr = emitconv(tr, IRT_INTP, IRT_INT,
755 ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT); 760 ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT);
756#else 761#else
757 if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) 762 if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) {
758 tr = emitconv(tr, IRT_INTP, t, 763 tr = emitconv(tr, IRT_INTP, t,
759 (t == IRT_NUM || t == IRT_FLOAT) ? 764 (t == IRT_NUM || t == IRT_FLOAT) ?
760 IRCONV_TRUNC|IRCONV_ANY : 0); 765 IRCONV_TRUNC|IRCONV_ANY : 0);
766 }
761#endif 767#endif
762 tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz)); 768 tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz));
763 tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr); 769 tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr);
@@ -790,6 +796,7 @@ void LJ_FASTCALL recff_cdata_arith(jit_State *J, RecordFFData *rd)
790 if (ctype_isnum(ct->info)) { 796 if (ctype_isnum(ct->info)) {
791 IRType t = crec_ct2irt(ct); 797 IRType t = crec_ct2irt(ct);
792 if (t == IRT_CDATA) goto err_type; 798 if (t == IRT_CDATA) goto err_type;
799 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
793 tr = emitir(IRT(IR_XLOAD, t), tr, 0); 800 tr = emitir(IRT(IR_XLOAD, t), tr, 0);
794 } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) { 801 } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) {
795 goto err_type; 802 goto err_type;
@@ -842,6 +849,7 @@ void LJ_FASTCALL lj_crecord_tonumber(jit_State *J, RecordFFData *rd)
842 IRType t = crec_ct2irt(s); 849 IRType t = crec_ct2irt(s);
843 if (t != IRT_CDATA) { 850 if (t != IRT_CDATA) {
844 TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */ 851 TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */
852 if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J);
845 if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64) 853 if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64)
846 tr = emitconv(tr, IRT_NUM, t, 0); 854 tr = emitconv(tr, IRT_NUM, t, 0);
847 J->base[0] = tr; 855 J->base[0] = tr;
diff --git a/src/lj_ir.h b/src/lj_ir.h
index 1cb3566e..286eb219 100644
--- a/src/lj_ir.h
+++ b/src/lj_ir.h
@@ -33,6 +33,7 @@
33 /* Miscellaneous ops. */ \ 33 /* Miscellaneous ops. */ \
34 _(NOP, N , ___, ___) \ 34 _(NOP, N , ___, ___) \
35 _(BASE, N , lit, lit) \ 35 _(BASE, N , lit, lit) \
36 _(HIOP, S , ref, ref) \
36 _(LOOP, S , ___, ___) \ 37 _(LOOP, S , ___, ___) \
37 _(PHI, S , ref, ref) \ 38 _(PHI, S , ref, ref) \
38 _(RENAME, S , ref, lit) \ 39 _(RENAME, S , ref, lit) \
@@ -212,8 +213,9 @@ IRFLDEF(FLENUM)
212/* CONV mode, stored in op2. */ 213/* CONV mode, stored in op2. */
213#define IRCONV_SRCMASK 0x001f /* Source IRType. */ 214#define IRCONV_SRCMASK 0x001f /* Source IRType. */
214#define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */ 215#define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */
215#define IRCONV_NUM_INT ((IRT_NUM<<5)|IRT_INT) 216#define IRCONV_DSH 5
216#define IRCONV_INT_NUM ((IRT_INT<<5)|IRT_NUM) 217#define IRCONV_NUM_INT ((IRT_NUM<<IRCONV_DSH)|IRT_INT)
218#define IRCONV_INT_NUM ((IRT_INT<<IRCONV_DSH)|IRT_NUM)
217#define IRCONV_TRUNC 0x0400 /* Truncate number to integer. */ 219#define IRCONV_TRUNC 0x0400 /* Truncate number to integer. */
218#define IRCONV_SEXT 0x0800 /* Sign-extend integer to integer. */ 220#define IRCONV_SEXT 0x0800 /* Sign-extend integer to integer. */
219#define IRCONV_MODEMASK 0x0fff 221#define IRCONV_MODEMASK 0x0fff
@@ -251,13 +253,21 @@ typedef struct CCallInfo {
251#define CCI_CASTU64 0x0200 /* Cast u64 result to number. */ 253#define CCI_CASTU64 0x0200 /* Cast u64 result to number. */
252#define CCI_NOFPRCLOBBER 0x0400 /* Does not clobber any FPRs. */ 254#define CCI_NOFPRCLOBBER 0x0400 /* Does not clobber any FPRs. */
253#define CCI_FASTCALL 0x0800 /* Fastcall convention. */ 255#define CCI_FASTCALL 0x0800 /* Fastcall convention. */
254#define CCI_STACK64 0x1000 /* Needs 64 bits per argument. */
255 256
256/* Function definitions for CALL* instructions. */ 257/* Function definitions for CALL* instructions. */
257#if LJ_HASFFI 258#if LJ_HASFFI
259#if LJ_32
260#define ARG2_64 4 /* Treat as 4 32 bit arguments. */
261#define IRCALLDEF_FFI32(_) \
262 _(lj_carith_mul64, ARG2_64, N, I64, CCI_NOFPRCLOBBER)
263#else
264#define ARG2_64 2
265#define IRCALLDEF_FFI32(_)
266#endif
258#define IRCALLDEF_FFI(_) \ 267#define IRCALLDEF_FFI(_) \
259 _(lj_carith_powi64, 2, N, I64, CCI_STACK64|CCI_NOFPRCLOBBER) \ 268 IRCALLDEF_FFI32(_) \
260 _(lj_carith_powu64, 2, N, U64, CCI_STACK64|CCI_NOFPRCLOBBER) 269 _(lj_carith_powi64, ARG2_64, N, I64, CCI_NOFPRCLOBBER) \
270 _(lj_carith_powu64, ARG2_64, N, U64, CCI_NOFPRCLOBBER)
261#else 271#else
262#define IRCALLDEF_FFI(_) 272#define IRCALLDEF_FFI(_)
263#endif 273#endif
@@ -402,6 +412,7 @@ typedef struct IRType1 { uint8_t irt; } IRType1;
402#define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT)) 412#define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT))
403#define irt_isgcv(t) (irt_typerange((t), IRT_STR, IRT_UDATA)) 413#define irt_isgcv(t) (irt_typerange((t), IRT_STR, IRT_UDATA))
404#define irt_isaddr(t) (irt_typerange((t), IRT_LIGHTUD, IRT_UDATA)) 414#define irt_isaddr(t) (irt_typerange((t), IRT_LIGHTUD, IRT_UDATA))
415#define irt_isint64(t) (irt_typerange((t), IRT_I64, IRT_U64))
405 416
406#if LJ_64 417#if LJ_64
407#define IRT_IS64 \ 418#define IRT_IS64 \
diff --git a/src/lj_iropt.h b/src/lj_iropt.h
index 43c414c1..db99c118 100644
--- a/src/lj_iropt.h
+++ b/src/lj_iropt.h
@@ -141,6 +141,12 @@ LJ_FUNC IRType lj_opt_narrow_forl(cTValue *forbase);
141/* Optimization passes. */ 141/* Optimization passes. */
142LJ_FUNC void lj_opt_dce(jit_State *J); 142LJ_FUNC void lj_opt_dce(jit_State *J);
143LJ_FUNC int lj_opt_loop(jit_State *J); 143LJ_FUNC int lj_opt_loop(jit_State *J);
144#if LJ_HASFFI && LJ_32
145LJ_FUNC void lj_opt_split(jit_State *J);
146#else
147#define lj_opt_split(J) UNUSED(J)
148#endif
149
144#endif 150#endif
145 151
146#endif 152#endif
diff --git a/src/lj_jit.h b/src/lj_jit.h
index a8be1a97..38970fc7 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -240,6 +240,15 @@ enum {
240#define LJ_KSIMD(J, n) \ 240#define LJ_KSIMD(J, n) \
241 ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15)) 241 ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15))
242 242
243/* Set/reset flag to activate the SPLIT pass for the current trace. */
244#if LJ_32 && LJ_HASFFI
245#define lj_needsplit(J) (J->needsplit = 1)
246#define lj_resetsplit(J) (J->needsplit = 0)
247#else
248#define lj_needsplit(J) UNUSED(J)
249#define lj_resetsplit(J) UNUSED(J)
250#endif
251
243/* Fold state is used to fold instructions on-the-fly. */ 252/* Fold state is used to fold instructions on-the-fly. */
244typedef struct FoldState { 253typedef struct FoldState {
245 IRIns ins; /* Currently emitted instruction. */ 254 IRIns ins; /* Currently emitted instruction. */
@@ -293,6 +302,9 @@ typedef struct jit_State {
293 MSize sizesnapmap; /* Size of temp. snapshot map buffer. */ 302 MSize sizesnapmap; /* Size of temp. snapshot map buffer. */
294 303
295 PostProc postproc; /* Required post-processing after execution. */ 304 PostProc postproc; /* Required post-processing after execution. */
305#if LJ_32 && LJ_HASFFI
306 int needsplit; /* Need SPLIT pass. */
307#endif
296 308
297 GCRef *trace; /* Array of traces. */ 309 GCRef *trace; /* Array of traces. */
298 TraceNo freetrace; /* Start of scan for next free trace. */ 310 TraceNo freetrace; /* Start of scan for next free trace. */
diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 2d08e187..03caf80d 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -538,6 +538,13 @@ LJFOLDF(kfold_conv_knum_int_num)
538 } 538 }
539} 539}
540 540
541LJFOLD(CONV KNUM IRCONV_U32_NUM)
542LJFOLDF(kfold_conv_knum_u32_num)
543{
544 lua_assert((fins->op2 & IRCONV_TRUNC));
545 return INTFOLD((int32_t)(uint32_t)knumleft);
546}
547
541LJFOLD(CONV KNUM IRCONV_I64_NUM) 548LJFOLD(CONV KNUM IRCONV_I64_NUM)
542LJFOLDF(kfold_conv_knum_i64_num) 549LJFOLDF(kfold_conv_knum_i64_num)
543{ 550{
@@ -805,6 +812,7 @@ LJFOLDF(simplify_conv_u32_num)
805} 812}
806 813
807LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/ 814LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/
815LJFOLD(CONV CONV IRCONV_U64_NUM) /* _INT or _U32*/
808LJFOLDF(simplify_conv_i64_num) 816LJFOLDF(simplify_conv_i64_num)
809{ 817{
810 PHIBARRIER(fleft); 818 PHIBARRIER(fleft);
@@ -826,23 +834,6 @@ LJFOLDF(simplify_conv_i64_num)
826 return NEXTFOLD; 834 return NEXTFOLD;
827} 835}
828 836
829LJFOLD(CONV CONV IRCONV_U64_NUM) /* _U32*/
830LJFOLDF(simplify_conv_u64_num)
831{
832 PHIBARRIER(fleft);
833 if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
834#if LJ_TARGET_X64
835 return fleft->op1;
836#else
837 /* Reduce to a zero-extension. */
838 fins->op1 = fleft->op1;
839 fins->op2 = (IRT_U64<<5)|IRT_U32;
840 return RETRYFOLD;
841#endif
842 }
843 return NEXTFOLD;
844}
845
846/* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */ 837/* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
847LJFOLD(TOBIT CONV KNUM) 838LJFOLD(TOBIT CONV KNUM)
848LJFOLDF(simplify_tobit_conv) 839LJFOLDF(simplify_tobit_conv)
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c
new file mode 100644
index 00000000..3cb30514
--- /dev/null
+++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
1/*
2** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
3** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_opt_split_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10
11#if LJ_HASJIT && LJ_HASFFI && LJ_32
12
13#include "lj_err.h"
14#include "lj_str.h"
15#include "lj_ir.h"
16#include "lj_jit.h"
17#include "lj_iropt.h"
18#include "lj_vm.h"
19
20/* SPLIT pass:
21**
22** This pass splits up 64 bit IR instructions into multiple 32 bit IR
23** instructions. It's only active for 32 bit CPUs which lack native 64 bit
24** operations. The FFI is currently the only emitter for 64 bit
25** instructions, so this pass is disabled if the FFI is disabled.
26**
27** Splitting the IR in a separate pass keeps each 32 bit IR assembler
28** backend simple. Only a small amount of extra functionality needs to be
29** implemented. This is much easier than adding support for allocating
30** register pairs to each backend (believe me, I tried). A few simple, but
31** important optimizations can be performed by the SPLIT pass, which would
32** be tedious to do in the backend.
33**
34** The basic idea is to replace each 64 bit IR instruction with its 32 bit
35** equivalent plus an extra HIOP instruction. The splitted IR is not passed
36** through FOLD or any other optimizations, so each HIOP is guaranteed to
37** immediately follow it's counterpart. The actual functionality of HIOP is
38** inferred from the previous instruction.
39**
40** The operands of HIOP hold the hiword input references. The output of HIOP
41** is the hiword output reference, which is also used to hold the hiword
42** register or spill slot information. The register allocator treats this
43** instruction independent of any other instruction, which improves code
44** quality compared to using fixed register pairs.
45**
46** It's easier to split up some instructions into two regular 32 bit
47** instructions. E.g. XLOAD is split up into two XLOADs with two different
48** addresses. Obviously 64 bit constants need to be split up into two 32 bit
49** constants, too. Some hiword instructions can be entirely omitted, e.g.
50** when zero-extending a 32 bit value to 64 bits.
51**
52** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
53** two int64_t fields:
54**
55** 0100 p32 ADD base +8
56** 0101 i64 XLOAD 0100
57** 0102 i64 ADD 0101 +1
58** 0103 p32 ADD base +16
59** 0104 i64 XSTORE 0103 0102
60**
61** mov rax, [esi+0x8]
62** add rax, +0x01
63** mov [esi+0x10], rax
64**
65** Here's the transformed IR and the x86 machine code after the SPLIT pass:
66**
67** 0100 p32 ADD base +8
68** 0101 int XLOAD 0100
69** 0102 p32 ADD base +12
70** 0103 int XLOAD 0102
71** 0104 int ADD 0101 +1
72** 0105 int HIOP 0103 +0
73** 0106 p32 ADD base +16
74** 0107 int XSTORE 0106 0104
75** 0108 p32 ADD base +20
76** 0109 int XSTORE 0108 0105
77**
78** mov eax, [esi+0x8]
79** mov ecx, [esi+0xc]
80** add eax, +0x01
81** adc ecx, +0x00
82** mov [esi+0x10], eax
83** mov [esi+0x14], ecx
84**
85** You may notice the reassociated hiword address computation, which is
86** later fused into the mov operands by the assembler.
87*/
88
89/* Some local macros to save typing. Undef'd at the end. */
90#define IR(ref) (&J->cur.ir[(ref)])
91
92/* Directly emit the transformed IR without updating chains etc. */
93static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
94{
95 IRRef nref = lj_ir_nextins(J);
96 IRIns *ir = IR(nref);
97 ir->ot = ot;
98 ir->op1 = op1;
99 ir->op2 = op2;
100 return nref;
101}
102
103/* Emit a CALLN with two split 64 bit arguments. */
104static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
105 IRIns *ir, IRCallID id)
106{
107 IRRef tmp, op1 = ir->op1, op2 = ir->op2;
108 J->cur.nins--;
109#if LJ_LE
110 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
111 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
112 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
113#else
114 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
115 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
116 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
117#endif
118 ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
119 return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
120}
121
122/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
123static IRRef split_ptr(jit_State *J, IRRef ref)
124{
125 IRIns *ir = IR(ref);
126 int32_t ofs = 4;
127 if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
128 ofs += IR(ir->op2)->i;
129 ref = ir->op1;
130 if (ofs == 0) return ref;
131 }
132 return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
133}
134
135/* Transform the old IR to the new IR. */
136static void split_ir(jit_State *J)
137{
138 IRRef nins = J->cur.nins, nk = J->cur.nk;
139 MSize irlen = nins - nk;
140 MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
141 IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
142 IRRef1 *hisubst;
143 IRRef ref;
144
145 /* Copy old IR to buffer. */
146 memcpy(oir, IR(nk), irlen*sizeof(IRIns));
147 /* Bias hiword substitution table and old IR. Loword kept in field prev. */
148 hisubst = (IRRef1 *)&oir[irlen] - nk;
149 oir -= nk;
150
151 /* Remove all IR instructions, but retain IR constants. */
152 J->cur.nins = REF_FIRST;
153
154 /* Process constants and fixed references. */
155 for (ref = nk; ref <= REF_BASE; ref++) {
156 IRIns *ir = &oir[ref];
157 if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
158 TValue tv = *ir_k64(ir);
159 ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
160 hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
161 } else {
162 ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
163 }
164 }
165
166 /* Process old IR instructions. */
167 for (ref = REF_FIRST; ref < nins; ref++) {
168 IRIns *ir = &oir[ref];
169 IRRef nref = lj_ir_nextins(J);
170 IRIns *nir = IR(nref);
171
172 /* Copy-substitute old instruction to new instruction. */
173 nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
174 nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
175 ir->prev = nref; /* Loword substitution. */
176 nir->o = ir->o;
177 nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
178
179 /* Split 64 bit instructions. */
180 if (irt_isint64(ir->t)) {
181 IRRef hi = hisubst[ir->op1];
182 nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
183 switch (ir->o) {
184 case IR_ADD:
185 case IR_SUB:
186 /* Use plain op for hiword if loword cannot produce a carry/borrow. */
187 if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
188 ir->prev = nir->op1; /* Pass through loword. */
189 nir->op1 = hi; nir->op2 = hisubst[ir->op2];
190 hi = nref;
191 break;
192 }
193 /* fallthrough */
194 case IR_NEG:
195 hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
196 break;
197 case IR_MUL:
198 hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
199 break;
200 case IR_POWI:
201 hi = split_call64(J, hisubst, oir, ir,
202 irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
203 IRCALL_lj_carith_powu64);
204 break;
205 case IR_XLOAD:
206 hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
207#if LJ_BE
208 ir->prev = hi; hi = nref;
209#endif
210 break;
211 case IR_XSTORE:
212#if LJ_LE
213 hi = hisubst[ir->op2];
214#else
215 hi = nir->op2; nir->op2 = hisubst[ir->op2];
216#endif
217 split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
218 continue;
219 case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
220 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
221 if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */
222 hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
223 } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */
224 /* Drop cast, since assembler doesn't care. */
225 hisubst[ref] = hi;
226 goto fwdlo;
227 } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
228 IRRef k31 = lj_ir_kint(J, 31);
229 nir = IR(nref); /* May have been reallocated. */
230 ir->prev = nir->op1; /* Pass through loword. */
231 nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
232 nir->op2 = k31;
233 hi = nref;
234 } else { /* Zero-extend to 64 bit. */
235 hisubst[ref] = lj_ir_kint(J, 0);
236 goto fwdlo;
237 }
238 break;
239 }
240 case IR_PHI: {
241 IRRef hi2;
242 if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
243 nir->op1 == nir->op2)
244 J->cur.nins--; /* Drop useless PHIs. */
245 hi2 = hisubst[ir->op2];
246 if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
247 split_emit(J, IRTI(IR_PHI), hi, hi2);
248 continue;
249 }
250 default:
251 lua_assert(ir->o <= IR_NE);
252 split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
253 continue;
254 }
255 hisubst[ref] = hi; /* Store hiword substitution. */
256 } else if (ir->o == IR_CONV) { /* See above, too. */
257 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
258 if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */
259 if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
260 ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
261 hisubst[ir->op1], nref);
262 } else { /* Truncate to lower 32 bits. */
263 fwdlo:
264 ir->prev = nir->op1; /* Forward loword. */
265 /* Replace with NOP to avoid messing up the snapshot logic. */
266 nir->ot = IRT(IR_NOP, IRT_NIL);
267 nir->op1 = nir->op2 = 0;
268 }
269 }
270 } else if (ir->o == IR_LOOP) {
271 J->loopref = nref; /* Needed by assembler. */
272 }
273 }
274
275 /* Add PHI marks. */
276 for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
277 IRIns *ir = IR(ref);
278 if (ir->o != IR_PHI) break;
279 if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
280 if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
281 }
282
283 /* Substitute snapshot maps. */
284 oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
285 {
286 SnapNo i, nsnap = J->cur.nsnap;
287 for (i = 0; i < nsnap; i++) {
288 SnapShot *snap = &J->cur.snap[i];
289 SnapEntry *map = &J->cur.snapmap[snap->mapofs];
290 MSize n, nent = snap->nent;
291 snap->ref = oir[snap->ref].prev;
292 for (n = 0; n < nent; n++) {
293 SnapEntry sn = map[n];
294 map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
295 }
296 }
297 }
298}
299
300/* Protected callback for split pass. */
301static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
302{
303 jit_State *J = (jit_State *)ud;
304 split_ir(J);
305 UNUSED(L); UNUSED(dummy);
306 return NULL;
307}
308
309#ifdef LUA_USE_ASSERT
310/* Slow, but sure way to check whether a SPLIT pass is needed. */
311static int split_needsplit(jit_State *J)
312{
313 IRIns *ir, *irend;
314 IRRef ref;
315 for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
316 if (irt_isint64(ir->t))
317 return 1;
318 for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
319 if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
320 (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
321 return 1;
322 return 0; /* Nope. */
323}
324#endif
325
326/* SPLIT pass. */
327void lj_opt_split(jit_State *J)
328{
329 lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
330 if (J->needsplit) {
331 int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
332 if (errcode) {
333 /* Completely reset the trace to avoid inconsistent dump on abort. */
334 J->cur.nins = J->cur.nk = REF_BASE;
335 J->cur.nsnap = 0;
336 lj_err_throw(J->L, errcode); /* Propagate errors. */
337 }
338 }
339}
340
341#undef IR
342
343#endif
diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h
index 94ab3c32..37c68f4b 100644
--- a/src/lj_target_x86.h
+++ b/src/lj_target_x86.h
@@ -193,6 +193,7 @@ typedef enum {
193 XI_FLD1 = 0xe8d9, 193 XI_FLD1 = 0xe8d9,
194 XI_FLDLG2 = 0xecd9, 194 XI_FLDLG2 = 0xecd9,
195 XI_FLDLN2 = 0xedd9, 195 XI_FLDLN2 = 0xedd9,
196 XI_FDUP = 0xc0d9, /* Really fld st0. */
196 XI_FPOP = 0xd8dd, /* Really fstp st0. */ 197 XI_FPOP = 0xd8dd, /* Really fstp st0. */
197 XI_FPOP1 = 0xd9dd, /* Really fstp st1. */ 198 XI_FPOP1 = 0xd9dd, /* Really fstp st1. */
198 XI_FRNDINT = 0xfcd9, 199 XI_FRNDINT = 0xfcd9,
@@ -263,10 +264,17 @@ typedef enum {
263 XO_MOVD = XO_660f(6e), 264 XO_MOVD = XO_660f(6e),
264 XO_MOVDto = XO_660f(7e), 265 XO_MOVDto = XO_660f(7e),
265 266
267 XO_FLDd = XO_(d9), XOg_FLDd = 0,
266 XO_FLDq = XO_(dd), XOg_FLDq = 0, 268 XO_FLDq = XO_(dd), XOg_FLDq = 0,
267 XO_FILDd = XO_(db), XOg_FILDd = 0, 269 XO_FILDd = XO_(db), XOg_FILDd = 0,
270 XO_FILDq = XO_(df), XOg_FILDq = 5,
271 XO_FSTPd = XO_(d9), XOg_FSTPd = 3,
268 XO_FSTPq = XO_(dd), XOg_FSTPq = 3, 272 XO_FSTPq = XO_(dd), XOg_FSTPq = 3,
269 XO_FISTPq = XO_(df), XOg_FISTPq = 7, 273 XO_FISTPq = XO_(df), XOg_FISTPq = 7,
274 XO_FISTTPq = XO_(dd), XOg_FISTTPq = 1,
275 XO_FADDq = XO_(dc), XOg_FADDq = 0,
276 XO_FLDCW = XO_(d9), XOg_FLDCW = 5,
277 XO_FNSTCW = XO_(d9), XOg_FNSTCW = 7
270} x86Op; 278} x86Op;
271 279
272/* x86 opcode groups. */ 280/* x86 opcode groups. */
@@ -278,6 +286,7 @@ typedef uint32_t x86Group;
278#define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000))) 286#define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000)))
279 287
280#define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27))) 288#define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27)))
289#define XO_ARITHw(a) ((x86Op)(0x036600fd + ((a)<<27)))
281 290
282typedef enum { 291typedef enum {
283 XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP, 292 XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP,
diff --git a/src/lj_trace.c b/src/lj_trace.c
index da20f991..b67e8f75 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -394,6 +394,7 @@ static void trace_start(jit_State *J)
394 J->bcskip = 0; 394 J->bcskip = 0;
395 J->guardemit.irt = 0; 395 J->guardemit.irt = 0;
396 J->postproc = LJ_POST_NONE; 396 J->postproc = LJ_POST_NONE;
397 lj_resetsplit(J);
397 setgcref(J->cur.startpt, obj2gco(J->pt)); 398 setgcref(J->cur.startpt, obj2gco(J->pt));
398 399
399 L = J->L; 400 L = J->L;
@@ -592,6 +593,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
592 } 593 }
593 J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */ 594 J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */
594 } 595 }
596 lj_opt_split(J);
595 J->state = LJ_TRACE_ASM; 597 J->state = LJ_TRACE_ASM;
596 break; 598 break;
597 599
diff --git a/src/ljamalg.c b/src/ljamalg.c
index 4d5f7600..5d90c002 100644
--- a/src/ljamalg.c
+++ b/src/ljamalg.c
@@ -58,6 +58,7 @@
58#include "lj_opt_narrow.c" 58#include "lj_opt_narrow.c"
59#include "lj_opt_dce.c" 59#include "lj_opt_dce.c"
60#include "lj_opt_loop.c" 60#include "lj_opt_loop.c"
61#include "lj_opt_split.c"
61#include "lj_mcode.c" 62#include "lj_mcode.c"
62#include "lj_snap.c" 63#include "lj_snap.c"
63#include "lj_record.c" 64#include "lj_record.c"