1 files changed, 343 insertions, 0 deletions
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c
new file mode 100644
index 00000000..3cb30514
--- /dev/null
+++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
+/*
+** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
+** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
+*/
+#define lj_opt_split_c
+#define LUA_CORE
+#include "lj_obj.h"
+#if LJ_HASJIT && LJ_HASFFI && LJ_32
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_vm.h"
+/* SPLIT pass:
+**
+** This pass splits up 64 bit IR instructions into multiple 32 bit IR
+** instructions. It's only active for 32 bit CPUs which lack native 64 bit
+** operations. The FFI is currently the only emitter for 64 bit
+** instructions, so this pass is disabled if the FFI is disabled.
+**
+** Splitting the IR in a separate pass keeps each 32 bit IR assembler
+** backend simple. Only a small amount of extra functionality needs to be
+** implemented. This is much easier than adding support for allocating
+** register pairs to each backend (believe me, I tried). A few simple, but
+** important optimizations can be performed by the SPLIT pass, which would
+** be tedious to do in the backend.
+**
+** The basic idea is to replace each 64 bit IR instruction with its 32 bit
+** equivalent plus an extra HIOP instruction. The splitted IR is not passed
+** through FOLD or any other optimizations, so each HIOP is guaranteed to
+** immediately follow it's counterpart. The actual functionality of HIOP is
+** inferred from the previous instruction.
+**
+** The operands of HIOP hold the hiword input references. The output of HIOP
+** is the hiword output reference, which is also used to hold the hiword
+** register or spill slot information. The register allocator treats this
+** instruction independent of any other instruction, which improves code
+** quality compared to using fixed register pairs.
+**
+** It's easier to split up some instructions into two regular 32 bit
+** instructions. E.g. XLOAD is split up into two XLOADs with two different
+** addresses. Obviously 64 bit constants need to be split up into two 32 bit
+** constants, too. Some hiword instructions can be entirely omitted, e.g.
+** when zero-extending a 32 bit value to 64 bits.
+**
+** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
+** two int64_t fields:
+**
+** 0100    p32 ADD    base  +8
+** 0101    i64 XLOAD  0100
+** 0102    i64 ADD    0101  +1
+** 0103    p32 ADD    base  +16
+** 0104    i64 XSTORE 0103  0102
+**
+**         mov rax, [esi+0x8]
+**         add rax, +0x01
+**         mov [esi+0x10], rax
+**
+** Here's the transformed IR and the x86 machine code after the SPLIT pass:
+**
+** 0100    p32 ADD    base  +8
+** 0101    int XLOAD  0100
+** 0102    p32 ADD    base  +12
+** 0103    int XLOAD  0102
+** 0104    int ADD    0101  +1
+** 0105    int HIOP   0103  +0
+** 0106    p32 ADD    base  +16
+** 0107    int XSTORE 0106  0104
+** 0108    p32 ADD    base  +20
+** 0109    int XSTORE 0108  0105
+**
+**         mov eax, [esi+0x8]
+**         mov ecx, [esi+0xc]
+**         add eax, +0x01
+**         adc ecx, +0x00
+**         mov [esi+0x10], eax
+**         mov [esi+0x14], ecx
+**
+** You may notice the reassociated hiword address computation, which is
+** later fused into the mov operands by the assembler.
+*/
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)         (&J->cur.ir[(ref)])
+/* Directly emit the transformed IR without updating chains etc. */
+static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
+{
+  IRRef nref = lj_ir_nextins(J);
+  IRIns *ir = IR(nref);
+  ir->ot = ot;
+  ir->op1 = op1;
+  ir->op2 = op2;
+  return nref;
+}
+/* Emit a CALLN with two split 64 bit arguments. */
+static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
+                          IRIns *ir, IRCallID id)
+{
+  IRRef tmp, op1 = ir->op1, op2 = ir->op2;
+  J->cur.nins--;
+#if LJ_LE
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+#else
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+#endif
+  ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
+  return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
+}
+/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
+static IRRef split_ptr(jit_State *J, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  int32_t ofs = 4;
+  if (ir->o == IR_ADD && irref_isk(ir->op2)) {  /* Reassociate address. */
+    ofs += IR(ir->op2)->i;
+    ref = ir->op1;
+    if (ofs == 0) return ref;
+  }
+  return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
+}
+/* Transform the old IR to the new IR. */
+static void split_ir(jit_State *J)
+{
+  IRRef nins = J->cur.nins, nk = J->cur.nk;
+  MSize irlen = nins - nk;
+  MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
+  IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
+  IRRef1 *hisubst;
+  IRRef ref;
+  /* Copy old IR to buffer. */
+  memcpy(oir, IR(nk), irlen*sizeof(IRIns));
+  /* Bias hiword substitution table and old IR. Loword kept in field prev. */
+  hisubst = (IRRef1 *)&oir[irlen] - nk;
+  oir -= nk;
+  /* Remove all IR instructions, but retain IR constants. */
+  J->cur.nins = REF_FIRST;
+  /* Process constants and fixed references. */
+  for (ref = nk; ref <= REF_BASE; ref++) {
+    IRIns *ir = &oir[ref];
+    if (ir->o == IR_KINT64) {  /* Split up 64 bit constant. */
+      TValue tv = *ir_k64(ir);
+      ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
+      hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
+    } else {
+      ir->prev = (IRRef1)ref;  /* Identity substitution for loword. */
+    }
+  }
+  /* Process old IR instructions. */
+  for (ref = REF_FIRST; ref < nins; ref++) {
+    IRIns *ir = &oir[ref];
+    IRRef nref = lj_ir_nextins(J);
+    IRIns *nir = IR(nref);
+    /* Copy-substitute old instruction to new instruction. */
+    nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
+    nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
+    ir->prev = nref;  /* Loword substitution. */
+    nir->o = ir->o;
+    nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
+    /* Split 64 bit instructions. */
+    if (irt_isint64(ir->t)) {
+      IRRef hi = hisubst[ir->op1];
+      nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD);  /* Turn into INT op. */
+      switch (ir->o) {
+      case IR_ADD:
+      case IR_SUB:
+        /* Use plain op for hiword if loword cannot produce a carry/borrow. */
+        if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
+          ir->prev = nir->op1;  /* Pass through loword. */
+          nir->op1 = hi; nir->op2 = hisubst[ir->op2];
+          hi = nref;
+          break;
+        }
+        /* fallthrough */
+      case IR_NEG:
+        hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
+        break;
+      case IR_MUL:
+        hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
+        break;
+      case IR_POWI:
+        hi = split_call64(J, hisubst, oir, ir,
+                          irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+                                             IRCALL_lj_carith_powu64);
+        break;
+      case IR_XLOAD:
+        hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
+#if LJ_BE
+        ir->prev = hi; hi = nref;
+#endif
+        break;
+      case IR_XSTORE:
+#if LJ_LE
+        hi = hisubst[ir->op2];
+#else
+        hi = nir->op2; nir->op2 = hisubst[ir->op2];
+#endif
+        split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
+        continue;
+      case IR_CONV: {  /* Conversion to 64 bit integer. Others handled below. */
+        IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+        if (st == IRT_NUM || st == IRT_FLOAT) {  /* FP to 64 bit int conv. */
+          hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
+        } else if (st == IRT_I64 || st == IRT_U64) {  /* 64/64 bit cast. */
+          /* Drop cast, since assembler doesn't care. */
+          hisubst[ref] = hi;
+          goto fwdlo;
+        } else if ((ir->op2 & IRCONV_SEXT)) {  /* Sign-extend to 64 bit. */
+          IRRef k31 = lj_ir_kint(J, 31);
+          nir = IR(nref);  /* May have been reallocated. */
+          ir->prev = nir->op1;  /* Pass through loword. */
+          nir->o = IR_BSAR;  /* hi = bsar(lo, 31). */
+          nir->op2 = k31;
+          hi = nref;
+        } else {  /* Zero-extend to 64 bit. */
+          hisubst[ref] = lj_ir_kint(J, 0);
+          goto fwdlo;
+        }
+        break;
+        }
+      case IR_PHI: {
+        IRRef hi2;
+        if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
+            nir->op1 == nir->op2)
+          J->cur.nins--;  /* Drop useless PHIs. */
+        hi2 = hisubst[ir->op2];
+        if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
+          split_emit(J, IRTI(IR_PHI), hi, hi2);
+        continue;
+        }
+      default:
+        lua_assert(ir->o <= IR_NE);
+        split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]);  /* Comparisons. */
+        continue;
+      }
+      hisubst[ref] = hi;  /* Store hiword substitution. */
+    } else if (ir->o == IR_CONV) {  /* See above, too. */
+      IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+      if (st == IRT_I64 || st == IRT_U64) {  /* Conversion from 64 bit int. */
+        if (irt_isfp(ir->t)) {  /* 64 bit integer to FP conversion. */
+          ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
+                                hisubst[ir->op1], nref);
+        } else {  /* Truncate to lower 32 bits. */
+        fwdlo:
+          ir->prev = nir->op1;  /* Forward loword. */
+          /* Replace with NOP to avoid messing up the snapshot logic. */
+          nir->ot = IRT(IR_NOP, IRT_NIL);
+          nir->op1 = nir->op2 = 0;
+        }
+      }
+    } else if (ir->o == IR_LOOP) {
+      J->loopref = nref;  /* Needed by assembler. */
+    }
+  }
+  /* Add PHI marks. */
+  for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
+    IRIns *ir = IR(ref);
+    if (ir->o != IR_PHI) break;
+    if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
+    if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
+  }
+  /* Substitute snapshot maps. */
+  oir[nins].prev = J->cur.nins;  /* Substitution for last snapshot. */
+  {
+    SnapNo i, nsnap = J->cur.nsnap;
+    for (i = 0; i < nsnap; i++) {
+      SnapShot *snap = &J->cur.snap[i];
+      SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+      MSize n, nent = snap->nent;
+      snap->ref = oir[snap->ref].prev;
+      for (n = 0; n < nent; n++) {
+        SnapEntry sn = map[n];
+        map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
+      }
+    }
+  }
+}
+/* Protected callback for split pass. */
+static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  jit_State *J = (jit_State *)ud;
+  split_ir(J);
+  UNUSED(L); UNUSED(dummy);
+  return NULL;
+}
+#ifdef LUA_USE_ASSERT
+/* Slow, but sure way to check whether a SPLIT pass is needed. */
+static int split_needsplit(jit_State *J)
+{
+  IRIns *ir, *irend;
+  IRRef ref;
+  for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
+    if (irt_isint64(ir->t))
+      return 1;
+  for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
+    if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
+        (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
+      return 1;
+  return 0;  /* Nope. */
+}
+#endif
+/* SPLIT pass. */
+void lj_opt_split(jit_State *J)
+{
+  lua_assert(J->needsplit >= split_needsplit(J));  /* Verify flag. */
+  if (J->needsplit) {
+    int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
+    if (errcode) {
+      /* Completely reset the trace to avoid inconsistent dump on abort. */
+      J->cur.nins = J->cur.nk = REF_BASE;
+      J->cur.nsnap = 0;
+      lj_err_throw(J->L, errcode);  /* Propagate errors. */
+    }
+  }
+}
+#undef IR
+#endif

diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
	1	/*
	2	** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
	3	** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
	4	*/
	5
	6	#define lj_opt_split_c
	7	#define LUA_CORE
	8
	9	#include "lj_obj.h"
	10
	11	#if LJ_HASJIT && LJ_HASFFI && LJ_32
	12
	13	#include "lj_err.h"
	14	#include "lj_str.h"
	15	#include "lj_ir.h"
	16	#include "lj_jit.h"
	17	#include "lj_iropt.h"
	18	#include "lj_vm.h"
	19
	20	/* SPLIT pass:
	21	**
	22	** This pass splits up 64 bit IR instructions into multiple 32 bit IR
	23	** instructions. It's only active for 32 bit CPUs which lack native 64 bit
	24	** operations. The FFI is currently the only emitter for 64 bit
	25	** instructions, so this pass is disabled if the FFI is disabled.
	26	**
	27	** Splitting the IR in a separate pass keeps each 32 bit IR assembler
	28	** backend simple. Only a small amount of extra functionality needs to be
	29	** implemented. This is much easier than adding support for allocating
	30	** register pairs to each backend (believe me, I tried). A few simple, but
	31	** important optimizations can be performed by the SPLIT pass, which would
	32	** be tedious to do in the backend.
	33	**
	34	** The basic idea is to replace each 64 bit IR instruction with its 32 bit
	35	** equivalent plus an extra HIOP instruction. The splitted IR is not passed
	36	** through FOLD or any other optimizations, so each HIOP is guaranteed to
	37	** immediately follow it's counterpart. The actual functionality of HIOP is
	38	** inferred from the previous instruction.
	39	**
	40	** The operands of HIOP hold the hiword input references. The output of HIOP
	41	** is the hiword output reference, which is also used to hold the hiword
	42	** register or spill slot information. The register allocator treats this
	43	** instruction independent of any other instruction, which improves code
	44	** quality compared to using fixed register pairs.
	45	**
	46	** It's easier to split up some instructions into two regular 32 bit
	47	** instructions. E.g. XLOAD is split up into two XLOADs with two different
	48	** addresses. Obviously 64 bit constants need to be split up into two 32 bit
	49	** constants, too. Some hiword instructions can be entirely omitted, e.g.
	50	** when zero-extending a 32 bit value to 64 bits.
	51	**
	52	** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
	53	** two int64_t fields:
	54	**
	55	** 0100 p32 ADD base +8
	56	** 0101 i64 XLOAD 0100
	57	** 0102 i64 ADD 0101 +1
	58	** 0103 p32 ADD base +16
	59	** 0104 i64 XSTORE 0103 0102
	60	**
	61	** mov rax, [esi+0x8]
	62	** add rax, +0x01
	63	** mov [esi+0x10], rax
	64	**
	65	** Here's the transformed IR and the x86 machine code after the SPLIT pass:
	66	**
	67	** 0100 p32 ADD base +8
	68	** 0101 int XLOAD 0100
	69	** 0102 p32 ADD base +12
	70	** 0103 int XLOAD 0102
	71	** 0104 int ADD 0101 +1
	72	** 0105 int HIOP 0103 +0
	73	** 0106 p32 ADD base +16
	74	** 0107 int XSTORE 0106 0104
	75	** 0108 p32 ADD base +20
	76	** 0109 int XSTORE 0108 0105
	77	**
	78	** mov eax, [esi+0x8]
	79	** mov ecx, [esi+0xc]
	80	** add eax, +0x01
	81	** adc ecx, +0x00
	82	** mov [esi+0x10], eax
	83	** mov [esi+0x14], ecx
	84	**
	85	** You may notice the reassociated hiword address computation, which is
	86	** later fused into the mov operands by the assembler.
	87	*/
	88
	89	/* Some local macros to save typing. Undef'd at the end. */
	90	#define IR(ref) (&J->cur.ir[(ref)])
	91
	92	/* Directly emit the transformed IR without updating chains etc. */
	93	static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
	94	{
	95	IRRef nref = lj_ir_nextins(J);
	96	IRIns *ir = IR(nref);
	97	ir->ot = ot;
	98	ir->op1 = op1;
	99	ir->op2 = op2;
	100	return nref;
	101	}
	102
	103	/* Emit a CALLN with two split 64 bit arguments. */
	104	static IRRef split_call64(jit_State J, IRRef1 hisubst, IRIns *oir,
	105	IRIns *ir, IRCallID id)
	106	{
	107	IRRef tmp, op1 = ir->op1, op2 = ir->op2;
	108	J->cur.nins--;
	109	#if LJ_LE
	110	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
	111	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
	112	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
	113	#else
	114	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
	115	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
	116	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
	117	#endif
	118	ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
	119	return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
	120	}
	121
	122	/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
	123	static IRRef split_ptr(jit_State *J, IRRef ref)
	124	{
	125	IRIns *ir = IR(ref);
	126	int32_t ofs = 4;
	127	if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
	128	ofs += IR(ir->op2)->i;
	129	ref = ir->op1;
	130	if (ofs == 0) return ref;
	131	}
	132	return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
	133	}
	134
	135	/* Transform the old IR to the new IR. */
	136	static void split_ir(jit_State *J)
	137	{
	138	IRRef nins = J->cur.nins, nk = J->cur.nk;
	139	MSize irlen = nins - nk;
	140	MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
	141	IRIns oir = (IRIns )lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
	142	IRRef1 *hisubst;
	143	IRRef ref;
	144
	145	/* Copy old IR to buffer. */
	146	memcpy(oir, IR(nk), irlen*sizeof(IRIns));
	147	/* Bias hiword substitution table and old IR. Loword kept in field prev. */
	148	hisubst = (IRRef1 *)&oir[irlen] - nk;
	149	oir -= nk;
	150
	151	/* Remove all IR instructions, but retain IR constants. */
	152	J->cur.nins = REF_FIRST;
	153
	154	/* Process constants and fixed references. */
	155	for (ref = nk; ref <= REF_BASE; ref++) {
	156	IRIns *ir = &oir[ref];
	157	if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
	158	TValue tv = *ir_k64(ir);
	159	ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
	160	hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
	161	} else {
	162	ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
	163	}
	164	}
	165
	166	/* Process old IR instructions. */
	167	for (ref = REF_FIRST; ref < nins; ref++) {
	168	IRIns *ir = &oir[ref];
	169	IRRef nref = lj_ir_nextins(J);
	170	IRIns *nir = IR(nref);
	171
	172	/* Copy-substitute old instruction to new instruction. */
	173	nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
	174	nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
	175	ir->prev = nref; /* Loword substitution. */
	176	nir->o = ir->o;
	177	nir->t.irt = ir->t.irt & ~(IRT_MARK\|IRT_ISPHI);
	178
	179	/* Split 64 bit instructions. */
	180	if (irt_isint64(ir->t)) {
	181	IRRef hi = hisubst[ir->op1];
	182	nir->t.irt = IRT_INT \| (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
	183	switch (ir->o) {
	184	case IR_ADD:
	185	case IR_SUB:
	186	/* Use plain op for hiword if loword cannot produce a carry/borrow. */
	187	if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
	188	ir->prev = nir->op1; /* Pass through loword. */
	189	nir->op1 = hi; nir->op2 = hisubst[ir->op2];
	190	hi = nref;
	191	break;
	192	}
	193	/* fallthrough */
	194	case IR_NEG:
	195	hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
	196	break;
	197	case IR_MUL:
	198	hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
	199	break;
	200	case IR_POWI:
	201	hi = split_call64(J, hisubst, oir, ir,
	202	irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
	203	IRCALL_lj_carith_powu64);
	204	break;
	205	case IR_XLOAD:
	206	hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
	207	#if LJ_BE
	208	ir->prev = hi; hi = nref;
	209	#endif
	210	break;
	211	case IR_XSTORE:
	212	#if LJ_LE
	213	hi = hisubst[ir->op2];
	214	#else
	215	hi = nir->op2; nir->op2 = hisubst[ir->op2];
	216	#endif
	217	split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
	218	continue;
	219	case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
	220	IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
	221	if (st == IRT_NUM \|\| st == IRT_FLOAT) { /* FP to 64 bit int conv. */
	222	hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
	223	} else if (st == IRT_I64 \|\| st == IRT_U64) { /* 64/64 bit cast. */
	224	/* Drop cast, since assembler doesn't care. */
	225	hisubst[ref] = hi;
	226	goto fwdlo;
	227	} else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
	228	IRRef k31 = lj_ir_kint(J, 31);
	229	nir = IR(nref); /* May have been reallocated. */
	230	ir->prev = nir->op1; /* Pass through loword. */
	231	nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
	232	nir->op2 = k31;
	233	hi = nref;
	234	} else { /* Zero-extend to 64 bit. */
	235	hisubst[ref] = lj_ir_kint(J, 0);
	236	goto fwdlo;
	237	}
	238	break;
	239	}
	240	case IR_PHI: {
	241	IRRef hi2;
	242	if ((irref_isk(nir->op1) && irref_isk(nir->op2)) \|\|
	243	nir->op1 == nir->op2)
	244	J->cur.nins--; /* Drop useless PHIs. */
	245	hi2 = hisubst[ir->op2];
	246	if (!((irref_isk(hi) && irref_isk(hi2)) \|\| hi == hi2))
	247	split_emit(J, IRTI(IR_PHI), hi, hi2);
	248	continue;
	249	}
	250	default:
	251	lua_assert(ir->o <= IR_NE);
	252	split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
	253	continue;
	254	}
	255	hisubst[ref] = hi; /* Store hiword substitution. */
	256	} else if (ir->o == IR_CONV) { /* See above, too. */
	257	IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
	258	if (st == IRT_I64 \|\| st == IRT_U64) { /* Conversion from 64 bit int. */
	259	if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
	260	ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
	261	hisubst[ir->op1], nref);
	262	} else { /* Truncate to lower 32 bits. */
	263	fwdlo:
	264	ir->prev = nir->op1; /* Forward loword. */
	265	/* Replace with NOP to avoid messing up the snapshot logic. */
	266	nir->ot = IRT(IR_NOP, IRT_NIL);
	267	nir->op1 = nir->op2 = 0;
	268	}
	269	}
	270	} else if (ir->o == IR_LOOP) {
	271	J->loopref = nref; /* Needed by assembler. */
	272	}
	273	}
	274
	275	/* Add PHI marks. */
	276	for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
	277	IRIns *ir = IR(ref);
	278	if (ir->o != IR_PHI) break;
	279	if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
	280	if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
	281	}
	282
	283	/* Substitute snapshot maps. */
	284	oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
	285	{
	286	SnapNo i, nsnap = J->cur.nsnap;
	287	for (i = 0; i < nsnap; i++) {
	288	SnapShot *snap = &J->cur.snap[i];
	289	SnapEntry *map = &J->cur.snapmap[snap->mapofs];
	290	MSize n, nent = snap->nent;
	291	snap->ref = oir[snap->ref].prev;
	292	for (n = 0; n < nent; n++) {
	293	SnapEntry sn = map[n];
	294	map[n] = ((sn & 0xffff0000) \| oir[snap_ref(sn)].prev);
	295	}
	296	}
	297	}
	298	}
	299
	300	/* Protected callback for split pass. */
	301	static TValue cpsplit(lua_State L, lua_CFunction dummy, void *ud)
	302	{
	303	jit_State J = (jit_State )ud;
	304	split_ir(J);
	305	UNUSED(L); UNUSED(dummy);
	306	return NULL;
	307	}
	308
	309	#ifdef LUA_USE_ASSERT
	310	/* Slow, but sure way to check whether a SPLIT pass is needed. */
	311	static int split_needsplit(jit_State *J)
	312	{
	313	IRIns ir, irend;
	314	IRRef ref;
	315	for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
	316	if (irt_isint64(ir->t))
	317	return 1;
	318	for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
	319	if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 \|\|
	320	(IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
	321	return 1;
	322	return 0; /* Nope. */
	323	}
	324	#endif
	325
	326	/* SPLIT pass. */
	327	void lj_opt_split(jit_State *J)
	328	{
	329	lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
	330	if (J->needsplit) {
	331	int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
	332	if (errcode) {
	333	/* Completely reset the trace to avoid inconsistent dump on abort. */
	334	J->cur.nins = J->cur.nk = REF_BASE;
	335	J->cur.nsnap = 0;
	336	lj_err_throw(J->L, errcode); /* Propagate errors. */
	337	}
	338	}
	339	}
	340
	341	#undef IR
	342
	343	#endif