Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs.

Add generic HIOP instruction for extra backend functionality. Add support for HIOP to x86 backend. Use POWI for 64 bit integer x^k, too. POWI is lowered to a call by SPLIT or the x64 backend.
author: Mike Pall <mike> 2011-02-02 02:29:37 +0100
committer: Mike Pall <mike> 2011-02-02 02:29:37 +0100
commit: b613216efc7447dae645d8834e4d6f3185cd1bcc (patch)
tree: 0859fed377f00ebeada70ba45d02496b7fb4a249 /src/lj_opt_split.c
parent: c539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff)
download: luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip
1 files changed, 343 insertions, 0 deletions
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c
new file mode 100644
index 00000000..3cb30514
--- /dev/null
+++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
+/*
+** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
+** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
+*/
+#define lj_opt_split_c
+#define LUA_CORE
+#include "lj_obj.h"
+#if LJ_HASJIT && LJ_HASFFI && LJ_32
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_vm.h"
+/* SPLIT pass:
+**
+** This pass splits up 64 bit IR instructions into multiple 32 bit IR
+** instructions. It's only active for 32 bit CPUs which lack native 64 bit
+** operations. The FFI is currently the only emitter for 64 bit
+** instructions, so this pass is disabled if the FFI is disabled.
+**
+** Splitting the IR in a separate pass keeps each 32 bit IR assembler
+** backend simple. Only a small amount of extra functionality needs to be
+** implemented. This is much easier than adding support for allocating
+** register pairs to each backend (believe me, I tried). A few simple, but
+** important optimizations can be performed by the SPLIT pass, which would
+** be tedious to do in the backend.
+**
+** The basic idea is to replace each 64 bit IR instruction with its 32 bit
+** equivalent plus an extra HIOP instruction. The splitted IR is not passed
+** through FOLD or any other optimizations, so each HIOP is guaranteed to
+** immediately follow it's counterpart. The actual functionality of HIOP is
+** inferred from the previous instruction.
+**
+** The operands of HIOP hold the hiword input references. The output of HIOP
+** is the hiword output reference, which is also used to hold the hiword
+** register or spill slot information. The register allocator treats this
+** instruction independent of any other instruction, which improves code
+** quality compared to using fixed register pairs.
+**
+** It's easier to split up some instructions into two regular 32 bit
+** instructions. E.g. XLOAD is split up into two XLOADs with two different
+** addresses. Obviously 64 bit constants need to be split up into two 32 bit
+** constants, too. Some hiword instructions can be entirely omitted, e.g.
+** when zero-extending a 32 bit value to 64 bits.
+**
+** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
+** two int64_t fields:
+**
+** 0100    p32 ADD    base  +8
+** 0101    i64 XLOAD  0100
+** 0102    i64 ADD    0101  +1
+** 0103    p32 ADD    base  +16
+** 0104    i64 XSTORE 0103  0102
+**
+**         mov rax, [esi+0x8]
+**         add rax, +0x01
+**         mov [esi+0x10], rax
+**
+** Here's the transformed IR and the x86 machine code after the SPLIT pass:
+**
+** 0100    p32 ADD    base  +8
+** 0101    int XLOAD  0100
+** 0102    p32 ADD    base  +12
+** 0103    int XLOAD  0102
+** 0104    int ADD    0101  +1
+** 0105    int HIOP   0103  +0
+** 0106    p32 ADD    base  +16
+** 0107    int XSTORE 0106  0104
+** 0108    p32 ADD    base  +20
+** 0109    int XSTORE 0108  0105
+**
+**         mov eax, [esi+0x8]
+**         mov ecx, [esi+0xc]
+**         add eax, +0x01
+**         adc ecx, +0x00
+**         mov [esi+0x10], eax
+**         mov [esi+0x14], ecx
+**
+** You may notice the reassociated hiword address computation, which is
+** later fused into the mov operands by the assembler.
+*/
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)         (&J->cur.ir[(ref)])
+/* Directly emit the transformed IR without updating chains etc. */
+static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
+{
+  IRRef nref = lj_ir_nextins(J);
+  IRIns *ir = IR(nref);
+  ir->ot = ot;
+  ir->op1 = op1;
+  ir->op2 = op2;
+  return nref;
+}
+/* Emit a CALLN with two split 64 bit arguments. */
+static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
+                          IRIns *ir, IRCallID id)
+{
+  IRRef tmp, op1 = ir->op1, op2 = ir->op2;
+  J->cur.nins--;
+#if LJ_LE
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+#else
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
+  tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
+#endif
+  ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
+  return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
+}
+/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
+static IRRef split_ptr(jit_State *J, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  int32_t ofs = 4;
+  if (ir->o == IR_ADD && irref_isk(ir->op2)) {  /* Reassociate address. */
+    ofs += IR(ir->op2)->i;
+    ref = ir->op1;
+    if (ofs == 0) return ref;
+  }
+  return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
+}
+/* Transform the old IR to the new IR. */
+static void split_ir(jit_State *J)
+{
+  IRRef nins = J->cur.nins, nk = J->cur.nk;
+  MSize irlen = nins - nk;
+  MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
+  IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
+  IRRef1 *hisubst;
+  IRRef ref;
+  /* Copy old IR to buffer. */
+  memcpy(oir, IR(nk), irlen*sizeof(IRIns));
+  /* Bias hiword substitution table and old IR. Loword kept in field prev. */
+  hisubst = (IRRef1 *)&oir[irlen] - nk;
+  oir -= nk;
+  /* Remove all IR instructions, but retain IR constants. */
+  J->cur.nins = REF_FIRST;
+  /* Process constants and fixed references. */
+  for (ref = nk; ref <= REF_BASE; ref++) {
+    IRIns *ir = &oir[ref];
+    if (ir->o == IR_KINT64) {  /* Split up 64 bit constant. */
+      TValue tv = *ir_k64(ir);
+      ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
+      hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
+    } else {
+      ir->prev = (IRRef1)ref;  /* Identity substitution for loword. */
+    }
+  }
+  /* Process old IR instructions. */
+  for (ref = REF_FIRST; ref < nins; ref++) {
+    IRIns *ir = &oir[ref];
+    IRRef nref = lj_ir_nextins(J);
+    IRIns *nir = IR(nref);
+    /* Copy-substitute old instruction to new instruction. */
+    nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
+    nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
+    ir->prev = nref;  /* Loword substitution. */
+    nir->o = ir->o;
+    nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
+    /* Split 64 bit instructions. */
+    if (irt_isint64(ir->t)) {
+      IRRef hi = hisubst[ir->op1];
+      nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD);  /* Turn into INT op. */
+      switch (ir->o) {
+      case IR_ADD:
+      case IR_SUB:
+        /* Use plain op for hiword if loword cannot produce a carry/borrow. */
+        if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
+          ir->prev = nir->op1;  /* Pass through loword. */
+          nir->op1 = hi; nir->op2 = hisubst[ir->op2];
+          hi = nref;
+          break;
+        }
+        /* fallthrough */
+      case IR_NEG:
+        hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
+        break;
+      case IR_MUL:
+        hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
+        break;
+      case IR_POWI:
+        hi = split_call64(J, hisubst, oir, ir,
+                          irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+                                             IRCALL_lj_carith_powu64);
+        break;
+      case IR_XLOAD:
+        hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
+#if LJ_BE
+        ir->prev = hi; hi = nref;
+#endif
+        break;
+      case IR_XSTORE:
+#if LJ_LE
+        hi = hisubst[ir->op2];
+#else
+        hi = nir->op2; nir->op2 = hisubst[ir->op2];
+#endif
+        split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
+        continue;
+      case IR_CONV: {  /* Conversion to 64 bit integer. Others handled below. */
+        IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+        if (st == IRT_NUM || st == IRT_FLOAT) {  /* FP to 64 bit int conv. */
+          hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
+        } else if (st == IRT_I64 || st == IRT_U64) {  /* 64/64 bit cast. */
+          /* Drop cast, since assembler doesn't care. */
+          hisubst[ref] = hi;
+          goto fwdlo;
+        } else if ((ir->op2 & IRCONV_SEXT)) {  /* Sign-extend to 64 bit. */
+          IRRef k31 = lj_ir_kint(J, 31);
+          nir = IR(nref);  /* May have been reallocated. */
+          ir->prev = nir->op1;  /* Pass through loword. */
+          nir->o = IR_BSAR;  /* hi = bsar(lo, 31). */
+          nir->op2 = k31;
+          hi = nref;
+        } else {  /* Zero-extend to 64 bit. */
+          hisubst[ref] = lj_ir_kint(J, 0);
+          goto fwdlo;
+        }
+        break;
+        }
+      case IR_PHI: {
+        IRRef hi2;
+        if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
+            nir->op1 == nir->op2)
+          J->cur.nins--;  /* Drop useless PHIs. */
+        hi2 = hisubst[ir->op2];
+        if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
+          split_emit(J, IRTI(IR_PHI), hi, hi2);
+        continue;
+        }
+      default:
+        lua_assert(ir->o <= IR_NE);
+        split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]);  /* Comparisons. */
+        continue;
+      }
+      hisubst[ref] = hi;  /* Store hiword substitution. */
+    } else if (ir->o == IR_CONV) {  /* See above, too. */
+      IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+      if (st == IRT_I64 || st == IRT_U64) {  /* Conversion from 64 bit int. */
+        if (irt_isfp(ir->t)) {  /* 64 bit integer to FP conversion. */
+          ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
+                                hisubst[ir->op1], nref);
+        } else {  /* Truncate to lower 32 bits. */
+        fwdlo:
+          ir->prev = nir->op1;  /* Forward loword. */
+          /* Replace with NOP to avoid messing up the snapshot logic. */
+          nir->ot = IRT(IR_NOP, IRT_NIL);
+          nir->op1 = nir->op2 = 0;
+        }
+      }
+    } else if (ir->o == IR_LOOP) {
+      J->loopref = nref;  /* Needed by assembler. */
+    }
+  }
+  /* Add PHI marks. */
+  for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
+    IRIns *ir = IR(ref);
+    if (ir->o != IR_PHI) break;
+    if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
+    if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
+  }
+  /* Substitute snapshot maps. */
+  oir[nins].prev = J->cur.nins;  /* Substitution for last snapshot. */
+  {
+    SnapNo i, nsnap = J->cur.nsnap;
+    for (i = 0; i < nsnap; i++) {
+      SnapShot *snap = &J->cur.snap[i];
+      SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+      MSize n, nent = snap->nent;
+      snap->ref = oir[snap->ref].prev;
+      for (n = 0; n < nent; n++) {
+        SnapEntry sn = map[n];
+        map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
+      }
+    }
+  }
+}
+/* Protected callback for split pass. */
+static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  jit_State *J = (jit_State *)ud;
+  split_ir(J);
+  UNUSED(L); UNUSED(dummy);
+  return NULL;
+}
+#ifdef LUA_USE_ASSERT
+/* Slow, but sure way to check whether a SPLIT pass is needed. */
+static int split_needsplit(jit_State *J)
+{
+  IRIns *ir, *irend;
+  IRRef ref;
+  for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
+    if (irt_isint64(ir->t))
+      return 1;
+  for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
+    if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
+        (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
+      return 1;
+  return 0;  /* Nope. */
+}
+#endif
+/* SPLIT pass. */
+void lj_opt_split(jit_State *J)
+{
+  lua_assert(J->needsplit >= split_needsplit(J));  /* Verify flag. */
+  if (J->needsplit) {
+    int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
+    if (errcode) {
+      /* Completely reset the trace to avoid inconsistent dump on abort. */
+      J->cur.nins = J->cur.nk = REF_BASE;
+      J->cur.nsnap = 0;
+      lj_err_throw(J->L, errcode);  /* Propagate errors. */
+    }
+  }
+}
+#undef IR
+#endif
author	Mike Pall <mike>	2011-02-02 02:29:37 +0100
committer	Mike Pall <mike>	2011-02-02 02:29:37 +0100
commit	b613216efc7447dae645d8834e4d6f3185cd1bcc (patch)
tree	0859fed377f00ebeada70ba45d02496b7fb4a249 /src/lj_opt_split.c
parent	c539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff)
download	luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2 luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip

diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
	1	/*
	2	** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
	3	** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
	4	*/
	5
	6	#define lj_opt_split_c
	7	#define LUA_CORE
	8
	9	#include "lj_obj.h"
	10
	11	#if LJ_HASJIT && LJ_HASFFI && LJ_32
	12
	13	#include "lj_err.h"
	14	#include "lj_str.h"
	15	#include "lj_ir.h"
	16	#include "lj_jit.h"
	17	#include "lj_iropt.h"
	18	#include "lj_vm.h"
	19
	20	/* SPLIT pass:
	21	**
	22	** This pass splits up 64 bit IR instructions into multiple 32 bit IR
	23	** instructions. It's only active for 32 bit CPUs which lack native 64 bit
	24	** operations. The FFI is currently the only emitter for 64 bit
	25	** instructions, so this pass is disabled if the FFI is disabled.
	26	**
	27	** Splitting the IR in a separate pass keeps each 32 bit IR assembler
	28	** backend simple. Only a small amount of extra functionality needs to be
	29	** implemented. This is much easier than adding support for allocating
	30	** register pairs to each backend (believe me, I tried). A few simple, but
	31	** important optimizations can be performed by the SPLIT pass, which would
	32	** be tedious to do in the backend.
	33	**
	34	** The basic idea is to replace each 64 bit IR instruction with its 32 bit
	35	** equivalent plus an extra HIOP instruction. The splitted IR is not passed
	36	** through FOLD or any other optimizations, so each HIOP is guaranteed to
	37	** immediately follow it's counterpart. The actual functionality of HIOP is
	38	** inferred from the previous instruction.
	39	**
	40	** The operands of HIOP hold the hiword input references. The output of HIOP
	41	** is the hiword output reference, which is also used to hold the hiword
	42	** register or spill slot information. The register allocator treats this
	43	** instruction independent of any other instruction, which improves code
	44	** quality compared to using fixed register pairs.
	45	**
	46	** It's easier to split up some instructions into two regular 32 bit
	47	** instructions. E.g. XLOAD is split up into two XLOADs with two different
	48	** addresses. Obviously 64 bit constants need to be split up into two 32 bit
	49	** constants, too. Some hiword instructions can be entirely omitted, e.g.
	50	** when zero-extending a 32 bit value to 64 bits.
	51	**
	52	** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
	53	** two int64_t fields:
	54	**
	55	** 0100 p32 ADD base +8
	56	** 0101 i64 XLOAD 0100
	57	** 0102 i64 ADD 0101 +1
	58	** 0103 p32 ADD base +16
	59	** 0104 i64 XSTORE 0103 0102
	60	**
	61	** mov rax, [esi+0x8]
	62	** add rax, +0x01
	63	** mov [esi+0x10], rax
	64	**
	65	** Here's the transformed IR and the x86 machine code after the SPLIT pass:
	66	**
	67	** 0100 p32 ADD base +8
	68	** 0101 int XLOAD 0100
	69	** 0102 p32 ADD base +12
	70	** 0103 int XLOAD 0102
	71	** 0104 int ADD 0101 +1
	72	** 0105 int HIOP 0103 +0
	73	** 0106 p32 ADD base +16
	74	** 0107 int XSTORE 0106 0104
	75	** 0108 p32 ADD base +20
	76	** 0109 int XSTORE 0108 0105
	77	**
	78	** mov eax, [esi+0x8]
	79	** mov ecx, [esi+0xc]
	80	** add eax, +0x01
	81	** adc ecx, +0x00
	82	** mov [esi+0x10], eax
	83	** mov [esi+0x14], ecx
	84	**
	85	** You may notice the reassociated hiword address computation, which is
	86	** later fused into the mov operands by the assembler.
	87	*/
	88
	89	/* Some local macros to save typing. Undef'd at the end. */
	90	#define IR(ref) (&J->cur.ir[(ref)])
	91
	92	/* Directly emit the transformed IR without updating chains etc. */
	93	static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
	94	{
	95	IRRef nref = lj_ir_nextins(J);
	96	IRIns *ir = IR(nref);
	97	ir->ot = ot;
	98	ir->op1 = op1;
	99	ir->op2 = op2;
	100	return nref;
	101	}
	102
	103	/* Emit a CALLN with two split 64 bit arguments. */
	104	static IRRef split_call64(jit_State J, IRRef1 hisubst, IRIns *oir,
	105	IRIns *ir, IRCallID id)
	106	{
	107	IRRef tmp, op1 = ir->op1, op2 = ir->op2;
	108	J->cur.nins--;
	109	#if LJ_LE
	110	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
	111	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
	112	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
	113	#else
	114	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
	115	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
	116	tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
	117	#endif
	118	ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
	119	return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
	120	}
	121
	122	/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
	123	static IRRef split_ptr(jit_State *J, IRRef ref)
	124	{
	125	IRIns *ir = IR(ref);
	126	int32_t ofs = 4;
	127	if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
	128	ofs += IR(ir->op2)->i;
	129	ref = ir->op1;
	130	if (ofs == 0) return ref;
	131	}
	132	return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
	133	}
	134
	135	/* Transform the old IR to the new IR. */
	136	static void split_ir(jit_State *J)
	137	{
	138	IRRef nins = J->cur.nins, nk = J->cur.nk;
	139	MSize irlen = nins - nk;
	140	MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
	141	IRIns oir = (IRIns )lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
	142	IRRef1 *hisubst;
	143	IRRef ref;
	144
	145	/* Copy old IR to buffer. */
	146	memcpy(oir, IR(nk), irlen*sizeof(IRIns));
	147	/* Bias hiword substitution table and old IR. Loword kept in field prev. */
	148	hisubst = (IRRef1 *)&oir[irlen] - nk;
	149	oir -= nk;
	150
	151	/* Remove all IR instructions, but retain IR constants. */
	152	J->cur.nins = REF_FIRST;
	153
	154	/* Process constants and fixed references. */
	155	for (ref = nk; ref <= REF_BASE; ref++) {
	156	IRIns *ir = &oir[ref];
	157	if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
	158	TValue tv = *ir_k64(ir);
	159	ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
	160	hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
	161	} else {
	162	ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
	163	}
	164	}
	165
	166	/* Process old IR instructions. */
	167	for (ref = REF_FIRST; ref < nins; ref++) {
	168	IRIns *ir = &oir[ref];
	169	IRRef nref = lj_ir_nextins(J);
	170	IRIns *nir = IR(nref);
	171
	172	/* Copy-substitute old instruction to new instruction. */
	173	nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
	174	nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
	175	ir->prev = nref; /* Loword substitution. */
	176	nir->o = ir->o;
	177	nir->t.irt = ir->t.irt & ~(IRT_MARK\|IRT_ISPHI);
	178
	179	/* Split 64 bit instructions. */
	180	if (irt_isint64(ir->t)) {
	181	IRRef hi = hisubst[ir->op1];
	182	nir->t.irt = IRT_INT \| (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
	183	switch (ir->o) {
	184	case IR_ADD:
	185	case IR_SUB:
	186	/* Use plain op for hiword if loword cannot produce a carry/borrow. */
	187	if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
	188	ir->prev = nir->op1; /* Pass through loword. */
	189	nir->op1 = hi; nir->op2 = hisubst[ir->op2];
	190	hi = nref;
	191	break;
	192	}
	193	/* fallthrough */
	194	case IR_NEG:
	195	hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
	196	break;
	197	case IR_MUL:
	198	hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
	199	break;
	200	case IR_POWI:
	201	hi = split_call64(J, hisubst, oir, ir,
	202	irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
	203	IRCALL_lj_carith_powu64);
	204	break;
	205	case IR_XLOAD:
	206	hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
	207	#if LJ_BE
	208	ir->prev = hi; hi = nref;
	209	#endif
	210	break;
	211	case IR_XSTORE:
	212	#if LJ_LE
	213	hi = hisubst[ir->op2];
	214	#else
	215	hi = nir->op2; nir->op2 = hisubst[ir->op2];
	216	#endif
	217	split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
	218	continue;
	219	case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
	220	IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
	221	if (st == IRT_NUM \|\| st == IRT_FLOAT) { /* FP to 64 bit int conv. */
	222	hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
	223	} else if (st == IRT_I64 \|\| st == IRT_U64) { /* 64/64 bit cast. */
	224	/* Drop cast, since assembler doesn't care. */
	225	hisubst[ref] = hi;
	226	goto fwdlo;
	227	} else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
	228	IRRef k31 = lj_ir_kint(J, 31);
	229	nir = IR(nref); /* May have been reallocated. */
	230	ir->prev = nir->op1; /* Pass through loword. */
	231	nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
	232	nir->op2 = k31;
	233	hi = nref;
	234	} else { /* Zero-extend to 64 bit. */
	235	hisubst[ref] = lj_ir_kint(J, 0);
	236	goto fwdlo;
	237	}
	238	break;
	239	}
	240	case IR_PHI: {
	241	IRRef hi2;
	242	if ((irref_isk(nir->op1) && irref_isk(nir->op2)) \|\|
	243	nir->op1 == nir->op2)
	244	J->cur.nins--; /* Drop useless PHIs. */
	245	hi2 = hisubst[ir->op2];
	246	if (!((irref_isk(hi) && irref_isk(hi2)) \|\| hi == hi2))
	247	split_emit(J, IRTI(IR_PHI), hi, hi2);
	248	continue;
	249	}
	250	default:
	251	lua_assert(ir->o <= IR_NE);
	252	split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
	253	continue;
	254	}
	255	hisubst[ref] = hi; /* Store hiword substitution. */
	256	} else if (ir->o == IR_CONV) { /* See above, too. */
	257	IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
	258	if (st == IRT_I64 \|\| st == IRT_U64) { /* Conversion from 64 bit int. */
	259	if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
	260	ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
	261	hisubst[ir->op1], nref);
	262	} else { /* Truncate to lower 32 bits. */
	263	fwdlo:
	264	ir->prev = nir->op1; /* Forward loword. */
	265	/* Replace with NOP to avoid messing up the snapshot logic. */
	266	nir->ot = IRT(IR_NOP, IRT_NIL);
	267	nir->op1 = nir->op2 = 0;
	268	}
	269	}
	270	} else if (ir->o == IR_LOOP) {
	271	J->loopref = nref; /* Needed by assembler. */
	272	}
	273	}
	274
	275	/* Add PHI marks. */
	276	for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
	277	IRIns *ir = IR(ref);
	278	if (ir->o != IR_PHI) break;
	279	if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
	280	if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
	281	}
	282
	283	/* Substitute snapshot maps. */
	284	oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
	285	{
	286	SnapNo i, nsnap = J->cur.nsnap;
	287	for (i = 0; i < nsnap; i++) {
	288	SnapShot *snap = &J->cur.snap[i];
	289	SnapEntry *map = &J->cur.snapmap[snap->mapofs];
	290	MSize n, nent = snap->nent;
	291	snap->ref = oir[snap->ref].prev;
	292	for (n = 0; n < nent; n++) {
	293	SnapEntry sn = map[n];
	294	map[n] = ((sn & 0xffff0000) \| oir[snap_ref(sn)].prev);
	295	}
	296	}
	297	}
	298	}
	299
	300	/* Protected callback for split pass. */
	301	static TValue cpsplit(lua_State L, lua_CFunction dummy, void *ud)
	302	{
	303	jit_State J = (jit_State )ud;
	304	split_ir(J);
	305	UNUSED(L); UNUSED(dummy);
	306	return NULL;
	307	}
	308
	309	#ifdef LUA_USE_ASSERT
	310	/* Slow, but sure way to check whether a SPLIT pass is needed. */
	311	static int split_needsplit(jit_State *J)
	312	{
	313	IRIns ir, irend;
	314	IRRef ref;
	315	for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
	316	if (irt_isint64(ir->t))
	317	return 1;
	318	for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
	319	if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 \|\|
	320	(IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
	321	return 1;
	322	return 0; /* Nope. */
	323	}
	324	#endif
	325
	326	/* SPLIT pass. */
	327	void lj_opt_split(jit_State *J)
	328	{
	329	lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
	330	if (J->needsplit) {
	331	int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
	332	if (errcode) {
	333	/* Completely reset the trace to avoid inconsistent dump on abort. */
	334	J->cur.nins = J->cur.nk = REF_BASE;
	335	J->cur.nsnap = 0;
	336	lj_err_throw(J->L, errcode); /* Propagate errors. */
	337	}
	338	}
	339	}
	340
	341	#undef IR
	342
	343	#endif