1 files changed, 358 insertions, 0 deletions
diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c
new file mode 100644
index 00000000..adc0c476
--- /dev/null
+++ b/src/lj_opt_loop.c
@@ -0,0 +1,358 @@
+/*
+** LOOP: Loop Optimizations.
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+#define lj_opt_loop_c
+#define LUA_CORE
+#include "lj_obj.h"
+#if LJ_HASJIT
+#include "lj_gc.h"
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_snap.h"
+#include "lj_vm.h"
+/* Loop optimization:
+**
+** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
+** of a loop into invariant and variant instructions. The invariant
+** instructions are hoisted out of the loop and only the variant
+** instructions remain inside the loop body.
+**
+** Unfortunately LICM is mostly useless for compiling dynamic languages.
+** The IR has many guards and most of the subsequent instructions are
+** control-dependent on them. The first non-hoistable guard would
+** effectively prevent hoisting of all subsequent instructions.
+**
+** That's why we use a special form of unrolling using copy-substitution,
+** combined with redundancy elimination:
+**
+** The recorded instruction stream is re-emitted to the compiler pipeline
+** with substituted operands. The substitution table is filled with the
+** refs returned by re-emitting each instruction. This can be done
+** on-the-fly, because the IR is in strict SSA form, where every ref is
+** defined before its use.
+**
+** This aproach generates two code sections, separated by the LOOP
+** instruction:
+**
+** 1. The recorded instructions form a kind of pre-roll for the loop. It
+** contains a mix of invariant and variant instructions and performs
+** exactly one loop iteration (but not necessarily the 1st iteration).
+**
+** 2. The loop body contains only the variant instructions and performs
+** all remaining loop iterations.
+**
+** On first sight that looks like a waste of space, because the variant
+** instructions are present twice. But the key insight is that the
+** pre-roll honors the control-dependencies for *both* the pre-roll itself
+** *and* the loop body!
+**
+** It also means one doesn't have to explicitly model control-dependencies
+** (which, BTW, wouldn't help LICM much). And it's much easier to
+** integrate sparse snapshotting with this approach.
+**
+** One of the nicest aspects of this approach is that all of the
+** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
+** reused with only minor restrictions (e.g. one should not fold
+** instructions across loop-carried dependencies).
+**
+** But in general all optimizations can be applied which only need to look
+** backwards into the generated instruction stream. At any point in time
+** during the copy-substitution process this contains both a static loop
+** iteration (the pre-roll) and a dynamic one (from the to-be-copied
+** instruction up to the end of the partial loop body).
+**
+** Since control-dependencies are implicitly kept, CSE also applies to all
+** kinds of guards. The major advantage is that all invariant guards can
+** be hoisted, too.
+**
+** Load/store forwarding works across loop iterations, too. This is
+** important if loop-carried dependencies are kept in upvalues or tables.
+** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
+** become a forwarded loop-recurrence after inlining.
+**
+** Since the IR is in SSA form, loop-carried dependencies have to be
+** modeled with PHI instructions. The potential candidates for PHIs are
+** collected on-the-fly during copy-substitution. After eliminating the
+** redundant ones, PHI instructions are emitted *below* the loop body.
+**
+** Note that this departure from traditional SSA form doesn't change the
+** semantics of the PHI instructions themselves. But it greatly simplifies
+** on-the-fly generation of the IR and the machine code.
+*/
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)         (&J->cur.ir[(ref)])
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+/* Emit raw IR without passing through optimizations. */
+#define emitir_raw(ot, a, b)    (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+/* -- PHI elimination ----------------------------------------------------- */
+/* Emit or eliminate collected PHIs. */
+static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi)
+{
+  int pass2 = 0;
+  IRRef i, nslots;
+  IRRef invar = J->chain[IR_LOOP];
+  /* Pass #1: mark redundant and potentially redundant PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRRef rref = subst[lref];
+    if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
+      irt_setmark(IR(lref)->t);
+    } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
+      /* Quick check for simple recurrences failed, need pass2. */
+      irt_setmark(IR(lref)->t);
+      pass2 = 1;
+    }
+  }
+  /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
+  if (pass2) {
+    for (i = J->cur.nins-1; i > invar; i--) {
+      IRIns *ir = IR(i);
+      if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t);
+      if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
+    }
+  }
+  /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
+  nslots = J->baseslot+J->maxslot;
+  for (i = 1; i < nslots; i++) {
+    IRRef ref = tref_ref(J->slot[i]);
+    if (!irref_isk(ref) && ref != subst[ref]) {
+      IRIns *ir = IR(ref);
+      irt_clearmark(ir->t);  /* Unmark potential uses, too. */
+      if (!irt_isphi(ir->t) && !irt_ispri(ir->t)) {
+        irt_setphi(ir->t);
+        if (nphi >= LJ_MAX_PHI)
+          lj_trace_err(J, LJ_TRERR_PHIOV);
+        phi[nphi++] = (IRRef1)ref;
+      }
+    }
+  }
+  /* Pass #4: emit PHI instructions or eliminate PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRIns *ir = IR(lref);
+    if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
+      IRRef rref = subst[lref];
+      if (rref > invar)
+        irt_setphi(IR(rref)->t);
+      emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
+    } else {  /* Otherwise eliminate PHI. */
+      irt_clearmark(ir->t);
+      irt_clearphi(ir->t);
+    }
+  }
+}
+/* -- Loop unrolling using copy-substitution ------------------------------ */
+/* Unroll loop. */
+static void loop_unroll(jit_State *J)
+{
+  IRRef1 phi[LJ_MAX_PHI];
+  uint32_t nphi = 0;
+  IRRef1 *subst;
+  SnapShot *osnap, *snap;
+  IRRef2 *loopmap;
+  BCReg loopslots;
+  MSize nsnap, nsnapmap;
+  IRRef ins, invar, osnapref;
+  /* Use temp buffer for substitution table.
+  ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
+  ** Note: don't call into the VM or run the GC or the buffer may be gone.
+  */
+  invar = J->cur.nins;
+  subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
+                                   (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
+  subst[REF_BASE] = REF_BASE;
+  /* LOOP separates the pre-roll from the loop body. */
+  emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
+  /* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
+  nsnap = J->cur.nsnap;
+  if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
+    MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
+    if (2*nsnap-2 > maxsnap)
+      lj_trace_err(J, LJ_TRERR_SNAPOV);
+    lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
+    J->cur.snap = J->snapbuf;
+  }
+  nsnapmap = J->cur.nsnapmap;  /* Use temp. copy to avoid undo. */
+  if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
+    J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
+                                             J->sizesnapmap*sizeof(IRRef2),
+                                             2*J->sizesnapmap*sizeof(IRRef2));
+    J->cur.snapmap = J->snapmapbuf;
+    J->sizesnapmap *= 2;
+  }
+  /* The loop snapshot is used for fallback substitutions. */
+  snap = &J->cur.snap[nsnap-1];
+  loopmap = &J->cur.snapmap[snap->mapofs];
+  loopslots = snap->nslots;
+  /* The PC of snapshot #0 and the loop snapshot must match. */
+  lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
+  /* Start substitution with snapshot #1 (#0 is empty for root traces). */
+  osnap = &J->cur.snap[1];
+  osnapref = osnap->ref;
+  /* Copy and substitute all recorded instructions and snapshots. */
+  for (ins = REF_FIRST; ins < invar; ins++) {
+    IRIns *ir;
+    IRRef op1, op2;
+    /* Copy-substitute snapshot. */
+    if (ins >= osnapref) {
+      IRRef2 *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
+      BCReg s, nslots;
+      uint32_t nmapofs, nframelinks;
+      if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+        nmapofs = nsnapmap;
+        snap++;  /* Add new snapshot. */
+      } else {
+        nmapofs = snap->mapofs;  /* Overwrite previous snapshot. */
+      }
+      J->guardemit.irt = 0;
+      nslots = osnap->nslots;
+      nframelinks = osnap->nframelinks;
+      snap->mapofs = (uint16_t)nmapofs;
+      snap->ref = (IRRef1)J->cur.nins;
+      snap->nslots = (uint8_t)nslots;
+      snap->nframelinks = (uint8_t)nframelinks;
+      snap->count = 0;
+      osnap++;
+      osnapref = osnap->ref;
+      nsnapmap = nmapofs + nslots + nframelinks;
+      nmap = &J->cur.snapmap[nmapofs];
+      /* Substitute snapshot slots. */
+      for (s = 0; s < nslots; s++) {
+        IRRef ref = snap_ref(omap[s]);
+        if (ref) {
+          if (!irref_isk(ref))
+            ref = subst[ref];
+        } else if (s < loopslots) {
+          ref = loopmap[s];
+        }
+        nmap[s] = ref;
+      }
+      /* Copy frame links. */
+      nmap += nslots;
+      omap += nslots;
+      for (s = 0; s < nframelinks; s++)
+        nmap[s] = omap[s];
+    }
+    /* Substitute instruction operands. */
+    ir = IR(ins);
+    op1 = ir->op1;
+    if (!irref_isk(op1)) op1 = subst[op1];
+    op2 = ir->op2;
+    if (!irref_isk(op2)) op2 = subst[op2];
+    if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
+        op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
+      subst[ins] = (IRRef1)ins;  /* Shortcut. */
+    } else {
+      /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
+      IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
+      IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
+      subst[ins] = (IRRef1)ref;
+      if (ref != ins && ref < invar) {  /* Loop-carried dependency? */
+        IRIns *irr = IR(ref);
+        /* Potential PHI? */
+        if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
+          irt_setphi(irr->t);
+          if (nphi >= LJ_MAX_PHI)
+            lj_trace_err(J, LJ_TRERR_PHIOV);
+          phi[nphi++] = (IRRef1)ref;
+        }
+        /* Check all loop-carried dependencies for type instability. */
+        if (!irt_sametype(t, irr->t)) {
+          if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num case. */
+            subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0));
+          else
+            lj_trace_err(J, LJ_TRERR_TYPEINS);
+        }
+      }
+    }
+  }
+  if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+    J->cur.nsnapmap = (uint16_t)nsnapmap;
+    snap++;
+  } else {
+    J->cur.nsnapmap = (uint16_t)snap->mapofs;  /* Last snapshot is redundant. */
+  }
+  J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
+  lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
+  loop_emit_phi(J, subst, phi, nphi);
+}
+/* Undo any partial changes made by the loop optimization. */
+static void loop_undo(jit_State *J, IRRef ins)
+{
+  lj_ir_rollback(J, ins);
+  for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
+    IRIns *ir = IR(ins);
+    irt_clearphi(ir->t);
+    irt_clearmark(ir->t);
+  }
+}
+/* Protected callback for loop optimization. */
+static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  UNUSED(L); UNUSED(dummy);
+  loop_unroll((jit_State *)ud);
+  return NULL;
+}
+/* Loop optimization. */
+int lj_opt_loop(jit_State *J)
+{
+  IRRef nins = J->cur.nins;
+  int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J);
+  if (LJ_UNLIKELY(errcode)) {
+    lua_State *L = J->L;
+    if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) {  /* Trace error? */
+      int32_t e = lj_num2int(numV(L->top-1));
+      switch ((TraceError)e) {
+      case LJ_TRERR_TYPEINS:  /* Type instability. */
+      case LJ_TRERR_GFAIL:  /* Guard would always fail. */
+        /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
+        if (--J->instunroll < 0)  /* But do not unroll forever. */
+          break;
+        L->top--;  /* Remove error object. */
+        J->guardemit.irt = 0;
+        loop_undo(J, nins);
+        return 1;  /* Loop optimization failed, continue recording. */
+      default:
+        break;
+      }
+    }
+    lj_err_throw(L, errcode);  /* Propagate all other errors. */
+  }
+  return 0;  /* Loop optimization is ok. */
+}
+#undef IR
+#undef emitir
+#undef emitir_raw
+#endif

diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c new file mode 100644 index 00000000..adc0c476 --- /dev/null +++ b/src/lj_opt_loop.c
@@ -0,0 +1,358 @@
	1	/*
	2	** LOOP: Loop Optimizations.
	3	** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
	4	*/
	5
	6	#define lj_opt_loop_c
	7	#define LUA_CORE
	8
	9	#include "lj_obj.h"
	10
	11	#if LJ_HASJIT
	12
	13	#include "lj_gc.h"
	14	#include "lj_err.h"
	15	#include "lj_str.h"
	16	#include "lj_ir.h"
	17	#include "lj_jit.h"
	18	#include "lj_iropt.h"
	19	#include "lj_trace.h"
	20	#include "lj_snap.h"
	21	#include "lj_vm.h"
	22
	23	/* Loop optimization:
	24	**
	25	** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
	26	** of a loop into invariant and variant instructions. The invariant
	27	** instructions are hoisted out of the loop and only the variant
	28	** instructions remain inside the loop body.
	29	**
	30	** Unfortunately LICM is mostly useless for compiling dynamic languages.
	31	** The IR has many guards and most of the subsequent instructions are
	32	** control-dependent on them. The first non-hoistable guard would
	33	** effectively prevent hoisting of all subsequent instructions.
	34	**
	35	** That's why we use a special form of unrolling using copy-substitution,
	36	** combined with redundancy elimination:
	37	**
	38	** The recorded instruction stream is re-emitted to the compiler pipeline
	39	** with substituted operands. The substitution table is filled with the
	40	** refs returned by re-emitting each instruction. This can be done
	41	** on-the-fly, because the IR is in strict SSA form, where every ref is
	42	** defined before its use.
	43	**
	44	** This aproach generates two code sections, separated by the LOOP
	45	** instruction:
	46	**
	47	** 1. The recorded instructions form a kind of pre-roll for the loop. It
	48	** contains a mix of invariant and variant instructions and performs
	49	** exactly one loop iteration (but not necessarily the 1st iteration).
	50	**
	51	** 2. The loop body contains only the variant instructions and performs
	52	** all remaining loop iterations.
	53	**
	54	** On first sight that looks like a waste of space, because the variant
	55	** instructions are present twice. But the key insight is that the
	56	** pre-roll honors the control-dependencies for both the pre-roll itself
	57	** and the loop body!
	58	**
	59	** It also means one doesn't have to explicitly model control-dependencies
	60	** (which, BTW, wouldn't help LICM much). And it's much easier to
	61	** integrate sparse snapshotting with this approach.
	62	**
	63	** One of the nicest aspects of this approach is that all of the
	64	** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
	65	** reused with only minor restrictions (e.g. one should not fold
	66	** instructions across loop-carried dependencies).
	67	**
	68	** But in general all optimizations can be applied which only need to look
	69	** backwards into the generated instruction stream. At any point in time
	70	** during the copy-substitution process this contains both a static loop
	71	** iteration (the pre-roll) and a dynamic one (from the to-be-copied
	72	** instruction up to the end of the partial loop body).
	73	**
	74	** Since control-dependencies are implicitly kept, CSE also applies to all
	75	** kinds of guards. The major advantage is that all invariant guards can
	76	** be hoisted, too.
	77	**
	78	** Load/store forwarding works across loop iterations, too. This is
	79	** important if loop-carried dependencies are kept in upvalues or tables.
	80	** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
	81	** become a forwarded loop-recurrence after inlining.
	82	**
	83	** Since the IR is in SSA form, loop-carried dependencies have to be
	84	** modeled with PHI instructions. The potential candidates for PHIs are
	85	** collected on-the-fly during copy-substitution. After eliminating the
	86	** redundant ones, PHI instructions are emitted below the loop body.
	87	**
	88	** Note that this departure from traditional SSA form doesn't change the
	89	** semantics of the PHI instructions themselves. But it greatly simplifies
	90	** on-the-fly generation of the IR and the machine code.
	91	*/
	92
	93	/* Some local macros to save typing. Undef'd at the end. */
	94	#define IR(ref) (&J->cur.ir[(ref)])
	95
	96	/* Pass IR on to next optimization in chain (FOLD). */
	97	#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
	98
	99	/* Emit raw IR without passing through optimizations. */
	100	#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
	101
	102	/* -- PHI elimination ----------------------------------------------------- */
	103
	104	/* Emit or eliminate collected PHIs. */
	105	static void loop_emit_phi(jit_State J, IRRef1 subst, IRRef1 *phi, IRRef nphi)
	106	{
	107	int pass2 = 0;
	108	IRRef i, nslots;
	109	IRRef invar = J->chain[IR_LOOP];
	110	/* Pass #1: mark redundant and potentially redundant PHIs. */
	111	for (i = 0; i < nphi; i++) {
	112	IRRef lref = phi[i];
	113	IRRef rref = subst[lref];
	114	if (lref == rref \|\| rref == REF_DROP) { /* Invariants are redundant. */
	115	irt_setmark(IR(lref)->t);
	116	} else if (!(IR(rref)->op1 == lref \|\| IR(rref)->op2 == lref)) {
	117	/* Quick check for simple recurrences failed, need pass2. */
	118	irt_setmark(IR(lref)->t);
	119	pass2 = 1;
	120	}
	121	}
	122	/* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
	123	if (pass2) {
	124	for (i = J->cur.nins-1; i > invar; i--) {
	125	IRIns *ir = IR(i);
	126	if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t);
	127	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
	128	}
	129	}
	130	/* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
	131	nslots = J->baseslot+J->maxslot;
	132	for (i = 1; i < nslots; i++) {
	133	IRRef ref = tref_ref(J->slot[i]);
	134	if (!irref_isk(ref) && ref != subst[ref]) {
	135	IRIns *ir = IR(ref);
	136	irt_clearmark(ir->t); /* Unmark potential uses, too. */
	137	if (!irt_isphi(ir->t) && !irt_ispri(ir->t)) {
	138	irt_setphi(ir->t);
	139	if (nphi >= LJ_MAX_PHI)
	140	lj_trace_err(J, LJ_TRERR_PHIOV);
	141	phi[nphi++] = (IRRef1)ref;
	142	}
	143	}
	144	}
	145	/* Pass #4: emit PHI instructions or eliminate PHIs. */
	146	for (i = 0; i < nphi; i++) {
	147	IRRef lref = phi[i];
	148	IRIns *ir = IR(lref);
	149	if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
	150	IRRef rref = subst[lref];
	151	if (rref > invar)
	152	irt_setphi(IR(rref)->t);
	153	emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
	154	} else { /* Otherwise eliminate PHI. */
	155	irt_clearmark(ir->t);
	156	irt_clearphi(ir->t);
	157	}
	158	}
	159	}
	160
	161	/* -- Loop unrolling using copy-substitution ------------------------------ */
	162
	163	/* Unroll loop. */
	164	static void loop_unroll(jit_State *J)
	165	{
	166	IRRef1 phi[LJ_MAX_PHI];
	167	uint32_t nphi = 0;
	168	IRRef1 *subst;
	169	SnapShot osnap, snap;
	170	IRRef2 *loopmap;
	171	BCReg loopslots;
	172	MSize nsnap, nsnapmap;
	173	IRRef ins, invar, osnapref;
	174
	175	/* Use temp buffer for substitution table.
	176	** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
	177	** Note: don't call into the VM or run the GC or the buffer may be gone.
	178	*/
	179	invar = J->cur.nins;
	180	subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
	181	(invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
	182	subst[REF_BASE] = REF_BASE;
	183
	184	/* LOOP separates the pre-roll from the loop body. */
	185	emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
	186
	187	/* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
	188	nsnap = J->cur.nsnap;
	189	if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
	190	MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
	191	if (2*nsnap-2 > maxsnap)
	192	lj_trace_err(J, LJ_TRERR_SNAPOV);
	193	lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
	194	J->cur.snap = J->snapbuf;
	195	}
	196	nsnapmap = J->cur.nsnapmap; /* Use temp. copy to avoid undo. */
	197	if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
	198	J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
	199	J->sizesnapmap*sizeof(IRRef2),
	200	2J->sizesnapmapsizeof(IRRef2));
	201	J->cur.snapmap = J->snapmapbuf;
	202	J->sizesnapmap *= 2;
	203	}
	204
	205	/* The loop snapshot is used for fallback substitutions. */
	206	snap = &J->cur.snap[nsnap-1];
	207	loopmap = &J->cur.snapmap[snap->mapofs];
	208	loopslots = snap->nslots;
	209	/* The PC of snapshot #0 and the loop snapshot must match. */
	210	lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
	211
	212	/* Start substitution with snapshot #1 (#0 is empty for root traces). */
	213	osnap = &J->cur.snap[1];
	214	osnapref = osnap->ref;
	215
	216	/* Copy and substitute all recorded instructions and snapshots. */
	217	for (ins = REF_FIRST; ins < invar; ins++) {
	218	IRIns *ir;
	219	IRRef op1, op2;
	220
	221	/* Copy-substitute snapshot. */
	222	if (ins >= osnapref) {
	223	IRRef2 nmap, omap = &J->cur.snapmap[osnap->mapofs];
	224	BCReg s, nslots;
	225	uint32_t nmapofs, nframelinks;
	226	if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
	227	nmapofs = nsnapmap;
	228	snap++; /* Add new snapshot. */
	229	} else {
	230	nmapofs = snap->mapofs; /* Overwrite previous snapshot. */
	231	}
	232	J->guardemit.irt = 0;
	233	nslots = osnap->nslots;
	234	nframelinks = osnap->nframelinks;
	235	snap->mapofs = (uint16_t)nmapofs;
	236	snap->ref = (IRRef1)J->cur.nins;
	237	snap->nslots = (uint8_t)nslots;
	238	snap->nframelinks = (uint8_t)nframelinks;
	239	snap->count = 0;
	240	osnap++;
	241	osnapref = osnap->ref;
	242	nsnapmap = nmapofs + nslots + nframelinks;
	243	nmap = &J->cur.snapmap[nmapofs];
	244	/* Substitute snapshot slots. */
	245	for (s = 0; s < nslots; s++) {
	246	IRRef ref = snap_ref(omap[s]);
	247	if (ref) {
	248	if (!irref_isk(ref))
	249	ref = subst[ref];
	250	} else if (s < loopslots) {
	251	ref = loopmap[s];
	252	}
	253	nmap[s] = ref;
	254	}
	255	/* Copy frame links. */
	256	nmap += nslots;
	257	omap += nslots;
	258	for (s = 0; s < nframelinks; s++)
	259	nmap[s] = omap[s];
	260	}
	261
	262	/* Substitute instruction operands. */
	263	ir = IR(ins);
	264	op1 = ir->op1;
	265	if (!irref_isk(op1)) op1 = subst[op1];
	266	op2 = ir->op2;
	267	if (!irref_isk(op2)) op2 = subst[op2];
	268	if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
	269	op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
	270	subst[ins] = (IRRef1)ins; /* Shortcut. */
	271	} else {
	272	/* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
	273	IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
	274	IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
	275	subst[ins] = (IRRef1)ref;
	276	if (ref != ins && ref < invar) { /* Loop-carried dependency? */
	277	IRIns *irr = IR(ref);
	278	/* Potential PHI? */
	279	if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
	280	irt_setphi(irr->t);
	281	if (nphi >= LJ_MAX_PHI)
	282	lj_trace_err(J, LJ_TRERR_PHIOV);
	283	phi[nphi++] = (IRRef1)ref;
	284	}
	285	/* Check all loop-carried dependencies for type instability. */
	286	if (!irt_sametype(t, irr->t)) {
	287	if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num case. */
	288	subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0));
	289	else
	290	lj_trace_err(J, LJ_TRERR_TYPEINS);
	291	}
	292	}
	293	}
	294	}
	295	if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
	296	J->cur.nsnapmap = (uint16_t)nsnapmap;
	297	snap++;
	298	} else {
	299	J->cur.nsnapmap = (uint16_t)snap->mapofs; /* Last snapshot is redundant. */
	300	}
	301	J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
	302	lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
	303
	304	loop_emit_phi(J, subst, phi, nphi);
	305	}
	306
	307	/* Undo any partial changes made by the loop optimization. */
	308	static void loop_undo(jit_State *J, IRRef ins)
	309	{
	310	lj_ir_rollback(J, ins);
	311	for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
	312	IRIns *ir = IR(ins);
	313	irt_clearphi(ir->t);
	314	irt_clearmark(ir->t);
	315	}
	316	}
	317
	318	/* Protected callback for loop optimization. */
	319	static TValue cploop_opt(lua_State L, lua_CFunction dummy, void *ud)
	320	{
	321	UNUSED(L); UNUSED(dummy);
	322	loop_unroll((jit_State *)ud);
	323	return NULL;
	324	}
	325
	326	/* Loop optimization. */
	327	int lj_opt_loop(jit_State *J)
	328	{
	329	IRRef nins = J->cur.nins;
	330	int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J);
	331	if (LJ_UNLIKELY(errcode)) {
	332	lua_State *L = J->L;
	333	if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) { /* Trace error? */
	334	int32_t e = lj_num2int(numV(L->top-1));
	335	switch ((TraceError)e) {
	336	case LJ_TRERR_TYPEINS: /* Type instability. */
	337	case LJ_TRERR_GFAIL: /* Guard would always fail. */
	338	/* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
	339	if (--J->instunroll < 0) /* But do not unroll forever. */
	340	break;
	341	L->top--; /* Remove error object. */
	342	J->guardemit.irt = 0;
	343	loop_undo(J, nins);
	344	return 1; /* Loop optimization failed, continue recording. */
	345	default:
	346	break;
	347	}
	348	}
	349	lj_err_throw(L, errcode); /* Propagate all other errors. */
	350	}
	351	return 0; /* Loop optimization is ok. */
	352	}
	353
	354	#undef IR
	355	#undef emitir
	356	#undef emitir_raw
	357
	358	#endif