RELEASE LuaJIT-2.0.0-beta1v2.0.0-beta1

author: Mike Pall <mike> 2009-12-08 19:46:35 +0100
committer: Mike Pall <mike> 2009-12-08 19:46:35 +0100
commit: 55b16959717084884fd4a0cbae6d19e3786c20c7 (patch)
tree: c8a07a43c13679751ed25a9d06796e9e7b2134a6 /src/lj_opt_narrow.c
download: luajit-2.0.0-beta1.tar.gz
luajit-2.0.0-beta1.tar.bz2
luajit-2.0.0-beta1.zip
1 files changed, 430 insertions, 0 deletions
diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c
new file mode 100644
index 00000000..60a6afb8
--- /dev/null
+++ b/src/lj_opt_narrow.c
@@ -0,0 +1,430 @@
+/*
+** NARROW: Narrowing of numbers to integers (double to int32_t).
+** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
+*/
+#define lj_opt_narrow_c
+#define LUA_CORE
+#include "lj_obj.h"
+#if LJ_HASJIT
+#include "lj_str.h"
+#include "lj_bc.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+/* Rationale for narrowing optimizations:
+**
+** Lua has only a single number type and this is a FP double by default.
+** Narrowing doubles to integers does not pay off for the interpreter on a
+** current-generation x86/x64 machine. Most FP operations need the same
+** amount of execution resources as their integer counterparts, except
+** with slightly longer latencies. Longer latencies are a non-issue for
+** the interpreter, since they are usually hidden by other overhead.
+**
+** The total CPU execution bandwidth is the sum of the bandwidth of the FP
+** and the integer units, because they execute in parallel. The FP units
+** have an equal or higher bandwidth than the integer units. Not using
+** them means losing execution bandwidth. Moving work away from them to
+** the already quite busy integer units is a losing proposition.
+**
+** The situation for JIT-compiled code is a bit different: the higher code
+** density makes the extra latencies much more visible. Tight loops expose
+** the latencies for updating the induction variables. Array indexing
+** requires narrowing conversions with high latencies and additional
+** guards (to check that the index is really an integer). And many common
+** optimizations only work on integers.
+**
+** One solution would be speculative, eager narrowing of all number loads.
+** This causes many problems, like losing -0 or the need to resolve type
+** mismatches between traces. It also effectively forces the integer type
+** to have overflow-checking semantics. This impedes many basic
+** optimizations and requires adding overflow checks to all integer
+** arithmetic operations (whereas FP arithmetics can do without).
+**
+** Always replacing an FP op with an integer op plus an overflow check is
+** counter-productive on a current-generation super-scalar CPU. Although
+** the overflow check branches are highly predictable, they will clog the
+** execution port for the branch unit and tie up reorder buffers. This is
+** turning a pure data-flow dependency into a different data-flow
+** dependency (with slightly lower latency) *plus* a control dependency.
+** In general, you don't want to do this since latencies due to data-flow
+** dependencies can be well hidden by out-of-order execution.
+**
+** A better solution is to keep all numbers as FP values and only narrow
+** when it's beneficial to do so. LuaJIT uses predictive narrowing for
+** induction variables and demand-driven narrowing for index expressions
+** and bit operations. Additionally it can eliminate or hoists most of the
+** resulting overflow checks. Regular arithmetic computations are never
+** narrowed to integers.
+**
+** The integer type in the IR has convenient wrap-around semantics and
+** ignores overflow. Extra operations have been added for
+** overflow-checking arithmetic (ADDOV/SUBOV) instead of an extra type.
+** Apart from reducing overall complexity of the compiler, this also
+** nicely solves the problem where you want to apply algebraic
+** simplifications to ADD, but not to ADDOV. And the assembler can use lea
+** instead of an add for integer ADD, but not for ADDOV (lea does not
+** affect the flags, but it helps to avoid register moves).
+**
+** Note that all of the above has to be reconsidered if LuaJIT is to be
+** ported to architectures with slow FP operations or with no hardware FPU
+** at all. In the latter case an integer-only port may be the best overall
+** solution (if this still meets user demands).
+*/
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)                 (&J->cur.ir[(ref)])
+#define fins                    (&J->fold.ins)
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+#define emitir_raw(ot, a, b)    (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+/* -- Elimination of narrowing type conversions --------------------------- */
+/* Narrowing of index expressions and bit operations is demand-driven. The
+** trace recorder emits a narrowing type conversion (TOINT or TOBIT) in
+** all of these cases (e.g. array indexing or string indexing). FOLD
+** already takes care of eliminating simple redundant conversions like
+** TOINT(TONUM(x)) ==> x.
+**
+** But the surrounding code is FP-heavy and all arithmetic operations are
+** performed on FP numbers. Consider a common example such as 'x=t[i+1]',
+** with 'i' already an integer (due to induction variable narrowing). The
+** index expression would be recorded as TOINT(ADD(TONUM(i), 1)), which is
+** clearly suboptimal.
+**
+** One can do better by recursively backpropagating the narrowing type
+** conversion across FP arithmetic operations. This turns FP ops into
+** their corresponding integer counterparts. Depending on the semantics of
+** the conversion they also need to check for overflow. Currently only ADD
+** and SUB are supported.
+**
+** The above example can be rewritten as ADDOV(TOINT(TONUM(i)), 1) and
+** then into ADDOV(i, 1) after folding of the conversions. The original FP
+** ops remain in the IR and are eliminated by DCE since all references to
+** them are gone.
+**
+** Special care has to be taken to avoid narrowing across an operation
+** which is potentially operating on non-integral operands. One obvious
+** case is when an expression contains a non-integral constant, but ends
+** up as an integer index at runtime (like t[x+1.5] with x=0.5).
+**
+** Operations with two non-constant operands illustrate a similar problem
+** (like t[a+b] with a=1.5 and b=2.5). Backpropagation has to stop there,
+** unless it can be proven that either operand is integral (e.g. by CSEing
+** a previous conversion). As a not-so-obvious corollary this logic also
+** applies for a whole expression tree (e.g. t[(a+1)+(b+1)]).
+**
+** Correctness of the transformation is guaranteed by avoiding to expand
+** the tree by adding more conversions than the one we would need to emit
+** if not backpropagating. TOBIT employs a more optimistic rule, because
+** the conversion has special semantics, designed to make the life of the
+** compiler writer easier. ;-)
+**
+** Using on-the-fly backpropagation of an expression tree doesn't work
+** because it's unknown whether the transform is correct until the end.
+** This either requires IR rollback and cache invalidation for every
+** subtree or a two-pass algorithm. The former didn't work out too well,
+** so the code now combines a recursive collector with a stack-based
+** emitter.
+**
+** [A recursive backpropagation algorithm with backtracking, employing
+** skip-list lookup and round-robin caching, emitting stack operations
+** on-the-fly for a stack-based interpreter -- and all of that in a meager
+** kilobyte? Yep, compilers are a great treasure chest. Throw away your
+** textbooks and read the codebase of a compiler today!]
+**
+** There's another optimization opportunity for array indexing: it's
+** always accompanied by an array bounds-check. The outermost overflow
+** check may be delegated to the ABC operation. This works because ABC is
+** an unsigned comparison and wrap-around due to overflow creates negative
+** numbers.
+**
+** But this optimization is only valid for constants that cannot overflow
+** an int32_t into the range of valid array indexes [0..2^27+1). A check
+** for +-2^30 is safe since -2^31 - 2^30 wraps to 2^30 and 2^31-1 + 2^30
+** wraps to -2^30-1.
+**
+** It's also good enough in practice, since e.g. t[i+1] or t[i-10] are
+** quite common. So the above example finally ends up as ADD(i, 1)!
+**
+** Later on, the assembler is able to fuse the whole array reference and
+** the ADD into the memory operands of loads and other instructions. This
+** is why LuaJIT is able to generate very pretty (and fast) machine code
+** for array indexing. And that, my dear, concludes another story about
+** one of the hidden secrets of LuaJIT ...
+*/
+/* Maximum backpropagation depth and maximum stack size. */
+#define NARROW_MAX_BACKPROP     100
+#define NARROW_MAX_STACK        256
+/* Context used for narrowing of type conversions. */
+typedef struct NarrowConv {
+  jit_State *J;         /* JIT compiler state. */
+  IRRef2 *sp;           /* Current stack pointer. */
+  IRRef2 *maxsp;        /* Maximum stack pointer minus redzone. */
+  int lim;              /* Limit on the number of emitted conversions. */
+  IRRef mode;           /* Conversion mode (IRTOINT_*). */
+  IRRef2 stack[NARROW_MAX_STACK];  /* Stack holding the stack-machine code. */
+} NarrowConv;
+/* The stack machine has a 32 bit instruction format: [IROpT | IRRef1]
+** The lower 16 bits hold a reference (or 0). The upper 16 bits hold
+** the IR opcode + type or one of the following special opcodes:
+*/
+enum {
+  NARROW_REF,           /* Push ref. */
+  NARROW_CONV,          /* Push conversion of ref. */
+  NARROW_INT            /* Push KINT ref. The next code holds an int32_t. */
+};
+/* Lookup a reference in the backpropagation cache. */
+static IRRef narrow_bpc_get(jit_State *J, IRRef1 key, IRRef mode)
+{
+  ptrdiff_t i;
+  for (i = 0; i < BPROP_SLOTS; i++) {
+    BPropEntry *bp = &J->bpropcache[i];
+    if (bp->key == key && bp->mode <= mode)  /* Stronger checks are ok, too. */
+      return bp->val;
+  }
+  return 0;
+}
+/* Add an entry to the backpropagation cache. */
+static void narrow_bpc_set(jit_State *J, IRRef1 key, IRRef1 val, IRRef mode)
+{
+  uint32_t slot = J->bpropslot;
+  BPropEntry *bp = &J->bpropcache[slot];
+  J->bpropslot = (slot + 1) & (BPROP_SLOTS-1);
+  bp->key = key;
+  bp->val = val;
+  bp->mode = mode;
+}
+/* Backpropagate narrowing conversion. Return number of needed conversions. */
+static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
+{
+  jit_State *J = nc->J;
+  IRIns *ir = IR(ref);
+  IRRef cref;
+  /* Check the easy cases first. */
+  if (ir->o == IR_TONUM) {  /* Undo inverse conversion. */
+    *nc->sp++ = IRREF2(ir->op1, NARROW_REF);
+    return 0;
+  } else if (ir->o == IR_KNUM) {  /* Narrow FP constant. */
+    lua_Number n = ir_knum(ir)->n;
+    if (nc->mode == IRTOINT_TOBIT) {  /* Allows a wider range of constants. */
+      int64_t k64 = (int64_t)n;
+      if (n == cast_num(k64)) {  /* Only if constant doesn't lose precision. */
+        *nc->sp++ = IRREF2(0, NARROW_INT);
+        *nc->sp++ = (IRRef2)k64;  /* But always truncate to 32 bits. */
+        return 0;
+      }
+    } else {
+      int32_t k = lj_num2int(n);
+      if (n == cast_num(k)) {  /* Only if constant is really an integer. */
+        *nc->sp++ = IRREF2(0, NARROW_INT);
+        *nc->sp++ = (IRRef2)k;
+        return 0;
+      }
+    }
+    return 10;  /* Never narrow other FP constants (this is rare). */
+  }
+  /* Try to CSE the conversion. Stronger checks are ok, too. */
+  for (cref = J->chain[fins->o]; cref > ref; cref = IR(cref)->prev)
+    if (IR(cref)->op1 == ref &&
+        irt_isguard(IR(cref)->t) >= irt_isguard(fins->t)) {
+      *nc->sp++ = IRREF2(cref, NARROW_REF);
+      return 0;  /* Already there, no additional conversion needed. */
+    }
+  /* Backpropagate across ADD/SUB. */
+  if (ir->o == IR_ADD || ir->o == IR_SUB) {
+    /* Try cache lookup first. */
+    IRRef bpref, mode = nc->mode;
+    if (mode == IRTOINT_INDEX && depth > 0)
+      mode = IRTOINT_CHECK;  /* Inner conversions need a stronger check. */
+    bpref = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
+    if (bpref) {
+      *nc->sp++ = IRREF2(bpref, NARROW_REF);
+      return 0;
+    }
+    if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
+      IRRef2 *savesp = nc->sp;
+      int count = narrow_conv_backprop(nc, ir->op1, depth);
+      count += narrow_conv_backprop(nc, ir->op2, depth);
+      if (count <= nc->lim) {  /* Limit total number of conversions. */
+        *nc->sp++ = IRREF2(ref, IRTI(ir->o));
+        return count;
+      }
+      nc->sp = savesp;  /* Too many conversions, need to backtrack. */
+    }
+  }
+  /* Otherwise add a conversion. */
+  *nc->sp++ = IRREF2(ref, NARROW_CONV);
+  return 1;
+}
+/* Emit the conversions collected during backpropagation. */
+static IRRef narrow_conv_emit(jit_State *J, NarrowConv *nc)
+{
+  /* The fins fields must be saved now -- emitir() overwrites them. */
+  IROpT guardot = irt_isguard(fins->t) ? IRTG(IR_ADDOV-IR_ADD, 0) : 0;
+  IROpT convot = fins->ot;
+  IRRef1 convop2 = fins->op2;
+  IRRef2 *next = nc->stack;  /* List of instructions from backpropagation. */
+  IRRef2 *last = nc->sp;
+  IRRef2 *sp = nc->stack;  /* Recycle the stack to store operands. */
+  while (next < last) {  /* Simple stack machine to process the ins. list. */
+    IRRef2 ref = *next++;
+    IROpT op = ref >> 16;
+    if (op == NARROW_REF) {
+      *sp++ = ref;
+    } else if (op == NARROW_CONV) {
+      *sp++ = emitir_raw(convot, ref, convop2);  /* Raw emit avoids a loop. */
+    } else if (op == NARROW_INT) {
+      lua_assert(next < last);
+      *sp++ = lj_ir_kint(J, *next++);
+    } else {  /* Regular IROpT. Pops two operands and pushes one result. */
+      IRRef mode = nc->mode;
+      lua_assert(sp >= nc->stack+2);
+      sp--;
+      /* Omit some overflow checks for array indexing. See comments above. */
+      if (mode == IRTOINT_INDEX) {
+        if (next == last && irref_isk((IRRef1)sp[0]) &&
+          (uint32_t)IR((IRRef1)sp[0])->i + 0x40000000 < 0x80000000)
+          guardot = 0;
+        else
+          mode = IRTOINT_CHECK;  /* Otherwise cache a stronger check. */
+      }
+      sp[-1] = emitir(op+guardot, sp[-1], sp[0]);
+      narrow_bpc_set(J, (IRRef1)ref, (IRRef1)sp[-1], mode);  /* Add to cache. */
+    }
+  }
+  lua_assert(sp == nc->stack+1);
+  return nc->stack[0];
+}
+/* Narrow a type conversion of an arithmetic operation. */
+TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
+{
+  if ((J->flags & JIT_F_OPT_NARROW)) {
+    NarrowConv nc;
+    nc.J = J;
+    nc.sp = nc.stack;
+    nc.maxsp = &nc.stack[NARROW_MAX_STACK-4];
+    if (fins->o == IR_TOBIT) {
+      nc.mode = IRTOINT_TOBIT;  /* Used only in the backpropagation cache. */
+      nc.lim = 2;  /* TOBIT can use a more optimistic rule. */
+    } else {
+      nc.mode = fins->op2;
+      nc.lim = 1;
+    }
+    if (narrow_conv_backprop(&nc, fins->op1, 0) <= nc.lim)
+      return narrow_conv_emit(J, &nc);
+  }
+  return NEXTFOLD;
+}
+/* -- Narrowing of arithmetic operators ----------------------------------- */
+/* Check whether a number fits into an int32_t (-0 is ok, too). */
+static int numisint(lua_Number n)
+{
+  return (n == cast_num(lj_num2int(n)));
+}
+/* Narrowing of modulo operator. */
+TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc)
+{
+  TRef tmp;
+  if ((J->flags & JIT_F_OPT_NARROW) &&
+      tref_isk(rc) && tref_isint(rc)) {  /* Optimize x % k. */
+    int32_t k = IR(tref_ref(rc))->i;
+    if (k > 0 && (k & (k-1)) == 0) {  /* i % 2^k ==> band(i, 2^k-1) */
+      if (tref_isint(rb))
+        return emitir(IRTI(IR_BAND), rb, lj_ir_kint(J, k-1));
+    }
+  }
+  /* b % c ==> b - floor(b/c)*c */
+  rb = lj_ir_tonum(J, rb);
+  rc = lj_ir_tonum(J, rc);
+  tmp = emitir(IRTN(IR_DIV), rb, rc);
+  tmp = emitir(IRTN(IR_FPMATH), tmp, IRFPM_FLOOR);
+  tmp = emitir(IRTN(IR_MUL), tmp, rc);
+  return emitir(IRTN(IR_SUB), rb, tmp);
+}
+/* Narrowing of power operator or math.pow. */
+TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc)
+{
+  lua_Number n;
+  if (tvisstr(vc) && !lj_str_numconv(strVdata(vc), vc))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  n = numV(vc);
+  /* Limit narrowing for pow to small exponents (or for two constants). */
+  if ((tref_isint(rc) && tref_isk(rc) && tref_isk(rb)) ||
+      ((J->flags & JIT_F_OPT_NARROW) &&
+       (numisint(n) && n >= -65536.0 && n <= 65536.0))) {
+    TRef tmp;
+    if (!tref_isinteger(rc)) {
+      if (tref_isstr(rc))
+        rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0);
+      rc = emitir(IRTGI(IR_TOINT), rc, IRTOINT_CHECK); /* Guarded TOINT! */
+    }
+    if (!tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
+      tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536-2147483647-1));
+      emitir(IRTGI(IR_LE), tmp, lj_ir_kint(J, 2*65536-2147483647-1));
+    }
+    return emitir(IRTN(IR_POWI), rb, rc);
+  }
+  /* FOLD covers most cases, but some are easier to do here. */
+  if (tref_isk(rb) && tvispone(ir_knum(IR(tref_ref(rb)))))
+    return rb;  /* 1 ^ x ==> 1 */
+  rc = lj_ir_tonum(J, rc);
+  if (tref_isk(rc) && ir_knum(IR(tref_ref(rc)))->n == 0.5)
+    return emitir(IRTN(IR_FPMATH), rb, IRFPM_SQRT);  /* x ^ 0.5 ==> sqrt(x) */
+  /* Split up b^c into exp2(c*log2(b)). Assembler may rejoin later. */
+  rb = emitir(IRTN(IR_FPMATH), rb, IRFPM_LOG2);
+  rc = emitir(IRTN(IR_MUL), rb, rc);
+  return emitir(IRTN(IR_FPMATH), rc, IRFPM_EXP2);
+}
+/* -- Predictive narrowing of induction variables ------------------------- */
+/* Narrow the FORL index type by looking at the runtime values. */
+IRType lj_opt_narrow_forl(cTValue *forbase)
+{
+  lua_assert(tvisnum(&forbase[FORL_IDX]) &&
+             tvisnum(&forbase[FORL_STOP]) &&
+             tvisnum(&forbase[FORL_STEP]));
+  /* Narrow only if the runtime values of start/stop/step are all integers. */
+  if (numisint(numV(&forbase[FORL_IDX])) &&
+      numisint(numV(&forbase[FORL_STOP])) &&
+      numisint(numV(&forbase[FORL_STEP]))) {
+    /* And if the loop index can't possibly overflow. */
+    lua_Number step = numV(&forbase[FORL_STEP]);
+    lua_Number sum = numV(&forbase[FORL_STOP]) + step;
+    if (0 <= step ? sum <= 2147483647.0 : sum >= -2147483648.0)
+      return IRT_INT;
+  }
+  return IRT_NUM;
+}
+#undef IR
+#undef fins
+#undef emitir
+#undef emitir_raw
+#endif
author	Mike Pall <mike>	2009-12-08 19:46:35 +0100
committer	Mike Pall <mike>	2009-12-08 19:46:35 +0100
commit	55b16959717084884fd4a0cbae6d19e3786c20c7 (patch)
tree	c8a07a43c13679751ed25a9d06796e9e7b2134a6 /src/lj_opt_narrow.c
download	luajit-2.0.0-beta1.tar.gz luajit-2.0.0-beta1.tar.bz2 luajit-2.0.0-beta1.zip

diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c new file mode 100644 index 00000000..60a6afb8 --- /dev/null +++ b/src/lj_opt_narrow.c
@@ -0,0 +1,430 @@
	1	/*
	2	** NARROW: Narrowing of numbers to integers (double to int32_t).
	3	** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
	4	*/
	5
	6	#define lj_opt_narrow_c
	7	#define LUA_CORE
	8
	9	#include "lj_obj.h"
	10
	11	#if LJ_HASJIT
	12
	13	#include "lj_str.h"
	14	#include "lj_bc.h"
	15	#include "lj_ir.h"
	16	#include "lj_jit.h"
	17	#include "lj_iropt.h"
	18	#include "lj_trace.h"
	19
	20	/* Rationale for narrowing optimizations:
	21	**
	22	** Lua has only a single number type and this is a FP double by default.
	23	** Narrowing doubles to integers does not pay off for the interpreter on a
	24	** current-generation x86/x64 machine. Most FP operations need the same
	25	** amount of execution resources as their integer counterparts, except
	26	** with slightly longer latencies. Longer latencies are a non-issue for
	27	** the interpreter, since they are usually hidden by other overhead.
	28	**
	29	** The total CPU execution bandwidth is the sum of the bandwidth of the FP
	30	** and the integer units, because they execute in parallel. The FP units
	31	** have an equal or higher bandwidth than the integer units. Not using
	32	** them means losing execution bandwidth. Moving work away from them to
	33	** the already quite busy integer units is a losing proposition.
	34	**
	35	** The situation for JIT-compiled code is a bit different: the higher code
	36	** density makes the extra latencies much more visible. Tight loops expose
	37	** the latencies for updating the induction variables. Array indexing
	38	** requires narrowing conversions with high latencies and additional
	39	** guards (to check that the index is really an integer). And many common
	40	** optimizations only work on integers.
	41	**
	42	** One solution would be speculative, eager narrowing of all number loads.
	43	** This causes many problems, like losing -0 or the need to resolve type
	44	** mismatches between traces. It also effectively forces the integer type
	45	** to have overflow-checking semantics. This impedes many basic
	46	** optimizations and requires adding overflow checks to all integer
	47	** arithmetic operations (whereas FP arithmetics can do without).
	48	**
	49	** Always replacing an FP op with an integer op plus an overflow check is
	50	** counter-productive on a current-generation super-scalar CPU. Although
	51	** the overflow check branches are highly predictable, they will clog the
	52	** execution port for the branch unit and tie up reorder buffers. This is
	53	** turning a pure data-flow dependency into a different data-flow
	54	** dependency (with slightly lower latency) plus a control dependency.
	55	** In general, you don't want to do this since latencies due to data-flow
	56	** dependencies can be well hidden by out-of-order execution.
	57	**
	58	** A better solution is to keep all numbers as FP values and only narrow
	59	** when it's beneficial to do so. LuaJIT uses predictive narrowing for
	60	** induction variables and demand-driven narrowing for index expressions
	61	** and bit operations. Additionally it can eliminate or hoists most of the
	62	** resulting overflow checks. Regular arithmetic computations are never
	63	** narrowed to integers.
	64	**
	65	** The integer type in the IR has convenient wrap-around semantics and
	66	** ignores overflow. Extra operations have been added for
	67	** overflow-checking arithmetic (ADDOV/SUBOV) instead of an extra type.
	68	** Apart from reducing overall complexity of the compiler, this also
	69	** nicely solves the problem where you want to apply algebraic
	70	** simplifications to ADD, but not to ADDOV. And the assembler can use lea
	71	** instead of an add for integer ADD, but not for ADDOV (lea does not
	72	** affect the flags, but it helps to avoid register moves).
	73	**
	74	** Note that all of the above has to be reconsidered if LuaJIT is to be
	75	** ported to architectures with slow FP operations or with no hardware FPU
	76	** at all. In the latter case an integer-only port may be the best overall
	77	** solution (if this still meets user demands).
	78	*/
	79
	80	/* Some local macros to save typing. Undef'd at the end. */
	81	#define IR(ref) (&J->cur.ir[(ref)])
	82	#define fins (&J->fold.ins)
	83
	84	/* Pass IR on to next optimization in chain (FOLD). */
	85	#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
	86
	87	#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
	88
	89	/* -- Elimination of narrowing type conversions --------------------------- */
	90
	91	/* Narrowing of index expressions and bit operations is demand-driven. The
	92	** trace recorder emits a narrowing type conversion (TOINT or TOBIT) in
	93	** all of these cases (e.g. array indexing or string indexing). FOLD
	94	** already takes care of eliminating simple redundant conversions like
	95	** TOINT(TONUM(x)) ==> x.
	96	**
	97	** But the surrounding code is FP-heavy and all arithmetic operations are
	98	** performed on FP numbers. Consider a common example such as 'x=t[i+1]',
	99	** with 'i' already an integer (due to induction variable narrowing). The
	100	** index expression would be recorded as TOINT(ADD(TONUM(i), 1)), which is
	101	** clearly suboptimal.
	102	**
	103	** One can do better by recursively backpropagating the narrowing type
	104	** conversion across FP arithmetic operations. This turns FP ops into
	105	** their corresponding integer counterparts. Depending on the semantics of
	106	** the conversion they also need to check for overflow. Currently only ADD
	107	** and SUB are supported.
	108	**
	109	** The above example can be rewritten as ADDOV(TOINT(TONUM(i)), 1) and
	110	** then into ADDOV(i, 1) after folding of the conversions. The original FP
	111	** ops remain in the IR and are eliminated by DCE since all references to
	112	** them are gone.
	113	**
	114	** Special care has to be taken to avoid narrowing across an operation
	115	** which is potentially operating on non-integral operands. One obvious
	116	** case is when an expression contains a non-integral constant, but ends
	117	** up as an integer index at runtime (like t[x+1.5] with x=0.5).
	118	**
	119	** Operations with two non-constant operands illustrate a similar problem
	120	** (like t[a+b] with a=1.5 and b=2.5). Backpropagation has to stop there,
	121	** unless it can be proven that either operand is integral (e.g. by CSEing
	122	** a previous conversion). As a not-so-obvious corollary this logic also
	123	** applies for a whole expression tree (e.g. t[(a+1)+(b+1)]).
	124	**
	125	** Correctness of the transformation is guaranteed by avoiding to expand
	126	** the tree by adding more conversions than the one we would need to emit
	127	** if not backpropagating. TOBIT employs a more optimistic rule, because
	128	** the conversion has special semantics, designed to make the life of the
	129	** compiler writer easier. ;-)
	130	**
	131	** Using on-the-fly backpropagation of an expression tree doesn't work
	132	** because it's unknown whether the transform is correct until the end.
	133	** This either requires IR rollback and cache invalidation for every
	134	** subtree or a two-pass algorithm. The former didn't work out too well,
	135	** so the code now combines a recursive collector with a stack-based
	136	** emitter.
	137	**
	138	** [A recursive backpropagation algorithm with backtracking, employing
	139	** skip-list lookup and round-robin caching, emitting stack operations
	140	** on-the-fly for a stack-based interpreter -- and all of that in a meager
	141	** kilobyte? Yep, compilers are a great treasure chest. Throw away your
	142	** textbooks and read the codebase of a compiler today!]
	143	**
	144	** There's another optimization opportunity for array indexing: it's
	145	** always accompanied by an array bounds-check. The outermost overflow
	146	** check may be delegated to the ABC operation. This works because ABC is
	147	** an unsigned comparison and wrap-around due to overflow creates negative
	148	** numbers.
	149	**
	150	** But this optimization is only valid for constants that cannot overflow
	151	** an int32_t into the range of valid array indexes [0..2^27+1). A check
	152	** for +-2^30 is safe since -2^31 - 2^30 wraps to 2^30 and 2^31-1 + 2^30
	153	** wraps to -2^30-1.
	154	**
	155	** It's also good enough in practice, since e.g. t[i+1] or t[i-10] are
	156	** quite common. So the above example finally ends up as ADD(i, 1)!
	157	**
	158	** Later on, the assembler is able to fuse the whole array reference and
	159	** the ADD into the memory operands of loads and other instructions. This
	160	** is why LuaJIT is able to generate very pretty (and fast) machine code
	161	** for array indexing. And that, my dear, concludes another story about
	162	** one of the hidden secrets of LuaJIT ...
	163	*/
	164
	165	/* Maximum backpropagation depth and maximum stack size. */
	166	#define NARROW_MAX_BACKPROP 100
	167	#define NARROW_MAX_STACK 256
	168
	169	/* Context used for narrowing of type conversions. */
	170	typedef struct NarrowConv {
	171	jit_State J; / JIT compiler state. */
	172	IRRef2 sp; / Current stack pointer. */
	173	IRRef2 maxsp; / Maximum stack pointer minus redzone. */
	174	int lim; /* Limit on the number of emitted conversions. */
	175	IRRef mode; /* Conversion mode (IRTOINT_). /
	176	IRRef2 stack[NARROW_MAX_STACK]; /* Stack holding the stack-machine code. */
	177	} NarrowConv;
	178
	179	/* The stack machine has a 32 bit instruction format: [IROpT \| IRRef1]
	180	** The lower 16 bits hold a reference (or 0). The upper 16 bits hold
	181	** the IR opcode + type or one of the following special opcodes:
	182	*/
	183	enum {
	184	NARROW_REF, /* Push ref. */
	185	NARROW_CONV, /* Push conversion of ref. */
	186	NARROW_INT /* Push KINT ref. The next code holds an int32_t. */
	187	};
	188
	189	/* Lookup a reference in the backpropagation cache. */
	190	static IRRef narrow_bpc_get(jit_State *J, IRRef1 key, IRRef mode)
	191	{
	192	ptrdiff_t i;
	193	for (i = 0; i < BPROP_SLOTS; i++) {
	194	BPropEntry *bp = &J->bpropcache[i];
	195	if (bp->key == key && bp->mode <= mode) /* Stronger checks are ok, too. */
	196	return bp->val;
	197	}
	198	return 0;
	199	}
	200
	201	/* Add an entry to the backpropagation cache. */
	202	static void narrow_bpc_set(jit_State *J, IRRef1 key, IRRef1 val, IRRef mode)
	203	{
	204	uint32_t slot = J->bpropslot;
	205	BPropEntry *bp = &J->bpropcache[slot];
	206	J->bpropslot = (slot + 1) & (BPROP_SLOTS-1);
	207	bp->key = key;
	208	bp->val = val;
	209	bp->mode = mode;
	210	}
	211
	212	/* Backpropagate narrowing conversion. Return number of needed conversions. */
	213	static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
	214	{
	215	jit_State *J = nc->J;
	216	IRIns *ir = IR(ref);
	217	IRRef cref;
	218
	219	/* Check the easy cases first. */
	220	if (ir->o == IR_TONUM) { /* Undo inverse conversion. */
	221	*nc->sp++ = IRREF2(ir->op1, NARROW_REF);
	222	return 0;
	223	} else if (ir->o == IR_KNUM) { /* Narrow FP constant. */
	224	lua_Number n = ir_knum(ir)->n;
	225	if (nc->mode == IRTOINT_TOBIT) { /* Allows a wider range of constants. */
	226	int64_t k64 = (int64_t)n;
	227	if (n == cast_num(k64)) { /* Only if constant doesn't lose precision. */
	228	*nc->sp++ = IRREF2(0, NARROW_INT);
	229	nc->sp++ = (IRRef2)k64; / But always truncate to 32 bits. */
	230	return 0;
	231	}
	232	} else {
	233	int32_t k = lj_num2int(n);
	234	if (n == cast_num(k)) { /* Only if constant is really an integer. */
	235	*nc->sp++ = IRREF2(0, NARROW_INT);
	236	*nc->sp++ = (IRRef2)k;
	237	return 0;
	238	}
	239	}
	240	return 10; /* Never narrow other FP constants (this is rare). */
	241	}
	242
	243	/* Try to CSE the conversion. Stronger checks are ok, too. */
	244	for (cref = J->chain[fins->o]; cref > ref; cref = IR(cref)->prev)
	245	if (IR(cref)->op1 == ref &&
	246	irt_isguard(IR(cref)->t) >= irt_isguard(fins->t)) {
	247	*nc->sp++ = IRREF2(cref, NARROW_REF);
	248	return 0; /* Already there, no additional conversion needed. */
	249	}
	250
	251	/* Backpropagate across ADD/SUB. */
	252	if (ir->o == IR_ADD \|\| ir->o == IR_SUB) {
	253	/* Try cache lookup first. */
	254	IRRef bpref, mode = nc->mode;
	255	if (mode == IRTOINT_INDEX && depth > 0)
	256	mode = IRTOINT_CHECK; /* Inner conversions need a stronger check. */
	257	bpref = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
	258	if (bpref) {
	259	*nc->sp++ = IRREF2(bpref, NARROW_REF);
	260	return 0;
	261	}
	262	if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
	263	IRRef2 *savesp = nc->sp;
	264	int count = narrow_conv_backprop(nc, ir->op1, depth);
	265	count += narrow_conv_backprop(nc, ir->op2, depth);
	266	if (count <= nc->lim) { /* Limit total number of conversions. */
	267	*nc->sp++ = IRREF2(ref, IRTI(ir->o));
	268	return count;
	269	}
	270	nc->sp = savesp; /* Too many conversions, need to backtrack. */
	271	}
	272	}
	273
	274	/* Otherwise add a conversion. */
	275	*nc->sp++ = IRREF2(ref, NARROW_CONV);
	276	return 1;
	277	}
	278
	279	/* Emit the conversions collected during backpropagation. */
	280	static IRRef narrow_conv_emit(jit_State J, NarrowConv nc)
	281	{
	282	/* The fins fields must be saved now -- emitir() overwrites them. */
	283	IROpT guardot = irt_isguard(fins->t) ? IRTG(IR_ADDOV-IR_ADD, 0) : 0;
	284	IROpT convot = fins->ot;
	285	IRRef1 convop2 = fins->op2;
	286	IRRef2 next = nc->stack; / List of instructions from backpropagation. */
	287	IRRef2 *last = nc->sp;
	288	IRRef2 sp = nc->stack; / Recycle the stack to store operands. */
	289	while (next < last) { /* Simple stack machine to process the ins. list. */
	290	IRRef2 ref = *next++;
	291	IROpT op = ref >> 16;
	292	if (op == NARROW_REF) {
	293	*sp++ = ref;
	294	} else if (op == NARROW_CONV) {
	295	sp++ = emitir_raw(convot, ref, convop2); / Raw emit avoids a loop. */
	296	} else if (op == NARROW_INT) {
	297	lua_assert(next < last);
	298	sp++ = lj_ir_kint(J, next++);
	299	} else { /* Regular IROpT. Pops two operands and pushes one result. */
	300	IRRef mode = nc->mode;
	301	lua_assert(sp >= nc->stack+2);
	302	sp--;
	303	/* Omit some overflow checks for array indexing. See comments above. */
	304	if (mode == IRTOINT_INDEX) {
	305	if (next == last && irref_isk((IRRef1)sp[0]) &&
	306	(uint32_t)IR((IRRef1)sp[0])->i + 0x40000000 < 0x80000000)
	307	guardot = 0;
	308	else
	309	mode = IRTOINT_CHECK; /* Otherwise cache a stronger check. */
	310	}
	311	sp[-1] = emitir(op+guardot, sp[-1], sp[0]);
	312	narrow_bpc_set(J, (IRRef1)ref, (IRRef1)sp[-1], mode); /* Add to cache. */
	313	}
	314	}
	315	lua_assert(sp == nc->stack+1);
	316	return nc->stack[0];
	317	}
	318
	319	/* Narrow a type conversion of an arithmetic operation. */
	320	TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
	321	{
	322	if ((J->flags & JIT_F_OPT_NARROW)) {
	323	NarrowConv nc;
	324	nc.J = J;
	325	nc.sp = nc.stack;
	326	nc.maxsp = &nc.stack[NARROW_MAX_STACK-4];
	327	if (fins->o == IR_TOBIT) {
	328	nc.mode = IRTOINT_TOBIT; /* Used only in the backpropagation cache. */
	329	nc.lim = 2; /* TOBIT can use a more optimistic rule. */
	330	} else {
	331	nc.mode = fins->op2;
	332	nc.lim = 1;
	333	}
	334	if (narrow_conv_backprop(&nc, fins->op1, 0) <= nc.lim)
	335	return narrow_conv_emit(J, &nc);
	336	}
	337	return NEXTFOLD;
	338	}
	339
	340	/* -- Narrowing of arithmetic operators ----------------------------------- */
	341
	342	/* Check whether a number fits into an int32_t (-0 is ok, too). */
	343	static int numisint(lua_Number n)
	344	{
	345	return (n == cast_num(lj_num2int(n)));
	346	}
	347
	348	/* Narrowing of modulo operator. */
	349	TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc)
	350	{
	351	TRef tmp;
	352	if ((J->flags & JIT_F_OPT_NARROW) &&
	353	tref_isk(rc) && tref_isint(rc)) { /* Optimize x % k. */
	354	int32_t k = IR(tref_ref(rc))->i;
	355	if (k > 0 && (k & (k-1)) == 0) { /* i % 2^k ==> band(i, 2^k-1) */
	356	if (tref_isint(rb))
	357	return emitir(IRTI(IR_BAND), rb, lj_ir_kint(J, k-1));
	358	}
	359	}
	360	/* b % c ==> b - floor(b/c)c /
	361	rb = lj_ir_tonum(J, rb);
	362	rc = lj_ir_tonum(J, rc);
	363	tmp = emitir(IRTN(IR_DIV), rb, rc);
	364	tmp = emitir(IRTN(IR_FPMATH), tmp, IRFPM_FLOOR);
	365	tmp = emitir(IRTN(IR_MUL), tmp, rc);
	366	return emitir(IRTN(IR_SUB), rb, tmp);
	367	}
	368
	369	/* Narrowing of power operator or math.pow. */
	370	TRef lj_opt_narrow_pow(jit_State J, TRef rb, TRef rc, TValue vc)
	371	{
	372	lua_Number n;
	373	if (tvisstr(vc) && !lj_str_numconv(strVdata(vc), vc))
	374	lj_trace_err(J, LJ_TRERR_BADTYPE);
	375	n = numV(vc);
	376	/* Limit narrowing for pow to small exponents (or for two constants). */
	377	if ((tref_isint(rc) && tref_isk(rc) && tref_isk(rb)) \|\|
	378	((J->flags & JIT_F_OPT_NARROW) &&
	379	(numisint(n) && n >= -65536.0 && n <= 65536.0))) {
	380	TRef tmp;
	381	if (!tref_isinteger(rc)) {
	382	if (tref_isstr(rc))
	383	rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0);
	384	rc = emitir(IRTGI(IR_TOINT), rc, IRTOINT_CHECK); /* Guarded TOINT! */
	385	}
	386	if (!tref_isk(rc)) { /* Range guard: -65536 <= i <= 65536 */
	387	tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536-2147483647-1));
	388	emitir(IRTGI(IR_LE), tmp, lj_ir_kint(J, 2*65536-2147483647-1));
	389	}
	390	return emitir(IRTN(IR_POWI), rb, rc);
	391	}
	392	/* FOLD covers most cases, but some are easier to do here. */
	393	if (tref_isk(rb) && tvispone(ir_knum(IR(tref_ref(rb)))))
	394	return rb; /* 1 ^ x ==> 1 */
	395	rc = lj_ir_tonum(J, rc);
	396	if (tref_isk(rc) && ir_knum(IR(tref_ref(rc)))->n == 0.5)
	397	return emitir(IRTN(IR_FPMATH), rb, IRFPM_SQRT); /* x ^ 0.5 ==> sqrt(x) */
	398	/* Split up b^c into exp2(clog2(b)). Assembler may rejoin later. /
	399	rb = emitir(IRTN(IR_FPMATH), rb, IRFPM_LOG2);
	400	rc = emitir(IRTN(IR_MUL), rb, rc);
	401	return emitir(IRTN(IR_FPMATH), rc, IRFPM_EXP2);
	402	}
	403
	404	/* -- Predictive narrowing of induction variables ------------------------- */
	405
	406	/* Narrow the FORL index type by looking at the runtime values. */
	407	IRType lj_opt_narrow_forl(cTValue *forbase)
	408	{
	409	lua_assert(tvisnum(&forbase[FORL_IDX]) &&
	410	tvisnum(&forbase[FORL_STOP]) &&
	411	tvisnum(&forbase[FORL_STEP]));
	412	/* Narrow only if the runtime values of start/stop/step are all integers. */
	413	if (numisint(numV(&forbase[FORL_IDX])) &&
	414	numisint(numV(&forbase[FORL_STOP])) &&
	415	numisint(numV(&forbase[FORL_STEP]))) {
	416	/* And if the loop index can't possibly overflow. */
	417	lua_Number step = numV(&forbase[FORL_STEP]);
	418	lua_Number sum = numV(&forbase[FORL_STOP]) + step;
	419	if (0 <= step ? sum <= 2147483647.0 : sum >= -2147483648.0)
	420	return IRT_INT;
	421	}
	422	return IRT_NUM;
	423	}
	424
	425	#undef IR
	426	#undef fins
	427	#undef emitir
	428	#undef emitir_raw
	429
	430	#endif