summaryrefslogtreecommitdiff
path: root/src/lj_opt_loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_opt_loop.c')
-rw-r--r--src/lj_opt_loop.c358
1 files changed, 358 insertions, 0 deletions
diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c
new file mode 100644
index 00000000..adc0c476
--- /dev/null
+++ b/src/lj_opt_loop.c
@@ -0,0 +1,358 @@
1/*
2** LOOP: Loop Optimizations.
3** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_opt_loop_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10
11#if LJ_HASJIT
12
13#include "lj_gc.h"
14#include "lj_err.h"
15#include "lj_str.h"
16#include "lj_ir.h"
17#include "lj_jit.h"
18#include "lj_iropt.h"
19#include "lj_trace.h"
20#include "lj_snap.h"
21#include "lj_vm.h"
22
23/* Loop optimization:
24**
25** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
26** of a loop into invariant and variant instructions. The invariant
27** instructions are hoisted out of the loop and only the variant
28** instructions remain inside the loop body.
29**
30** Unfortunately LICM is mostly useless for compiling dynamic languages.
31** The IR has many guards and most of the subsequent instructions are
32** control-dependent on them. The first non-hoistable guard would
33** effectively prevent hoisting of all subsequent instructions.
34**
35** That's why we use a special form of unrolling using copy-substitution,
36** combined with redundancy elimination:
37**
38** The recorded instruction stream is re-emitted to the compiler pipeline
39** with substituted operands. The substitution table is filled with the
40** refs returned by re-emitting each instruction. This can be done
41** on-the-fly, because the IR is in strict SSA form, where every ref is
42** defined before its use.
43**
44** This aproach generates two code sections, separated by the LOOP
45** instruction:
46**
47** 1. The recorded instructions form a kind of pre-roll for the loop. It
48** contains a mix of invariant and variant instructions and performs
49** exactly one loop iteration (but not necessarily the 1st iteration).
50**
51** 2. The loop body contains only the variant instructions and performs
52** all remaining loop iterations.
53**
54** On first sight that looks like a waste of space, because the variant
55** instructions are present twice. But the key insight is that the
56** pre-roll honors the control-dependencies for *both* the pre-roll itself
57** *and* the loop body!
58**
59** It also means one doesn't have to explicitly model control-dependencies
60** (which, BTW, wouldn't help LICM much). And it's much easier to
61** integrate sparse snapshotting with this approach.
62**
63** One of the nicest aspects of this approach is that all of the
64** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
65** reused with only minor restrictions (e.g. one should not fold
66** instructions across loop-carried dependencies).
67**
68** But in general all optimizations can be applied which only need to look
69** backwards into the generated instruction stream. At any point in time
70** during the copy-substitution process this contains both a static loop
71** iteration (the pre-roll) and a dynamic one (from the to-be-copied
72** instruction up to the end of the partial loop body).
73**
74** Since control-dependencies are implicitly kept, CSE also applies to all
75** kinds of guards. The major advantage is that all invariant guards can
76** be hoisted, too.
77**
78** Load/store forwarding works across loop iterations, too. This is
79** important if loop-carried dependencies are kept in upvalues or tables.
80** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
81** become a forwarded loop-recurrence after inlining.
82**
83** Since the IR is in SSA form, loop-carried dependencies have to be
84** modeled with PHI instructions. The potential candidates for PHIs are
85** collected on-the-fly during copy-substitution. After eliminating the
86** redundant ones, PHI instructions are emitted *below* the loop body.
87**
88** Note that this departure from traditional SSA form doesn't change the
89** semantics of the PHI instructions themselves. But it greatly simplifies
90** on-the-fly generation of the IR and the machine code.
91*/
92
93/* Some local macros to save typing. Undef'd at the end. */
94#define IR(ref) (&J->cur.ir[(ref)])
95
96/* Pass IR on to next optimization in chain (FOLD). */
97#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
98
99/* Emit raw IR without passing through optimizations. */
100#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
101
102/* -- PHI elimination ----------------------------------------------------- */
103
104/* Emit or eliminate collected PHIs. */
105static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi)
106{
107 int pass2 = 0;
108 IRRef i, nslots;
109 IRRef invar = J->chain[IR_LOOP];
110 /* Pass #1: mark redundant and potentially redundant PHIs. */
111 for (i = 0; i < nphi; i++) {
112 IRRef lref = phi[i];
113 IRRef rref = subst[lref];
114 if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */
115 irt_setmark(IR(lref)->t);
116 } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
117 /* Quick check for simple recurrences failed, need pass2. */
118 irt_setmark(IR(lref)->t);
119 pass2 = 1;
120 }
121 }
122 /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
123 if (pass2) {
124 for (i = J->cur.nins-1; i > invar; i--) {
125 IRIns *ir = IR(i);
126 if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t);
127 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
128 }
129 }
130 /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
131 nslots = J->baseslot+J->maxslot;
132 for (i = 1; i < nslots; i++) {
133 IRRef ref = tref_ref(J->slot[i]);
134 if (!irref_isk(ref) && ref != subst[ref]) {
135 IRIns *ir = IR(ref);
136 irt_clearmark(ir->t); /* Unmark potential uses, too. */
137 if (!irt_isphi(ir->t) && !irt_ispri(ir->t)) {
138 irt_setphi(ir->t);
139 if (nphi >= LJ_MAX_PHI)
140 lj_trace_err(J, LJ_TRERR_PHIOV);
141 phi[nphi++] = (IRRef1)ref;
142 }
143 }
144 }
145 /* Pass #4: emit PHI instructions or eliminate PHIs. */
146 for (i = 0; i < nphi; i++) {
147 IRRef lref = phi[i];
148 IRIns *ir = IR(lref);
149 if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
150 IRRef rref = subst[lref];
151 if (rref > invar)
152 irt_setphi(IR(rref)->t);
153 emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
154 } else { /* Otherwise eliminate PHI. */
155 irt_clearmark(ir->t);
156 irt_clearphi(ir->t);
157 }
158 }
159}
160
161/* -- Loop unrolling using copy-substitution ------------------------------ */
162
163/* Unroll loop. */
164static void loop_unroll(jit_State *J)
165{
166 IRRef1 phi[LJ_MAX_PHI];
167 uint32_t nphi = 0;
168 IRRef1 *subst;
169 SnapShot *osnap, *snap;
170 IRRef2 *loopmap;
171 BCReg loopslots;
172 MSize nsnap, nsnapmap;
173 IRRef ins, invar, osnapref;
174
175 /* Use temp buffer for substitution table.
176 ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
177 ** Note: don't call into the VM or run the GC or the buffer may be gone.
178 */
179 invar = J->cur.nins;
180 subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
181 (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
182 subst[REF_BASE] = REF_BASE;
183
184 /* LOOP separates the pre-roll from the loop body. */
185 emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
186
187 /* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
188 nsnap = J->cur.nsnap;
189 if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
190 MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
191 if (2*nsnap-2 > maxsnap)
192 lj_trace_err(J, LJ_TRERR_SNAPOV);
193 lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
194 J->cur.snap = J->snapbuf;
195 }
196 nsnapmap = J->cur.nsnapmap; /* Use temp. copy to avoid undo. */
197 if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
198 J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
199 J->sizesnapmap*sizeof(IRRef2),
200 2*J->sizesnapmap*sizeof(IRRef2));
201 J->cur.snapmap = J->snapmapbuf;
202 J->sizesnapmap *= 2;
203 }
204
205 /* The loop snapshot is used for fallback substitutions. */
206 snap = &J->cur.snap[nsnap-1];
207 loopmap = &J->cur.snapmap[snap->mapofs];
208 loopslots = snap->nslots;
209 /* The PC of snapshot #0 and the loop snapshot must match. */
210 lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
211
212 /* Start substitution with snapshot #1 (#0 is empty for root traces). */
213 osnap = &J->cur.snap[1];
214 osnapref = osnap->ref;
215
216 /* Copy and substitute all recorded instructions and snapshots. */
217 for (ins = REF_FIRST; ins < invar; ins++) {
218 IRIns *ir;
219 IRRef op1, op2;
220
221 /* Copy-substitute snapshot. */
222 if (ins >= osnapref) {
223 IRRef2 *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
224 BCReg s, nslots;
225 uint32_t nmapofs, nframelinks;
226 if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
227 nmapofs = nsnapmap;
228 snap++; /* Add new snapshot. */
229 } else {
230 nmapofs = snap->mapofs; /* Overwrite previous snapshot. */
231 }
232 J->guardemit.irt = 0;
233 nslots = osnap->nslots;
234 nframelinks = osnap->nframelinks;
235 snap->mapofs = (uint16_t)nmapofs;
236 snap->ref = (IRRef1)J->cur.nins;
237 snap->nslots = (uint8_t)nslots;
238 snap->nframelinks = (uint8_t)nframelinks;
239 snap->count = 0;
240 osnap++;
241 osnapref = osnap->ref;
242 nsnapmap = nmapofs + nslots + nframelinks;
243 nmap = &J->cur.snapmap[nmapofs];
244 /* Substitute snapshot slots. */
245 for (s = 0; s < nslots; s++) {
246 IRRef ref = snap_ref(omap[s]);
247 if (ref) {
248 if (!irref_isk(ref))
249 ref = subst[ref];
250 } else if (s < loopslots) {
251 ref = loopmap[s];
252 }
253 nmap[s] = ref;
254 }
255 /* Copy frame links. */
256 nmap += nslots;
257 omap += nslots;
258 for (s = 0; s < nframelinks; s++)
259 nmap[s] = omap[s];
260 }
261
262 /* Substitute instruction operands. */
263 ir = IR(ins);
264 op1 = ir->op1;
265 if (!irref_isk(op1)) op1 = subst[op1];
266 op2 = ir->op2;
267 if (!irref_isk(op2)) op2 = subst[op2];
268 if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
269 op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
270 subst[ins] = (IRRef1)ins; /* Shortcut. */
271 } else {
272 /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
273 IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
274 IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
275 subst[ins] = (IRRef1)ref;
276 if (ref != ins && ref < invar) { /* Loop-carried dependency? */
277 IRIns *irr = IR(ref);
278 /* Potential PHI? */
279 if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
280 irt_setphi(irr->t);
281 if (nphi >= LJ_MAX_PHI)
282 lj_trace_err(J, LJ_TRERR_PHIOV);
283 phi[nphi++] = (IRRef1)ref;
284 }
285 /* Check all loop-carried dependencies for type instability. */
286 if (!irt_sametype(t, irr->t)) {
287 if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num case. */
288 subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0));
289 else
290 lj_trace_err(J, LJ_TRERR_TYPEINS);
291 }
292 }
293 }
294 }
295 if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
296 J->cur.nsnapmap = (uint16_t)nsnapmap;
297 snap++;
298 } else {
299 J->cur.nsnapmap = (uint16_t)snap->mapofs; /* Last snapshot is redundant. */
300 }
301 J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
302 lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
303
304 loop_emit_phi(J, subst, phi, nphi);
305}
306
307/* Undo any partial changes made by the loop optimization. */
308static void loop_undo(jit_State *J, IRRef ins)
309{
310 lj_ir_rollback(J, ins);
311 for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
312 IRIns *ir = IR(ins);
313 irt_clearphi(ir->t);
314 irt_clearmark(ir->t);
315 }
316}
317
318/* Protected callback for loop optimization. */
319static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
320{
321 UNUSED(L); UNUSED(dummy);
322 loop_unroll((jit_State *)ud);
323 return NULL;
324}
325
326/* Loop optimization. */
327int lj_opt_loop(jit_State *J)
328{
329 IRRef nins = J->cur.nins;
330 int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J);
331 if (LJ_UNLIKELY(errcode)) {
332 lua_State *L = J->L;
333 if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) { /* Trace error? */
334 int32_t e = lj_num2int(numV(L->top-1));
335 switch ((TraceError)e) {
336 case LJ_TRERR_TYPEINS: /* Type instability. */
337 case LJ_TRERR_GFAIL: /* Guard would always fail. */
338 /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
339 if (--J->instunroll < 0) /* But do not unroll forever. */
340 break;
341 L->top--; /* Remove error object. */
342 J->guardemit.irt = 0;
343 loop_undo(J, nins);
344 return 1; /* Loop optimization failed, continue recording. */
345 default:
346 break;
347 }
348 }
349 lj_err_throw(L, errcode); /* Propagate all other errors. */
350 }
351 return 0; /* Loop optimization is ok. */
352}
353
354#undef IR
355#undef emitir
356#undef emitir_raw
357
358#endif