diff options
Diffstat (limited to 'src/lj_opt_loop.c')
-rw-r--r-- | src/lj_opt_loop.c | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c new file mode 100644 index 00000000..adc0c476 --- /dev/null +++ b/src/lj_opt_loop.c | |||
@@ -0,0 +1,358 @@ | |||
1 | /* | ||
2 | ** LOOP: Loop Optimizations. | ||
3 | ** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h | ||
4 | */ | ||
5 | |||
6 | #define lj_opt_loop_c | ||
7 | #define LUA_CORE | ||
8 | |||
9 | #include "lj_obj.h" | ||
10 | |||
11 | #if LJ_HASJIT | ||
12 | |||
13 | #include "lj_gc.h" | ||
14 | #include "lj_err.h" | ||
15 | #include "lj_str.h" | ||
16 | #include "lj_ir.h" | ||
17 | #include "lj_jit.h" | ||
18 | #include "lj_iropt.h" | ||
19 | #include "lj_trace.h" | ||
20 | #include "lj_snap.h" | ||
21 | #include "lj_vm.h" | ||
22 | |||
23 | /* Loop optimization: | ||
24 | ** | ||
25 | ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions | ||
26 | ** of a loop into invariant and variant instructions. The invariant | ||
27 | ** instructions are hoisted out of the loop and only the variant | ||
28 | ** instructions remain inside the loop body. | ||
29 | ** | ||
30 | ** Unfortunately LICM is mostly useless for compiling dynamic languages. | ||
31 | ** The IR has many guards and most of the subsequent instructions are | ||
32 | ** control-dependent on them. The first non-hoistable guard would | ||
33 | ** effectively prevent hoisting of all subsequent instructions. | ||
34 | ** | ||
35 | ** That's why we use a special form of unrolling using copy-substitution, | ||
36 | ** combined with redundancy elimination: | ||
37 | ** | ||
38 | ** The recorded instruction stream is re-emitted to the compiler pipeline | ||
39 | ** with substituted operands. The substitution table is filled with the | ||
40 | ** refs returned by re-emitting each instruction. This can be done | ||
41 | ** on-the-fly, because the IR is in strict SSA form, where every ref is | ||
42 | ** defined before its use. | ||
43 | ** | ||
44 | ** This aproach generates two code sections, separated by the LOOP | ||
45 | ** instruction: | ||
46 | ** | ||
47 | ** 1. The recorded instructions form a kind of pre-roll for the loop. It | ||
48 | ** contains a mix of invariant and variant instructions and performs | ||
49 | ** exactly one loop iteration (but not necessarily the 1st iteration). | ||
50 | ** | ||
51 | ** 2. The loop body contains only the variant instructions and performs | ||
52 | ** all remaining loop iterations. | ||
53 | ** | ||
54 | ** On first sight that looks like a waste of space, because the variant | ||
55 | ** instructions are present twice. But the key insight is that the | ||
56 | ** pre-roll honors the control-dependencies for *both* the pre-roll itself | ||
57 | ** *and* the loop body! | ||
58 | ** | ||
59 | ** It also means one doesn't have to explicitly model control-dependencies | ||
60 | ** (which, BTW, wouldn't help LICM much). And it's much easier to | ||
61 | ** integrate sparse snapshotting with this approach. | ||
62 | ** | ||
63 | ** One of the nicest aspects of this approach is that all of the | ||
64 | ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be | ||
65 | ** reused with only minor restrictions (e.g. one should not fold | ||
66 | ** instructions across loop-carried dependencies). | ||
67 | ** | ||
68 | ** But in general all optimizations can be applied which only need to look | ||
69 | ** backwards into the generated instruction stream. At any point in time | ||
70 | ** during the copy-substitution process this contains both a static loop | ||
71 | ** iteration (the pre-roll) and a dynamic one (from the to-be-copied | ||
72 | ** instruction up to the end of the partial loop body). | ||
73 | ** | ||
74 | ** Since control-dependencies are implicitly kept, CSE also applies to all | ||
75 | ** kinds of guards. The major advantage is that all invariant guards can | ||
76 | ** be hoisted, too. | ||
77 | ** | ||
78 | ** Load/store forwarding works across loop iterations, too. This is | ||
79 | ** important if loop-carried dependencies are kept in upvalues or tables. | ||
80 | ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may | ||
81 | ** become a forwarded loop-recurrence after inlining. | ||
82 | ** | ||
83 | ** Since the IR is in SSA form, loop-carried dependencies have to be | ||
84 | ** modeled with PHI instructions. The potential candidates for PHIs are | ||
85 | ** collected on-the-fly during copy-substitution. After eliminating the | ||
86 | ** redundant ones, PHI instructions are emitted *below* the loop body. | ||
87 | ** | ||
88 | ** Note that this departure from traditional SSA form doesn't change the | ||
89 | ** semantics of the PHI instructions themselves. But it greatly simplifies | ||
90 | ** on-the-fly generation of the IR and the machine code. | ||
91 | */ | ||
92 | |||
93 | /* Some local macros to save typing. Undef'd at the end. */ | ||
94 | #define IR(ref) (&J->cur.ir[(ref)]) | ||
95 | |||
96 | /* Pass IR on to next optimization in chain (FOLD). */ | ||
97 | #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) | ||
98 | |||
99 | /* Emit raw IR without passing through optimizations. */ | ||
100 | #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) | ||
101 | |||
102 | /* -- PHI elimination ----------------------------------------------------- */ | ||
103 | |||
104 | /* Emit or eliminate collected PHIs. */ | ||
105 | static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi) | ||
106 | { | ||
107 | int pass2 = 0; | ||
108 | IRRef i, nslots; | ||
109 | IRRef invar = J->chain[IR_LOOP]; | ||
110 | /* Pass #1: mark redundant and potentially redundant PHIs. */ | ||
111 | for (i = 0; i < nphi; i++) { | ||
112 | IRRef lref = phi[i]; | ||
113 | IRRef rref = subst[lref]; | ||
114 | if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */ | ||
115 | irt_setmark(IR(lref)->t); | ||
116 | } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) { | ||
117 | /* Quick check for simple recurrences failed, need pass2. */ | ||
118 | irt_setmark(IR(lref)->t); | ||
119 | pass2 = 1; | ||
120 | } | ||
121 | } | ||
122 | /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */ | ||
123 | if (pass2) { | ||
124 | for (i = J->cur.nins-1; i > invar; i--) { | ||
125 | IRIns *ir = IR(i); | ||
126 | if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t); | ||
127 | if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); | ||
128 | } | ||
129 | } | ||
130 | /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */ | ||
131 | nslots = J->baseslot+J->maxslot; | ||
132 | for (i = 1; i < nslots; i++) { | ||
133 | IRRef ref = tref_ref(J->slot[i]); | ||
134 | if (!irref_isk(ref) && ref != subst[ref]) { | ||
135 | IRIns *ir = IR(ref); | ||
136 | irt_clearmark(ir->t); /* Unmark potential uses, too. */ | ||
137 | if (!irt_isphi(ir->t) && !irt_ispri(ir->t)) { | ||
138 | irt_setphi(ir->t); | ||
139 | if (nphi >= LJ_MAX_PHI) | ||
140 | lj_trace_err(J, LJ_TRERR_PHIOV); | ||
141 | phi[nphi++] = (IRRef1)ref; | ||
142 | } | ||
143 | } | ||
144 | } | ||
145 | /* Pass #4: emit PHI instructions or eliminate PHIs. */ | ||
146 | for (i = 0; i < nphi; i++) { | ||
147 | IRRef lref = phi[i]; | ||
148 | IRIns *ir = IR(lref); | ||
149 | if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */ | ||
150 | IRRef rref = subst[lref]; | ||
151 | if (rref > invar) | ||
152 | irt_setphi(IR(rref)->t); | ||
153 | emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref); | ||
154 | } else { /* Otherwise eliminate PHI. */ | ||
155 | irt_clearmark(ir->t); | ||
156 | irt_clearphi(ir->t); | ||
157 | } | ||
158 | } | ||
159 | } | ||
160 | |||
161 | /* -- Loop unrolling using copy-substitution ------------------------------ */ | ||
162 | |||
163 | /* Unroll loop. */ | ||
164 | static void loop_unroll(jit_State *J) | ||
165 | { | ||
166 | IRRef1 phi[LJ_MAX_PHI]; | ||
167 | uint32_t nphi = 0; | ||
168 | IRRef1 *subst; | ||
169 | SnapShot *osnap, *snap; | ||
170 | IRRef2 *loopmap; | ||
171 | BCReg loopslots; | ||
172 | MSize nsnap, nsnapmap; | ||
173 | IRRef ins, invar, osnapref; | ||
174 | |||
175 | /* Use temp buffer for substitution table. | ||
176 | ** Only non-constant refs in [REF_BIAS,invar) are valid indexes. | ||
177 | ** Note: don't call into the VM or run the GC or the buffer may be gone. | ||
178 | */ | ||
179 | invar = J->cur.nins; | ||
180 | subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, | ||
181 | (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS; | ||
182 | subst[REF_BASE] = REF_BASE; | ||
183 | |||
184 | /* LOOP separates the pre-roll from the loop body. */ | ||
185 | emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0); | ||
186 | |||
187 | /* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */ | ||
188 | nsnap = J->cur.nsnap; | ||
189 | if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) { | ||
190 | MSize maxsnap = (MSize)J->param[JIT_P_maxsnap]; | ||
191 | if (2*nsnap-2 > maxsnap) | ||
192 | lj_trace_err(J, LJ_TRERR_SNAPOV); | ||
193 | lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot); | ||
194 | J->cur.snap = J->snapbuf; | ||
195 | } | ||
196 | nsnapmap = J->cur.nsnapmap; /* Use temp. copy to avoid undo. */ | ||
197 | if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) { | ||
198 | J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf, | ||
199 | J->sizesnapmap*sizeof(IRRef2), | ||
200 | 2*J->sizesnapmap*sizeof(IRRef2)); | ||
201 | J->cur.snapmap = J->snapmapbuf; | ||
202 | J->sizesnapmap *= 2; | ||
203 | } | ||
204 | |||
205 | /* The loop snapshot is used for fallback substitutions. */ | ||
206 | snap = &J->cur.snap[nsnap-1]; | ||
207 | loopmap = &J->cur.snapmap[snap->mapofs]; | ||
208 | loopslots = snap->nslots; | ||
209 | /* The PC of snapshot #0 and the loop snapshot must match. */ | ||
210 | lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]); | ||
211 | |||
212 | /* Start substitution with snapshot #1 (#0 is empty for root traces). */ | ||
213 | osnap = &J->cur.snap[1]; | ||
214 | osnapref = osnap->ref; | ||
215 | |||
216 | /* Copy and substitute all recorded instructions and snapshots. */ | ||
217 | for (ins = REF_FIRST; ins < invar; ins++) { | ||
218 | IRIns *ir; | ||
219 | IRRef op1, op2; | ||
220 | |||
221 | /* Copy-substitute snapshot. */ | ||
222 | if (ins >= osnapref) { | ||
223 | IRRef2 *nmap, *omap = &J->cur.snapmap[osnap->mapofs]; | ||
224 | BCReg s, nslots; | ||
225 | uint32_t nmapofs, nframelinks; | ||
226 | if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ | ||
227 | nmapofs = nsnapmap; | ||
228 | snap++; /* Add new snapshot. */ | ||
229 | } else { | ||
230 | nmapofs = snap->mapofs; /* Overwrite previous snapshot. */ | ||
231 | } | ||
232 | J->guardemit.irt = 0; | ||
233 | nslots = osnap->nslots; | ||
234 | nframelinks = osnap->nframelinks; | ||
235 | snap->mapofs = (uint16_t)nmapofs; | ||
236 | snap->ref = (IRRef1)J->cur.nins; | ||
237 | snap->nslots = (uint8_t)nslots; | ||
238 | snap->nframelinks = (uint8_t)nframelinks; | ||
239 | snap->count = 0; | ||
240 | osnap++; | ||
241 | osnapref = osnap->ref; | ||
242 | nsnapmap = nmapofs + nslots + nframelinks; | ||
243 | nmap = &J->cur.snapmap[nmapofs]; | ||
244 | /* Substitute snapshot slots. */ | ||
245 | for (s = 0; s < nslots; s++) { | ||
246 | IRRef ref = snap_ref(omap[s]); | ||
247 | if (ref) { | ||
248 | if (!irref_isk(ref)) | ||
249 | ref = subst[ref]; | ||
250 | } else if (s < loopslots) { | ||
251 | ref = loopmap[s]; | ||
252 | } | ||
253 | nmap[s] = ref; | ||
254 | } | ||
255 | /* Copy frame links. */ | ||
256 | nmap += nslots; | ||
257 | omap += nslots; | ||
258 | for (s = 0; s < nframelinks; s++) | ||
259 | nmap[s] = omap[s]; | ||
260 | } | ||
261 | |||
262 | /* Substitute instruction operands. */ | ||
263 | ir = IR(ins); | ||
264 | op1 = ir->op1; | ||
265 | if (!irref_isk(op1)) op1 = subst[op1]; | ||
266 | op2 = ir->op2; | ||
267 | if (!irref_isk(op2)) op2 = subst[op2]; | ||
268 | if (irm_kind(lj_ir_mode[ir->o]) == IRM_N && | ||
269 | op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */ | ||
270 | subst[ins] = (IRRef1)ins; /* Shortcut. */ | ||
271 | } else { | ||
272 | /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */ | ||
273 | IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */ | ||
274 | IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2)); | ||
275 | subst[ins] = (IRRef1)ref; | ||
276 | if (ref != ins && ref < invar) { /* Loop-carried dependency? */ | ||
277 | IRIns *irr = IR(ref); | ||
278 | /* Potential PHI? */ | ||
279 | if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) { | ||
280 | irt_setphi(irr->t); | ||
281 | if (nphi >= LJ_MAX_PHI) | ||
282 | lj_trace_err(J, LJ_TRERR_PHIOV); | ||
283 | phi[nphi++] = (IRRef1)ref; | ||
284 | } | ||
285 | /* Check all loop-carried dependencies for type instability. */ | ||
286 | if (!irt_sametype(t, irr->t)) { | ||
287 | if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num case. */ | ||
288 | subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0)); | ||
289 | else | ||
290 | lj_trace_err(J, LJ_TRERR_TYPEINS); | ||
291 | } | ||
292 | } | ||
293 | } | ||
294 | } | ||
295 | if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ | ||
296 | J->cur.nsnapmap = (uint16_t)nsnapmap; | ||
297 | snap++; | ||
298 | } else { | ||
299 | J->cur.nsnapmap = (uint16_t)snap->mapofs; /* Last snapshot is redundant. */ | ||
300 | } | ||
301 | J->cur.nsnap = (uint16_t)(snap - J->cur.snap); | ||
302 | lua_assert(J->cur.nsnapmap <= J->sizesnapmap); | ||
303 | |||
304 | loop_emit_phi(J, subst, phi, nphi); | ||
305 | } | ||
306 | |||
307 | /* Undo any partial changes made by the loop optimization. */ | ||
308 | static void loop_undo(jit_State *J, IRRef ins) | ||
309 | { | ||
310 | lj_ir_rollback(J, ins); | ||
311 | for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */ | ||
312 | IRIns *ir = IR(ins); | ||
313 | irt_clearphi(ir->t); | ||
314 | irt_clearmark(ir->t); | ||
315 | } | ||
316 | } | ||
317 | |||
318 | /* Protected callback for loop optimization. */ | ||
319 | static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud) | ||
320 | { | ||
321 | UNUSED(L); UNUSED(dummy); | ||
322 | loop_unroll((jit_State *)ud); | ||
323 | return NULL; | ||
324 | } | ||
325 | |||
326 | /* Loop optimization. */ | ||
327 | int lj_opt_loop(jit_State *J) | ||
328 | { | ||
329 | IRRef nins = J->cur.nins; | ||
330 | int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J); | ||
331 | if (LJ_UNLIKELY(errcode)) { | ||
332 | lua_State *L = J->L; | ||
333 | if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) { /* Trace error? */ | ||
334 | int32_t e = lj_num2int(numV(L->top-1)); | ||
335 | switch ((TraceError)e) { | ||
336 | case LJ_TRERR_TYPEINS: /* Type instability. */ | ||
337 | case LJ_TRERR_GFAIL: /* Guard would always fail. */ | ||
338 | /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */ | ||
339 | if (--J->instunroll < 0) /* But do not unroll forever. */ | ||
340 | break; | ||
341 | L->top--; /* Remove error object. */ | ||
342 | J->guardemit.irt = 0; | ||
343 | loop_undo(J, nins); | ||
344 | return 1; /* Loop optimization failed, continue recording. */ | ||
345 | default: | ||
346 | break; | ||
347 | } | ||
348 | } | ||
349 | lj_err_throw(L, errcode); /* Propagate all other errors. */ | ||
350 | } | ||
351 | return 0; /* Loop optimization is ok. */ | ||
352 | } | ||
353 | |||
354 | #undef IR | ||
355 | #undef emitir | ||
356 | #undef emitir_raw | ||
357 | |||
358 | #endif | ||