diff options
Diffstat (limited to 'src/lj_opt_split.c')
-rw-r--r-- | src/lj_opt_split.c | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c | |||
@@ -0,0 +1,343 @@ | |||
1 | /* | ||
2 | ** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions. | ||
3 | ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h | ||
4 | */ | ||
5 | |||
6 | #define lj_opt_split_c | ||
7 | #define LUA_CORE | ||
8 | |||
9 | #include "lj_obj.h" | ||
10 | |||
11 | #if LJ_HASJIT && LJ_HASFFI && LJ_32 | ||
12 | |||
13 | #include "lj_err.h" | ||
14 | #include "lj_str.h" | ||
15 | #include "lj_ir.h" | ||
16 | #include "lj_jit.h" | ||
17 | #include "lj_iropt.h" | ||
18 | #include "lj_vm.h" | ||
19 | |||
20 | /* SPLIT pass: | ||
21 | ** | ||
22 | ** This pass splits up 64 bit IR instructions into multiple 32 bit IR | ||
23 | ** instructions. It's only active for 32 bit CPUs which lack native 64 bit | ||
24 | ** operations. The FFI is currently the only emitter for 64 bit | ||
25 | ** instructions, so this pass is disabled if the FFI is disabled. | ||
26 | ** | ||
27 | ** Splitting the IR in a separate pass keeps each 32 bit IR assembler | ||
28 | ** backend simple. Only a small amount of extra functionality needs to be | ||
29 | ** implemented. This is much easier than adding support for allocating | ||
30 | ** register pairs to each backend (believe me, I tried). A few simple, but | ||
31 | ** important optimizations can be performed by the SPLIT pass, which would | ||
32 | ** be tedious to do in the backend. | ||
33 | ** | ||
34 | ** The basic idea is to replace each 64 bit IR instruction with its 32 bit | ||
35 | ** equivalent plus an extra HIOP instruction. The splitted IR is not passed | ||
36 | ** through FOLD or any other optimizations, so each HIOP is guaranteed to | ||
37 | ** immediately follow it's counterpart. The actual functionality of HIOP is | ||
38 | ** inferred from the previous instruction. | ||
39 | ** | ||
40 | ** The operands of HIOP hold the hiword input references. The output of HIOP | ||
41 | ** is the hiword output reference, which is also used to hold the hiword | ||
42 | ** register or spill slot information. The register allocator treats this | ||
43 | ** instruction independent of any other instruction, which improves code | ||
44 | ** quality compared to using fixed register pairs. | ||
45 | ** | ||
46 | ** It's easier to split up some instructions into two regular 32 bit | ||
47 | ** instructions. E.g. XLOAD is split up into two XLOADs with two different | ||
48 | ** addresses. Obviously 64 bit constants need to be split up into two 32 bit | ||
49 | ** constants, too. Some hiword instructions can be entirely omitted, e.g. | ||
50 | ** when zero-extending a 32 bit value to 64 bits. | ||
51 | ** | ||
52 | ** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with | ||
53 | ** two int64_t fields: | ||
54 | ** | ||
55 | ** 0100 p32 ADD base +8 | ||
56 | ** 0101 i64 XLOAD 0100 | ||
57 | ** 0102 i64 ADD 0101 +1 | ||
58 | ** 0103 p32 ADD base +16 | ||
59 | ** 0104 i64 XSTORE 0103 0102 | ||
60 | ** | ||
61 | ** mov rax, [esi+0x8] | ||
62 | ** add rax, +0x01 | ||
63 | ** mov [esi+0x10], rax | ||
64 | ** | ||
65 | ** Here's the transformed IR and the x86 machine code after the SPLIT pass: | ||
66 | ** | ||
67 | ** 0100 p32 ADD base +8 | ||
68 | ** 0101 int XLOAD 0100 | ||
69 | ** 0102 p32 ADD base +12 | ||
70 | ** 0103 int XLOAD 0102 | ||
71 | ** 0104 int ADD 0101 +1 | ||
72 | ** 0105 int HIOP 0103 +0 | ||
73 | ** 0106 p32 ADD base +16 | ||
74 | ** 0107 int XSTORE 0106 0104 | ||
75 | ** 0108 p32 ADD base +20 | ||
76 | ** 0109 int XSTORE 0108 0105 | ||
77 | ** | ||
78 | ** mov eax, [esi+0x8] | ||
79 | ** mov ecx, [esi+0xc] | ||
80 | ** add eax, +0x01 | ||
81 | ** adc ecx, +0x00 | ||
82 | ** mov [esi+0x10], eax | ||
83 | ** mov [esi+0x14], ecx | ||
84 | ** | ||
85 | ** You may notice the reassociated hiword address computation, which is | ||
86 | ** later fused into the mov operands by the assembler. | ||
87 | */ | ||
88 | |||
89 | /* Some local macros to save typing. Undef'd at the end. */ | ||
90 | #define IR(ref) (&J->cur.ir[(ref)]) | ||
91 | |||
92 | /* Directly emit the transformed IR without updating chains etc. */ | ||
93 | static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2) | ||
94 | { | ||
95 | IRRef nref = lj_ir_nextins(J); | ||
96 | IRIns *ir = IR(nref); | ||
97 | ir->ot = ot; | ||
98 | ir->op1 = op1; | ||
99 | ir->op2 = op2; | ||
100 | return nref; | ||
101 | } | ||
102 | |||
103 | /* Emit a CALLN with two split 64 bit arguments. */ | ||
104 | static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir, | ||
105 | IRIns *ir, IRCallID id) | ||
106 | { | ||
107 | IRRef tmp, op1 = ir->op1, op2 = ir->op2; | ||
108 | J->cur.nins--; | ||
109 | #if LJ_LE | ||
110 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]); | ||
111 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); | ||
112 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); | ||
113 | #else | ||
114 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev); | ||
115 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); | ||
116 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); | ||
117 | #endif | ||
118 | ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id); | ||
119 | return split_emit(J, IRTI(IR_HIOP), tmp, tmp); | ||
120 | } | ||
121 | |||
122 | /* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */ | ||
123 | static IRRef split_ptr(jit_State *J, IRRef ref) | ||
124 | { | ||
125 | IRIns *ir = IR(ref); | ||
126 | int32_t ofs = 4; | ||
127 | if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */ | ||
128 | ofs += IR(ir->op2)->i; | ||
129 | ref = ir->op1; | ||
130 | if (ofs == 0) return ref; | ||
131 | } | ||
132 | return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs)); | ||
133 | } | ||
134 | |||
135 | /* Transform the old IR to the new IR. */ | ||
136 | static void split_ir(jit_State *J) | ||
137 | { | ||
138 | IRRef nins = J->cur.nins, nk = J->cur.nk; | ||
139 | MSize irlen = nins - nk; | ||
140 | MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1)); | ||
141 | IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need); | ||
142 | IRRef1 *hisubst; | ||
143 | IRRef ref; | ||
144 | |||
145 | /* Copy old IR to buffer. */ | ||
146 | memcpy(oir, IR(nk), irlen*sizeof(IRIns)); | ||
147 | /* Bias hiword substitution table and old IR. Loword kept in field prev. */ | ||
148 | hisubst = (IRRef1 *)&oir[irlen] - nk; | ||
149 | oir -= nk; | ||
150 | |||
151 | /* Remove all IR instructions, but retain IR constants. */ | ||
152 | J->cur.nins = REF_FIRST; | ||
153 | |||
154 | /* Process constants and fixed references. */ | ||
155 | for (ref = nk; ref <= REF_BASE; ref++) { | ||
156 | IRIns *ir = &oir[ref]; | ||
157 | if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */ | ||
158 | TValue tv = *ir_k64(ir); | ||
159 | ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo); | ||
160 | hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi); | ||
161 | } else { | ||
162 | ir->prev = (IRRef1)ref; /* Identity substitution for loword. */ | ||
163 | } | ||
164 | } | ||
165 | |||
166 | /* Process old IR instructions. */ | ||
167 | for (ref = REF_FIRST; ref < nins; ref++) { | ||
168 | IRIns *ir = &oir[ref]; | ||
169 | IRRef nref = lj_ir_nextins(J); | ||
170 | IRIns *nir = IR(nref); | ||
171 | |||
172 | /* Copy-substitute old instruction to new instruction. */ | ||
173 | nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev; | ||
174 | nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev; | ||
175 | ir->prev = nref; /* Loword substitution. */ | ||
176 | nir->o = ir->o; | ||
177 | nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI); | ||
178 | |||
179 | /* Split 64 bit instructions. */ | ||
180 | if (irt_isint64(ir->t)) { | ||
181 | IRRef hi = hisubst[ir->op1]; | ||
182 | nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */ | ||
183 | switch (ir->o) { | ||
184 | case IR_ADD: | ||
185 | case IR_SUB: | ||
186 | /* Use plain op for hiword if loword cannot produce a carry/borrow. */ | ||
187 | if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) { | ||
188 | ir->prev = nir->op1; /* Pass through loword. */ | ||
189 | nir->op1 = hi; nir->op2 = hisubst[ir->op2]; | ||
190 | hi = nref; | ||
191 | break; | ||
192 | } | ||
193 | /* fallthrough */ | ||
194 | case IR_NEG: | ||
195 | hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]); | ||
196 | break; | ||
197 | case IR_MUL: | ||
198 | hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64); | ||
199 | break; | ||
200 | case IR_POWI: | ||
201 | hi = split_call64(J, hisubst, oir, ir, | ||
202 | irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : | ||
203 | IRCALL_lj_carith_powu64); | ||
204 | break; | ||
205 | case IR_XLOAD: | ||
206 | hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2); | ||
207 | #if LJ_BE | ||
208 | ir->prev = hi; hi = nref; | ||
209 | #endif | ||
210 | break; | ||
211 | case IR_XSTORE: | ||
212 | #if LJ_LE | ||
213 | hi = hisubst[ir->op2]; | ||
214 | #else | ||
215 | hi = nir->op2; nir->op2 = hisubst[ir->op2]; | ||
216 | #endif | ||
217 | split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi); | ||
218 | continue; | ||
219 | case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */ | ||
220 | IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); | ||
221 | if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */ | ||
222 | hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref); | ||
223 | } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */ | ||
224 | /* Drop cast, since assembler doesn't care. */ | ||
225 | hisubst[ref] = hi; | ||
226 | goto fwdlo; | ||
227 | } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */ | ||
228 | IRRef k31 = lj_ir_kint(J, 31); | ||
229 | nir = IR(nref); /* May have been reallocated. */ | ||
230 | ir->prev = nir->op1; /* Pass through loword. */ | ||
231 | nir->o = IR_BSAR; /* hi = bsar(lo, 31). */ | ||
232 | nir->op2 = k31; | ||
233 | hi = nref; | ||
234 | } else { /* Zero-extend to 64 bit. */ | ||
235 | hisubst[ref] = lj_ir_kint(J, 0); | ||
236 | goto fwdlo; | ||
237 | } | ||
238 | break; | ||
239 | } | ||
240 | case IR_PHI: { | ||
241 | IRRef hi2; | ||
242 | if ((irref_isk(nir->op1) && irref_isk(nir->op2)) || | ||
243 | nir->op1 == nir->op2) | ||
244 | J->cur.nins--; /* Drop useless PHIs. */ | ||
245 | hi2 = hisubst[ir->op2]; | ||
246 | if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2)) | ||
247 | split_emit(J, IRTI(IR_PHI), hi, hi2); | ||
248 | continue; | ||
249 | } | ||
250 | default: | ||
251 | lua_assert(ir->o <= IR_NE); | ||
252 | split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */ | ||
253 | continue; | ||
254 | } | ||
255 | hisubst[ref] = hi; /* Store hiword substitution. */ | ||
256 | } else if (ir->o == IR_CONV) { /* See above, too. */ | ||
257 | IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); | ||
258 | if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */ | ||
259 | if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */ | ||
260 | ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)), | ||
261 | hisubst[ir->op1], nref); | ||
262 | } else { /* Truncate to lower 32 bits. */ | ||
263 | fwdlo: | ||
264 | ir->prev = nir->op1; /* Forward loword. */ | ||
265 | /* Replace with NOP to avoid messing up the snapshot logic. */ | ||
266 | nir->ot = IRT(IR_NOP, IRT_NIL); | ||
267 | nir->op1 = nir->op2 = 0; | ||
268 | } | ||
269 | } | ||
270 | } else if (ir->o == IR_LOOP) { | ||
271 | J->loopref = nref; /* Needed by assembler. */ | ||
272 | } | ||
273 | } | ||
274 | |||
275 | /* Add PHI marks. */ | ||
276 | for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) { | ||
277 | IRIns *ir = IR(ref); | ||
278 | if (ir->o != IR_PHI) break; | ||
279 | if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t); | ||
280 | if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t); | ||
281 | } | ||
282 | |||
283 | /* Substitute snapshot maps. */ | ||
284 | oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */ | ||
285 | { | ||
286 | SnapNo i, nsnap = J->cur.nsnap; | ||
287 | for (i = 0; i < nsnap; i++) { | ||
288 | SnapShot *snap = &J->cur.snap[i]; | ||
289 | SnapEntry *map = &J->cur.snapmap[snap->mapofs]; | ||
290 | MSize n, nent = snap->nent; | ||
291 | snap->ref = oir[snap->ref].prev; | ||
292 | for (n = 0; n < nent; n++) { | ||
293 | SnapEntry sn = map[n]; | ||
294 | map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev); | ||
295 | } | ||
296 | } | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /* Protected callback for split pass. */ | ||
301 | static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud) | ||
302 | { | ||
303 | jit_State *J = (jit_State *)ud; | ||
304 | split_ir(J); | ||
305 | UNUSED(L); UNUSED(dummy); | ||
306 | return NULL; | ||
307 | } | ||
308 | |||
309 | #ifdef LUA_USE_ASSERT | ||
310 | /* Slow, but sure way to check whether a SPLIT pass is needed. */ | ||
311 | static int split_needsplit(jit_State *J) | ||
312 | { | ||
313 | IRIns *ir, *irend; | ||
314 | IRRef ref; | ||
315 | for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++) | ||
316 | if (irt_isint64(ir->t)) | ||
317 | return 1; | ||
318 | for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev) | ||
319 | if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 || | ||
320 | (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64) | ||
321 | return 1; | ||
322 | return 0; /* Nope. */ | ||
323 | } | ||
324 | #endif | ||
325 | |||
326 | /* SPLIT pass. */ | ||
327 | void lj_opt_split(jit_State *J) | ||
328 | { | ||
329 | lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */ | ||
330 | if (J->needsplit) { | ||
331 | int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit); | ||
332 | if (errcode) { | ||
333 | /* Completely reset the trace to avoid inconsistent dump on abort. */ | ||
334 | J->cur.nins = J->cur.nk = REF_BASE; | ||
335 | J->cur.nsnap = 0; | ||
336 | lj_err_throw(J->L, errcode); /* Propagate errors. */ | ||
337 | } | ||
338 | } | ||
339 | } | ||
340 | |||
341 | #undef IR | ||
342 | |||
343 | #endif | ||