diff options
author | Mike Pall <mike> | 2011-02-02 02:29:37 +0100 |
---|---|---|
committer | Mike Pall <mike> | 2011-02-02 02:29:37 +0100 |
commit | b613216efc7447dae645d8834e4d6f3185cd1bcc (patch) | |
tree | 0859fed377f00ebeada70ba45d02496b7fb4a249 /src/lj_opt_split.c | |
parent | c539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff) | |
download | luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2 luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip |
Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs.
Add generic HIOP instruction for extra backend functionality.
Add support for HIOP to x86 backend.
Use POWI for 64 bit integer x^k, too.
POWI is lowered to a call by SPLIT or the x64 backend.
Diffstat (limited to 'src/lj_opt_split.c')
-rw-r--r-- | src/lj_opt_split.c | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c | |||
@@ -0,0 +1,343 @@ | |||
1 | /* | ||
2 | ** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions. | ||
3 | ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h | ||
4 | */ | ||
5 | |||
6 | #define lj_opt_split_c | ||
7 | #define LUA_CORE | ||
8 | |||
9 | #include "lj_obj.h" | ||
10 | |||
11 | #if LJ_HASJIT && LJ_HASFFI && LJ_32 | ||
12 | |||
13 | #include "lj_err.h" | ||
14 | #include "lj_str.h" | ||
15 | #include "lj_ir.h" | ||
16 | #include "lj_jit.h" | ||
17 | #include "lj_iropt.h" | ||
18 | #include "lj_vm.h" | ||
19 | |||
20 | /* SPLIT pass: | ||
21 | ** | ||
22 | ** This pass splits up 64 bit IR instructions into multiple 32 bit IR | ||
23 | ** instructions. It's only active for 32 bit CPUs which lack native 64 bit | ||
24 | ** operations. The FFI is currently the only emitter for 64 bit | ||
25 | ** instructions, so this pass is disabled if the FFI is disabled. | ||
26 | ** | ||
27 | ** Splitting the IR in a separate pass keeps each 32 bit IR assembler | ||
28 | ** backend simple. Only a small amount of extra functionality needs to be | ||
29 | ** implemented. This is much easier than adding support for allocating | ||
30 | ** register pairs to each backend (believe me, I tried). A few simple, but | ||
31 | ** important optimizations can be performed by the SPLIT pass, which would | ||
32 | ** be tedious to do in the backend. | ||
33 | ** | ||
34 | ** The basic idea is to replace each 64 bit IR instruction with its 32 bit | ||
35 | ** equivalent plus an extra HIOP instruction. The splitted IR is not passed | ||
36 | ** through FOLD or any other optimizations, so each HIOP is guaranteed to | ||
37 | ** immediately follow it's counterpart. The actual functionality of HIOP is | ||
38 | ** inferred from the previous instruction. | ||
39 | ** | ||
40 | ** The operands of HIOP hold the hiword input references. The output of HIOP | ||
41 | ** is the hiword output reference, which is also used to hold the hiword | ||
42 | ** register or spill slot information. The register allocator treats this | ||
43 | ** instruction independent of any other instruction, which improves code | ||
44 | ** quality compared to using fixed register pairs. | ||
45 | ** | ||
46 | ** It's easier to split up some instructions into two regular 32 bit | ||
47 | ** instructions. E.g. XLOAD is split up into two XLOADs with two different | ||
48 | ** addresses. Obviously 64 bit constants need to be split up into two 32 bit | ||
49 | ** constants, too. Some hiword instructions can be entirely omitted, e.g. | ||
50 | ** when zero-extending a 32 bit value to 64 bits. | ||
51 | ** | ||
52 | ** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with | ||
53 | ** two int64_t fields: | ||
54 | ** | ||
55 | ** 0100 p32 ADD base +8 | ||
56 | ** 0101 i64 XLOAD 0100 | ||
57 | ** 0102 i64 ADD 0101 +1 | ||
58 | ** 0103 p32 ADD base +16 | ||
59 | ** 0104 i64 XSTORE 0103 0102 | ||
60 | ** | ||
61 | ** mov rax, [esi+0x8] | ||
62 | ** add rax, +0x01 | ||
63 | ** mov [esi+0x10], rax | ||
64 | ** | ||
65 | ** Here's the transformed IR and the x86 machine code after the SPLIT pass: | ||
66 | ** | ||
67 | ** 0100 p32 ADD base +8 | ||
68 | ** 0101 int XLOAD 0100 | ||
69 | ** 0102 p32 ADD base +12 | ||
70 | ** 0103 int XLOAD 0102 | ||
71 | ** 0104 int ADD 0101 +1 | ||
72 | ** 0105 int HIOP 0103 +0 | ||
73 | ** 0106 p32 ADD base +16 | ||
74 | ** 0107 int XSTORE 0106 0104 | ||
75 | ** 0108 p32 ADD base +20 | ||
76 | ** 0109 int XSTORE 0108 0105 | ||
77 | ** | ||
78 | ** mov eax, [esi+0x8] | ||
79 | ** mov ecx, [esi+0xc] | ||
80 | ** add eax, +0x01 | ||
81 | ** adc ecx, +0x00 | ||
82 | ** mov [esi+0x10], eax | ||
83 | ** mov [esi+0x14], ecx | ||
84 | ** | ||
85 | ** You may notice the reassociated hiword address computation, which is | ||
86 | ** later fused into the mov operands by the assembler. | ||
87 | */ | ||
88 | |||
89 | /* Some local macros to save typing. Undef'd at the end. */ | ||
90 | #define IR(ref) (&J->cur.ir[(ref)]) | ||
91 | |||
92 | /* Directly emit the transformed IR without updating chains etc. */ | ||
93 | static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2) | ||
94 | { | ||
95 | IRRef nref = lj_ir_nextins(J); | ||
96 | IRIns *ir = IR(nref); | ||
97 | ir->ot = ot; | ||
98 | ir->op1 = op1; | ||
99 | ir->op2 = op2; | ||
100 | return nref; | ||
101 | } | ||
102 | |||
103 | /* Emit a CALLN with two split 64 bit arguments. */ | ||
104 | static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir, | ||
105 | IRIns *ir, IRCallID id) | ||
106 | { | ||
107 | IRRef tmp, op1 = ir->op1, op2 = ir->op2; | ||
108 | J->cur.nins--; | ||
109 | #if LJ_LE | ||
110 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]); | ||
111 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); | ||
112 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); | ||
113 | #else | ||
114 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev); | ||
115 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); | ||
116 | tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); | ||
117 | #endif | ||
118 | ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id); | ||
119 | return split_emit(J, IRTI(IR_HIOP), tmp, tmp); | ||
120 | } | ||
121 | |||
122 | /* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */ | ||
123 | static IRRef split_ptr(jit_State *J, IRRef ref) | ||
124 | { | ||
125 | IRIns *ir = IR(ref); | ||
126 | int32_t ofs = 4; | ||
127 | if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */ | ||
128 | ofs += IR(ir->op2)->i; | ||
129 | ref = ir->op1; | ||
130 | if (ofs == 0) return ref; | ||
131 | } | ||
132 | return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs)); | ||
133 | } | ||
134 | |||
135 | /* Transform the old IR to the new IR. */ | ||
136 | static void split_ir(jit_State *J) | ||
137 | { | ||
138 | IRRef nins = J->cur.nins, nk = J->cur.nk; | ||
139 | MSize irlen = nins - nk; | ||
140 | MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1)); | ||
141 | IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need); | ||
142 | IRRef1 *hisubst; | ||
143 | IRRef ref; | ||
144 | |||
145 | /* Copy old IR to buffer. */ | ||
146 | memcpy(oir, IR(nk), irlen*sizeof(IRIns)); | ||
147 | /* Bias hiword substitution table and old IR. Loword kept in field prev. */ | ||
148 | hisubst = (IRRef1 *)&oir[irlen] - nk; | ||
149 | oir -= nk; | ||
150 | |||
151 | /* Remove all IR instructions, but retain IR constants. */ | ||
152 | J->cur.nins = REF_FIRST; | ||
153 | |||
154 | /* Process constants and fixed references. */ | ||
155 | for (ref = nk; ref <= REF_BASE; ref++) { | ||
156 | IRIns *ir = &oir[ref]; | ||
157 | if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */ | ||
158 | TValue tv = *ir_k64(ir); | ||
159 | ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo); | ||
160 | hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi); | ||
161 | } else { | ||
162 | ir->prev = (IRRef1)ref; /* Identity substitution for loword. */ | ||
163 | } | ||
164 | } | ||
165 | |||
166 | /* Process old IR instructions. */ | ||
167 | for (ref = REF_FIRST; ref < nins; ref++) { | ||
168 | IRIns *ir = &oir[ref]; | ||
169 | IRRef nref = lj_ir_nextins(J); | ||
170 | IRIns *nir = IR(nref); | ||
171 | |||
172 | /* Copy-substitute old instruction to new instruction. */ | ||
173 | nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev; | ||
174 | nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev; | ||
175 | ir->prev = nref; /* Loword substitution. */ | ||
176 | nir->o = ir->o; | ||
177 | nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI); | ||
178 | |||
179 | /* Split 64 bit instructions. */ | ||
180 | if (irt_isint64(ir->t)) { | ||
181 | IRRef hi = hisubst[ir->op1]; | ||
182 | nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */ | ||
183 | switch (ir->o) { | ||
184 | case IR_ADD: | ||
185 | case IR_SUB: | ||
186 | /* Use plain op for hiword if loword cannot produce a carry/borrow. */ | ||
187 | if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) { | ||
188 | ir->prev = nir->op1; /* Pass through loword. */ | ||
189 | nir->op1 = hi; nir->op2 = hisubst[ir->op2]; | ||
190 | hi = nref; | ||
191 | break; | ||
192 | } | ||
193 | /* fallthrough */ | ||
194 | case IR_NEG: | ||
195 | hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]); | ||
196 | break; | ||
197 | case IR_MUL: | ||
198 | hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64); | ||
199 | break; | ||
200 | case IR_POWI: | ||
201 | hi = split_call64(J, hisubst, oir, ir, | ||
202 | irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : | ||
203 | IRCALL_lj_carith_powu64); | ||
204 | break; | ||
205 | case IR_XLOAD: | ||
206 | hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2); | ||
207 | #if LJ_BE | ||
208 | ir->prev = hi; hi = nref; | ||
209 | #endif | ||
210 | break; | ||
211 | case IR_XSTORE: | ||
212 | #if LJ_LE | ||
213 | hi = hisubst[ir->op2]; | ||
214 | #else | ||
215 | hi = nir->op2; nir->op2 = hisubst[ir->op2]; | ||
216 | #endif | ||
217 | split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi); | ||
218 | continue; | ||
219 | case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */ | ||
220 | IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); | ||
221 | if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */ | ||
222 | hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref); | ||
223 | } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */ | ||
224 | /* Drop cast, since assembler doesn't care. */ | ||
225 | hisubst[ref] = hi; | ||
226 | goto fwdlo; | ||
227 | } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */ | ||
228 | IRRef k31 = lj_ir_kint(J, 31); | ||
229 | nir = IR(nref); /* May have been reallocated. */ | ||
230 | ir->prev = nir->op1; /* Pass through loword. */ | ||
231 | nir->o = IR_BSAR; /* hi = bsar(lo, 31). */ | ||
232 | nir->op2 = k31; | ||
233 | hi = nref; | ||
234 | } else { /* Zero-extend to 64 bit. */ | ||
235 | hisubst[ref] = lj_ir_kint(J, 0); | ||
236 | goto fwdlo; | ||
237 | } | ||
238 | break; | ||
239 | } | ||
240 | case IR_PHI: { | ||
241 | IRRef hi2; | ||
242 | if ((irref_isk(nir->op1) && irref_isk(nir->op2)) || | ||
243 | nir->op1 == nir->op2) | ||
244 | J->cur.nins--; /* Drop useless PHIs. */ | ||
245 | hi2 = hisubst[ir->op2]; | ||
246 | if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2)) | ||
247 | split_emit(J, IRTI(IR_PHI), hi, hi2); | ||
248 | continue; | ||
249 | } | ||
250 | default: | ||
251 | lua_assert(ir->o <= IR_NE); | ||
252 | split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */ | ||
253 | continue; | ||
254 | } | ||
255 | hisubst[ref] = hi; /* Store hiword substitution. */ | ||
256 | } else if (ir->o == IR_CONV) { /* See above, too. */ | ||
257 | IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); | ||
258 | if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */ | ||
259 | if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */ | ||
260 | ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)), | ||
261 | hisubst[ir->op1], nref); | ||
262 | } else { /* Truncate to lower 32 bits. */ | ||
263 | fwdlo: | ||
264 | ir->prev = nir->op1; /* Forward loword. */ | ||
265 | /* Replace with NOP to avoid messing up the snapshot logic. */ | ||
266 | nir->ot = IRT(IR_NOP, IRT_NIL); | ||
267 | nir->op1 = nir->op2 = 0; | ||
268 | } | ||
269 | } | ||
270 | } else if (ir->o == IR_LOOP) { | ||
271 | J->loopref = nref; /* Needed by assembler. */ | ||
272 | } | ||
273 | } | ||
274 | |||
275 | /* Add PHI marks. */ | ||
276 | for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) { | ||
277 | IRIns *ir = IR(ref); | ||
278 | if (ir->o != IR_PHI) break; | ||
279 | if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t); | ||
280 | if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t); | ||
281 | } | ||
282 | |||
283 | /* Substitute snapshot maps. */ | ||
284 | oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */ | ||
285 | { | ||
286 | SnapNo i, nsnap = J->cur.nsnap; | ||
287 | for (i = 0; i < nsnap; i++) { | ||
288 | SnapShot *snap = &J->cur.snap[i]; | ||
289 | SnapEntry *map = &J->cur.snapmap[snap->mapofs]; | ||
290 | MSize n, nent = snap->nent; | ||
291 | snap->ref = oir[snap->ref].prev; | ||
292 | for (n = 0; n < nent; n++) { | ||
293 | SnapEntry sn = map[n]; | ||
294 | map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev); | ||
295 | } | ||
296 | } | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /* Protected callback for split pass. */ | ||
301 | static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud) | ||
302 | { | ||
303 | jit_State *J = (jit_State *)ud; | ||
304 | split_ir(J); | ||
305 | UNUSED(L); UNUSED(dummy); | ||
306 | return NULL; | ||
307 | } | ||
308 | |||
309 | #ifdef LUA_USE_ASSERT | ||
310 | /* Slow, but sure way to check whether a SPLIT pass is needed. */ | ||
311 | static int split_needsplit(jit_State *J) | ||
312 | { | ||
313 | IRIns *ir, *irend; | ||
314 | IRRef ref; | ||
315 | for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++) | ||
316 | if (irt_isint64(ir->t)) | ||
317 | return 1; | ||
318 | for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev) | ||
319 | if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 || | ||
320 | (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64) | ||
321 | return 1; | ||
322 | return 0; /* Nope. */ | ||
323 | } | ||
324 | #endif | ||
325 | |||
326 | /* SPLIT pass. */ | ||
327 | void lj_opt_split(jit_State *J) | ||
328 | { | ||
329 | lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */ | ||
330 | if (J->needsplit) { | ||
331 | int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit); | ||
332 | if (errcode) { | ||
333 | /* Completely reset the trace to avoid inconsistent dump on abort. */ | ||
334 | J->cur.nins = J->cur.nk = REF_BASE; | ||
335 | J->cur.nsnap = 0; | ||
336 | lj_err_throw(J->L, errcode); /* Propagate errors. */ | ||
337 | } | ||
338 | } | ||
339 | } | ||
340 | |||
341 | #undef IR | ||
342 | |||
343 | #endif | ||