aboutsummaryrefslogtreecommitdiff
path: root/src/lj_opt_split.c
diff options
context:
space:
mode:
authorMike Pall <mike>2011-02-02 02:29:37 +0100
committerMike Pall <mike>2011-02-02 02:29:37 +0100
commitb613216efc7447dae645d8834e4d6f3185cd1bcc (patch)
tree0859fed377f00ebeada70ba45d02496b7fb4a249 /src/lj_opt_split.c
parentc539c0cac8f668e66a5ce9e5fd645cb45e3c5063 (diff)
downloadluajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.gz
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.tar.bz2
luajit-b613216efc7447dae645d8834e4d6f3185cd1bcc.zip
Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs.
Add generic HIOP instruction for extra backend functionality. Add support for HIOP to x86 backend. Use POWI for 64 bit integer x^k, too. POWI is lowered to a call by SPLIT or the x64 backend.
Diffstat (limited to 'src/lj_opt_split.c')
-rw-r--r--src/lj_opt_split.c343
1 files changed, 343 insertions, 0 deletions
diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c
new file mode 100644
index 00000000..3cb30514
--- /dev/null
+++ b/src/lj_opt_split.c
@@ -0,0 +1,343 @@
1/*
2** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions.
3** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_opt_split_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10
11#if LJ_HASJIT && LJ_HASFFI && LJ_32
12
13#include "lj_err.h"
14#include "lj_str.h"
15#include "lj_ir.h"
16#include "lj_jit.h"
17#include "lj_iropt.h"
18#include "lj_vm.h"
19
20/* SPLIT pass:
21**
22** This pass splits up 64 bit IR instructions into multiple 32 bit IR
23** instructions. It's only active for 32 bit CPUs which lack native 64 bit
24** operations. The FFI is currently the only emitter for 64 bit
25** instructions, so this pass is disabled if the FFI is disabled.
26**
27** Splitting the IR in a separate pass keeps each 32 bit IR assembler
28** backend simple. Only a small amount of extra functionality needs to be
29** implemented. This is much easier than adding support for allocating
30** register pairs to each backend (believe me, I tried). A few simple, but
31** important optimizations can be performed by the SPLIT pass, which would
32** be tedious to do in the backend.
33**
34** The basic idea is to replace each 64 bit IR instruction with its 32 bit
35** equivalent plus an extra HIOP instruction. The splitted IR is not passed
36** through FOLD or any other optimizations, so each HIOP is guaranteed to
37** immediately follow it's counterpart. The actual functionality of HIOP is
38** inferred from the previous instruction.
39**
40** The operands of HIOP hold the hiword input references. The output of HIOP
41** is the hiword output reference, which is also used to hold the hiword
42** register or spill slot information. The register allocator treats this
43** instruction independent of any other instruction, which improves code
44** quality compared to using fixed register pairs.
45**
46** It's easier to split up some instructions into two regular 32 bit
47** instructions. E.g. XLOAD is split up into two XLOADs with two different
48** addresses. Obviously 64 bit constants need to be split up into two 32 bit
49** constants, too. Some hiword instructions can be entirely omitted, e.g.
50** when zero-extending a 32 bit value to 64 bits.
51**
52** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with
53** two int64_t fields:
54**
55** 0100 p32 ADD base +8
56** 0101 i64 XLOAD 0100
57** 0102 i64 ADD 0101 +1
58** 0103 p32 ADD base +16
59** 0104 i64 XSTORE 0103 0102
60**
61** mov rax, [esi+0x8]
62** add rax, +0x01
63** mov [esi+0x10], rax
64**
65** Here's the transformed IR and the x86 machine code after the SPLIT pass:
66**
67** 0100 p32 ADD base +8
68** 0101 int XLOAD 0100
69** 0102 p32 ADD base +12
70** 0103 int XLOAD 0102
71** 0104 int ADD 0101 +1
72** 0105 int HIOP 0103 +0
73** 0106 p32 ADD base +16
74** 0107 int XSTORE 0106 0104
75** 0108 p32 ADD base +20
76** 0109 int XSTORE 0108 0105
77**
78** mov eax, [esi+0x8]
79** mov ecx, [esi+0xc]
80** add eax, +0x01
81** adc ecx, +0x00
82** mov [esi+0x10], eax
83** mov [esi+0x14], ecx
84**
85** You may notice the reassociated hiword address computation, which is
86** later fused into the mov operands by the assembler.
87*/
88
89/* Some local macros to save typing. Undef'd at the end. */
90#define IR(ref) (&J->cur.ir[(ref)])
91
92/* Directly emit the transformed IR without updating chains etc. */
93static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2)
94{
95 IRRef nref = lj_ir_nextins(J);
96 IRIns *ir = IR(nref);
97 ir->ot = ot;
98 ir->op1 = op1;
99 ir->op2 = op2;
100 return nref;
101}
102
103/* Emit a CALLN with two split 64 bit arguments. */
104static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir,
105 IRIns *ir, IRCallID id)
106{
107 IRRef tmp, op1 = ir->op1, op2 = ir->op2;
108 J->cur.nins--;
109#if LJ_LE
110 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]);
111 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
112 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
113#else
114 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev);
115 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]);
116 tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev);
117#endif
118 ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id);
119 return split_emit(J, IRTI(IR_HIOP), tmp, tmp);
120}
121
122/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */
123static IRRef split_ptr(jit_State *J, IRRef ref)
124{
125 IRIns *ir = IR(ref);
126 int32_t ofs = 4;
127 if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */
128 ofs += IR(ir->op2)->i;
129 ref = ir->op1;
130 if (ofs == 0) return ref;
131 }
132 return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs));
133}
134
135/* Transform the old IR to the new IR. */
136static void split_ir(jit_State *J)
137{
138 IRRef nins = J->cur.nins, nk = J->cur.nk;
139 MSize irlen = nins - nk;
140 MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1));
141 IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need);
142 IRRef1 *hisubst;
143 IRRef ref;
144
145 /* Copy old IR to buffer. */
146 memcpy(oir, IR(nk), irlen*sizeof(IRIns));
147 /* Bias hiword substitution table and old IR. Loword kept in field prev. */
148 hisubst = (IRRef1 *)&oir[irlen] - nk;
149 oir -= nk;
150
151 /* Remove all IR instructions, but retain IR constants. */
152 J->cur.nins = REF_FIRST;
153
154 /* Process constants and fixed references. */
155 for (ref = nk; ref <= REF_BASE; ref++) {
156 IRIns *ir = &oir[ref];
157 if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */
158 TValue tv = *ir_k64(ir);
159 ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo);
160 hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi);
161 } else {
162 ir->prev = (IRRef1)ref; /* Identity substitution for loword. */
163 }
164 }
165
166 /* Process old IR instructions. */
167 for (ref = REF_FIRST; ref < nins; ref++) {
168 IRIns *ir = &oir[ref];
169 IRRef nref = lj_ir_nextins(J);
170 IRIns *nir = IR(nref);
171
172 /* Copy-substitute old instruction to new instruction. */
173 nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev;
174 nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev;
175 ir->prev = nref; /* Loword substitution. */
176 nir->o = ir->o;
177 nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI);
178
179 /* Split 64 bit instructions. */
180 if (irt_isint64(ir->t)) {
181 IRRef hi = hisubst[ir->op1];
182 nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */
183 switch (ir->o) {
184 case IR_ADD:
185 case IR_SUB:
186 /* Use plain op for hiword if loword cannot produce a carry/borrow. */
187 if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) {
188 ir->prev = nir->op1; /* Pass through loword. */
189 nir->op1 = hi; nir->op2 = hisubst[ir->op2];
190 hi = nref;
191 break;
192 }
193 /* fallthrough */
194 case IR_NEG:
195 hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]);
196 break;
197 case IR_MUL:
198 hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64);
199 break;
200 case IR_POWI:
201 hi = split_call64(J, hisubst, oir, ir,
202 irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
203 IRCALL_lj_carith_powu64);
204 break;
205 case IR_XLOAD:
206 hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2);
207#if LJ_BE
208 ir->prev = hi; hi = nref;
209#endif
210 break;
211 case IR_XSTORE:
212#if LJ_LE
213 hi = hisubst[ir->op2];
214#else
215 hi = nir->op2; nir->op2 = hisubst[ir->op2];
216#endif
217 split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi);
218 continue;
219 case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */
220 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
221 if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */
222 hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref);
223 } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */
224 /* Drop cast, since assembler doesn't care. */
225 hisubst[ref] = hi;
226 goto fwdlo;
227 } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
228 IRRef k31 = lj_ir_kint(J, 31);
229 nir = IR(nref); /* May have been reallocated. */
230 ir->prev = nir->op1; /* Pass through loword. */
231 nir->o = IR_BSAR; /* hi = bsar(lo, 31). */
232 nir->op2 = k31;
233 hi = nref;
234 } else { /* Zero-extend to 64 bit. */
235 hisubst[ref] = lj_ir_kint(J, 0);
236 goto fwdlo;
237 }
238 break;
239 }
240 case IR_PHI: {
241 IRRef hi2;
242 if ((irref_isk(nir->op1) && irref_isk(nir->op2)) ||
243 nir->op1 == nir->op2)
244 J->cur.nins--; /* Drop useless PHIs. */
245 hi2 = hisubst[ir->op2];
246 if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2))
247 split_emit(J, IRTI(IR_PHI), hi, hi2);
248 continue;
249 }
250 default:
251 lua_assert(ir->o <= IR_NE);
252 split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */
253 continue;
254 }
255 hisubst[ref] = hi; /* Store hiword substitution. */
256 } else if (ir->o == IR_CONV) { /* See above, too. */
257 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
258 if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */
259 if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */
260 ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)),
261 hisubst[ir->op1], nref);
262 } else { /* Truncate to lower 32 bits. */
263 fwdlo:
264 ir->prev = nir->op1; /* Forward loword. */
265 /* Replace with NOP to avoid messing up the snapshot logic. */
266 nir->ot = IRT(IR_NOP, IRT_NIL);
267 nir->op1 = nir->op2 = 0;
268 }
269 }
270 } else if (ir->o == IR_LOOP) {
271 J->loopref = nref; /* Needed by assembler. */
272 }
273 }
274
275 /* Add PHI marks. */
276 for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) {
277 IRIns *ir = IR(ref);
278 if (ir->o != IR_PHI) break;
279 if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t);
280 if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t);
281 }
282
283 /* Substitute snapshot maps. */
284 oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */
285 {
286 SnapNo i, nsnap = J->cur.nsnap;
287 for (i = 0; i < nsnap; i++) {
288 SnapShot *snap = &J->cur.snap[i];
289 SnapEntry *map = &J->cur.snapmap[snap->mapofs];
290 MSize n, nent = snap->nent;
291 snap->ref = oir[snap->ref].prev;
292 for (n = 0; n < nent; n++) {
293 SnapEntry sn = map[n];
294 map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev);
295 }
296 }
297 }
298}
299
300/* Protected callback for split pass. */
301static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud)
302{
303 jit_State *J = (jit_State *)ud;
304 split_ir(J);
305 UNUSED(L); UNUSED(dummy);
306 return NULL;
307}
308
309#ifdef LUA_USE_ASSERT
310/* Slow, but sure way to check whether a SPLIT pass is needed. */
311static int split_needsplit(jit_State *J)
312{
313 IRIns *ir, *irend;
314 IRRef ref;
315 for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++)
316 if (irt_isint64(ir->t))
317 return 1;
318 for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev)
319 if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 ||
320 (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64)
321 return 1;
322 return 0; /* Nope. */
323}
324#endif
325
326/* SPLIT pass. */
327void lj_opt_split(jit_State *J)
328{
329 lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */
330 if (J->needsplit) {
331 int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit);
332 if (errcode) {
333 /* Completely reset the trace to avoid inconsistent dump on abort. */
334 J->cur.nins = J->cur.nk = REF_BASE;
335 J->cur.nsnap = 0;
336 lj_err_throw(J->L, errcode); /* Propagate errors. */
337 }
338 }
339}
340
341#undef IR
342
343#endif