diff options
Diffstat (limited to 'src/lj_asm.c')
-rw-r--r-- | src/lj_asm.c | 3324 |
1 files changed, 3324 insertions, 0 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c new file mode 100644 index 00000000..b89b8543 --- /dev/null +++ b/src/lj_asm.c | |||
@@ -0,0 +1,3324 @@ | |||
1 | /* | ||
2 | ** IR assembler (SSA IR -> machine code). | ||
3 | ** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h | ||
4 | */ | ||
5 | |||
6 | #define lj_asm_c | ||
7 | #define LUA_CORE | ||
8 | |||
9 | #include "lj_obj.h" | ||
10 | |||
11 | #if LJ_HASJIT | ||
12 | |||
13 | #include "lj_gc.h" | ||
14 | #include "lj_str.h" | ||
15 | #include "lj_tab.h" | ||
16 | #include "lj_ir.h" | ||
17 | #include "lj_jit.h" | ||
18 | #include "lj_iropt.h" | ||
19 | #include "lj_mcode.h" | ||
20 | #include "lj_iropt.h" | ||
21 | #include "lj_trace.h" | ||
22 | #include "lj_snap.h" | ||
23 | #include "lj_asm.h" | ||
24 | #include "lj_dispatch.h" | ||
25 | #include "lj_vm.h" | ||
26 | #include "lj_target.h" | ||
27 | |||
28 | /* -- Assembler state and common macros ----------------------------------- */ | ||
29 | |||
30 | /* Assembler state. */ | ||
31 | typedef struct ASMState { | ||
32 | RegCost cost[RID_MAX]; /* Reference and blended allocation cost for regs. */ | ||
33 | |||
34 | MCode *mcp; /* Current MCode pointer (grows down). */ | ||
35 | MCode *mclim; /* Lower limit for MCode memory + red zone. */ | ||
36 | |||
37 | IRIns *ir; /* Copy of pointer to IR instructions/constants. */ | ||
38 | jit_State *J; /* JIT compiler state. */ | ||
39 | |||
40 | x86ModRM mrm; /* Fused x86 address operand. */ | ||
41 | |||
42 | RegSet freeset; /* Set of free registers. */ | ||
43 | RegSet modset; /* Set of registers modified inside the loop. */ | ||
44 | RegSet phiset; /* Set of PHI registers. */ | ||
45 | |||
46 | uint32_t flags; /* Copy of JIT compiler flags. */ | ||
47 | int loopinv; /* Loop branch inversion (0:no, 1:yes, 2:yes+CC_P). */ | ||
48 | |||
49 | int32_t evenspill; /* Next even spill slot. */ | ||
50 | int32_t oddspill; /* Next odd spill slot (or 0). */ | ||
51 | |||
52 | IRRef curins; /* Reference of current instruction. */ | ||
53 | IRRef stopins; /* Stop assembly before hitting this instruction. */ | ||
54 | IRRef orignins; /* Original T->nins. */ | ||
55 | |||
56 | IRRef snapref; /* Current snapshot is active after this reference. */ | ||
57 | IRRef snaprename; /* Rename highwater mark for snapshot check. */ | ||
58 | SnapNo snapno; /* Current snapshot number. */ | ||
59 | SnapNo loopsnapno; /* Loop snapshot number. */ | ||
60 | |||
61 | Trace *T; /* Trace to assemble. */ | ||
62 | Trace *parent; /* Parent trace (or NULL). */ | ||
63 | |||
64 | IRRef fuseref; /* Fusion limit (loopref, 0 or FUSE_DISABLED). */ | ||
65 | IRRef sectref; /* Section base reference (loopref or 0). */ | ||
66 | IRRef loopref; /* Reference of LOOP instruction (or 0). */ | ||
67 | |||
68 | BCReg topslot; /* Number of slots for stack check (unless 0). */ | ||
69 | MSize gcsteps; /* Accumulated number of GC steps (per section). */ | ||
70 | |||
71 | MCode *mcbot; /* Bottom of reserved MCode. */ | ||
72 | MCode *mctop; /* Top of generated MCode. */ | ||
73 | MCode *mcloop; /* Pointer to loop MCode (or NULL). */ | ||
74 | MCode *invmcp; /* Points to invertible loop branch (or NULL). */ | ||
75 | MCode *testmcp; /* Pending opportunity to remove test r,r. */ | ||
76 | MCode *realign; /* Realign loop if not NULL. */ | ||
77 | |||
78 | IRRef1 phireg[RID_MAX]; /* PHI register references. */ | ||
79 | uint16_t parentmap[LJ_MAX_JSLOTS]; /* Parent slot to RegSP map. */ | ||
80 | } ASMState; | ||
81 | |||
82 | #define IR(ref) (&as->ir[(ref)]) | ||
83 | |||
84 | /* Check for variant to invariant references. */ | ||
85 | #define iscrossref(as, ref) ((ref) < as->sectref) | ||
86 | |||
87 | /* Inhibit memory op fusion from variant to invariant references. */ | ||
88 | #define FUSE_DISABLED (~(IRRef)0) | ||
89 | #define mayfuse(as, ref) ((ref) > as->fuseref) | ||
90 | #define neverfuse(as) (as->fuseref == FUSE_DISABLED) | ||
91 | #define opisfusableload(o) \ | ||
92 | ((o) == IR_ALOAD || (o) == IR_HLOAD || (o) == IR_ULOAD || \ | ||
93 | (o) == IR_FLOAD || (o) == IR_SLOAD || (o) == IR_XLOAD) | ||
94 | |||
95 | /* Instruction selection for XMM moves. */ | ||
96 | #define XMM_MOVRR(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVSD : XO_MOVAPS) | ||
97 | #define XMM_MOVRM(as) ((as->flags & JIT_F_SPLIT_XMM) ? XO_MOVLPD : XO_MOVSD) | ||
98 | |||
99 | /* Sparse limit checks using a red zone before the actual limit. */ | ||
100 | #define MCLIM_REDZONE 64 | ||
101 | #define checkmclim(as) \ | ||
102 | if (LJ_UNLIKELY(as->mcp < as->mclim)) asm_mclimit(as) | ||
103 | |||
104 | static LJ_NORET LJ_NOINLINE void asm_mclimit(ASMState *as) | ||
105 | { | ||
106 | lj_mcode_limiterr(as->J, (size_t)(as->mctop - as->mcp + 4*MCLIM_REDZONE)); | ||
107 | } | ||
108 | |||
109 | /* -- Emit x86 instructions ----------------------------------------------- */ | ||
110 | |||
111 | #define MODRM(mode, r1, r2) ((MCode)((mode)+(((r1)&7)<<3)+((r2)&7))) | ||
112 | |||
113 | #if LJ_64 | ||
114 | #define REXRB(p, rr, rb) \ | ||
115 | { MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \ | ||
116 | if (rex != 0x40) *--(p) = rex; } | ||
117 | #define FORCE_REX 0x200 | ||
118 | #else | ||
119 | #define REXRB(p, rr, rb) ((void)0) | ||
120 | #define FORCE_REX 0 | ||
121 | #endif | ||
122 | |||
123 | #define emit_i8(as, i) (*--as->mcp = (MCode)(i)) | ||
124 | #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4) | ||
125 | |||
126 | #define emit_x87op(as, xo) \ | ||
127 | (*(uint16_t *)(as->mcp-2) = (uint16_t)(xo), as->mcp -= 2) | ||
128 | |||
129 | /* op */ | ||
130 | static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx, | ||
131 | MCode *p, int delta) | ||
132 | { | ||
133 | int n = (int8_t)xo; | ||
134 | #if defined(__GNUC__) | ||
135 | if (__builtin_constant_p(xo) && n == -2) | ||
136 | p[delta-2] = (MCode)(xo >> 24); | ||
137 | else if (__builtin_constant_p(xo) && n == -3) | ||
138 | *(uint16_t *)(p+delta-3) = (uint16_t)(xo >> 16); | ||
139 | else | ||
140 | #endif | ||
141 | *(uint32_t *)(p+delta-5) = (uint32_t)xo; | ||
142 | p += n + delta; | ||
143 | #if LJ_64 | ||
144 | { | ||
145 | uint32_t rex = 0x40 + ((rr>>1)&(4+(FORCE_REX>>1)))+((rx>>2)&2)+((rb>>3)&1); | ||
146 | if (rex != 0x40) { | ||
147 | if (n == -4) { *p = (MCode)rex; rex = (MCode)(xo >> 8); } | ||
148 | *--p = (MCode)rex; | ||
149 | } | ||
150 | } | ||
151 | #else | ||
152 | UNUSED(rr); UNUSED(rb); UNUSED(rx); | ||
153 | #endif | ||
154 | return p; | ||
155 | } | ||
156 | |||
157 | /* op + modrm */ | ||
158 | #define emit_opm(xo, mode, rr, rb, p, delta) \ | ||
159 | (p[(delta)-1] = MODRM((mode), (rr), (rb)), \ | ||
160 | emit_op((xo), (rr), (rb), 0, (p), (delta))) | ||
161 | |||
162 | /* op + modrm + sib */ | ||
163 | #define emit_opmx(xo, mode, scale, rr, rb, rx, p) \ | ||
164 | (p[-1] = MODRM((scale), (rx), (rb)), \ | ||
165 | p[-2] = MODRM((mode), (rr), RID_ESP), \ | ||
166 | emit_op((xo), (rr), (rb), (rx), (p), -1)) | ||
167 | |||
168 | /* op r1, r2 */ | ||
169 | static void emit_rr(ASMState *as, x86Op xo, Reg r1, Reg r2) | ||
170 | { | ||
171 | MCode *p = as->mcp; | ||
172 | as->mcp = emit_opm(xo, XM_REG, r1, r2, p, 0); | ||
173 | } | ||
174 | |||
175 | #if LJ_64 && defined(LUA_USE_ASSERT) | ||
176 | /* [addr] is sign-extended in x64 and must be in lower 2G (not 4G). */ | ||
177 | static int32_t ptr2addr(void *p) | ||
178 | { | ||
179 | lua_assert((uintptr_t)p < (uintptr_t)0x80000000); | ||
180 | return i32ptr(p); | ||
181 | } | ||
182 | #else | ||
183 | #define ptr2addr(p) (i32ptr((p))) | ||
184 | #endif | ||
185 | |||
186 | /* op r, [addr] */ | ||
187 | static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr) | ||
188 | { | ||
189 | MCode *p = as->mcp; | ||
190 | *(int32_t *)(p-4) = ptr2addr(addr); | ||
191 | #if LJ_64 | ||
192 | p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP); | ||
193 | as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5); | ||
194 | #else | ||
195 | as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4); | ||
196 | #endif | ||
197 | } | ||
198 | |||
199 | /* op r, [base+ofs] */ | ||
200 | static void emit_rmro(ASMState *as, x86Op xo, Reg rr, Reg rb, int32_t ofs) | ||
201 | { | ||
202 | MCode *p = as->mcp; | ||
203 | x86Mode mode; | ||
204 | if (ra_hasreg(rb)) { | ||
205 | if (ofs == 0 && (rb&7) != RID_EBP) { | ||
206 | mode = XM_OFS0; | ||
207 | } else if (checki8(ofs)) { | ||
208 | *--p = (MCode)ofs; | ||
209 | mode = XM_OFS8; | ||
210 | } else { | ||
211 | p -= 4; | ||
212 | *(int32_t *)p = ofs; | ||
213 | mode = XM_OFS32; | ||
214 | } | ||
215 | if ((rb&7) == RID_ESP) | ||
216 | *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP); | ||
217 | } else { | ||
218 | *(int32_t *)(p-4) = ofs; | ||
219 | #if LJ_64 | ||
220 | p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP); | ||
221 | p -= 5; | ||
222 | rb = RID_ESP; | ||
223 | #else | ||
224 | p -= 4; | ||
225 | rb = RID_EBP; | ||
226 | #endif | ||
227 | mode = XM_OFS0; | ||
228 | } | ||
229 | as->mcp = emit_opm(xo, mode, rr, rb, p, 0); | ||
230 | } | ||
231 | |||
232 | /* op r, [base+idx*scale+ofs] */ | ||
233 | static void emit_rmrxo(ASMState *as, x86Op xo, Reg rr, Reg rb, Reg rx, | ||
234 | x86Mode scale, int32_t ofs) | ||
235 | { | ||
236 | MCode *p = as->mcp; | ||
237 | x86Mode mode; | ||
238 | if (ofs == 0 && (rb&7) != RID_EBP) { | ||
239 | mode = XM_OFS0; | ||
240 | } else if (checki8(ofs)) { | ||
241 | mode = XM_OFS8; | ||
242 | *--p = (MCode)ofs; | ||
243 | } else { | ||
244 | mode = XM_OFS32; | ||
245 | p -= 4; | ||
246 | *(int32_t *)p = ofs; | ||
247 | } | ||
248 | as->mcp = emit_opmx(xo, mode, scale, rr, rb, rx, p); | ||
249 | } | ||
250 | |||
251 | /* op r, i */ | ||
252 | static void emit_gri(ASMState *as, x86Group xg, Reg rb, int32_t i) | ||
253 | { | ||
254 | MCode *p = as->mcp; | ||
255 | if (checki8(i)) { | ||
256 | p -= 3; | ||
257 | p[2] = (MCode)i; | ||
258 | p[0] = (MCode)(xg >> 16); | ||
259 | } else { | ||
260 | p -= 6; | ||
261 | *(int32_t *)(p+2) = i; | ||
262 | p[0] = (MCode)(xg >> 8); | ||
263 | } | ||
264 | p[1] = MODRM(XM_REG, xg, rb); | ||
265 | REXRB(p, 0, rb); | ||
266 | as->mcp = p; | ||
267 | } | ||
268 | |||
269 | /* op [base+ofs], i */ | ||
270 | static void emit_gmroi(ASMState *as, x86Group xg, Reg rb, int32_t ofs, | ||
271 | int32_t i) | ||
272 | { | ||
273 | x86Op xo; | ||
274 | if (checki8(i)) { | ||
275 | emit_i8(as, i); | ||
276 | xo = (x86Op)(((xg >> 16) << 24)+0xfe); | ||
277 | } else { | ||
278 | emit_i32(as, i); | ||
279 | xo = (x86Op)(((xg >> 8) << 24)+0xfe); | ||
280 | } | ||
281 | emit_rmro(as, xo, (Reg)xg, rb, ofs); | ||
282 | } | ||
283 | |||
284 | #define emit_shifti(as, xg, r, i) \ | ||
285 | (emit_i8(as, (i)), emit_rr(as, XO_SHIFTi, (Reg)(xg), (r))) | ||
286 | |||
287 | /* op r, rm/mrm */ | ||
288 | static void emit_mrm(ASMState *as, x86Op xo, Reg rr, Reg rb) | ||
289 | { | ||
290 | MCode *p = as->mcp; | ||
291 | x86Mode mode = XM_REG; | ||
292 | if (rb == RID_MRM) { | ||
293 | rb = as->mrm.base; | ||
294 | if (rb == RID_NONE) { | ||
295 | rb = RID_EBP; | ||
296 | mode = XM_OFS0; | ||
297 | p -= 4; | ||
298 | *(int32_t *)p = as->mrm.ofs; | ||
299 | if (as->mrm.idx != RID_NONE) | ||
300 | goto mrmidx; | ||
301 | #if LJ_64 | ||
302 | *--p = MODRM(XM_SCALE1, RID_ESP, RID_EBP); | ||
303 | rb = RID_ESP; | ||
304 | #endif | ||
305 | } else { | ||
306 | if (as->mrm.ofs == 0 && (rb&7) != RID_EBP) { | ||
307 | mode = XM_OFS0; | ||
308 | } else if (checki8(as->mrm.ofs)) { | ||
309 | *--p = (MCode)as->mrm.ofs; | ||
310 | mode = XM_OFS8; | ||
311 | } else { | ||
312 | p -= 4; | ||
313 | *(int32_t *)p = as->mrm.ofs; | ||
314 | mode = XM_OFS32; | ||
315 | } | ||
316 | if (as->mrm.idx != RID_NONE) { | ||
317 | mrmidx: | ||
318 | as->mcp = emit_opmx(xo, mode, as->mrm.scale, rr, rb, as->mrm.idx, p); | ||
319 | return; | ||
320 | } | ||
321 | if ((rb&7) == RID_ESP) | ||
322 | *--p = MODRM(XM_SCALE1, RID_ESP, RID_ESP); | ||
323 | } | ||
324 | } | ||
325 | as->mcp = emit_opm(xo, mode, rr, rb, p, 0); | ||
326 | } | ||
327 | |||
328 | static void emit_addptr(ASMState *as, Reg r, int32_t ofs) | ||
329 | { | ||
330 | if (ofs) { | ||
331 | if ((as->flags & JIT_F_LEA_AGU)) | ||
332 | emit_rmro(as, XO_LEA, r, r, ofs); | ||
333 | else | ||
334 | emit_gri(as, XG_ARITHi(XOg_ADD), r, ofs); | ||
335 | } | ||
336 | } | ||
337 | |||
338 | /* -- Emit moves ---------------------------------------------------------- */ | ||
339 | |||
340 | /* Generic move between two regs. */ | ||
341 | static void emit_movrr(ASMState *as, Reg r1, Reg r2) | ||
342 | { | ||
343 | emit_rr(as, r1 < RID_MAX_GPR ? XO_MOV : XMM_MOVRR(as), r1, r2); | ||
344 | } | ||
345 | |||
346 | /* Generic move from [base+ofs]. */ | ||
347 | static void emit_movrmro(ASMState *as, Reg rr, Reg rb, int32_t ofs) | ||
348 | { | ||
349 | emit_rmro(as, rr < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), rr, rb, ofs); | ||
350 | } | ||
351 | |||
352 | /* mov [base+ofs], i */ | ||
353 | static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i) | ||
354 | { | ||
355 | emit_i32(as, i); | ||
356 | emit_rmro(as, XO_MOVmi, 0, base, ofs); | ||
357 | } | ||
358 | |||
359 | /* mov [base+ofs], r */ | ||
360 | #define emit_movtomro(as, r, base, ofs) \ | ||
361 | emit_rmro(as, XO_MOVto, (r), (base), (ofs)) | ||
362 | |||
363 | /* Get/set global_State fields. */ | ||
364 | #define emit_opgl(as, xo, r, field) \ | ||
365 | emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field) | ||
366 | #define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field) | ||
367 | #define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field) | ||
368 | #define emit_setgli(as, field, i) \ | ||
369 | (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, field)) | ||
370 | |||
371 | /* mov r, i / xor r, r */ | ||
372 | static void emit_loadi(ASMState *as, Reg r, int32_t i) | ||
373 | { | ||
374 | if (i == 0) { | ||
375 | emit_rr(as, XO_ARITH(XOg_XOR), r, r); | ||
376 | } else { | ||
377 | MCode *p = as->mcp; | ||
378 | *(int32_t *)(p-4) = i; | ||
379 | p[-5] = (MCode)(XI_MOVri+(r&7)); | ||
380 | p -= 5; | ||
381 | REXRB(p, 0, r); | ||
382 | as->mcp = p; | ||
383 | } | ||
384 | } | ||
385 | |||
386 | /* mov r, addr */ | ||
387 | #define emit_loada(as, r, addr) \ | ||
388 | emit_loadi(as, (r), ptr2addr((addr))) | ||
389 | |||
390 | /* movsd r, [&tv->n] / xorps r, r */ | ||
391 | static void emit_loadn(ASMState *as, Reg r, cTValue *tv) | ||
392 | { | ||
393 | if (tvispzero(tv)) /* Use xor only for +0. */ | ||
394 | emit_rr(as, XO_XORPS, r, r); | ||
395 | else | ||
396 | emit_rma(as, XMM_MOVRM(as), r, &tv->n); | ||
397 | } | ||
398 | |||
399 | /* -- Emit branches ------------------------------------------------------- */ | ||
400 | |||
401 | /* Label for short jumps. */ | ||
402 | typedef MCode *MCLabel; | ||
403 | |||
404 | /* jcc short target */ | ||
405 | static void emit_sjcc(ASMState *as, int cc, MCLabel target) | ||
406 | { | ||
407 | MCode *p = as->mcp; | ||
408 | p[-1] = (MCode)(int8_t)(target-p); | ||
409 | p[-2] = (MCode)(XI_JCCs+(cc&15)); | ||
410 | as->mcp = p - 2; | ||
411 | } | ||
412 | |||
413 | /* jcc short (pending target) */ | ||
414 | static MCLabel emit_sjcc_label(ASMState *as, int cc) | ||
415 | { | ||
416 | MCode *p = as->mcp; | ||
417 | p[-1] = 0; | ||
418 | p[-2] = (MCode)(XI_JCCs+(cc&15)); | ||
419 | as->mcp = p - 2; | ||
420 | return p; | ||
421 | } | ||
422 | |||
423 | /* Fixup jcc short target. */ | ||
424 | static void emit_sfixup(ASMState *as, MCLabel source) | ||
425 | { | ||
426 | source[-1] = (MCode)(as->mcp-source); | ||
427 | } | ||
428 | |||
429 | /* Return label pointing to current PC. */ | ||
430 | #define emit_label(as) ((as)->mcp) | ||
431 | |||
432 | /* jcc target */ | ||
433 | static void emit_jcc(ASMState *as, int cc, MCode *target) | ||
434 | { | ||
435 | MCode *p = as->mcp; | ||
436 | int32_t addr = (int32_t)(target - p); | ||
437 | *(int32_t *)(p-4) = addr; | ||
438 | p[-5] = (MCode)(XI_JCCn+(cc&15)); | ||
439 | p[-6] = 0x0f; | ||
440 | as->mcp = p - 6; | ||
441 | } | ||
442 | |||
443 | /* call target */ | ||
444 | static void emit_call_(ASMState *as, MCode *target) | ||
445 | { | ||
446 | MCode *p = as->mcp; | ||
447 | *(int32_t *)(p-4) = (int32_t)(target - p); | ||
448 | p[-5] = XI_CALL; | ||
449 | as->mcp = p - 5; | ||
450 | } | ||
451 | |||
452 | #define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f)) | ||
453 | |||
454 | /* Argument setup for C calls. Up to 3 args need no stack adjustment. */ | ||
455 | #define emit_setargr(as, narg, r) \ | ||
456 | emit_movtomro(as, (r), RID_ESP, ((narg)-1)*4); | ||
457 | #define emit_setargi(as, narg, imm) \ | ||
458 | emit_movmroi(as, RID_ESP, ((narg)-1)*4, (imm)) | ||
459 | #define emit_setargp(as, narg, ptr) \ | ||
460 | emit_setargi(as, (narg), ptr2addr((ptr))) | ||
461 | |||
462 | /* -- Register allocator debugging ---------------------------------------- */ | ||
463 | |||
464 | /* #define LUAJIT_DEBUG_RA */ | ||
465 | |||
466 | #ifdef LUAJIT_DEBUG_RA | ||
467 | |||
468 | #include <stdio.h> | ||
469 | #include <stdarg.h> | ||
470 | |||
471 | #define RIDNAME(name) #name, | ||
472 | static const char *const ra_regname[] = { | ||
473 | GPRDEF(RIDNAME) | ||
474 | FPRDEF(RIDNAME) | ||
475 | "mrm", | ||
476 | NULL | ||
477 | }; | ||
478 | #undef RIDNAME | ||
479 | |||
480 | static char ra_dbg_buf[65536]; | ||
481 | static char *ra_dbg_p; | ||
482 | static char *ra_dbg_merge; | ||
483 | static MCode *ra_dbg_mcp; | ||
484 | |||
485 | static void ra_dstart(void) | ||
486 | { | ||
487 | ra_dbg_p = ra_dbg_buf; | ||
488 | ra_dbg_merge = NULL; | ||
489 | ra_dbg_mcp = NULL; | ||
490 | } | ||
491 | |||
492 | static void ra_dflush(void) | ||
493 | { | ||
494 | fwrite(ra_dbg_buf, 1, (size_t)(ra_dbg_p-ra_dbg_buf), stdout); | ||
495 | ra_dstart(); | ||
496 | } | ||
497 | |||
498 | static void ra_dprintf(ASMState *as, const char *fmt, ...) | ||
499 | { | ||
500 | char *p; | ||
501 | va_list argp; | ||
502 | va_start(argp, fmt); | ||
503 | p = ra_dbg_mcp == as->mcp ? ra_dbg_merge : ra_dbg_p; | ||
504 | ra_dbg_mcp = NULL; | ||
505 | p += sprintf(p, "%08x \e[36m%04d ", (uintptr_t)as->mcp, as->curins-REF_BIAS); | ||
506 | for (;;) { | ||
507 | const char *e = strchr(fmt, '$'); | ||
508 | if (e == NULL) break; | ||
509 | memcpy(p, fmt, (size_t)(e-fmt)); | ||
510 | p += e-fmt; | ||
511 | if (e[1] == 'r') { | ||
512 | Reg r = va_arg(argp, Reg) & RID_MASK; | ||
513 | if (r <= RID_MAX) { | ||
514 | const char *q; | ||
515 | for (q = ra_regname[r]; *q; q++) | ||
516 | *p++ = *q >= 'A' && *q <= 'Z' ? *q + 0x20 : *q; | ||
517 | } else { | ||
518 | *p++ = '?'; | ||
519 | lua_assert(0); | ||
520 | } | ||
521 | } else if (e[1] == 'f' || e[1] == 'i') { | ||
522 | IRRef ref; | ||
523 | if (e[1] == 'f') | ||
524 | ref = va_arg(argp, IRRef); | ||
525 | else | ||
526 | ref = va_arg(argp, IRIns *) - as->ir; | ||
527 | if (ref >= REF_BIAS) | ||
528 | p += sprintf(p, "%04d", ref - REF_BIAS); | ||
529 | else | ||
530 | p += sprintf(p, "K%03d", REF_BIAS - ref); | ||
531 | } else if (e[1] == 's') { | ||
532 | uint32_t slot = va_arg(argp, uint32_t); | ||
533 | p += sprintf(p, "[esp+0x%x]", sps_scale(slot)); | ||
534 | } else { | ||
535 | lua_assert(0); | ||
536 | } | ||
537 | fmt = e+2; | ||
538 | } | ||
539 | va_end(argp); | ||
540 | while (*fmt) | ||
541 | *p++ = *fmt++; | ||
542 | *p++ = '\e'; *p++ = '['; *p++ = 'm'; *p++ = '\n'; | ||
543 | if (p > ra_dbg_buf+sizeof(ra_dbg_buf)-256) { | ||
544 | fwrite(ra_dbg_buf, 1, (size_t)(p-ra_dbg_buf), stdout); | ||
545 | p = ra_dbg_buf; | ||
546 | } | ||
547 | ra_dbg_p = p; | ||
548 | } | ||
549 | |||
550 | #define RA_DBG_START() ra_dstart() | ||
551 | #define RA_DBG_FLUSH() ra_dflush() | ||
552 | #define RA_DBG_REF() \ | ||
553 | do { char *_p = ra_dbg_p; ra_dprintf(as, ""); \ | ||
554 | ra_dbg_merge = _p; ra_dbg_mcp = as->mcp; } while (0) | ||
555 | #define RA_DBGX(x) ra_dprintf x | ||
556 | |||
557 | #else | ||
558 | #define RA_DBG_START() ((void)0) | ||
559 | #define RA_DBG_FLUSH() ((void)0) | ||
560 | #define RA_DBG_REF() ((void)0) | ||
561 | #define RA_DBGX(x) ((void)0) | ||
562 | #endif | ||
563 | |||
564 | /* -- Register allocator -------------------------------------------------- */ | ||
565 | |||
566 | #define ra_free(as, r) rset_set(as->freeset, (r)) | ||
567 | #define ra_modified(as, r) rset_set(as->modset, (r)) | ||
568 | |||
569 | #define ra_used(ir) (ra_hasreg((ir)->r) || ra_hasspill((ir)->s)) | ||
570 | |||
571 | /* Setup register allocator. */ | ||
572 | static void ra_setup(ASMState *as) | ||
573 | { | ||
574 | /* Initially all regs (except the stack pointer) are free for use. */ | ||
575 | as->freeset = RSET_ALL; | ||
576 | as->modset = RSET_EMPTY; | ||
577 | as->phiset = RSET_EMPTY; | ||
578 | memset(as->phireg, 0, sizeof(as->phireg)); | ||
579 | memset(as->cost, 0, sizeof(as->cost)); | ||
580 | as->cost[RID_ESP] = REGCOST(~0u, 0u); | ||
581 | |||
582 | /* Start slots for spill slot allocation. */ | ||
583 | as->evenspill = (SPS_FIRST+1)&~1; | ||
584 | as->oddspill = (SPS_FIRST&1) ? SPS_FIRST : 0; | ||
585 | } | ||
586 | |||
587 | /* Rematerialize constants. */ | ||
588 | static Reg ra_rematk(ASMState *as, IRIns *ir) | ||
589 | { | ||
590 | Reg r = ir->r; | ||
591 | lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s)); | ||
592 | ra_free(as, r); | ||
593 | ra_modified(as, r); | ||
594 | ir->r = RID_INIT; /* Do not keep any hint. */ | ||
595 | RA_DBGX((as, "remat $i $r", ir, r)); | ||
596 | if (ir->o == IR_KNUM) { | ||
597 | emit_loadn(as, r, ir_knum(ir)); | ||
598 | } else if (ir->o == IR_BASE) { | ||
599 | ra_sethint(ir->r, RID_BASE); /* Restore BASE register hint. */ | ||
600 | emit_getgl(as, r, jit_base); | ||
601 | } else { | ||
602 | lua_assert(ir->o == IR_KINT || ir->o == IR_KGC || | ||
603 | ir->o == IR_KPTR || ir->o == IR_KNULL); | ||
604 | emit_loadi(as, r, ir->i); | ||
605 | } | ||
606 | return r; | ||
607 | } | ||
608 | |||
609 | /* Force a spill. Allocate a new spill slot if needed. */ | ||
610 | static int32_t ra_spill(ASMState *as, IRIns *ir) | ||
611 | { | ||
612 | int32_t slot = ir->s; | ||
613 | if (!ra_hasspill(slot)) { | ||
614 | if (irt_isnum(ir->t)) { | ||
615 | slot = as->evenspill; | ||
616 | as->evenspill += 2; | ||
617 | } else if (as->oddspill) { | ||
618 | slot = as->oddspill; | ||
619 | as->oddspill = 0; | ||
620 | } else { | ||
621 | slot = as->evenspill; | ||
622 | as->oddspill = slot+1; | ||
623 | as->evenspill += 2; | ||
624 | } | ||
625 | if (as->evenspill > 256) | ||
626 | lj_trace_err(as->J, LJ_TRERR_SPILLOV); | ||
627 | ir->s = (uint8_t)slot; | ||
628 | } | ||
629 | return sps_scale(slot); | ||
630 | } | ||
631 | |||
632 | /* Restore a register (marked as free). Rematerialize or force a spill. */ | ||
633 | static Reg ra_restore(ASMState *as, IRRef ref) | ||
634 | { | ||
635 | IRIns *ir = IR(ref); | ||
636 | if (irref_isk(ref) || ref == REF_BASE) { | ||
637 | return ra_rematk(as, ir); | ||
638 | } else { | ||
639 | Reg r = ir->r; | ||
640 | lua_assert(ra_hasreg(r)); | ||
641 | ra_free(as, r); | ||
642 | ra_modified(as, r); | ||
643 | ra_sethint(ir->r, r); /* Keep hint. */ | ||
644 | RA_DBGX((as, "restore $i $r", ir, r)); | ||
645 | emit_movrmro(as, r, RID_ESP, ra_spill(as, ir)); /* Force a spill. */ | ||
646 | return r; | ||
647 | } | ||
648 | } | ||
649 | |||
650 | /* Save a register to a spill slot. */ | ||
651 | static LJ_AINLINE void ra_save(ASMState *as, IRIns *ir, Reg r) | ||
652 | { | ||
653 | RA_DBGX((as, "save $i $r", ir, r)); | ||
654 | emit_rmro(as, r < RID_MAX_GPR ? XO_MOVto : XO_MOVSDto, | ||
655 | r, RID_ESP, sps_scale(ir->s)); | ||
656 | } | ||
657 | |||
658 | #define MINCOST(r) \ | ||
659 | if (LJ_LIKELY(allow&RID2RSET(r)) && as->cost[r] < cost) \ | ||
660 | cost = as->cost[r] | ||
661 | |||
662 | /* Evict the register with the lowest cost, forcing a restore. */ | ||
663 | static Reg ra_evict(ASMState *as, RegSet allow) | ||
664 | { | ||
665 | RegCost cost = ~(RegCost)0; | ||
666 | if (allow < RID2RSET(RID_MAX_GPR)) { | ||
667 | MINCOST(RID_EAX);MINCOST(RID_ECX);MINCOST(RID_EDX);MINCOST(RID_EBX); | ||
668 | MINCOST(RID_EBP);MINCOST(RID_ESI);MINCOST(RID_EDI); | ||
669 | #if LJ_64 | ||
670 | MINCOST(RID_R8D);MINCOST(RID_R9D);MINCOST(RID_R10D);MINCOST(RID_R11D); | ||
671 | MINCOST(RID_R12D);MINCOST(RID_R13D);MINCOST(RID_R14D);MINCOST(RID_R15D); | ||
672 | #endif | ||
673 | } else { | ||
674 | MINCOST(RID_XMM0);MINCOST(RID_XMM1);MINCOST(RID_XMM2);MINCOST(RID_XMM3); | ||
675 | MINCOST(RID_XMM4);MINCOST(RID_XMM5);MINCOST(RID_XMM6);MINCOST(RID_XMM7); | ||
676 | #if LJ_64 | ||
677 | MINCOST(RID_XMM8);MINCOST(RID_XMM9);MINCOST(RID_XMM10);MINCOST(RID_XMM11); | ||
678 | MINCOST(RID_XMM12);MINCOST(RID_XMM13);MINCOST(RID_XMM14);MINCOST(RID_XMM15); | ||
679 | #endif | ||
680 | } | ||
681 | lua_assert(allow != RSET_EMPTY); | ||
682 | lua_assert(regcost_ref(cost) >= as->T->nk && regcost_ref(cost) < as->T->nins); | ||
683 | return ra_restore(as, regcost_ref(cost)); | ||
684 | } | ||
685 | |||
686 | /* Pick any register (marked as free). Evict on-demand. */ | ||
687 | static LJ_AINLINE Reg ra_pick(ASMState *as, RegSet allow) | ||
688 | { | ||
689 | RegSet pick = as->freeset & allow; | ||
690 | if (!pick) | ||
691 | return ra_evict(as, allow); | ||
692 | else | ||
693 | return rset_picktop(pick); | ||
694 | } | ||
695 | |||
696 | /* Get a scratch register (marked as free). */ | ||
697 | static LJ_AINLINE Reg ra_scratch(ASMState *as, RegSet allow) | ||
698 | { | ||
699 | Reg r = ra_pick(as, allow); | ||
700 | ra_modified(as, r); | ||
701 | RA_DBGX((as, "scratch $r", r)); | ||
702 | return r; | ||
703 | } | ||
704 | |||
705 | /* Evict all registers from a set (if not free). */ | ||
706 | static void ra_evictset(ASMState *as, RegSet drop) | ||
707 | { | ||
708 | as->modset |= drop; | ||
709 | drop &= ~as->freeset; | ||
710 | while (drop) { | ||
711 | Reg r = rset_picktop(drop); | ||
712 | ra_restore(as, regcost_ref(as->cost[r])); | ||
713 | rset_clear(drop, r); | ||
714 | checkmclim(as); | ||
715 | } | ||
716 | } | ||
717 | |||
718 | /* Allocate a register for ref from the allowed set of registers. | ||
719 | ** Note: this function assumes the ref does NOT have a register yet! | ||
720 | ** Picks an optimal register, sets the cost and marks the register as non-free. | ||
721 | */ | ||
722 | static Reg ra_allocref(ASMState *as, IRRef ref, RegSet allow) | ||
723 | { | ||
724 | IRIns *ir = IR(ref); | ||
725 | RegSet pick = as->freeset & allow; | ||
726 | Reg r; | ||
727 | lua_assert(ra_noreg(ir->r)); | ||
728 | if (pick) { | ||
729 | /* First check register hint from propagation or PHI. */ | ||
730 | if (ra_hashint(ir->r)) { | ||
731 | r = ra_gethint(ir->r); | ||
732 | if (rset_test(pick, r)) /* Use hint register if possible. */ | ||
733 | goto found; | ||
734 | /* Rematerialization is cheaper than missing a hint. */ | ||
735 | if (rset_test(allow, r) && irref_isk(regcost_ref(as->cost[r]))) { | ||
736 | ra_rematk(as, IR(regcost_ref(as->cost[r]))); | ||
737 | goto found; | ||
738 | } | ||
739 | RA_DBGX((as, "hintmiss $f $r", ref, r)); | ||
740 | } | ||
741 | /* Invariants should preferably get unused registers. */ | ||
742 | if (ref < as->loopref && !irt_isphi(ir->t)) | ||
743 | r = rset_pickbot(pick); | ||
744 | else | ||
745 | r = rset_picktop(pick); | ||
746 | } else { | ||
747 | r = ra_evict(as, allow); | ||
748 | } | ||
749 | found: | ||
750 | RA_DBGX((as, "alloc $f $r", ref, r)); | ||
751 | ir->r = (uint8_t)r; | ||
752 | rset_clear(as->freeset, r); | ||
753 | as->cost[r] = REGCOST_REF_T(ref, irt_t(ir->t)); | ||
754 | return r; | ||
755 | } | ||
756 | |||
757 | /* Allocate a register on-demand. */ | ||
758 | static LJ_INLINE Reg ra_alloc1(ASMState *as, IRRef ref, RegSet allow) | ||
759 | { | ||
760 | Reg r = IR(ref)->r; | ||
761 | /* Note: allow is ignored if the register is already allocated. */ | ||
762 | if (ra_noreg(r)) r = ra_allocref(as, ref, allow); | ||
763 | return r; | ||
764 | } | ||
765 | |||
766 | /* Rename register allocation and emit move. */ | ||
767 | static void ra_rename(ASMState *as, Reg down, Reg up) | ||
768 | { | ||
769 | IRRef ren, ref = regcost_ref(as->cost[up] = as->cost[down]); | ||
770 | IR(ref)->r = (uint8_t)up; | ||
771 | as->cost[down] = 0; | ||
772 | lua_assert((down < RID_MAX_GPR) == (up < RID_MAX_GPR)); | ||
773 | lua_assert(!rset_test(as->freeset, down) && rset_test(as->freeset, up)); | ||
774 | rset_set(as->freeset, down); /* 'down' is free ... */ | ||
775 | rset_clear(as->freeset, up); /* ... and 'up' is now allocated. */ | ||
776 | RA_DBGX((as, "rename $f $r $r", regcost_ref(as->cost[up]), down, up)); | ||
777 | emit_movrr(as, down, up); /* Backwards code generation needs inverse move. */ | ||
778 | if (!ra_hasspill(IR(ref)->s)) { /* Add the rename to the IR. */ | ||
779 | lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, as->snapno); | ||
780 | ren = tref_ref(lj_ir_emit(as->J)); | ||
781 | as->ir = as->T->ir; /* The IR may have been reallocated. */ | ||
782 | IR(ren)->r = (uint8_t)down; | ||
783 | IR(ren)->s = SPS_NONE; | ||
784 | } | ||
785 | } | ||
786 | |||
787 | /* Pick a destination register (marked as free). | ||
788 | ** Caveat: allow is ignored if there's already a destination register. | ||
789 | ** Use ra_destreg() to get a specific register. | ||
790 | */ | ||
791 | static Reg ra_dest(ASMState *as, IRIns *ir, RegSet allow) | ||
792 | { | ||
793 | Reg dest = ir->r; | ||
794 | if (ra_hasreg(dest)) { | ||
795 | ra_free(as, dest); | ||
796 | ra_modified(as, dest); | ||
797 | } else { | ||
798 | dest = ra_scratch(as, allow); | ||
799 | } | ||
800 | if (LJ_UNLIKELY(ra_hasspill(ir->s))) ra_save(as, ir, dest); | ||
801 | return dest; | ||
802 | } | ||
803 | |||
804 | /* Force a specific destination register (marked as free). */ | ||
805 | static void ra_destreg(ASMState *as, IRIns *ir, Reg r) | ||
806 | { | ||
807 | Reg dest = ra_dest(as, ir, RID2RSET(r)); | ||
808 | if (dest != r) { | ||
809 | ra_scratch(as, RID2RSET(r)); | ||
810 | emit_movrr(as, dest, r); | ||
811 | } | ||
812 | } | ||
813 | |||
814 | /* Propagate dest register to left reference. Emit moves as needed. | ||
815 | ** This is a required fixup step for all 2-operand machine instructions. | ||
816 | */ | ||
817 | static void ra_left(ASMState *as, Reg dest, IRRef lref) | ||
818 | { | ||
819 | IRIns *ir = IR(lref); | ||
820 | Reg left = ir->r; | ||
821 | if (ra_noreg(left)) { | ||
822 | if (irref_isk(lref)) { | ||
823 | if (ir->o == IR_KNUM) { | ||
824 | cTValue *tv = ir_knum(ir); | ||
825 | /* FP remat needs a load except for +0. Still better than eviction. */ | ||
826 | if (tvispzero(tv) || !(as->freeset & RSET_FPR)) { | ||
827 | emit_loadn(as, dest, tv); | ||
828 | return; | ||
829 | } | ||
830 | } else { | ||
831 | lua_assert(ir->o == IR_KINT || ir->o == IR_KGC || | ||
832 | ir->o == IR_KPTR || ir->o == IR_KNULL); | ||
833 | emit_loadi(as, dest, ir->i); | ||
834 | return; | ||
835 | } | ||
836 | } | ||
837 | if (!ra_hashint(left) && !iscrossref(as, lref)) | ||
838 | ra_sethint(ir->r, dest); /* Propagate register hint. */ | ||
839 | left = ra_allocref(as, lref, dest < RID_MAX_GPR ? RSET_GPR : RSET_FPR); | ||
840 | } | ||
841 | /* Move needed for true 3-operand instruction: y=a+b ==> y=a; y+=b. */ | ||
842 | if (dest != left) { | ||
843 | /* Use register renaming if dest is the PHI reg. */ | ||
844 | if (irt_isphi(ir->t) && as->phireg[dest] == lref) { | ||
845 | ra_modified(as, left); | ||
846 | ra_rename(as, left, dest); | ||
847 | } else { | ||
848 | emit_movrr(as, dest, left); | ||
849 | } | ||
850 | } | ||
851 | } | ||
852 | |||
853 | /* -- Exit stubs ---------------------------------------------------------- */ | ||
854 | |||
855 | /* Generate an exit stub group at the bottom of the reserved MCode memory. */ | ||
856 | static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) | ||
857 | { | ||
858 | ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff; | ||
859 | MCode *mxp = as->mcbot; | ||
860 | MCode *mxpstart = mxp; | ||
861 | if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop) | ||
862 | asm_mclimit(as); | ||
863 | /* Push low byte of exitno for each exit stub. */ | ||
864 | *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs; | ||
865 | for (i = 1; i < EXITSTUBS_PER_GROUP; i++) { | ||
866 | *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2); | ||
867 | *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i); | ||
868 | } | ||
869 | /* Push the high byte of the exitno for each exit stub group. */ | ||
870 | *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8); | ||
871 | /* Store DISPATCH in ExitInfo->dispatch. Account for the two push ops. */ | ||
872 | *mxp++ = XI_MOVmi; | ||
873 | *mxp++ = MODRM(XM_OFS8, 0, RID_ESP); | ||
874 | *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP); | ||
875 | *mxp++ = 2*sizeof(void *); | ||
876 | *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4; | ||
877 | /* Jump to exit handler which fills in the ExitState. */ | ||
878 | *mxp++ = XI_JMP; mxp += 4; | ||
879 | *((int32_t *)(mxp-4)) = (int32_t)((MCode *)lj_vm_exit_handler - mxp); | ||
880 | /* Commit the code for this group (even if assembly fails later on). */ | ||
881 | lj_mcode_commitbot(as->J, mxp); | ||
882 | as->mcbot = mxp; | ||
883 | as->mclim = as->mcbot + MCLIM_REDZONE; | ||
884 | return mxpstart; | ||
885 | } | ||
886 | |||
887 | /* Setup all needed exit stubs. */ | ||
888 | static void asm_exitstub_setup(ASMState *as, ExitNo nexits) | ||
889 | { | ||
890 | ExitNo i; | ||
891 | if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) | ||
892 | lj_trace_err(as->J, LJ_TRERR_SNAPOV); | ||
893 | for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) | ||
894 | if (as->J->exitstubgroup[i] == NULL) | ||
895 | as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); | ||
896 | } | ||
897 | |||
898 | /* -- Snapshot and guard handling ----------------------------------------- */ | ||
899 | |||
900 | /* Can we rematerialize a KNUM instead of forcing a spill? */ | ||
901 | static int asm_snap_canremat(ASMState *as) | ||
902 | { | ||
903 | Reg r; | ||
904 | for (r = RID_MIN_FPR; r < RID_MAX_FPR; r++) | ||
905 | if (irref_isk(regcost_ref(as->cost[r]))) | ||
906 | return 1; | ||
907 | return 0; | ||
908 | } | ||
909 | |||
910 | /* Allocate registers or spill slots for refs escaping to a snapshot. */ | ||
911 | static void asm_snap_alloc(ASMState *as) | ||
912 | { | ||
913 | SnapShot *snap = &as->T->snap[as->snapno]; | ||
914 | IRRef2 *map = &as->T->snapmap[snap->mapofs]; | ||
915 | BCReg s, nslots = snap->nslots; | ||
916 | for (s = 0; s < nslots; s++) { | ||
917 | IRRef ref = snap_ref(map[s]); | ||
918 | if (!irref_isk(ref)) { | ||
919 | IRIns *ir = IR(ref); | ||
920 | if (!ra_used(ir) && ir->o != IR_FRAME) { | ||
921 | RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; | ||
922 | /* Not a var-to-invar ref and got a free register (or a remat)? */ | ||
923 | if ((!iscrossref(as, ref) || irt_isphi(ir->t)) && | ||
924 | ((as->freeset & allow) || | ||
925 | (allow == RSET_FPR && asm_snap_canremat(as)))) { | ||
926 | ra_allocref(as, ref, allow); /* Allocate a register. */ | ||
927 | checkmclim(as); | ||
928 | RA_DBGX((as, "snapreg $f $r", ref, ir->r)); | ||
929 | } else { | ||
930 | ra_spill(as, ir); /* Otherwise force a spill slot. */ | ||
931 | RA_DBGX((as, "snapspill $f $s", ref, ir->s)); | ||
932 | } | ||
933 | } | ||
934 | } | ||
935 | } | ||
936 | } | ||
937 | |||
938 | /* All guards for a snapshot use the same exitno. This is currently the | ||
939 | ** same as the snapshot number. Since the exact origin of the exit cannot | ||
940 | ** be determined, all guards for the same snapshot must exit with the same | ||
941 | ** RegSP mapping. | ||
942 | ** A renamed ref which has been used in a prior guard for the same snapshot | ||
943 | ** would cause an inconsistency. The easy way out is to force a spill slot. | ||
944 | */ | ||
945 | static int asm_snap_checkrename(ASMState *as, IRRef ren) | ||
946 | { | ||
947 | SnapShot *snap = &as->T->snap[as->snapno]; | ||
948 | IRRef2 *map = &as->T->snapmap[snap->mapofs]; | ||
949 | BCReg s, nslots = snap->nslots; | ||
950 | for (s = 0; s < nslots; s++) { | ||
951 | IRRef ref = snap_ref(map[s]); | ||
952 | if (ref == ren) { | ||
953 | IRIns *ir = IR(ref); | ||
954 | ra_spill(as, ir); /* Register renamed, so force a spill slot. */ | ||
955 | RA_DBGX((as, "snaprensp $f $s", ref, ir->s)); | ||
956 | return 1; /* Found. */ | ||
957 | } | ||
958 | } | ||
959 | return 0; /* Not found. */ | ||
960 | } | ||
961 | |||
962 | /* Prepare snapshot for next guard instruction. */ | ||
963 | static void asm_snap_prep(ASMState *as) | ||
964 | { | ||
965 | if (as->curins < as->snapref) { | ||
966 | do { | ||
967 | lua_assert(as->snapno != 0); | ||
968 | as->snapno--; | ||
969 | as->snapref = as->T->snap[as->snapno].ref; | ||
970 | } while (as->curins < as->snapref); | ||
971 | asm_snap_alloc(as); | ||
972 | as->snaprename = as->T->nins; | ||
973 | } else { | ||
974 | /* Process any renames above the highwater mark. */ | ||
975 | for (; as->snaprename < as->T->nins; as->snaprename++) { | ||
976 | IRIns *ir = IR(as->snaprename); | ||
977 | if (asm_snap_checkrename(as, ir->op1)) | ||
978 | ir->op2 = REF_BIAS-1; /* Kill rename. */ | ||
979 | } | ||
980 | } | ||
981 | } | ||
982 | |||
983 | /* Emit conditional branch to exit for guard. | ||
984 | ** It's important to emit this *after* all registers have been allocated, | ||
985 | ** because rematerializations may invalidate the flags. | ||
986 | */ | ||
987 | static void asm_guardcc(ASMState *as, int cc) | ||
988 | { | ||
989 | MCode *target = exitstub_addr(as->J, as->snapno); | ||
990 | MCode *p = as->mcp; | ||
991 | if (LJ_UNLIKELY(p == as->invmcp)) { | ||
992 | as->loopinv = 1; | ||
993 | *(int32_t *)(p+1) = target - (p+5); | ||
994 | target = p; | ||
995 | cc ^= 1; | ||
996 | if (as->realign) { | ||
997 | emit_sjcc(as, cc, target); | ||
998 | return; | ||
999 | } | ||
1000 | } | ||
1001 | emit_jcc(as, cc, target); | ||
1002 | } | ||
1003 | |||
1004 | /* -- Memory operand fusion ----------------------------------------------- */ | ||
1005 | |||
1006 | /* Arch-specific field offsets. */ | ||
1007 | static const uint8_t field_ofs[IRFL__MAX+1] = { | ||
1008 | #define FLOFS(name, type, field) (uint8_t)offsetof(type, field), | ||
1009 | IRFLDEF(FLOFS) | ||
1010 | #undef FLOFS | ||
1011 | 0 | ||
1012 | }; | ||
1013 | |||
1014 | /* Limit linear search to this distance. Avoids O(n^2) behavior. */ | ||
1015 | #define CONFLICT_SEARCH_LIM 15 | ||
1016 | |||
1017 | /* Check if there's no conflicting instruction between curins and ref. */ | ||
1018 | static int noconflict(ASMState *as, IRRef ref, IROp conflict) | ||
1019 | { | ||
1020 | IRIns *ir = as->ir; | ||
1021 | IRRef i = as->curins; | ||
1022 | if (i > ref + CONFLICT_SEARCH_LIM) | ||
1023 | return 0; /* Give up, ref is too far away. */ | ||
1024 | while (--i > ref) | ||
1025 | if (ir[i].o == conflict) | ||
1026 | return 0; /* Conflict found. */ | ||
1027 | return 1; /* Ok, no conflict. */ | ||
1028 | } | ||
1029 | |||
1030 | /* Fuse array reference into memory operand. */ | ||
1031 | static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow) | ||
1032 | { | ||
1033 | IRIns *irb = IR(ir->op1); | ||
1034 | IRIns *ira, *irx; | ||
1035 | lua_assert(ir->o == IR_AREF); | ||
1036 | lua_assert(irb->o == IR_FLOAD && irb->op2 == IRFL_TAB_ARRAY); | ||
1037 | ira = IR(irb->op1); | ||
1038 | if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE && | ||
1039 | noconflict(as, irb->op1, IR_NEWREF)) { | ||
1040 | /* We can avoid the FLOAD of t->array for colocated arrays. */ | ||
1041 | as->mrm.base = (uint8_t)ra_alloc1(as, irb->op1, allow); /* Table obj. */ | ||
1042 | as->mrm.ofs = -(int32_t)(ira->op1*sizeof(TValue)); /* Ofs to colo array. */ | ||
1043 | } else { | ||
1044 | as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); /* Array base. */ | ||
1045 | as->mrm.ofs = 0; | ||
1046 | } | ||
1047 | irx = IR(ir->op2); | ||
1048 | if (irref_isk(ir->op2)) { | ||
1049 | as->mrm.ofs += 8*irx->i; | ||
1050 | as->mrm.idx = RID_NONE; | ||
1051 | } else { | ||
1052 | rset_clear(allow, as->mrm.base); | ||
1053 | as->mrm.scale = XM_SCALE8; | ||
1054 | /* Fuse a constant ADD (e.g. t[i+1]) into the offset. | ||
1055 | ** Doesn't help much without ABCelim, but reduces register pressure. | ||
1056 | */ | ||
1057 | if (mayfuse(as, ir->op2) && ra_noreg(irx->r) && | ||
1058 | irx->o == IR_ADD && irref_isk(irx->op2)) { | ||
1059 | as->mrm.ofs += 8*IR(irx->op2)->i; | ||
1060 | as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow); | ||
1061 | } else { | ||
1062 | as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow); | ||
1063 | } | ||
1064 | } | ||
1065 | } | ||
1066 | |||
1067 | /* Fuse array/hash/upvalue reference into memory operand. | ||
1068 | ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to | ||
1069 | ** pass the final allow mask, excluding any GPRs used for other inputs. | ||
1070 | ** In particular: 2-operand GPR instructions need to call ra_dest() first! | ||
1071 | */ | ||
1072 | static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow) | ||
1073 | { | ||
1074 | IRIns *ir = IR(ref); | ||
1075 | if (ra_noreg(ir->r)) { | ||
1076 | switch ((IROp)ir->o) { | ||
1077 | case IR_AREF: | ||
1078 | if (mayfuse(as, ref)) { | ||
1079 | asm_fusearef(as, ir, allow); | ||
1080 | return; | ||
1081 | } | ||
1082 | break; | ||
1083 | case IR_HREFK: | ||
1084 | if (mayfuse(as, ref)) { | ||
1085 | as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); | ||
1086 | as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node)); | ||
1087 | as->mrm.idx = RID_NONE; | ||
1088 | return; | ||
1089 | } | ||
1090 | break; | ||
1091 | case IR_UREFC: | ||
1092 | if (irref_isk(ir->op1)) { | ||
1093 | GCfunc *fn = ir_kfunc(IR(ir->op1)); | ||
1094 | GCupval *uv = &gcref(fn->l.uvptr[ir->op2])->uv; | ||
1095 | as->mrm.ofs = ptr2addr(&uv->tv); | ||
1096 | as->mrm.base = as->mrm.idx = RID_NONE; | ||
1097 | return; | ||
1098 | } | ||
1099 | break; | ||
1100 | default: | ||
1101 | lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO); | ||
1102 | break; | ||
1103 | } | ||
1104 | } | ||
1105 | as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow); | ||
1106 | as->mrm.ofs = 0; | ||
1107 | as->mrm.idx = RID_NONE; | ||
1108 | } | ||
1109 | |||
1110 | /* Fuse FLOAD/FREF reference into memory operand. */ | ||
1111 | static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) | ||
1112 | { | ||
1113 | lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); | ||
1114 | as->mrm.ofs = field_ofs[ir->op2]; | ||
1115 | as->mrm.idx = RID_NONE; | ||
1116 | if (irref_isk(ir->op1)) { | ||
1117 | as->mrm.ofs += IR(ir->op1)->i; | ||
1118 | as->mrm.base = RID_NONE; | ||
1119 | } else { | ||
1120 | as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); | ||
1121 | } | ||
1122 | } | ||
1123 | |||
1124 | /* Fuse string reference into memory operand. */ | ||
1125 | static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow) | ||
1126 | { | ||
1127 | IRIns *irr; | ||
1128 | lua_assert(ir->o == IR_STRREF); | ||
1129 | as->mrm.idx = as->mrm.base = RID_NONE; | ||
1130 | as->mrm.scale = XM_SCALE1; | ||
1131 | as->mrm.ofs = sizeof(GCstr); | ||
1132 | if (irref_isk(ir->op1)) { | ||
1133 | as->mrm.ofs += IR(ir->op1)->i; | ||
1134 | } else { | ||
1135 | Reg r = ra_alloc1(as, ir->op1, allow); | ||
1136 | rset_clear(allow, r); | ||
1137 | as->mrm.base = (uint8_t)r; | ||
1138 | } | ||
1139 | irr = IR(ir->op2); | ||
1140 | if (irref_isk(ir->op2)) { | ||
1141 | as->mrm.ofs += irr->i; | ||
1142 | } else { | ||
1143 | Reg r; | ||
1144 | /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */ | ||
1145 | if (mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) { | ||
1146 | as->mrm.ofs += IR(irr->op2)->i; | ||
1147 | r = ra_alloc1(as, irr->op1, allow); | ||
1148 | } else { | ||
1149 | r = ra_alloc1(as, ir->op2, allow); | ||
1150 | } | ||
1151 | if (as->mrm.base == RID_NONE) | ||
1152 | as->mrm.base = (uint8_t)r; | ||
1153 | else | ||
1154 | as->mrm.idx = (uint8_t)r; | ||
1155 | } | ||
1156 | } | ||
1157 | |||
1158 | /* Fuse load into memory operand. */ | ||
1159 | static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) | ||
1160 | { | ||
1161 | IRIns *ir = IR(ref); | ||
1162 | if (ra_hasreg(ir->r)) { | ||
1163 | if (allow != RSET_EMPTY) return ir->r; /* Fast path. */ | ||
1164 | fusespill: | ||
1165 | /* Force a spill if only memory operands are allowed (asm_x87load). */ | ||
1166 | as->mrm.base = RID_ESP; | ||
1167 | as->mrm.ofs = ra_spill(as, ir); | ||
1168 | as->mrm.idx = RID_NONE; | ||
1169 | return RID_MRM; | ||
1170 | } | ||
1171 | if (ir->o == IR_KNUM) { | ||
1172 | lua_assert(allow != RSET_EMPTY); | ||
1173 | if (!(as->freeset & ~as->modset & RSET_FPR)) { | ||
1174 | as->mrm.ofs = ptr2addr(ir_knum(ir)); | ||
1175 | as->mrm.base = as->mrm.idx = RID_NONE; | ||
1176 | return RID_MRM; | ||
1177 | } | ||
1178 | } else if (mayfuse(as, ref)) { | ||
1179 | RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; | ||
1180 | if (ir->o == IR_SLOAD) { | ||
1181 | if (!irt_isint(ir->t) && !(ir->op2 & IRSLOAD_PARENT)) { | ||
1182 | as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); | ||
1183 | as->mrm.ofs = 8*((int32_t)ir->op1-1); | ||
1184 | as->mrm.idx = RID_NONE; | ||
1185 | return RID_MRM; | ||
1186 | } | ||
1187 | } else if (ir->o == IR_FLOAD) { | ||
1188 | /* Generic fusion is only ok for IRT_INT operand (but see asm_comp). */ | ||
1189 | if (irt_isint(ir->t) && noconflict(as, ref, IR_FSTORE)) { | ||
1190 | asm_fusefref(as, ir, xallow); | ||
1191 | return RID_MRM; | ||
1192 | } | ||
1193 | } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { | ||
1194 | if (noconflict(as, ref, ir->o + IRDELTA_L2S)) { | ||
1195 | asm_fuseahuref(as, ir->op1, xallow); | ||
1196 | return RID_MRM; | ||
1197 | } | ||
1198 | } else if (ir->o == IR_XLOAD) { | ||
1199 | /* Generic fusion is only ok for IRT_INT operand (but see asm_comp). | ||
1200 | ** Fusing unaligned memory operands is ok on x86 (except for SIMD types). | ||
1201 | */ | ||
1202 | if (irt_isint(ir->t)) { | ||
1203 | asm_fusestrref(as, IR(ir->op1), xallow); | ||
1204 | return RID_MRM; | ||
1205 | } | ||
1206 | } | ||
1207 | } | ||
1208 | if (!(as->freeset & allow) && | ||
1209 | (allow == RSET_EMPTY || ra_hasspill(ir->s) || ref < as->loopref)) | ||
1210 | goto fusespill; | ||
1211 | return ra_allocref(as, ref, allow); | ||
1212 | } | ||
1213 | |||
1214 | /* -- Type conversions ---------------------------------------------------- */ | ||
1215 | |||
1216 | static void asm_tonum(ASMState *as, IRIns *ir) | ||
1217 | { | ||
1218 | Reg dest = ra_dest(as, ir, RSET_FPR); | ||
1219 | Reg left = asm_fuseload(as, ir->op1, RSET_GPR); | ||
1220 | emit_mrm(as, XO_CVTSI2SD, dest, left); | ||
1221 | if (!(as->flags & JIT_F_SPLIT_XMM)) | ||
1222 | emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ | ||
1223 | } | ||
1224 | |||
1225 | static void asm_tointg(ASMState *as, IRIns *ir, Reg left) | ||
1226 | { | ||
1227 | Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left)); | ||
1228 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1229 | asm_guardcc(as, CC_P); | ||
1230 | asm_guardcc(as, CC_NE); | ||
1231 | emit_rr(as, XO_UCOMISD, left, tmp); | ||
1232 | emit_rr(as, XO_CVTSI2SD, tmp, dest); | ||
1233 | if (!(as->flags & JIT_F_SPLIT_XMM)) | ||
1234 | emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */ | ||
1235 | emit_rr(as, XO_CVTTSD2SI, dest, left); | ||
1236 | /* Can't fuse since left is needed twice. */ | ||
1237 | } | ||
1238 | |||
1239 | static void asm_toint(ASMState *as, IRIns *ir) | ||
1240 | { | ||
1241 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1242 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); | ||
1243 | emit_mrm(as, XO_CVTSD2SI, dest, left); | ||
1244 | } | ||
1245 | |||
1246 | static void asm_tobit(ASMState *as, IRIns *ir) | ||
1247 | { | ||
1248 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1249 | Reg tmp = ra_noreg(IR(ir->op1)->r) ? | ||
1250 | ra_alloc1(as, ir->op1, RSET_FPR) : | ||
1251 | ra_scratch(as, RSET_FPR); | ||
1252 | Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); | ||
1253 | emit_rr(as, XO_MOVDto, tmp, dest); | ||
1254 | emit_mrm(as, XO_ADDSD, tmp, right); | ||
1255 | ra_left(as, tmp, ir->op1); | ||
1256 | } | ||
1257 | |||
1258 | static void asm_strto(ASMState *as, IRIns *ir) | ||
1259 | { | ||
1260 | Reg str; | ||
1261 | int32_t ofs; | ||
1262 | RegSet drop = RSET_SCRATCH; | ||
1263 | /* Force a spill slot for the destination register (if any). */ | ||
1264 | if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r)) | ||
1265 | rset_set(drop, ir->r); /* WIN64 doesn't spill all FPRs. */ | ||
1266 | ra_evictset(as, drop); | ||
1267 | asm_guardcc(as, CC_E); | ||
1268 | emit_rr(as, XO_TEST, RID_RET, RID_RET); | ||
1269 | /* int lj_str_numconv(const char *s, TValue *n) */ | ||
1270 | emit_call(as, lj_str_numconv); | ||
1271 | ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */ | ||
1272 | if (ofs == 0) { | ||
1273 | emit_setargr(as, 2, RID_ESP); | ||
1274 | } else { | ||
1275 | emit_setargr(as, 2, RID_RET); | ||
1276 | emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ofs); | ||
1277 | } | ||
1278 | emit_setargr(as, 1, RID_RET); | ||
1279 | str = ra_alloc1(as, ir->op1, RSET_GPR); | ||
1280 | emit_rmro(as, XO_LEA, RID_RET, str, sizeof(GCstr)); | ||
1281 | } | ||
1282 | |||
1283 | static void asm_tostr(ASMState *as, IRIns *ir) | ||
1284 | { | ||
1285 | IRIns *irl = IR(ir->op1); | ||
1286 | ra_destreg(as, ir, RID_RET); | ||
1287 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1288 | as->gcsteps++; | ||
1289 | if (irt_isnum(irl->t)) { | ||
1290 | /* GCstr *lj_str_fromnum(lua_State *L, const lua_Number *np) */ | ||
1291 | emit_call(as, lj_str_fromnum); | ||
1292 | emit_setargr(as, 1, RID_RET); | ||
1293 | emit_getgl(as, RID_RET, jit_L); | ||
1294 | emit_setargr(as, 2, RID_RET); | ||
1295 | emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ra_spill(as, irl)); | ||
1296 | } else { | ||
1297 | /* GCstr *lj_str_fromint(lua_State *L, int32_t k) */ | ||
1298 | emit_call(as, lj_str_fromint); | ||
1299 | emit_setargr(as, 1, RID_RET); | ||
1300 | emit_getgl(as, RID_RET, jit_L); | ||
1301 | emit_setargr(as, 2, ra_alloc1(as, ir->op1, RSET_GPR)); | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | /* -- Memory references --------------------------------------------------- */ | ||
1306 | |||
1307 | static void asm_aref(ASMState *as, IRIns *ir) | ||
1308 | { | ||
1309 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1310 | asm_fusearef(as, ir, RSET_GPR); | ||
1311 | if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0)) | ||
1312 | emit_mrm(as, XO_LEA, dest, RID_MRM); | ||
1313 | else if (as->mrm.base != dest) | ||
1314 | emit_rr(as, XO_MOV, dest, as->mrm.base); | ||
1315 | } | ||
1316 | |||
1317 | /* Must match with hashkey() and hashrot() in lj_tab.c. */ | ||
1318 | static uint32_t ir_khash(IRIns *ir) | ||
1319 | { | ||
1320 | uint32_t lo, hi; | ||
1321 | if (irt_isstr(ir->t)) { | ||
1322 | return ir_kstr(ir)->hash; | ||
1323 | } else if (irt_isnum(ir->t)) { | ||
1324 | lo = ir_knum(ir)->u32.lo; | ||
1325 | hi = ir_knum(ir)->u32.hi & 0x7fffffff; | ||
1326 | } else if (irt_ispri(ir->t)) { | ||
1327 | lua_assert(!irt_isnil(ir->t)); | ||
1328 | return irt_type(ir->t)-IRT_FALSE; | ||
1329 | } else { | ||
1330 | lua_assert(irt_isaddr(ir->t)); | ||
1331 | lo = u32ptr(ir_kgc(ir)); | ||
1332 | hi = lo - 0x04c11db7; | ||
1333 | } | ||
1334 | lo ^= hi; hi = lj_rol(hi, 14); | ||
1335 | lo -= hi; hi = lj_rol(hi, 5); | ||
1336 | hi ^= lo; hi -= lj_rol(lo, 27); | ||
1337 | return hi; | ||
1338 | } | ||
1339 | |||
1340 | /* Merge NE(HREF, niltv) check. */ | ||
1341 | static MCode *merge_href_niltv(ASMState *as, IRIns *ir) | ||
1342 | { | ||
1343 | /* Assumes nothing else generates NE of HREF. */ | ||
1344 | if (ir[1].o == IR_NE && ir[1].op1 == as->curins) { | ||
1345 | if (LJ_64 && *as->mcp != XI_ARITHi) | ||
1346 | as->mcp += 7+6; | ||
1347 | else | ||
1348 | as->mcp += 6+6; /* Kill cmp reg, imm32 + jz exit. */ | ||
1349 | return as->mcp + *(int32_t *)(as->mcp-4); /* Return exit address. */ | ||
1350 | } | ||
1351 | return NULL; | ||
1352 | } | ||
1353 | |||
1354 | /* Inlined hash lookup. Specialized for key type and for const keys. | ||
1355 | ** The equivalent C code is: | ||
1356 | ** Node *n = hashkey(t, key); | ||
1357 | ** do { | ||
1358 | ** if (lj_obj_equal(&n->key, key)) return &n->val; | ||
1359 | ** } while ((n = nextnode(n))); | ||
1360 | ** return niltv(L); | ||
1361 | */ | ||
1362 | static void asm_href(ASMState *as, IRIns *ir) | ||
1363 | { | ||
1364 | MCode *nilexit = merge_href_niltv(as, ir); /* Do this before any restores. */ | ||
1365 | RegSet allow = RSET_GPR; | ||
1366 | Reg dest = ra_dest(as, ir, allow); | ||
1367 | Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); | ||
1368 | Reg key = RID_NONE, tmp = RID_NONE; | ||
1369 | IRIns *irkey = IR(ir->op2); | ||
1370 | int isk = irref_isk(ir->op2); | ||
1371 | IRType1 kt = irkey->t; | ||
1372 | uint32_t khash; | ||
1373 | MCLabel l_end, l_loop, l_next; | ||
1374 | |||
1375 | if (!isk) { | ||
1376 | rset_clear(allow, tab); | ||
1377 | key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); | ||
1378 | if (!irt_isstr(kt)) | ||
1379 | tmp = ra_scratch(as, rset_exclude(allow, key)); | ||
1380 | } | ||
1381 | |||
1382 | /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */ | ||
1383 | l_end = emit_label(as); | ||
1384 | if (nilexit) | ||
1385 | emit_jcc(as, CC_E, nilexit); /* XI_JMP is not found by lj_asm_patchexit. */ | ||
1386 | else | ||
1387 | emit_loada(as, dest, niltvg(J2G(as->J))); | ||
1388 | |||
1389 | /* Follow hash chain until the end. */ | ||
1390 | l_loop = emit_sjcc_label(as, CC_NZ); | ||
1391 | emit_rr(as, XO_TEST, dest, dest); | ||
1392 | emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next)); | ||
1393 | l_next = emit_label(as); | ||
1394 | |||
1395 | /* Type and value comparison. */ | ||
1396 | emit_sjcc(as, CC_E, l_end); | ||
1397 | if (irt_isnum(kt)) { | ||
1398 | if (isk) { | ||
1399 | /* Assumes -0.0 is already canonicalized to +0.0. */ | ||
1400 | emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo), | ||
1401 | (int32_t)ir_knum(irkey)->u32.lo); | ||
1402 | emit_sjcc(as, CC_NE, l_next); | ||
1403 | emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi), | ||
1404 | (int32_t)ir_knum(irkey)->u32.hi); | ||
1405 | } else { | ||
1406 | emit_sjcc(as, CC_P, l_next); | ||
1407 | emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n)); | ||
1408 | emit_sjcc(as, CC_A, l_next); | ||
1409 | /* The type check avoids NaN penalties and complaints from Valgrind. */ | ||
1410 | emit_i8(as, ~IRT_NUM); | ||
1411 | emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); | ||
1412 | } | ||
1413 | } else { | ||
1414 | if (!irt_ispri(kt)) { | ||
1415 | lua_assert(irt_isaddr(kt)); | ||
1416 | if (isk) | ||
1417 | emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr), | ||
1418 | ptr2addr(ir_kgc(irkey))); | ||
1419 | else | ||
1420 | emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr)); | ||
1421 | emit_sjcc(as, CC_NE, l_next); | ||
1422 | } | ||
1423 | lua_assert(!irt_isnil(kt)); | ||
1424 | emit_i8(as, ~irt_type(kt)); | ||
1425 | emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); | ||
1426 | } | ||
1427 | emit_sfixup(as, l_loop); | ||
1428 | checkmclim(as); | ||
1429 | |||
1430 | /* Load main position relative to tab->node into dest. */ | ||
1431 | khash = isk ? ir_khash(irkey) : 1; | ||
1432 | if (khash == 0) { | ||
1433 | emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node)); | ||
1434 | } else { | ||
1435 | emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node)); | ||
1436 | if ((as->flags & JIT_F_PREFER_IMUL)) { | ||
1437 | emit_i8(as, sizeof(Node)); | ||
1438 | emit_rr(as, XO_IMULi8, dest, dest); | ||
1439 | } else { | ||
1440 | emit_shifti(as, XOg_SHL, dest, 3); | ||
1441 | emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0); | ||
1442 | } | ||
1443 | if (isk) { | ||
1444 | emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash); | ||
1445 | emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); | ||
1446 | } else if (irt_isstr(kt)) { | ||
1447 | emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash)); | ||
1448 | emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); | ||
1449 | } else { /* Must match with hashrot() in lj_tab.c. */ | ||
1450 | emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask)); | ||
1451 | emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp); | ||
1452 | emit_shifti(as, XOg_ROL, tmp, 27); | ||
1453 | emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp); | ||
1454 | emit_shifti(as, XOg_ROL, dest, 5); | ||
1455 | emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest); | ||
1456 | emit_shifti(as, XOg_ROL, dest, 14); | ||
1457 | emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest); | ||
1458 | if (irt_isnum(kt)) { | ||
1459 | emit_rmro(as, XO_ARITH(XOg_AND), dest, RID_ESP, ra_spill(as, irkey)+4); | ||
1460 | emit_loadi(as, dest, 0x7fffffff); | ||
1461 | emit_rr(as, XO_MOVDto, key, tmp); | ||
1462 | } else { | ||
1463 | emit_rr(as, XO_MOV, tmp, key); | ||
1464 | emit_rmro(as, XO_LEA, dest, key, -0x04c11db7); | ||
1465 | } | ||
1466 | } | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1470 | static void asm_hrefk(ASMState *as, IRIns *ir) | ||
1471 | { | ||
1472 | IRIns *kslot = IR(ir->op2); | ||
1473 | IRIns *irkey = IR(kslot->op1); | ||
1474 | int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node)); | ||
1475 | Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; | ||
1476 | Reg node = ra_alloc1(as, ir->op1, RSET_GPR); | ||
1477 | MCLabel l_exit; | ||
1478 | lua_assert(ofs % sizeof(Node) == 0); | ||
1479 | if (ra_hasreg(dest)) { | ||
1480 | if (ofs != 0) { | ||
1481 | if (dest == node && !(as->flags & JIT_F_LEA_AGU)) | ||
1482 | emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs); | ||
1483 | else | ||
1484 | emit_rmro(as, XO_LEA, dest, node, ofs); | ||
1485 | } else if (dest != node) { | ||
1486 | emit_rr(as, XO_MOV, dest, node); | ||
1487 | } | ||
1488 | } | ||
1489 | asm_guardcc(as, CC_NE); | ||
1490 | l_exit = emit_label(as); | ||
1491 | if (irt_isnum(irkey->t)) { | ||
1492 | /* Assumes -0.0 is already canonicalized to +0.0. */ | ||
1493 | emit_gmroi(as, XG_ARITHi(XOg_CMP), node, | ||
1494 | ofs + (int32_t)offsetof(Node, key.u32.lo), | ||
1495 | (int32_t)ir_knum(irkey)->u32.lo); | ||
1496 | emit_sjcc(as, CC_NE, l_exit); | ||
1497 | emit_gmroi(as, XG_ARITHi(XOg_CMP), node, | ||
1498 | ofs + (int32_t)offsetof(Node, key.u32.hi), | ||
1499 | (int32_t)ir_knum(irkey)->u32.hi); | ||
1500 | } else { | ||
1501 | if (!irt_ispri(irkey->t)) { | ||
1502 | lua_assert(irt_isgcv(irkey->t)); | ||
1503 | emit_gmroi(as, XG_ARITHi(XOg_CMP), node, | ||
1504 | ofs + (int32_t)offsetof(Node, key.gcr), | ||
1505 | ptr2addr(ir_kgc(irkey))); | ||
1506 | emit_sjcc(as, CC_NE, l_exit); | ||
1507 | } | ||
1508 | lua_assert(!irt_isnil(irkey->t)); | ||
1509 | emit_i8(as, ~irt_type(irkey->t)); | ||
1510 | emit_rmro(as, XO_ARITHi8, XOg_CMP, node, | ||
1511 | ofs + (int32_t)offsetof(Node, key.it)); | ||
1512 | } | ||
1513 | } | ||
1514 | |||
1515 | static void asm_newref(ASMState *as, IRIns *ir) | ||
1516 | { | ||
1517 | IRRef keyref = ir->op2; | ||
1518 | IRIns *irkey = IR(keyref); | ||
1519 | RegSet allow = RSET_GPR; | ||
1520 | Reg tab, tmp; | ||
1521 | ra_destreg(as, ir, RID_RET); | ||
1522 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1523 | tab = ra_alloc1(as, ir->op1, allow); | ||
1524 | tmp = ra_scratch(as, rset_clear(allow, tab)); | ||
1525 | /* TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key) */ | ||
1526 | emit_call(as, lj_tab_newkey); | ||
1527 | emit_setargr(as, 1, tmp); | ||
1528 | emit_setargr(as, 2, tab); | ||
1529 | emit_getgl(as, tmp, jit_L); | ||
1530 | if (irt_isnum(irkey->t)) { | ||
1531 | /* For numbers use the constant itself or a spill slot as a TValue. */ | ||
1532 | if (irref_isk(keyref)) { | ||
1533 | emit_setargp(as, 3, ir_knum(irkey)); | ||
1534 | } else { | ||
1535 | emit_setargr(as, 3, tmp); | ||
1536 | emit_rmro(as, XO_LEA, tmp, RID_ESP, ra_spill(as, irkey)); | ||
1537 | } | ||
1538 | } else { | ||
1539 | /* Otherwise use g->tmptv to hold the TValue. */ | ||
1540 | lua_assert(irt_ispri(irkey->t) || irt_isaddr(irkey->t)); | ||
1541 | emit_setargr(as, 3, tmp); | ||
1542 | if (!irref_isk(keyref)) { | ||
1543 | Reg src = ra_alloc1(as, keyref, rset_exclude(allow, tmp)); | ||
1544 | emit_movtomro(as, src, tmp, 0); | ||
1545 | } else if (!irt_ispri(irkey->t)) { | ||
1546 | emit_movmroi(as, tmp, 0, irkey->i); | ||
1547 | } | ||
1548 | emit_movmroi(as, tmp, 4, irt_toitype(irkey->t)); | ||
1549 | emit_loada(as, tmp, &J2G(as->J)->tmptv); | ||
1550 | } | ||
1551 | } | ||
1552 | |||
1553 | static void asm_uref(ASMState *as, IRIns *ir) | ||
1554 | { | ||
1555 | /* NYI: Check that UREFO is still open and not aliasing a slot. */ | ||
1556 | if (ra_used(ir)) { | ||
1557 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1558 | if (irref_isk(ir->op1)) { | ||
1559 | GCfunc *fn = ir_kfunc(IR(ir->op1)); | ||
1560 | TValue **v = &gcref(fn->l.uvptr[ir->op2])->uv.v; | ||
1561 | emit_rma(as, XO_MOV, dest, v); | ||
1562 | } else { | ||
1563 | Reg uv = ra_scratch(as, RSET_GPR); | ||
1564 | Reg func = ra_alloc1(as, ir->op1, RSET_GPR); | ||
1565 | if (ir->o == IR_UREFC) { | ||
1566 | emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv)); | ||
1567 | asm_guardcc(as, CC_NE); | ||
1568 | emit_i8(as, 1); | ||
1569 | emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); | ||
1570 | } else { | ||
1571 | emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v)); | ||
1572 | } | ||
1573 | emit_rmro(as, XO_MOV, uv, func, | ||
1574 | (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)ir->op2); | ||
1575 | } | ||
1576 | } | ||
1577 | } | ||
1578 | |||
1579 | static void asm_fref(ASMState *as, IRIns *ir) | ||
1580 | { | ||
1581 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1582 | asm_fusefref(as, ir, RSET_GPR); | ||
1583 | emit_mrm(as, XO_LEA, dest, RID_MRM); | ||
1584 | } | ||
1585 | |||
1586 | static void asm_strref(ASMState *as, IRIns *ir) | ||
1587 | { | ||
1588 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1589 | asm_fusestrref(as, ir, RSET_GPR); | ||
1590 | if (as->mrm.base == RID_NONE) | ||
1591 | emit_loadi(as, dest, as->mrm.ofs); | ||
1592 | else if (as->mrm.base == dest && as->mrm.idx == RID_NONE) | ||
1593 | emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs); | ||
1594 | else | ||
1595 | emit_mrm(as, XO_LEA, dest, RID_MRM); | ||
1596 | } | ||
1597 | |||
1598 | /* -- Loads and stores ---------------------------------------------------- */ | ||
1599 | |||
1600 | static void asm_fload(ASMState *as, IRIns *ir) | ||
1601 | { | ||
1602 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1603 | x86Op xo; | ||
1604 | asm_fusefref(as, ir, RSET_GPR); | ||
1605 | switch (irt_type(ir->t)) { | ||
1606 | case IRT_I8: xo = XO_MOVSXb; break; | ||
1607 | case IRT_U8: xo = XO_MOVZXb; break; | ||
1608 | case IRT_I16: xo = XO_MOVSXw; break; | ||
1609 | case IRT_U16: xo = XO_MOVZXw; break; | ||
1610 | default: | ||
1611 | lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t)); | ||
1612 | xo = XO_MOV; | ||
1613 | break; | ||
1614 | } | ||
1615 | emit_mrm(as, xo, dest, RID_MRM); | ||
1616 | } | ||
1617 | |||
1618 | static void asm_fstore(ASMState *as, IRIns *ir) | ||
1619 | { | ||
1620 | RegSet allow = RSET_GPR; | ||
1621 | Reg src = RID_NONE; | ||
1622 | /* The IRT_I16/IRT_U16 stores should never be simplified for constant | ||
1623 | ** values since mov word [mem], imm16 has a length-changing prefix. | ||
1624 | */ | ||
1625 | if (!irref_isk(ir->op2) || irt_isi16(ir->t) || irt_isu16(ir->t)) { | ||
1626 | RegSet allow8 = (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR; | ||
1627 | src = ra_alloc1(as, ir->op2, allow8); | ||
1628 | rset_clear(allow, src); | ||
1629 | } | ||
1630 | asm_fusefref(as, IR(ir->op1), allow); | ||
1631 | if (ra_hasreg(src)) { | ||
1632 | x86Op xo; | ||
1633 | switch (irt_type(ir->t)) { | ||
1634 | case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break; | ||
1635 | case IRT_I16: case IRT_U16: xo = XO_MOVtow; break; | ||
1636 | default: | ||
1637 | lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t)); | ||
1638 | xo = XO_MOVto; | ||
1639 | break; | ||
1640 | } | ||
1641 | emit_mrm(as, xo, src, RID_MRM); | ||
1642 | } else { | ||
1643 | if (irt_isi8(ir->t) || irt_isu8(ir->t)) { | ||
1644 | emit_i8(as, IR(ir->op2)->i); | ||
1645 | emit_mrm(as, XO_MOVmib, 0, RID_MRM); | ||
1646 | } else { | ||
1647 | lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t)); | ||
1648 | emit_i32(as, IR(ir->op2)->i); | ||
1649 | emit_mrm(as, XO_MOVmi, 0, RID_MRM); | ||
1650 | } | ||
1651 | } | ||
1652 | } | ||
1653 | |||
1654 | static void asm_ahuload(ASMState *as, IRIns *ir) | ||
1655 | { | ||
1656 | RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; | ||
1657 | lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t)); | ||
1658 | if (ra_used(ir)) { | ||
1659 | Reg dest = ra_dest(as, ir, allow); | ||
1660 | asm_fuseahuref(as, ir->op1, RSET_GPR); | ||
1661 | emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM); | ||
1662 | } else { | ||
1663 | asm_fuseahuref(as, ir->op1, RSET_GPR); | ||
1664 | } | ||
1665 | /* Always do the type check, even if the load result is unused. */ | ||
1666 | asm_guardcc(as, irt_isnum(ir->t) ? CC_A : CC_NE); | ||
1667 | emit_i8(as, ~irt_type(ir->t)); | ||
1668 | as->mrm.ofs += 4; | ||
1669 | emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM); | ||
1670 | } | ||
1671 | |||
1672 | static void asm_ahustore(ASMState *as, IRIns *ir) | ||
1673 | { | ||
1674 | if (irt_isnum(ir->t)) { | ||
1675 | Reg src = ra_alloc1(as, ir->op2, RSET_FPR); | ||
1676 | asm_fuseahuref(as, ir->op1, RSET_GPR); | ||
1677 | emit_mrm(as, XO_MOVSDto, src, RID_MRM); | ||
1678 | } else { | ||
1679 | IRIns *irr = IR(ir->op2); | ||
1680 | RegSet allow = RSET_GPR; | ||
1681 | Reg src = RID_NONE; | ||
1682 | if (!irref_isk(ir->op2)) { | ||
1683 | src = ra_alloc1(as, ir->op2, allow); | ||
1684 | rset_clear(allow, src); | ||
1685 | } | ||
1686 | asm_fuseahuref(as, ir->op1, allow); | ||
1687 | if (ra_hasreg(src)) { | ||
1688 | emit_mrm(as, XO_MOVto, src, RID_MRM); | ||
1689 | } else if (!irt_ispri(irr->t)) { | ||
1690 | lua_assert(irt_isaddr(ir->t)); | ||
1691 | emit_i32(as, irr->i); | ||
1692 | emit_mrm(as, XO_MOVmi, 0, RID_MRM); | ||
1693 | } | ||
1694 | as->mrm.ofs += 4; | ||
1695 | emit_i32(as, (int32_t)~irt_type(ir->t)); | ||
1696 | emit_mrm(as, XO_MOVmi, 0, RID_MRM); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | static void asm_sload(ASMState *as, IRIns *ir) | ||
1701 | { | ||
1702 | int32_t ofs = 8*((int32_t)ir->op1-1); | ||
1703 | IRType1 t = ir->t; | ||
1704 | Reg base; | ||
1705 | lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ | ||
1706 | if (irt_isint(t)) { | ||
1707 | Reg left = ra_scratch(as, RSET_FPR); | ||
1708 | asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */ | ||
1709 | base = ra_alloc1(as, REF_BASE, RSET_GPR); | ||
1710 | emit_rmro(as, XMM_MOVRM(as), left, base, ofs); | ||
1711 | t.irt = IRT_NUM; /* Continue with a regular number type check. */ | ||
1712 | } else if (ra_used(ir)) { | ||
1713 | RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; | ||
1714 | Reg dest = ra_dest(as, ir, allow); | ||
1715 | lua_assert(irt_isnum(ir->t) || irt_isaddr(ir->t)); | ||
1716 | base = ra_alloc1(as, REF_BASE, RSET_GPR); | ||
1717 | emit_movrmro(as, dest, base, ofs); | ||
1718 | } else { | ||
1719 | if (!irt_isguard(ir->t)) | ||
1720 | return; /* No type check: avoid base alloc. */ | ||
1721 | base = ra_alloc1(as, REF_BASE, RSET_GPR); | ||
1722 | } | ||
1723 | if (irt_isguard(ir->t)) { | ||
1724 | /* Need type check, even if the load result is unused. */ | ||
1725 | asm_guardcc(as, irt_isnum(t) ? CC_A : CC_NE); | ||
1726 | emit_i8(as, ~irt_type(t)); | ||
1727 | emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4); | ||
1728 | } | ||
1729 | } | ||
1730 | |||
1731 | static void asm_xload(ASMState *as, IRIns *ir) | ||
1732 | { | ||
1733 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
1734 | x86Op xo; | ||
1735 | asm_fusestrref(as, IR(ir->op1), RSET_GPR); /* For now only support STRREF. */ | ||
1736 | /* ir->op2 is ignored -- unaligned loads are ok on x86. */ | ||
1737 | switch (irt_type(ir->t)) { | ||
1738 | case IRT_I8: xo = XO_MOVSXb; break; | ||
1739 | case IRT_U8: xo = XO_MOVZXb; break; | ||
1740 | case IRT_I16: xo = XO_MOVSXw; break; | ||
1741 | case IRT_U16: xo = XO_MOVZXw; break; | ||
1742 | default: lua_assert(irt_isint(ir->t)); xo = XO_MOV; break; | ||
1743 | } | ||
1744 | emit_mrm(as, xo, dest, RID_MRM); | ||
1745 | } | ||
1746 | |||
1747 | /* -- String ops ---------------------------------------------------------- */ | ||
1748 | |||
1749 | static void asm_snew(ASMState *as, IRIns *ir) | ||
1750 | { | ||
1751 | RegSet allow = RSET_GPR; | ||
1752 | Reg left, right; | ||
1753 | IRIns *irl; | ||
1754 | ra_destreg(as, ir, RID_RET); | ||
1755 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1756 | irl = IR(ir->op1); | ||
1757 | left = irl->r; | ||
1758 | right = IR(ir->op2)->r; | ||
1759 | if (ra_noreg(left)) { | ||
1760 | lua_assert(irl->o == IR_STRREF); | ||
1761 | /* Get register only for non-const STRREF. */ | ||
1762 | if (!(irref_isk(irl->op1) && irref_isk(irl->op2))) { | ||
1763 | if (ra_hasreg(right)) rset_clear(allow, right); | ||
1764 | left = ra_allocref(as, ir->op1, allow); | ||
1765 | } | ||
1766 | } | ||
1767 | if (ra_noreg(right) && !irref_isk(ir->op2)) { | ||
1768 | if (ra_hasreg(left)) rset_clear(allow, left); | ||
1769 | right = ra_allocref(as, ir->op2, allow); | ||
1770 | } | ||
1771 | /* GCstr *lj_str_new(lua_State *L, const char *str, size_t len) */ | ||
1772 | emit_call(as, lj_str_new); | ||
1773 | emit_setargr(as, 1, RID_RET); | ||
1774 | emit_getgl(as, RID_RET, jit_L); | ||
1775 | if (ra_noreg(left)) /* Use immediate for const STRREF. */ | ||
1776 | emit_setargi(as, 2, IR(irl->op1)->i + IR(irl->op2)->i + | ||
1777 | (int32_t)sizeof(GCstr)); | ||
1778 | else | ||
1779 | emit_setargr(as, 2, left); | ||
1780 | if (ra_noreg(right)) | ||
1781 | emit_setargi(as, 3, IR(ir->op2)->i); | ||
1782 | else | ||
1783 | emit_setargr(as, 3, right); | ||
1784 | as->gcsteps++; | ||
1785 | } | ||
1786 | |||
1787 | /* -- Table ops ----------------------------------------------------------- */ | ||
1788 | |||
1789 | static void asm_tnew(ASMState *as, IRIns *ir) | ||
1790 | { | ||
1791 | ra_destreg(as, ir, RID_RET); | ||
1792 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1793 | /* GCtab *lj_tab_new(lua_State *L, int32_t asize, uint32_t hbits) */ | ||
1794 | emit_call(as, lj_tab_new); | ||
1795 | emit_setargr(as, 1, RID_RET); | ||
1796 | emit_setargi(as, 2, ir->op1); | ||
1797 | emit_setargi(as, 3, ir->op2); | ||
1798 | emit_getgl(as, RID_RET, jit_L); | ||
1799 | as->gcsteps++; | ||
1800 | } | ||
1801 | |||
1802 | static void asm_tdup(ASMState *as, IRIns *ir) | ||
1803 | { | ||
1804 | ra_destreg(as, ir, RID_RET); | ||
1805 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1806 | /* GCtab *lj_tab_dup(lua_State *L, const GCtab *kt) */ | ||
1807 | emit_call(as, lj_tab_dup); | ||
1808 | emit_setargr(as, 1, RID_RET); | ||
1809 | emit_setargp(as, 2, ir_kgc(IR(ir->op1))); | ||
1810 | emit_getgl(as, RID_RET, jit_L); | ||
1811 | as->gcsteps++; | ||
1812 | } | ||
1813 | |||
1814 | static void asm_tlen(ASMState *as, IRIns *ir) | ||
1815 | { | ||
1816 | ra_destreg(as, ir, RID_RET); | ||
1817 | ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET)); | ||
1818 | emit_call(as, lj_tab_len); /* MSize lj_tab_len(GCtab *t) */ | ||
1819 | emit_setargr(as, 1, ra_alloc1(as, ir->op1, RSET_GPR)); | ||
1820 | } | ||
1821 | |||
1822 | static void asm_tbar(ASMState *as, IRIns *ir) | ||
1823 | { | ||
1824 | Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); | ||
1825 | Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab)); | ||
1826 | MCLabel l_end = emit_label(as); | ||
1827 | emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist)); | ||
1828 | emit_setgl(as, tab, gc.grayagain); | ||
1829 | emit_getgl(as, tmp, gc.grayagain); | ||
1830 | emit_i8(as, ~LJ_GC_BLACK); | ||
1831 | emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked)); | ||
1832 | emit_sjcc(as, CC_Z, l_end); | ||
1833 | emit_i8(as, LJ_GC_BLACK); | ||
1834 | emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked)); | ||
1835 | } | ||
1836 | |||
1837 | static void asm_obar(ASMState *as, IRIns *ir) | ||
1838 | { | ||
1839 | RegSet allow = RSET_GPR; | ||
1840 | Reg obj, val; | ||
1841 | GCobj *valp; | ||
1842 | MCLabel l_end; | ||
1843 | int32_t ofs; | ||
1844 | ra_evictset(as, RSET_SCRATCH); | ||
1845 | if (irref_isk(ir->op2)) { | ||
1846 | valp = ir_kgc(IR(ir->op2)); | ||
1847 | val = RID_NONE; | ||
1848 | } else { | ||
1849 | valp = NULL; | ||
1850 | val = ra_alloc1(as, ir->op2, allow); | ||
1851 | rset_clear(allow, val); | ||
1852 | } | ||
1853 | obj = ra_alloc1(as, ir->op1, allow); | ||
1854 | l_end = emit_label(as); | ||
1855 | /* No need for other object barriers (yet). */ | ||
1856 | lua_assert(IR(ir->op1)->o == IR_UREFC); | ||
1857 | ofs = -(int32_t)offsetof(GCupval, tv); | ||
1858 | /* void lj_gc_barrieruv(global_State *g, GCobj *o, GCobj *v) */ | ||
1859 | emit_call(as, lj_gc_barrieruv); | ||
1860 | if (ofs == 0) { | ||
1861 | emit_setargr(as, 2, obj); | ||
1862 | } else if (rset_test(RSET_SCRATCH, obj) && !(as->flags & JIT_F_LEA_AGU)) { | ||
1863 | emit_setargr(as, 2, obj); | ||
1864 | emit_gri(as, XG_ARITHi(XOg_ADD), obj, ofs); | ||
1865 | } else { | ||
1866 | emit_setargr(as, 2, RID_RET); | ||
1867 | emit_rmro(as, XO_LEA, RID_RET, obj, ofs); | ||
1868 | } | ||
1869 | emit_setargp(as, 1, J2G(as->J)); | ||
1870 | if (valp) | ||
1871 | emit_setargp(as, 3, valp); | ||
1872 | else | ||
1873 | emit_setargr(as, 3, val); | ||
1874 | emit_sjcc(as, CC_Z, l_end); | ||
1875 | emit_i8(as, LJ_GC_WHITES); | ||
1876 | if (valp) | ||
1877 | emit_rma(as, XO_GROUP3b, XOg_TEST, &valp->gch.marked); | ||
1878 | else | ||
1879 | emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked)); | ||
1880 | emit_sjcc(as, CC_Z, l_end); | ||
1881 | emit_i8(as, LJ_GC_BLACK); | ||
1882 | emit_rmro(as, XO_GROUP3b, XOg_TEST, obj, | ||
1883 | ofs + (int32_t)offsetof(GChead, marked)); | ||
1884 | } | ||
1885 | |||
1886 | /* -- FP/int arithmetic and logic operations ------------------------------ */ | ||
1887 | |||
1888 | /* Load reference onto x87 stack. Force a spill to memory if needed. */ | ||
1889 | static void asm_x87load(ASMState *as, IRRef ref) | ||
1890 | { | ||
1891 | IRIns *ir = IR(ref); | ||
1892 | if (ir->o == IR_KNUM) { | ||
1893 | cTValue *tv = ir_knum(ir); | ||
1894 | if (tvispzero(tv)) /* Use fldz only for +0. */ | ||
1895 | emit_x87op(as, XI_FLDZ); | ||
1896 | else if (tvispone(tv)) | ||
1897 | emit_x87op(as, XI_FLD1); | ||
1898 | else | ||
1899 | emit_rma(as, XO_FLDq, XOg_FLDq, tv); | ||
1900 | } else if (ir->o == IR_TONUM && !ra_used(ir) && | ||
1901 | !irref_isk(ir->op1) && mayfuse(as, ir->op1)) { | ||
1902 | IRIns *iri = IR(ir->op1); | ||
1903 | emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri)); | ||
1904 | } else { | ||
1905 | emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY)); | ||
1906 | } | ||
1907 | } | ||
1908 | |||
1909 | /* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */ | ||
1910 | static int fpmjoin_pow(ASMState *as, IRIns *ir) | ||
1911 | { | ||
1912 | IRIns *irp = IR(ir->op1); | ||
1913 | if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) { | ||
1914 | IRIns *irpp = IR(irp->op1); | ||
1915 | if (irpp == ir-2 && irpp->o == IR_FPMATH && | ||
1916 | irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) { | ||
1917 | emit_call(as, lj_vm_pow); /* st0 = lj_vm_pow(st1, st0) */ | ||
1918 | asm_x87load(as, irp->op2); | ||
1919 | asm_x87load(as, irpp->op1); | ||
1920 | return 1; | ||
1921 | } | ||
1922 | } | ||
1923 | return 0; | ||
1924 | } | ||
1925 | |||
1926 | static void asm_fpmath(ASMState *as, IRIns *ir) | ||
1927 | { | ||
1928 | IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER; | ||
1929 | if (fpm == IRFPM_SQRT) { | ||
1930 | Reg dest = ra_dest(as, ir, RSET_FPR); | ||
1931 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); | ||
1932 | emit_mrm(as, XO_SQRTSD, dest, left); | ||
1933 | } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) { | ||
1934 | Reg dest = ra_dest(as, ir, RSET_FPR); | ||
1935 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); | ||
1936 | /* Round down/up/trunc == 1001/1010/1011. */ | ||
1937 | emit_i8(as, 0x09 + fpm); | ||
1938 | /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */ | ||
1939 | emit_mrm(as, XO_ROUNDSD, dest, left); | ||
1940 | /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */ | ||
1941 | /* This is atrocious, but the alternatives are much worse. */ | ||
1942 | if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) { | ||
1943 | as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */ | ||
1944 | } | ||
1945 | *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */ | ||
1946 | } else { | ||
1947 | int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */ | ||
1948 | Reg dest = ir->r; | ||
1949 | if (ra_hasreg(dest)) { | ||
1950 | ra_free(as, dest); | ||
1951 | ra_modified(as, dest); | ||
1952 | emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs); | ||
1953 | } | ||
1954 | emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); | ||
1955 | switch (fpm) { /* st0 = lj_vm_*(st0) */ | ||
1956 | case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break; | ||
1957 | case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break; | ||
1958 | case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break; | ||
1959 | case IRFPM_EXP: emit_call(as, lj_vm_exp); break; | ||
1960 | case IRFPM_EXP2: | ||
1961 | if (fpmjoin_pow(as, ir)) return; | ||
1962 | emit_call(as, lj_vm_exp2); /* st0 = lj_vm_exp2(st0) */ | ||
1963 | break; | ||
1964 | case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; | ||
1965 | case IRFPM_COS: emit_x87op(as, XI_FCOS); break; | ||
1966 | case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; | ||
1967 | case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10: | ||
1968 | /* Note: the use of fyl2xp1 would be pointless here. When computing | ||
1969 | ** log(1.0+eps) the precision is already lost after 1.0 is added. | ||
1970 | ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense. | ||
1971 | */ | ||
1972 | emit_x87op(as, XI_FYL2X); break; | ||
1973 | case IRFPM_OTHER: | ||
1974 | switch (ir->o) { | ||
1975 | case IR_ATAN2: | ||
1976 | emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break; | ||
1977 | case IR_LDEXP: | ||
1978 | emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break; | ||
1979 | case IR_POWI: | ||
1980 | emit_call(as, lj_vm_powi); /* st0 = lj_vm_powi(st0, [esp]) */ | ||
1981 | emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0); | ||
1982 | break; | ||
1983 | default: lua_assert(0); break; | ||
1984 | } | ||
1985 | break; | ||
1986 | default: lua_assert(0); break; | ||
1987 | } | ||
1988 | asm_x87load(as, ir->op1); | ||
1989 | switch (fpm) { | ||
1990 | case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break; | ||
1991 | case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break; | ||
1992 | case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break; | ||
1993 | case IRFPM_OTHER: | ||
1994 | if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2); | ||
1995 | break; | ||
1996 | default: break; | ||
1997 | } | ||
1998 | } | ||
1999 | } | ||
2000 | |||
2001 | /* Find out whether swapping operands might be beneficial. */ | ||
2002 | static int swapops(ASMState *as, IRIns *ir) | ||
2003 | { | ||
2004 | IRIns *irl = IR(ir->op1); | ||
2005 | IRIns *irr = IR(ir->op2); | ||
2006 | lua_assert(ra_noreg(irr->r)); | ||
2007 | if (!irm_iscomm(lj_ir_mode[ir->o])) | ||
2008 | return 0; /* Can't swap non-commutative operations. */ | ||
2009 | if (irref_isk(ir->op2)) | ||
2010 | return 0; /* Don't swap constants to the left. */ | ||
2011 | if (ra_hasreg(irl->r)) | ||
2012 | return 1; /* Swap if left already has a register. */ | ||
2013 | if (ra_samehint(ir->r, irr->r)) | ||
2014 | return 1; /* Swap if dest and right have matching hints. */ | ||
2015 | if (ir->op1 < as->loopref && !irt_isphi(irl->t) && | ||
2016 | !(ir->op2 < as->loopref && !irt_isphi(irr->t))) | ||
2017 | return 1; /* Swap invariants to the right. */ | ||
2018 | if (opisfusableload(irl->o)) | ||
2019 | return 1; /* Swap fusable loads to the right. */ | ||
2020 | return 0; /* Otherwise don't swap. */ | ||
2021 | } | ||
2022 | |||
2023 | static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo) | ||
2024 | { | ||
2025 | IRRef lref = ir->op1; | ||
2026 | IRRef rref = ir->op2; | ||
2027 | RegSet allow = RSET_FPR; | ||
2028 | Reg dest; | ||
2029 | Reg right = IR(rref)->r; | ||
2030 | if (ra_hasreg(right)) | ||
2031 | rset_clear(allow, right); | ||
2032 | dest = ra_dest(as, ir, allow); | ||
2033 | if (lref == rref) { | ||
2034 | right = dest; | ||
2035 | } else if (ra_noreg(right)) { | ||
2036 | if (swapops(as, ir)) { | ||
2037 | IRRef tmp = lref; lref = rref; rref = tmp; | ||
2038 | } | ||
2039 | right = asm_fuseload(as, rref, rset_clear(allow, dest)); | ||
2040 | } | ||
2041 | emit_mrm(as, xo, dest, right); | ||
2042 | ra_left(as, dest, lref); | ||
2043 | } | ||
2044 | |||
2045 | static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa) | ||
2046 | { | ||
2047 | IRRef lref = ir->op1; | ||
2048 | IRRef rref = ir->op2; | ||
2049 | RegSet allow = RSET_GPR; | ||
2050 | Reg dest, right; | ||
2051 | if (as->testmcp == as->mcp) { /* Drop test r,r instruction. */ | ||
2052 | as->testmcp = NULL; | ||
2053 | as->mcp += (LJ_64 && *as->mcp != XI_TEST) ? 3 : 2; | ||
2054 | } | ||
2055 | right = IR(rref)->r; | ||
2056 | if (ra_hasreg(right)) | ||
2057 | rset_clear(allow, right); | ||
2058 | dest = ra_dest(as, ir, allow); | ||
2059 | if (lref == rref) { | ||
2060 | right = dest; | ||
2061 | } else if (ra_noreg(right) && !irref_isk(rref)) { | ||
2062 | if (swapops(as, ir)) { | ||
2063 | IRRef tmp = lref; lref = rref; rref = tmp; | ||
2064 | } | ||
2065 | right = asm_fuseload(as, rref, rset_clear(allow, dest)); | ||
2066 | /* Note: fuses only with IR_FLOAD for now. */ | ||
2067 | } | ||
2068 | if (irt_isguard(ir->t)) /* For IR_ADDOV etc. */ | ||
2069 | asm_guardcc(as, CC_O); | ||
2070 | if (ra_hasreg(right)) | ||
2071 | emit_mrm(as, XO_ARITH(xa), dest, right); | ||
2072 | else | ||
2073 | emit_gri(as, XG_ARITHi(xa), dest, IR(ir->op2)->i); | ||
2074 | ra_left(as, dest, lref); | ||
2075 | } | ||
2076 | |||
2077 | /* LEA is really a 4-operand ADD with an independent destination register, | ||
2078 | ** up to two source registers and an immediate. One register can be scaled | ||
2079 | ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several | ||
2080 | ** instructions. | ||
2081 | ** | ||
2082 | ** Currently only a few common cases are supported: | ||
2083 | ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated | ||
2084 | ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b | ||
2085 | ** - Right ADD fusion: y = a+(b+k) | ||
2086 | ** The ommited variants have already been reduced by FOLD. | ||
2087 | ** | ||
2088 | ** There are more fusion opportunities, like gathering shifts or joining | ||
2089 | ** common references. But these are probably not worth the trouble, since | ||
2090 | ** array indexing is not decomposed and already makes use of all fields | ||
2091 | ** of the ModRM operand. | ||
2092 | */ | ||
2093 | static int asm_lea(ASMState *as, IRIns *ir) | ||
2094 | { | ||
2095 | IRIns *irl = IR(ir->op1); | ||
2096 | IRIns *irr = IR(ir->op2); | ||
2097 | RegSet allow = RSET_GPR; | ||
2098 | Reg dest; | ||
2099 | as->mrm.base = as->mrm.idx = RID_NONE; | ||
2100 | as->mrm.scale = XM_SCALE1; | ||
2101 | as->mrm.ofs = 0; | ||
2102 | if (ra_hasreg(irl->r)) { | ||
2103 | rset_clear(allow, irl->r); | ||
2104 | as->mrm.base = irl->r; | ||
2105 | if (irref_isk(ir->op2) || ra_hasreg(irr->r)) { | ||
2106 | /* The PHI renaming logic does a better job in some cases. */ | ||
2107 | if (ra_hasreg(ir->r) && | ||
2108 | ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) || | ||
2109 | (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2))) | ||
2110 | return 0; | ||
2111 | if (irref_isk(ir->op2)) { | ||
2112 | as->mrm.ofs = irr->i; | ||
2113 | } else { | ||
2114 | rset_clear(allow, irr->r); | ||
2115 | as->mrm.idx = irr->r; | ||
2116 | } | ||
2117 | } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) && | ||
2118 | irref_isk(irr->op2)) { | ||
2119 | Reg idx = ra_alloc1(as, irr->op1, allow); | ||
2120 | rset_clear(allow, idx); | ||
2121 | as->mrm.idx = (uint8_t)idx; | ||
2122 | as->mrm.ofs = IR(irr->op2)->i; | ||
2123 | } else { | ||
2124 | return 0; | ||
2125 | } | ||
2126 | } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) && | ||
2127 | (irref_isk(ir->op2) || irref_isk(irl->op2))) { | ||
2128 | Reg idx, base = ra_alloc1(as, irl->op1, allow); | ||
2129 | rset_clear(allow, base); | ||
2130 | as->mrm.base = (uint8_t)base; | ||
2131 | if (irref_isk(ir->op2)) { | ||
2132 | as->mrm.ofs = irr->i; | ||
2133 | idx = ra_alloc1(as, irl->op2, allow); | ||
2134 | } else { | ||
2135 | as->mrm.ofs = IR(irl->op2)->i; | ||
2136 | idx = ra_alloc1(as, ir->op2, allow); | ||
2137 | } | ||
2138 | rset_clear(allow, idx); | ||
2139 | as->mrm.idx = (uint8_t)idx; | ||
2140 | } else { | ||
2141 | return 0; | ||
2142 | } | ||
2143 | dest = ra_dest(as, ir, allow); | ||
2144 | emit_mrm(as, XO_LEA, dest, RID_MRM); | ||
2145 | return 1; /* Success. */ | ||
2146 | } | ||
2147 | |||
2148 | static void asm_add(ASMState *as, IRIns *ir) | ||
2149 | { | ||
2150 | if (irt_isnum(ir->t)) | ||
2151 | asm_fparith(as, ir, XO_ADDSD); | ||
2152 | else if ((as->flags & JIT_F_LEA_AGU) || as->testmcp == as->mcp || | ||
2153 | !asm_lea(as, ir)) | ||
2154 | asm_intarith(as, ir, XOg_ADD); | ||
2155 | } | ||
2156 | |||
2157 | static void asm_bitnot(ASMState *as, IRIns *ir) | ||
2158 | { | ||
2159 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
2160 | emit_rr(as, XO_GROUP3, XOg_NOT, dest); | ||
2161 | ra_left(as, dest, ir->op1); | ||
2162 | } | ||
2163 | |||
2164 | static void asm_bitswap(ASMState *as, IRIns *ir) | ||
2165 | { | ||
2166 | Reg dest = ra_dest(as, ir, RSET_GPR); | ||
2167 | MCode *p = as->mcp; | ||
2168 | p[-1] = (MCode)(XI_BSWAP+(dest&7)); | ||
2169 | p[-2] = 0x0f; | ||
2170 | p -= 2; | ||
2171 | REXRB(p, 0, dest); | ||
2172 | as->mcp = p; | ||
2173 | ra_left(as, dest, ir->op1); | ||
2174 | } | ||
2175 | |||
2176 | static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) | ||
2177 | { | ||
2178 | IRRef rref = ir->op2; | ||
2179 | IRIns *irr = IR(rref); | ||
2180 | Reg dest; | ||
2181 | if (irref_isk(rref)) { /* Constant shifts. */ | ||
2182 | int shift; | ||
2183 | dest = ra_dest(as, ir, RSET_GPR); | ||
2184 | shift = irr->i & 31; /* Handle shifts of 0..31 bits. */ | ||
2185 | switch (shift) { | ||
2186 | case 0: return; | ||
2187 | case 1: emit_rr(as, XO_SHIFT1, (Reg)xs, dest); break; | ||
2188 | default: emit_shifti(as, xs, dest, shift); break; | ||
2189 | } | ||
2190 | } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ | ||
2191 | RegSet allow = rset_exclude(RSET_GPR, RID_ECX); | ||
2192 | Reg right = irr->r; | ||
2193 | if (ra_noreg(right)) { | ||
2194 | right = ra_allocref(as, rref, RID2RSET(RID_ECX)); | ||
2195 | } else if (right != RID_ECX) { | ||
2196 | rset_clear(allow, right); | ||
2197 | ra_scratch(as, RID2RSET(RID_ECX)); | ||
2198 | } | ||
2199 | dest = ra_dest(as, ir, allow); | ||
2200 | emit_rr(as, XO_SHIFTcl, (Reg)xs, dest); | ||
2201 | if (right != RID_ECX) | ||
2202 | emit_rr(as, XO_MOV, RID_ECX, right); | ||
2203 | } | ||
2204 | ra_left(as, dest, ir->op1); | ||
2205 | /* | ||
2206 | ** Note: avoid using the flags resulting from a shift or rotate! | ||
2207 | ** All of them cause a partial flag stall, except for r,1 shifts | ||
2208 | ** (but not rotates). And a shift count of 0 leaves the flags unmodified. | ||
2209 | */ | ||
2210 | } | ||
2211 | |||
2212 | /* -- Comparisons --------------------------------------------------------- */ | ||
2213 | |||
2214 | /* Virtual flags for unordered FP comparisons. */ | ||
2215 | #define VCC_U 0x100 /* Unordered. */ | ||
2216 | #define VCC_P 0x200 /* Needs extra CC_P branch. */ | ||
2217 | #define VCC_S 0x400 /* Swap avoids CC_P branch. */ | ||
2218 | #define VCC_PS (VCC_P|VCC_S) | ||
2219 | |||
2220 | static void asm_comp_(ASMState *as, IRIns *ir, int cc) | ||
2221 | { | ||
2222 | if (irt_isnum(ir->t)) { | ||
2223 | IRRef lref = ir->op1; | ||
2224 | IRRef rref = ir->op2; | ||
2225 | Reg left, right; | ||
2226 | MCLabel l_around; | ||
2227 | /* | ||
2228 | ** An extra CC_P branch is required to preserve ordered/unordered | ||
2229 | ** semantics for FP comparisons. This can be avoided by swapping | ||
2230 | ** the operands and inverting the condition (except for EQ and UNE). | ||
2231 | ** So always try to swap if possible. | ||
2232 | ** | ||
2233 | ** Another option would be to swap operands to achieve better memory | ||
2234 | ** operand fusion. But it's unlikely that this outweighs the cost | ||
2235 | ** of the extra branches. | ||
2236 | */ | ||
2237 | if (cc & VCC_S) { /* Swap? */ | ||
2238 | IRRef tmp = lref; lref = rref; rref = tmp; | ||
2239 | cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */ | ||
2240 | } | ||
2241 | left = ra_alloc1(as, lref, RSET_FPR); | ||
2242 | right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left)); | ||
2243 | l_around = emit_label(as); | ||
2244 | asm_guardcc(as, cc >> 4); | ||
2245 | if (cc & VCC_P) { /* Extra CC_P branch required? */ | ||
2246 | if (!(cc & VCC_U)) { | ||
2247 | asm_guardcc(as, CC_P); /* Branch to exit for ordered comparisons. */ | ||
2248 | } else if (l_around != as->invmcp) { | ||
2249 | emit_sjcc(as, CC_P, l_around); /* Branch around for unordered. */ | ||
2250 | } else { | ||
2251 | /* Patched to mcloop by asm_loop_fixup. */ | ||
2252 | as->loopinv = 2; | ||
2253 | if (as->realign) | ||
2254 | emit_sjcc(as, CC_P, as->mcp); | ||
2255 | else | ||
2256 | emit_jcc(as, CC_P, as->mcp); | ||
2257 | } | ||
2258 | } | ||
2259 | emit_mrm(as, XO_UCOMISD, left, right); | ||
2260 | } else if (!(irt_isstr(ir->t) && (cc & 0xe) != CC_E)) { | ||
2261 | IRRef lref = ir->op1, rref = ir->op2; | ||
2262 | IROp leftop = (IROp)(IR(lref)->o); | ||
2263 | lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t)); | ||
2264 | /* Swap constants (only for ABC) and fusable loads to the right. */ | ||
2265 | if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) { | ||
2266 | if ((cc & 0xc) == 0xc) cc ^= 3; /* L <-> G, LE <-> GE */ | ||
2267 | else if ((cc & 0xa) == 0x2) cc ^= 5; /* A <-> B, AE <-> BE */ | ||
2268 | lref = ir->op2; rref = ir->op1; | ||
2269 | } | ||
2270 | if (irref_isk(rref)) { | ||
2271 | IRIns *irl = IR(lref); | ||
2272 | int32_t imm = IR(rref)->i; | ||
2273 | /* Check wether we can use test ins. Not for unsigned, since CF=0. */ | ||
2274 | int usetest = (imm == 0 && (cc & 0xa) != 0x2); | ||
2275 | if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) { | ||
2276 | /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */ | ||
2277 | Reg right, left = RID_NONE; | ||
2278 | RegSet allow = RSET_GPR; | ||
2279 | if (!irref_isk(irl->op2)) { | ||
2280 | left = ra_alloc1(as, irl->op2, allow); | ||
2281 | rset_clear(allow, left); | ||
2282 | } | ||
2283 | right = asm_fuseload(as, irl->op1, allow); | ||
2284 | asm_guardcc(as, cc); | ||
2285 | if (irref_isk(irl->op2)) { | ||
2286 | emit_i32(as, IR(irl->op2)->i); | ||
2287 | emit_mrm(as, XO_GROUP3, XOg_TEST, right); | ||
2288 | } else { | ||
2289 | emit_mrm(as, XO_TEST, left, right); | ||
2290 | } | ||
2291 | } else { | ||
2292 | Reg left; | ||
2293 | if (opisfusableload((IROp)irl->o) && | ||
2294 | ((irt_isi8(irl->t) && checki8(imm)) || | ||
2295 | (irt_isu8(irl->t) && checku8(imm)))) { | ||
2296 | /* Only the IRT_INT case is fused by asm_fuseload. The IRT_I8/IRT_U8 | ||
2297 | ** loads are handled here. The IRT_I16/IRT_U16 loads should never be | ||
2298 | ** fused, since cmp word [mem], imm16 has a length-changing prefix. | ||
2299 | */ | ||
2300 | IRType1 origt = irl->t; /* Temporarily flip types. */ | ||
2301 | irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT; | ||
2302 | left = asm_fuseload(as, lref, RSET_GPR); | ||
2303 | irl->t = origt; | ||
2304 | if (left == RID_MRM) { /* Fusion succeeded? */ | ||
2305 | asm_guardcc(as, cc); | ||
2306 | emit_i8(as, imm); | ||
2307 | emit_mrm(as, XO_ARITHib, XOg_CMP, RID_MRM); | ||
2308 | return; | ||
2309 | } /* Otherwise handle register case as usual. */ | ||
2310 | } else { | ||
2311 | left = asm_fuseload(as, lref, RSET_GPR); | ||
2312 | } | ||
2313 | asm_guardcc(as, cc); | ||
2314 | if (usetest && left != RID_MRM) { | ||
2315 | /* Use test r,r instead of cmp r,0. */ | ||
2316 | if (irl+1 == ir) /* Referencing previous ins? */ | ||
2317 | as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ | ||
2318 | emit_rr(as, XO_TEST, left, left); | ||
2319 | } else { | ||
2320 | x86Op xo; | ||
2321 | if (checki8(imm)) { | ||
2322 | emit_i8(as, imm); | ||
2323 | xo = XO_ARITHi8; | ||
2324 | } else { | ||
2325 | emit_i32(as, imm); | ||
2326 | xo = XO_ARITHi; | ||
2327 | } | ||
2328 | emit_mrm(as, xo, XOg_CMP, left); | ||
2329 | } | ||
2330 | } | ||
2331 | } else { | ||
2332 | Reg left = ra_alloc1(as, lref, RSET_GPR); | ||
2333 | Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left)); | ||
2334 | asm_guardcc(as, cc); | ||
2335 | emit_mrm(as, XO_CMP, left, right); | ||
2336 | } | ||
2337 | } else { /* Handle ordered string compares. */ | ||
2338 | RegSet allow = RSET_GPR; | ||
2339 | /* This assumes lj_str_cmp never uses any SSE registers. */ | ||
2340 | ra_evictset(as, (RSET_SCRATCH & RSET_GPR)); | ||
2341 | asm_guardcc(as, cc); | ||
2342 | emit_rr(as, XO_TEST, RID_RET, RID_RET); | ||
2343 | emit_call(as, lj_str_cmp); /* int32_t lj_str_cmp(GCstr *a, GCstr *b) */ | ||
2344 | if (irref_isk(ir->op1)) { | ||
2345 | emit_setargi(as, 1, IR(ir->op1)->i); | ||
2346 | } else { | ||
2347 | Reg left = ra_alloc1(as, ir->op1, allow); | ||
2348 | rset_clear(allow, left); | ||
2349 | emit_setargr(as, 1, left); | ||
2350 | } | ||
2351 | if (irref_isk(ir->op2)) { | ||
2352 | emit_setargi(as, 2, IR(ir->op2)->i); | ||
2353 | } else { | ||
2354 | Reg right = ra_alloc1(as, ir->op2, allow); | ||
2355 | emit_setargr(as, 2, right); | ||
2356 | } | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | #define asm_comp(as, ir, ci, cf, cu) \ | ||
2361 | asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) | ||
2362 | |||
2363 | /* -- GC handling --------------------------------------------------------- */ | ||
2364 | |||
2365 | /* Sync all live GC values to Lua stack slots. */ | ||
2366 | static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base, RegSet allow) | ||
2367 | { | ||
2368 | IRRef2 *map = &as->T->snapmap[snap->mapofs]; | ||
2369 | BCReg s, nslots = snap->nslots; | ||
2370 | for (s = 0; s < nslots; s++) { | ||
2371 | IRRef ref = snap_ref(map[s]); | ||
2372 | if (!irref_isk(ref)) { | ||
2373 | IRIns *ir = IR(ref); | ||
2374 | if (ir->o == IR_FRAME) { | ||
2375 | /* NYI: sync the frame, bump base, set topslot, clear new slots. */ | ||
2376 | lj_trace_err(as->J, LJ_TRERR_NYIGCF); | ||
2377 | } else if (irt_isgcv(ir->t) && | ||
2378 | !(ir->o == IR_SLOAD && ir->op1 < nslots && map[ir->op1] == 0)) { | ||
2379 | Reg src = ra_alloc1(as, ref, allow); | ||
2380 | int32_t ofs = 8*(int32_t)(s-1); | ||
2381 | emit_movtomro(as, src, base, ofs); | ||
2382 | emit_movmroi(as, base, ofs+4, irt_toitype(ir->t)); | ||
2383 | checkmclim(as); | ||
2384 | } | ||
2385 | } | ||
2386 | } | ||
2387 | } | ||
2388 | |||
2389 | /* Check GC threshold and do one or more GC steps. */ | ||
2390 | static void asm_gc_check(ASMState *as, SnapShot *snap) | ||
2391 | { | ||
2392 | MCLabel l_end; | ||
2393 | const BCIns *pc; | ||
2394 | Reg tmp, base; | ||
2395 | RegSet drop = RSET_SCRATCH; | ||
2396 | /* Must evict BASE because the stack may be reallocated by the GC. */ | ||
2397 | if (ra_hasreg(IR(REF_BASE)->r)) | ||
2398 | drop |= RID2RSET(IR(REF_BASE)->r); | ||
2399 | ra_evictset(as, drop); | ||
2400 | base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_GPR, RID_RET)); | ||
2401 | l_end = emit_label(as); | ||
2402 | /* void lj_gc_step_jit(lua_State *L, const BCIns *pc, MSize steps) */ | ||
2403 | emit_call(as, lj_gc_step_jit); | ||
2404 | emit_movtomro(as, base, RID_RET, offsetof(lua_State, base)); | ||
2405 | emit_setargr(as, 1, RID_RET); | ||
2406 | emit_setargi(as, 3, (int32_t)as->gcsteps); | ||
2407 | emit_getgl(as, RID_RET, jit_L); | ||
2408 | pc = (const BCIns *)(uintptr_t)as->T->snapmap[snap->mapofs+snap->nslots]; | ||
2409 | emit_setargp(as, 2, pc); | ||
2410 | asm_gc_sync(as, snap, base, rset_exclude(RSET_SCRATCH & RSET_GPR, base)); | ||
2411 | if (as->curins == as->loopref) /* BASE gets restored by LOOP anyway. */ | ||
2412 | ra_restore(as, REF_BASE); /* Better do it inside the slow path. */ | ||
2413 | /* Jump around GC step if GC total < GC threshold. */ | ||
2414 | tmp = ra_scratch(as, RSET_SCRATCH & RSET_GPR); | ||
2415 | emit_sjcc(as, CC_B, l_end); | ||
2416 | emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold); | ||
2417 | emit_getgl(as, tmp, gc.total); | ||
2418 | as->gcsteps = 0; | ||
2419 | checkmclim(as); | ||
2420 | } | ||
2421 | |||
2422 | /* -- PHI and loop handling ----------------------------------------------- */ | ||
2423 | |||
2424 | /* Break a PHI cycle by renaming to a free register (evict if needed). */ | ||
2425 | static void asm_phi_break(ASMState *as, RegSet blocked, RegSet blockedby, | ||
2426 | RegSet allow) | ||
2427 | { | ||
2428 | RegSet candidates = blocked & allow; | ||
2429 | if (candidates) { /* If this register file has candidates. */ | ||
2430 | /* Note: the set for ra_pick cannot be empty, since each register file | ||
2431 | ** has some registers never allocated to PHIs. | ||
2432 | */ | ||
2433 | Reg down, up = ra_pick(as, ~blocked & allow); /* Get a free register. */ | ||
2434 | if (candidates & ~blockedby) /* Optimize shifts, else it's a cycle. */ | ||
2435 | candidates = candidates & ~blockedby; | ||
2436 | down = rset_picktop(candidates); /* Pick candidate PHI register. */ | ||
2437 | ra_rename(as, down, up); /* And rename it to the free register. */ | ||
2438 | } | ||
2439 | } | ||
2440 | |||
2441 | /* PHI register shuffling. | ||
2442 | ** | ||
2443 | ** The allocator tries hard to preserve PHI register assignments across | ||
2444 | ** the loop body. Most of the time this loop does nothing, since there | ||
2445 | ** are no register mismatches. | ||
2446 | ** | ||
2447 | ** If a register mismatch is detected and ... | ||
2448 | ** - the register is currently free: rename it. | ||
2449 | ** - the register is blocked by an invariant: restore/remat and rename it. | ||
2450 | ** - Otherwise the register is used by another PHI, so mark it as blocked. | ||
2451 | ** | ||
2452 | ** The renames are order-sensitive, so just retry the loop if a register | ||
2453 | ** is marked as blocked, but has been freed in the meantime. A cycle is | ||
2454 | ** detected if all of the blocked registers are allocated. To break the | ||
2455 | ** cycle rename one of them to a free register and retry. | ||
2456 | ** | ||
2457 | ** Note that PHI spill slots are kept in sync and don't need to be shuffled. | ||
2458 | */ | ||
2459 | static void asm_phi_shuffle(ASMState *as) | ||
2460 | { | ||
2461 | RegSet work; | ||
2462 | |||
2463 | /* Find and resolve PHI register mismatches. */ | ||
2464 | for (;;) { | ||
2465 | RegSet blocked = RSET_EMPTY; | ||
2466 | RegSet blockedby = RSET_EMPTY; | ||
2467 | RegSet phiset = as->phiset; | ||
2468 | while (phiset) { /* Check all left PHI operand registers. */ | ||
2469 | Reg r = rset_picktop(phiset); | ||
2470 | IRIns *irl = IR(as->phireg[r]); | ||
2471 | Reg left = irl->r; | ||
2472 | if (r != left) { /* Mismatch? */ | ||
2473 | if (!rset_test(as->freeset, r)) { /* PHI register blocked? */ | ||
2474 | IRRef ref = regcost_ref(as->cost[r]); | ||
2475 | if (irt_ismarked(IR(ref)->t)) { /* Blocked by other PHI (w/reg)? */ | ||
2476 | rset_set(blocked, r); | ||
2477 | if (ra_hasreg(left)) | ||
2478 | rset_set(blockedby, left); | ||
2479 | left = RID_NONE; | ||
2480 | } else { /* Otherwise grab register from invariant. */ | ||
2481 | ra_restore(as, ref); | ||
2482 | checkmclim(as); | ||
2483 | } | ||
2484 | } | ||
2485 | if (ra_hasreg(left)) { | ||
2486 | ra_rename(as, left, r); | ||
2487 | checkmclim(as); | ||
2488 | } | ||
2489 | } | ||
2490 | rset_clear(phiset, r); | ||
2491 | } | ||
2492 | if (!blocked) break; /* Finished. */ | ||
2493 | if (!(as->freeset & blocked)) { /* Break cycles if none are free. */ | ||
2494 | asm_phi_break(as, blocked, blockedby, RSET_GPR); | ||
2495 | asm_phi_break(as, blocked, blockedby, RSET_FPR); | ||
2496 | checkmclim(as); | ||
2497 | } /* Else retry some more renames. */ | ||
2498 | } | ||
2499 | |||
2500 | /* Restore/remat invariants whose registers are modified inside the loop. */ | ||
2501 | work = as->modset & ~(as->freeset | as->phiset); | ||
2502 | while (work) { | ||
2503 | Reg r = rset_picktop(work); | ||
2504 | ra_restore(as, regcost_ref(as->cost[r])); | ||
2505 | rset_clear(work, r); | ||
2506 | checkmclim(as); | ||
2507 | } | ||
2508 | |||
2509 | /* Allocate and save all unsaved PHI regs and clear marks. */ | ||
2510 | work = as->phiset; | ||
2511 | while (work) { | ||
2512 | Reg r = rset_picktop(work); | ||
2513 | IRRef lref = as->phireg[r]; | ||
2514 | IRIns *ir = IR(lref); | ||
2515 | if (ra_hasspill(ir->s)) { /* Left PHI gained a spill slot? */ | ||
2516 | irt_clearmark(ir->t); /* Handled here, so clear marker now. */ | ||
2517 | ra_alloc1(as, lref, RID2RSET(r)); | ||
2518 | ra_save(as, ir, r); /* Save to spill slot inside the loop. */ | ||
2519 | checkmclim(as); | ||
2520 | } | ||
2521 | rset_clear(work, r); | ||
2522 | } | ||
2523 | } | ||
2524 | |||
2525 | /* Emit renames for left PHIs which are only spilled outside the loop. */ | ||
2526 | static void asm_phi_fixup(ASMState *as) | ||
2527 | { | ||
2528 | RegSet work = as->phiset; | ||
2529 | while (work) { | ||
2530 | Reg r = rset_picktop(work); | ||
2531 | IRRef lref = as->phireg[r]; | ||
2532 | IRIns *ir = IR(lref); | ||
2533 | /* Left PHI gained a spill slot before the loop? */ | ||
2534 | if (irt_ismarked(ir->t) && ra_hasspill(ir->s)) { | ||
2535 | IRRef ren; | ||
2536 | lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), lref, as->loopsnapno); | ||
2537 | ren = tref_ref(lj_ir_emit(as->J)); | ||
2538 | as->ir = as->T->ir; /* The IR may have been reallocated. */ | ||
2539 | IR(ren)->r = (uint8_t)r; | ||
2540 | IR(ren)->s = SPS_NONE; | ||
2541 | } | ||
2542 | irt_clearmark(ir->t); /* Always clear marker. */ | ||
2543 | rset_clear(work, r); | ||
2544 | } | ||
2545 | } | ||
2546 | |||
2547 | /* Setup right PHI reference. */ | ||
2548 | static void asm_phi(ASMState *as, IRIns *ir) | ||
2549 | { | ||
2550 | RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; | ||
2551 | RegSet afree = (as->freeset & allow); | ||
2552 | IRIns *irl = IR(ir->op1); | ||
2553 | IRIns *irr = IR(ir->op2); | ||
2554 | /* Spill slot shuffling is not implemented yet (but rarely needed). */ | ||
2555 | if (ra_hasspill(irl->s) || ra_hasspill(irr->s)) | ||
2556 | lj_trace_err(as->J, LJ_TRERR_NYIPHI); | ||
2557 | /* Leave at least one register free for non-PHIs (and PHI cycle breaking). */ | ||
2558 | if ((afree & (afree-1))) { /* Two or more free registers? */ | ||
2559 | Reg r; | ||
2560 | if (ra_noreg(irr->r)) { /* Get a register for the right PHI. */ | ||
2561 | r = ra_allocref(as, ir->op2, allow); | ||
2562 | } else { /* Duplicate right PHI, need a copy (rare). */ | ||
2563 | r = ra_scratch(as, allow); | ||
2564 | emit_movrr(as, r, irr->r); | ||
2565 | } | ||
2566 | ir->r = (uint8_t)r; | ||
2567 | rset_set(as->phiset, r); | ||
2568 | as->phireg[r] = (IRRef1)ir->op1; | ||
2569 | irt_setmark(irl->t); /* Marks left PHIs _with_ register. */ | ||
2570 | if (ra_noreg(irl->r)) | ||
2571 | ra_sethint(irl->r, r); /* Set register hint for left PHI. */ | ||
2572 | } else { /* Otherwise allocate a spill slot. */ | ||
2573 | /* This is overly restrictive, but it triggers only on synthetic code. */ | ||
2574 | if (ra_hasreg(irl->r) || ra_hasreg(irr->r)) | ||
2575 | lj_trace_err(as->J, LJ_TRERR_NYIPHI); | ||
2576 | ra_spill(as, ir); | ||
2577 | irl->s = irr->s = ir->s; /* Sync left/right PHI spill slots. */ | ||
2578 | } | ||
2579 | } | ||
2580 | |||
2581 | /* Fixup the loop branch. */ | ||
2582 | static void asm_loop_fixup(ASMState *as) | ||
2583 | { | ||
2584 | MCode *p = as->mctop; | ||
2585 | MCode *target = as->mcp; | ||
2586 | if (as->realign) { /* Realigned loops use short jumps. */ | ||
2587 | as->realign = NULL; /* Stop another retry. */ | ||
2588 | lua_assert(((intptr_t)target & 15) == 0); | ||
2589 | if (as->loopinv) { /* Inverted loop branch? */ | ||
2590 | p -= 5; | ||
2591 | p[0] = XI_JMP; | ||
2592 | lua_assert(target - p >= -128); | ||
2593 | p[-1] = (MCode)(target - p); /* Patch sjcc. */ | ||
2594 | if (as->loopinv == 2) | ||
2595 | p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */ | ||
2596 | } else { | ||
2597 | lua_assert(target - p >= -128); | ||
2598 | p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */ | ||
2599 | p[-2] = XI_JMPs; | ||
2600 | } | ||
2601 | } else { | ||
2602 | MCode *newloop; | ||
2603 | p[-5] = XI_JMP; | ||
2604 | if (as->loopinv) { /* Inverted loop branch? */ | ||
2605 | /* asm_guardcc already inverted the jcc and patched the jmp. */ | ||
2606 | p -= 5; | ||
2607 | newloop = target+4; | ||
2608 | *(int32_t *)(p-4) = (int32_t)(target - p); /* Patch jcc. */ | ||
2609 | if (as->loopinv == 2) { | ||
2610 | *(int32_t *)(p-10) = (int32_t)(target - p + 6); /* Patch opt. jp. */ | ||
2611 | newloop = target+8; | ||
2612 | } | ||
2613 | } else { /* Otherwise just patch jmp. */ | ||
2614 | *(int32_t *)(p-4) = (int32_t)(target - p); | ||
2615 | newloop = target+3; | ||
2616 | } | ||
2617 | /* Realign small loops and shorten the loop branch. */ | ||
2618 | if (newloop >= p - 128) { | ||
2619 | as->realign = newloop; /* Force a retry and remember alignment. */ | ||
2620 | as->curins = as->stopins; /* Abort asm_trace now. */ | ||
2621 | as->T->nins = as->orignins; /* Remove any added renames. */ | ||
2622 | } | ||
2623 | } | ||
2624 | } | ||
2625 | |||
2626 | /* Middle part of a loop. */ | ||
2627 | static void asm_loop(ASMState *as) | ||
2628 | { | ||
2629 | /* LOOP is a guard, so the snapno is up to date. */ | ||
2630 | as->loopsnapno = as->snapno; | ||
2631 | if (as->gcsteps) | ||
2632 | asm_gc_check(as, &as->T->snap[as->loopsnapno]); | ||
2633 | /* LOOP marks the transition from the variant to the invariant part. */ | ||
2634 | as->testmcp = as->invmcp = NULL; | ||
2635 | as->sectref = 0; | ||
2636 | if (!neverfuse(as)) as->fuseref = 0; | ||
2637 | asm_phi_shuffle(as); | ||
2638 | asm_loop_fixup(as); | ||
2639 | as->mcloop = as->mcp; | ||
2640 | RA_DBGX((as, "===== LOOP =====")); | ||
2641 | if (!as->realign) RA_DBG_FLUSH(); | ||
2642 | } | ||
2643 | |||
2644 | /* -- Head of trace ------------------------------------------------------- */ | ||
2645 | |||
2646 | /* Rematerialize all remaining constants in registers. */ | ||
2647 | static void asm_const_remat(ASMState *as) | ||
2648 | { | ||
2649 | RegSet work = ~as->freeset & RSET_ALL; | ||
2650 | while (work) { | ||
2651 | Reg r = rset_pickbot(work); | ||
2652 | IRRef ref = regcost_ref(as->cost[r]); | ||
2653 | if (irref_isk(ref) || ref == REF_BASE) { | ||
2654 | ra_rematk(as, IR(ref)); | ||
2655 | checkmclim(as); | ||
2656 | } | ||
2657 | rset_clear(work, r); | ||
2658 | } | ||
2659 | } | ||
2660 | |||
2661 | /* Head of a root trace. */ | ||
2662 | static void asm_head_root(ASMState *as) | ||
2663 | { | ||
2664 | int32_t spadj; | ||
2665 | emit_setgli(as, vmstate, (int32_t)as->J->curtrace); | ||
2666 | spadj = sps_adjust(as); | ||
2667 | as->T->spadjust = (uint16_t)spadj; | ||
2668 | emit_addptr(as, RID_ESP, -spadj); | ||
2669 | } | ||
2670 | |||
2671 | /* Handle BASE coalescing for a root trace. */ | ||
2672 | static void asm_head_base(ASMState *as) | ||
2673 | { | ||
2674 | IRIns *ir = IR(REF_BASE); | ||
2675 | Reg r = ir->r; | ||
2676 | lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s)); | ||
2677 | ra_free(as, r); | ||
2678 | if (r != RID_BASE) { | ||
2679 | ra_scratch(as, RID2RSET(RID_BASE)); | ||
2680 | emit_rr(as, XO_MOV, r, RID_BASE); | ||
2681 | } | ||
2682 | } | ||
2683 | |||
2684 | /* Check Lua stack size for overflow at the start of a side trace. | ||
2685 | ** Stack overflow is rare, so let the regular exit handling fix this up. | ||
2686 | ** This is done in the context of the *parent* trace and parent exitno! | ||
2687 | */ | ||
2688 | static void asm_checkstack(ASMState *as, RegSet allow) | ||
2689 | { | ||
2690 | /* Try to get an unused temp. register, otherwise spill/restore eax. */ | ||
2691 | Reg r = allow ? rset_pickbot(allow) : RID_EAX; | ||
2692 | emit_jcc(as, CC_B, exitstub_addr(as->J, as->J->exitno)); | ||
2693 | if (allow == RSET_EMPTY) /* Restore temp. register. */ | ||
2694 | emit_rmro(as, XO_MOV, r, RID_ESP, sps_scale(SPS_TEMP1)); | ||
2695 | emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*as->topslot)); | ||
2696 | emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, ptr2addr(&J2G(as->J)->jit_base)); | ||
2697 | emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack)); | ||
2698 | emit_getgl(as, r, jit_L); | ||
2699 | if (allow == RSET_EMPTY) /* Spill temp. register. */ | ||
2700 | emit_rmro(as, XO_MOVto, r, RID_ESP, sps_scale(SPS_TEMP1)); | ||
2701 | } | ||
2702 | |||
2703 | /* Head of a side trace. | ||
2704 | ** | ||
2705 | ** The current simplistic algorithm requires that all slots inherited | ||
2706 | ** from the parent are live in a register between pass 2 and pass 3. This | ||
2707 | ** avoids the complexity of stack slot shuffling. But of course this may | ||
2708 | ** overflow the register set in some cases and cause the dreaded error: | ||
2709 | ** "NYI: register coalescing too complex". A refined algorithm is needed. | ||
2710 | */ | ||
2711 | static void asm_head_side(ASMState *as) | ||
2712 | { | ||
2713 | IRRef1 sloadins[RID_MAX]; | ||
2714 | RegSet allow = RSET_ALL; /* Inverse of all coalesced registers. */ | ||
2715 | RegSet live = RSET_EMPTY; /* Live parent registers. */ | ||
2716 | int32_t spadj, spdelta; | ||
2717 | int pass2 = 0; | ||
2718 | int pass3 = 0; | ||
2719 | IRRef i; | ||
2720 | |||
2721 | /* Scan all parent SLOADs and collect register dependencies. */ | ||
2722 | for (i = as->curins; i > REF_BASE; i--) { | ||
2723 | IRIns *ir = IR(i); | ||
2724 | lua_assert((ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_PARENT)) || | ||
2725 | ir->o == IR_FRAME); | ||
2726 | if (ir->o == IR_SLOAD) { | ||
2727 | RegSP rs = as->parentmap[ir->op1]; | ||
2728 | if (ra_hasreg(ir->r)) { | ||
2729 | rset_clear(allow, ir->r); | ||
2730 | if (ra_hasspill(ir->s)) | ||
2731 | ra_save(as, ir, ir->r); | ||
2732 | } else if (ra_hasspill(ir->s)) { | ||
2733 | irt_setmark(ir->t); | ||
2734 | pass2 = 1; | ||
2735 | } | ||
2736 | if (ir->r == rs) { /* Coalesce matching registers right now. */ | ||
2737 | ra_free(as, ir->r); | ||
2738 | } else if (ra_hasspill(regsp_spill(rs))) { | ||
2739 | if (ra_hasreg(ir->r)) | ||
2740 | pass3 = 1; | ||
2741 | } else if (ra_used(ir)) { | ||
2742 | sloadins[rs] = (IRRef1)i; | ||
2743 | rset_set(live, rs); /* Block live parent register. */ | ||
2744 | } | ||
2745 | } | ||
2746 | } | ||
2747 | |||
2748 | /* Calculate stack frame adjustment. */ | ||
2749 | spadj = sps_adjust(as); | ||
2750 | spdelta = spadj - (int32_t)as->parent->spadjust; | ||
2751 | if (spdelta < 0) { /* Don't shrink the stack frame. */ | ||
2752 | spadj = (int32_t)as->parent->spadjust; | ||
2753 | spdelta = 0; | ||
2754 | } | ||
2755 | as->T->spadjust = (uint16_t)spadj; | ||
2756 | |||
2757 | /* Reload spilled target registers. */ | ||
2758 | if (pass2) { | ||
2759 | for (i = as->curins; i > REF_BASE; i--) { | ||
2760 | IRIns *ir = IR(i); | ||
2761 | if (irt_ismarked(ir->t)) { | ||
2762 | RegSet mask; | ||
2763 | Reg r; | ||
2764 | RegSP rs; | ||
2765 | irt_clearmark(ir->t); | ||
2766 | rs = as->parentmap[ir->op1]; | ||
2767 | if (!ra_hasspill(regsp_spill(rs))) | ||
2768 | ra_sethint(ir->r, rs); /* Hint may be gone, set it again. */ | ||
2769 | else if (sps_scale(regsp_spill(rs))+spdelta == sps_scale(ir->s)) | ||
2770 | continue; /* Same spill slot, do nothing. */ | ||
2771 | mask = (irt_isnum(ir->t) ? RSET_FPR : RSET_GPR) & allow; | ||
2772 | if (mask == RSET_EMPTY) | ||
2773 | lj_trace_err(as->J, LJ_TRERR_NYICOAL); | ||
2774 | r = ra_allocref(as, i, mask); | ||
2775 | ra_save(as, ir, r); | ||
2776 | rset_clear(allow, r); | ||
2777 | if (r == rs) { /* Coalesce matching registers right now. */ | ||
2778 | ra_free(as, r); | ||
2779 | rset_clear(live, r); | ||
2780 | } else if (ra_hasspill(regsp_spill(rs))) { | ||
2781 | pass3 = 1; | ||
2782 | } | ||
2783 | checkmclim(as); | ||
2784 | } | ||
2785 | } | ||
2786 | } | ||
2787 | |||
2788 | /* Store trace number and adjust stack frame relative to the parent. */ | ||
2789 | emit_setgli(as, vmstate, (int32_t)as->J->curtrace); | ||
2790 | emit_addptr(as, RID_ESP, -spdelta); | ||
2791 | |||
2792 | /* Restore target registers from parent spill slots. */ | ||
2793 | if (pass3) { | ||
2794 | RegSet work = ~as->freeset & RSET_ALL; | ||
2795 | while (work) { | ||
2796 | Reg r = rset_pickbot(work); | ||
2797 | IRIns *ir = IR(regcost_ref(as->cost[r])); | ||
2798 | RegSP rs = as->parentmap[ir->op1]; | ||
2799 | rset_clear(work, r); | ||
2800 | if (ra_hasspill(regsp_spill(rs))) { | ||
2801 | int32_t ofs = sps_scale(regsp_spill(rs)); | ||
2802 | ra_free(as, r); | ||
2803 | emit_movrmro(as, r, RID_ESP, ofs); | ||
2804 | checkmclim(as); | ||
2805 | } | ||
2806 | } | ||
2807 | } | ||
2808 | |||
2809 | /* Shuffle registers to match up target regs with parent regs. */ | ||
2810 | for (;;) { | ||
2811 | RegSet work; | ||
2812 | |||
2813 | /* Repeatedly coalesce free live registers by moving to their target. */ | ||
2814 | while ((work = as->freeset & live) != RSET_EMPTY) { | ||
2815 | Reg rp = rset_pickbot(work); | ||
2816 | IRIns *ir = IR(sloadins[rp]); | ||
2817 | rset_clear(live, rp); | ||
2818 | rset_clear(allow, rp); | ||
2819 | ra_free(as, ir->r); | ||
2820 | emit_movrr(as, ir->r, rp); | ||
2821 | checkmclim(as); | ||
2822 | } | ||
2823 | |||
2824 | /* We're done if no live registers remain. */ | ||
2825 | if (live == RSET_EMPTY) | ||
2826 | break; | ||
2827 | |||
2828 | /* Break cycles by renaming one target to a temp. register. */ | ||
2829 | if (live & RSET_GPR) { | ||
2830 | RegSet tmpset = as->freeset & ~live & allow & RSET_GPR; | ||
2831 | if (tmpset == RSET_EMPTY) | ||
2832 | lj_trace_err(as->J, LJ_TRERR_NYICOAL); | ||
2833 | ra_rename(as, rset_pickbot(live & RSET_GPR), rset_pickbot(tmpset)); | ||
2834 | } | ||
2835 | if (live & RSET_FPR) { | ||
2836 | RegSet tmpset = as->freeset & ~live & allow & RSET_FPR; | ||
2837 | if (tmpset == RSET_EMPTY) | ||
2838 | lj_trace_err(as->J, LJ_TRERR_NYICOAL); | ||
2839 | ra_rename(as, rset_pickbot(live & RSET_FPR), rset_pickbot(tmpset)); | ||
2840 | } | ||
2841 | checkmclim(as); | ||
2842 | /* Continue with coalescing to fix up the broken cycle(s). */ | ||
2843 | } | ||
2844 | |||
2845 | /* Check Lua stack size if frames have been added. */ | ||
2846 | if (as->topslot) | ||
2847 | asm_checkstack(as, allow & RSET_GPR); | ||
2848 | } | ||
2849 | |||
2850 | /* -- Tail of trace ------------------------------------------------------- */ | ||
2851 | |||
2852 | /* Sync Lua stack slots to match the last snapshot. | ||
2853 | ** Note: code generation is backwards, so this is best read bottom-up. | ||
2854 | */ | ||
2855 | static void asm_tail_sync(ASMState *as) | ||
2856 | { | ||
2857 | SnapShot *snap = &as->T->snap[as->T->nsnap-1]; /* Last snapshot. */ | ||
2858 | BCReg s, nslots = snap->nslots; | ||
2859 | IRRef2 *map = &as->T->snapmap[snap->mapofs]; | ||
2860 | IRRef2 *flinks = map + nslots + snap->nframelinks; | ||
2861 | BCReg newbase = 0; | ||
2862 | BCReg secondbase = ~(BCReg)0; | ||
2863 | BCReg topslot = 0; | ||
2864 | |||
2865 | checkmclim(as); | ||
2866 | ra_allocref(as, REF_BASE, RID2RSET(RID_BASE)); | ||
2867 | |||
2868 | /* Must check all frames to find topslot (outer can be larger than inner). */ | ||
2869 | for (s = 0; s < nslots; s++) { | ||
2870 | IRRef ref = snap_ref(map[s]); | ||
2871 | if (!irref_isk(ref)) { | ||
2872 | IRIns *ir = IR(ref); | ||
2873 | if (ir->o == IR_FRAME && irt_isfunc(ir->t)) { | ||
2874 | GCfunc *fn = ir_kfunc(IR(ir->op2)); | ||
2875 | if (isluafunc(fn)) { | ||
2876 | BCReg fs = s + funcproto(fn)->framesize; | ||
2877 | newbase = s; | ||
2878 | if (secondbase == ~(BCReg)0) secondbase = s; | ||
2879 | if (fs > topslot) topslot = fs; | ||
2880 | } | ||
2881 | } | ||
2882 | } | ||
2883 | } | ||
2884 | as->topslot = topslot; /* Used in asm_head_side(). */ | ||
2885 | |||
2886 | if (as->T->link == TRACE_INTERP) { | ||
2887 | /* Setup fixed registers for exit to interpreter. */ | ||
2888 | emit_loada(as, RID_DISPATCH, J2GG(as->J)->dispatch); | ||
2889 | emit_loadi(as, RID_PC, (int32_t)map[nslots]); | ||
2890 | } else if (newbase) { | ||
2891 | /* Save modified BASE for linking to trace with higher start frame. */ | ||
2892 | emit_setgl(as, RID_BASE, jit_base); | ||
2893 | } | ||
2894 | |||
2895 | emit_addptr(as, RID_BASE, 8*(int32_t)newbase); | ||
2896 | |||
2897 | /* Clear stack slots of newly added frames. */ | ||
2898 | if (nslots <= topslot) { | ||
2899 | if (nslots < topslot) { | ||
2900 | for (s = nslots; s <= topslot; s++) { | ||
2901 | emit_movtomro(as, RID_EAX, RID_BASE, 8*(int32_t)s-4); | ||
2902 | checkmclim(as); | ||
2903 | } | ||
2904 | emit_loadi(as, RID_EAX, LJ_TNIL); | ||
2905 | } else { | ||
2906 | emit_movmroi(as, RID_BASE, 8*(int32_t)nslots-4, LJ_TNIL); | ||
2907 | } | ||
2908 | } | ||
2909 | |||
2910 | /* Store the value of all modified slots to the Lua stack. */ | ||
2911 | for (s = 0; s < nslots; s++) { | ||
2912 | int32_t ofs = 8*((int32_t)s-1); | ||
2913 | IRRef ref = snap_ref(map[s]); | ||
2914 | if (ref) { | ||
2915 | IRIns *ir = IR(ref); | ||
2916 | /* No need to restore readonly slots and unmodified non-parent slots. */ | ||
2917 | if (ir->o == IR_SLOAD && ir->op1 == s && | ||
2918 | (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT) | ||
2919 | continue; | ||
2920 | if (irt_isnum(ir->t)) { | ||
2921 | Reg src = ra_alloc1(as, ref, RSET_FPR); | ||
2922 | emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs); | ||
2923 | } else if (ir->o == IR_FRAME) { | ||
2924 | emit_movmroi(as, RID_BASE, ofs, ptr2addr(ir_kgc(IR(ir->op2)))); | ||
2925 | if (s != 0) /* Do not overwrite link to previous frame. */ | ||
2926 | emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*--flinks)); | ||
2927 | } else { | ||
2928 | lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t)); | ||
2929 | if (!irref_isk(ref)) { | ||
2930 | Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); | ||
2931 | emit_movtomro(as, src, RID_BASE, ofs); | ||
2932 | } else if (!irt_ispri(ir->t)) { | ||
2933 | emit_movmroi(as, RID_BASE, ofs, ir->i); | ||
2934 | } | ||
2935 | emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); | ||
2936 | } | ||
2937 | } else if (s > secondbase) { | ||
2938 | emit_movmroi(as, RID_BASE, ofs+4, LJ_TNIL); | ||
2939 | } | ||
2940 | checkmclim(as); | ||
2941 | } | ||
2942 | lua_assert(map + nslots == flinks-1); | ||
2943 | } | ||
2944 | |||
2945 | /* Fixup the tail code. */ | ||
2946 | static void asm_tail_fixup(ASMState *as, TraceNo lnk) | ||
2947 | { | ||
2948 | /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */ | ||
2949 | MCode *p = as->mctop; | ||
2950 | MCode *target, *q; | ||
2951 | int32_t spadj = as->T->spadjust; | ||
2952 | if (spadj == 0) { | ||
2953 | p -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6; | ||
2954 | } else { | ||
2955 | MCode *p1; | ||
2956 | /* Patch stack adjustment. */ | ||
2957 | if (checki8(spadj)) { | ||
2958 | p -= 3; | ||
2959 | p1 = p-6; | ||
2960 | *p1 = (MCode)spadj; | ||
2961 | } else { | ||
2962 | p1 = p-9; | ||
2963 | *(int32_t *)p1 = spadj; | ||
2964 | } | ||
2965 | if ((as->flags & JIT_F_LEA_AGU)) { | ||
2966 | p1[-3] = (MCode)XI_LEA; | ||
2967 | p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP); | ||
2968 | p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP); | ||
2969 | } else { | ||
2970 | p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi); | ||
2971 | p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP); | ||
2972 | } | ||
2973 | } | ||
2974 | /* Patch exit branch. */ | ||
2975 | target = lnk == TRACE_INTERP ? (MCode *)lj_vm_exit_interp : | ||
2976 | as->J->trace[lnk]->mcode; | ||
2977 | *(int32_t *)(p-4) = (int32_t)(target - p); | ||
2978 | p[-5] = XI_JMP; | ||
2979 | /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */ | ||
2980 | for (q = as->mctop-1; q >= p; q--) | ||
2981 | *q = XI_NOP; | ||
2982 | as->mctop = p; | ||
2983 | } | ||
2984 | |||
2985 | /* -- Instruction dispatch ------------------------------------------------ */ | ||
2986 | |||
2987 | /* Assemble a single instruction. */ | ||
2988 | static void asm_ir(ASMState *as, IRIns *ir) | ||
2989 | { | ||
2990 | switch ((IROp)ir->o) { | ||
2991 | /* Miscellaneous ops. */ | ||
2992 | case IR_LOOP: asm_loop(as); break; | ||
2993 | case IR_NOP: break; | ||
2994 | case IR_PHI: asm_phi(as, ir); break; | ||
2995 | |||
2996 | /* Guarded assertions. */ | ||
2997 | case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; | ||
2998 | case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; | ||
2999 | case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; | ||
3000 | case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; | ||
3001 | case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; | ||
3002 | case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break; | ||
3003 | case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break; | ||
3004 | case IR_ABC: | ||
3005 | case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break; | ||
3006 | |||
3007 | case IR_FRAME: | ||
3008 | if (ir->op1 == ir->op2) break; /* No check needed for placeholder. */ | ||
3009 | /* fallthrough */ | ||
3010 | case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break; | ||
3011 | case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break; | ||
3012 | |||
3013 | /* Bit ops. */ | ||
3014 | case IR_BNOT: asm_bitnot(as, ir); break; | ||
3015 | case IR_BSWAP: asm_bitswap(as, ir); break; | ||
3016 | |||
3017 | case IR_BAND: asm_intarith(as, ir, XOg_AND); break; | ||
3018 | case IR_BOR: asm_intarith(as, ir, XOg_OR); break; | ||
3019 | case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break; | ||
3020 | |||
3021 | case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break; | ||
3022 | case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break; | ||
3023 | case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break; | ||
3024 | case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break; | ||
3025 | case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break; | ||
3026 | |||
3027 | /* Arithmetic ops. */ | ||
3028 | case IR_ADD: asm_add(as, ir); break; | ||
3029 | case IR_SUB: | ||
3030 | if (irt_isnum(ir->t)) | ||
3031 | asm_fparith(as, ir, XO_SUBSD); | ||
3032 | else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */ | ||
3033 | asm_intarith(as, ir, XOg_SUB); | ||
3034 | break; | ||
3035 | case IR_MUL: asm_fparith(as, ir, XO_MULSD); break; | ||
3036 | case IR_DIV: asm_fparith(as, ir, XO_DIVSD); break; | ||
3037 | |||
3038 | case IR_NEG: asm_fparith(as, ir, XO_XORPS); break; | ||
3039 | case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break; | ||
3040 | |||
3041 | case IR_MIN: asm_fparith(as, ir, XO_MINSD); break; | ||
3042 | case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break; | ||
3043 | |||
3044 | case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI: | ||
3045 | asm_fpmath(as, ir); | ||
3046 | break; | ||
3047 | |||
3048 | /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ | ||
3049 | case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; | ||
3050 | case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break; | ||
3051 | |||
3052 | /* Memory references. */ | ||
3053 | case IR_AREF: asm_aref(as, ir); break; | ||
3054 | case IR_HREF: asm_href(as, ir); break; | ||
3055 | case IR_HREFK: asm_hrefk(as, ir); break; | ||
3056 | case IR_NEWREF: asm_newref(as, ir); break; | ||
3057 | case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break; | ||
3058 | case IR_FREF: asm_fref(as, ir); break; | ||
3059 | case IR_STRREF: asm_strref(as, ir); break; | ||
3060 | |||
3061 | /* Loads and stores. */ | ||
3062 | case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: asm_ahuload(as, ir); break; | ||
3063 | case IR_FLOAD: asm_fload(as, ir); break; | ||
3064 | case IR_SLOAD: asm_sload(as, ir); break; | ||
3065 | case IR_XLOAD: asm_xload(as, ir); break; | ||
3066 | |||
3067 | case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break; | ||
3068 | case IR_FSTORE: asm_fstore(as, ir); break; | ||
3069 | |||
3070 | /* String ops. */ | ||
3071 | case IR_SNEW: asm_snew(as, ir); break; | ||
3072 | |||
3073 | /* Table ops. */ | ||
3074 | case IR_TNEW: asm_tnew(as, ir); break; | ||
3075 | case IR_TDUP: asm_tdup(as, ir); break; | ||
3076 | case IR_TLEN: asm_tlen(as, ir); break; | ||
3077 | case IR_TBAR: asm_tbar(as, ir); break; | ||
3078 | case IR_OBAR: asm_obar(as, ir); break; | ||
3079 | |||
3080 | /* Type conversions. */ | ||
3081 | case IR_TONUM: asm_tonum(as, ir); break; | ||
3082 | case IR_TOINT: | ||
3083 | if (irt_isguard(ir->t)) | ||
3084 | asm_tointg(as, ir, ra_alloc1(as, ir->op1, RSET_FPR)); | ||
3085 | else | ||
3086 | asm_toint(as, ir); break; | ||
3087 | break; | ||
3088 | case IR_TOBIT: asm_tobit(as, ir); break; | ||
3089 | case IR_TOSTR: asm_tostr(as, ir); break; | ||
3090 | case IR_STRTO: asm_strto(as, ir); break; | ||
3091 | |||
3092 | default: | ||
3093 | setintV(&as->J->errinfo, ir->o); | ||
3094 | lj_trace_err_info(as->J, LJ_TRERR_NYIIR); | ||
3095 | break; | ||
3096 | } | ||
3097 | } | ||
3098 | |||
3099 | /* Assemble a trace in linear backwards order. */ | ||
3100 | static void asm_trace(ASMState *as) | ||
3101 | { | ||
3102 | for (as->curins--; as->curins > as->stopins; as->curins--) { | ||
3103 | IRIns *ir = IR(as->curins); | ||
3104 | if (irt_isguard(ir->t)) | ||
3105 | asm_snap_prep(as); | ||
3106 | else if (!ra_used(ir) && !irm_sideeff(lj_ir_mode[ir->o]) && | ||
3107 | (as->flags & JIT_F_OPT_DCE)) | ||
3108 | continue; /* Dead-code elimination can be soooo easy. */ | ||
3109 | RA_DBG_REF(); | ||
3110 | checkmclim(as); | ||
3111 | asm_ir(as, ir); | ||
3112 | } | ||
3113 | } | ||
3114 | |||
3115 | /* -- Trace setup --------------------------------------------------------- */ | ||
3116 | |||
3117 | /* Clear reg/sp for all instructions and add register hints. */ | ||
3118 | static void asm_setup_regsp(ASMState *as, Trace *T) | ||
3119 | { | ||
3120 | IRRef i, nins; | ||
3121 | int inloop; | ||
3122 | |||
3123 | /* Clear reg/sp for constants. */ | ||
3124 | for (i = T->nk; i < REF_BIAS; i++) | ||
3125 | IR(i)->prev = REGSP_INIT; | ||
3126 | |||
3127 | /* REF_BASE is used for implicit references to the BASE register. */ | ||
3128 | IR(REF_BASE)->prev = REGSP_HINT(RID_BASE); | ||
3129 | |||
3130 | nins = T->nins; | ||
3131 | if (IR(nins-1)->o == IR_RENAME) { | ||
3132 | do { nins--; } while (IR(nins-1)->o == IR_RENAME); | ||
3133 | T->nins = nins; /* Remove any renames left over from ASM restart. */ | ||
3134 | } | ||
3135 | as->snaprename = nins; | ||
3136 | as->snapref = nins; | ||
3137 | as->snapno = T->nsnap; | ||
3138 | |||
3139 | as->stopins = REF_BASE; | ||
3140 | as->orignins = nins; | ||
3141 | as->curins = nins; | ||
3142 | |||
3143 | inloop = 0; | ||
3144 | for (i = REF_FIRST; i < nins; i++) { | ||
3145 | IRIns *ir = IR(i); | ||
3146 | switch (ir->o) { | ||
3147 | case IR_LOOP: | ||
3148 | inloop = 1; | ||
3149 | break; | ||
3150 | /* Set hints for slot loads from a parent trace. */ | ||
3151 | case IR_SLOAD: | ||
3152 | if ((ir->op2 & IRSLOAD_PARENT)) { | ||
3153 | RegSP rs = as->parentmap[ir->op1]; | ||
3154 | lua_assert(regsp_used(rs)); | ||
3155 | as->stopins = i; | ||
3156 | if (!ra_hasspill(regsp_spill(rs)) && ra_hasreg(regsp_reg(rs))) { | ||
3157 | ir->prev = (uint16_t)REGSP_HINT(regsp_reg(rs)); | ||
3158 | continue; | ||
3159 | } | ||
3160 | } | ||
3161 | break; | ||
3162 | case IR_FRAME: | ||
3163 | if (i == as->stopins+1 && ir->op1 == ir->op2) | ||
3164 | as->stopins++; | ||
3165 | break; | ||
3166 | /* C calls evict all scratch regs and return results in RID_RET. */ | ||
3167 | case IR_SNEW: case IR_TNEW: case IR_TDUP: case IR_TLEN: case IR_TOSTR: | ||
3168 | case IR_NEWREF: | ||
3169 | ir->prev = REGSP_HINT(RID_RET); | ||
3170 | if (inloop) | ||
3171 | as->modset = RSET_SCRATCH; | ||
3172 | continue; | ||
3173 | case IR_STRTO: case IR_OBAR: | ||
3174 | if (inloop) | ||
3175 | as->modset = RSET_SCRATCH; | ||
3176 | break; | ||
3177 | /* Ordered string compares evict all integer scratch registers. */ | ||
3178 | case IR_LT: case IR_GE: case IR_LE: case IR_GT: | ||
3179 | if (irt_isstr(ir->t) && inloop) | ||
3180 | as->modset |= (RSET_SCRATCH & RSET_GPR); | ||
3181 | break; | ||
3182 | /* Non-constant shift counts need to be in RID_ECX. */ | ||
3183 | case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR: | ||
3184 | if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) | ||
3185 | IR(ir->op2)->r = REGSP_HINT(RID_ECX); | ||
3186 | break; | ||
3187 | /* Do not propagate hints across type conversions. */ | ||
3188 | case IR_TONUM: case IR_TOINT: case IR_TOBIT: | ||
3189 | break; | ||
3190 | default: | ||
3191 | /* Propagate hints across likely 'op reg, imm' or 'op reg'. */ | ||
3192 | if (irref_isk(ir->op2) && !irref_isk(ir->op1)) { | ||
3193 | ir->prev = IR(ir->op1)->prev; | ||
3194 | continue; | ||
3195 | } | ||
3196 | break; | ||
3197 | } | ||
3198 | ir->prev = REGSP_INIT; | ||
3199 | } | ||
3200 | } | ||
3201 | |||
3202 | /* -- Assembler core ------------------------------------------------------ */ | ||
3203 | |||
3204 | /* Define this if you want to run LuaJIT with Valgrind. */ | ||
3205 | #ifdef LUAJIT_USE_VALGRIND | ||
3206 | #include <valgrind/valgrind.h> | ||
3207 | #define VG_INVALIDATE(p, sz) VALGRIND_DISCARD_TRANSLATIONS(p, sz) | ||
3208 | #else | ||
3209 | #define VG_INVALIDATE(p, sz) ((void)0) | ||
3210 | #endif | ||
3211 | |||
3212 | /* Assemble a trace. */ | ||
3213 | void lj_asm_trace(jit_State *J, Trace *T) | ||
3214 | { | ||
3215 | ASMState as_; | ||
3216 | ASMState *as = &as_; | ||
3217 | |||
3218 | /* Setup initial state. Copy some fields to reduce indirections. */ | ||
3219 | as->J = J; | ||
3220 | as->T = T; | ||
3221 | as->ir = T->ir; | ||
3222 | as->flags = J->flags; | ||
3223 | as->loopref = J->loopref; | ||
3224 | as->realign = NULL; | ||
3225 | as->loopinv = 0; | ||
3226 | if (J->parent) { | ||
3227 | as->parent = J->trace[J->parent]; | ||
3228 | lj_snap_regspmap(as->parentmap, as->parent, J->exitno); | ||
3229 | } else { | ||
3230 | as->parent = NULL; | ||
3231 | } | ||
3232 | as->mctop = lj_mcode_reserve(J, &as->mcbot); /* Reserve MCode memory. */ | ||
3233 | as->mcp = as->mctop; | ||
3234 | as->mclim = as->mcbot + MCLIM_REDZONE; | ||
3235 | asm_exitstub_setup(as, T->nsnap); | ||
3236 | |||
3237 | do { | ||
3238 | as->mcp = as->mctop; | ||
3239 | as->curins = T->nins; | ||
3240 | RA_DBG_START(); | ||
3241 | RA_DBGX((as, "===== STOP =====")); | ||
3242 | /* Realign and leave room for backwards loop branch or exit branch. */ | ||
3243 | if (as->realign) { | ||
3244 | int i = ((int)(intptr_t)as->realign) & 15; | ||
3245 | MCode *p = as->mctop; | ||
3246 | /* Fill unused mcode tail with NOPs to make the prefetcher happy. */ | ||
3247 | while (i-- > 0) | ||
3248 | *--p = XI_NOP; | ||
3249 | as->mctop = p; | ||
3250 | as->mcp = p - (as->loopinv ? 5 : 2); /* Space for short/near jmp. */ | ||
3251 | } else { | ||
3252 | as->mcp = as->mctop - 5; /* Space for exit branch (near jmp). */ | ||
3253 | } | ||
3254 | as->invmcp = as->mcp; | ||
3255 | as->mcloop = NULL; | ||
3256 | as->testmcp = NULL; | ||
3257 | as->topslot = 0; | ||
3258 | as->gcsteps = 0; | ||
3259 | as->sectref = as->loopref; | ||
3260 | as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED; | ||
3261 | |||
3262 | /* Setup register allocation. */ | ||
3263 | ra_setup(as); | ||
3264 | asm_setup_regsp(as, T); | ||
3265 | |||
3266 | if (!as->loopref) { | ||
3267 | /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */ | ||
3268 | as->mcp -= (as->flags & JIT_F_LEA_AGU) ? 7 : 6; | ||
3269 | as->invmcp = NULL; | ||
3270 | asm_tail_sync(as); | ||
3271 | } | ||
3272 | asm_trace(as); | ||
3273 | } while (as->realign); /* Retry in case the MCode needs to be realigned. */ | ||
3274 | |||
3275 | RA_DBG_REF(); | ||
3276 | checkmclim(as); | ||
3277 | if (as->gcsteps) | ||
3278 | asm_gc_check(as, &as->T->snap[0]); | ||
3279 | if (!J->parent) | ||
3280 | asm_head_base(as); | ||
3281 | asm_const_remat(as); | ||
3282 | if (J->parent) | ||
3283 | asm_head_side(as); | ||
3284 | else | ||
3285 | asm_head_root(as); | ||
3286 | asm_phi_fixup(as); | ||
3287 | |||
3288 | RA_DBGX((as, "===== START ====")); | ||
3289 | RA_DBG_FLUSH(); | ||
3290 | if (as->freeset != RSET_ALL) | ||
3291 | lj_trace_err(as->J, LJ_TRERR_BADRA); /* Ouch! Should never happen. */ | ||
3292 | |||
3293 | /* Set trace entry point before fixing up tail to allow link to self. */ | ||
3294 | T->mcode = as->mcp; | ||
3295 | T->mcloop = as->mcloop ? (MSize)(as->mcloop - as->mcp) : 0; | ||
3296 | if (!as->loopref) | ||
3297 | asm_tail_fixup(as, T->link); /* Note: this may change as->mctop! */ | ||
3298 | T->szmcode = (MSize)(as->mctop - as->mcp); | ||
3299 | VG_INVALIDATE(T->mcode, T->szmcode); | ||
3300 | } | ||
3301 | |||
3302 | /* Patch exit jumps of existing machine code to a new target. */ | ||
3303 | void lj_asm_patchexit(jit_State *J, Trace *T, ExitNo exitno, MCode *target) | ||
3304 | { | ||
3305 | MCode *p = T->mcode; | ||
3306 | MCode *mcarea = lj_mcode_patch(J, p, 0); | ||
3307 | MSize len = T->szmcode; | ||
3308 | MCode *px = exitstub_addr(J, exitno) - 6; | ||
3309 | MCode *pe = p+len-6; | ||
3310 | if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) | ||
3311 | *(int32_t *)(p+len-4) = (int32_t)(target - (p+len)); | ||
3312 | for (; p < pe; p++) { | ||
3313 | if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) { | ||
3314 | *(int32_t *)(p+2) = (int32_t)(target - (p+6)); | ||
3315 | p += 5; | ||
3316 | } | ||
3317 | } | ||
3318 | lj_mcode_patch(J, mcarea, 1); | ||
3319 | VG_INVALIDATE(T->mcode, T->szmcode); | ||
3320 | } | ||
3321 | |||
3322 | #undef IR | ||
3323 | |||
3324 | #endif | ||