aboutsummaryrefslogtreecommitdiff
path: root/src/lj_asm_x86.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_asm_x86.h')
-rw-r--r--src/lj_asm_x86.h1354
1 files changed, 793 insertions, 561 deletions
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 8b529086..1ef7c38f 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -21,15 +21,17 @@ static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
21 } 21 }
22 /* Push the high byte of the exitno for each exit stub group. */ 22 /* Push the high byte of the exitno for each exit stub group. */
23 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8); 23 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
24#if !LJ_GC64
24 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */ 25 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
25 *mxp++ = XI_MOVmi; 26 *mxp++ = XI_MOVmi;
26 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP); 27 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
27 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP); 28 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
28 *mxp++ = 2*sizeof(void *); 29 *mxp++ = 2*sizeof(void *);
29 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4; 30 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
31#endif
30 /* Jump to exit handler which fills in the ExitState. */ 32 /* Jump to exit handler which fills in the ExitState. */
31 *mxp++ = XI_JMP; mxp += 4; 33 *mxp++ = XI_JMP; mxp += 4;
32 *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler); 34 *((int32_t *)(mxp-4)) = jmprel(as->J, mxp, (MCode *)(void *)lj_vm_exit_handler);
33 /* Commit the code for this group (even if assembly fails later on). */ 35 /* Commit the code for this group (even if assembly fails later on). */
34 lj_mcode_commitbot(as->J, mxp); 36 lj_mcode_commitbot(as->J, mxp);
35 as->mcbot = mxp; 37 as->mcbot = mxp;
@@ -58,14 +60,18 @@ static void asm_guardcc(ASMState *as, int cc)
58 MCode *p = as->mcp; 60 MCode *p = as->mcp;
59 if (LJ_UNLIKELY(p == as->invmcp)) { 61 if (LJ_UNLIKELY(p == as->invmcp)) {
60 as->loopinv = 1; 62 as->loopinv = 1;
61 *(int32_t *)(p+1) = jmprel(p+5, target); 63 *(int32_t *)(p+1) = jmprel(as->J, p+5, target);
62 target = p; 64 target = p;
63 cc ^= 1; 65 cc ^= 1;
64 if (as->realign) { 66 if (as->realign) {
67 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
68 as->mrm.ofs += 2; /* Fixup RIP offset for pending fused load. */
65 emit_sjcc(as, cc, target); 69 emit_sjcc(as, cc, target);
66 return; 70 return;
67 } 71 }
68 } 72 }
73 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
74 as->mrm.ofs += 6; /* Fixup RIP offset for pending fused load. */
69 emit_jcc(as, cc, target); 75 emit_jcc(as, cc, target);
70} 76}
71 77
@@ -79,6 +85,15 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
79{ 85{
80 if (irref_isk(ref)) { 86 if (irref_isk(ref)) {
81 IRIns *ir = IR(ref); 87 IRIns *ir = IR(ref);
88#if LJ_GC64
89 if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
90 *k = ir->i;
91 return 1;
92 } else if (checki32((int64_t)ir_k64(ir)->u64)) {
93 *k = (int32_t)ir_k64(ir)->u64;
94 return 1;
95 }
96#else
82 if (ir->o != IR_KINT64) { 97 if (ir->o != IR_KINT64) {
83 *k = ir->i; 98 *k = ir->i;
84 return 1; 99 return 1;
@@ -86,6 +101,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
86 *k = (int32_t)ir_kint64(ir)->u64; 101 *k = (int32_t)ir_kint64(ir)->u64;
87 return 1; 102 return 1;
88 } 103 }
104#endif
89 } 105 }
90 return 0; 106 return 0;
91} 107}
@@ -115,7 +131,7 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
115 as->mrm.ofs = 0; 131 as->mrm.ofs = 0;
116 if (irb->o == IR_FLOAD) { 132 if (irb->o == IR_FLOAD) {
117 IRIns *ira = IR(irb->op1); 133 IRIns *ira = IR(irb->op1);
118 lua_assert(irb->op2 == IRFL_TAB_ARRAY); 134 lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
119 /* We can avoid the FLOAD of t->array for colocated arrays. */ 135 /* We can avoid the FLOAD of t->array for colocated arrays. */
120 if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE && 136 if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
121 !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) { 137 !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
@@ -134,7 +150,7 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
134static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow) 150static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow)
135{ 151{
136 IRIns *irx; 152 IRIns *irx;
137 lua_assert(ir->o == IR_AREF); 153 lj_assertA(ir->o == IR_AREF, "expected AREF");
138 as->mrm.base = (uint8_t)ra_alloc1(as, asm_fuseabase(as, ir->op1), allow); 154 as->mrm.base = (uint8_t)ra_alloc1(as, asm_fuseabase(as, ir->op1), allow);
139 irx = IR(ir->op2); 155 irx = IR(ir->op2);
140 if (irref_isk(ir->op2)) { 156 if (irref_isk(ir->op2)) {
@@ -185,14 +201,32 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
185 if (irref_isk(ir->op1)) { 201 if (irref_isk(ir->op1)) {
186 GCfunc *fn = ir_kfunc(IR(ir->op1)); 202 GCfunc *fn = ir_kfunc(IR(ir->op1));
187 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv; 203 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
204#if LJ_GC64
205 int64_t ofs = dispofs(as, &uv->tv);
206 if (checki32(ofs) && checki32(ofs+4)) {
207 as->mrm.ofs = (int32_t)ofs;
208 as->mrm.base = RID_DISPATCH;
209 as->mrm.idx = RID_NONE;
210 return;
211 }
212#else
188 as->mrm.ofs = ptr2addr(&uv->tv); 213 as->mrm.ofs = ptr2addr(&uv->tv);
189 as->mrm.base = as->mrm.idx = RID_NONE; 214 as->mrm.base = as->mrm.idx = RID_NONE;
190 return; 215 return;
216#endif
191 } 217 }
192 break; 218 break;
219 case IR_TMPREF:
220#if LJ_GC64
221 as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->tmptv);
222 as->mrm.base = RID_DISPATCH;
223 as->mrm.idx = RID_NONE;
224#else
225 as->mrm.ofs = igcptr(&J2G(as->J)->tmptv);
226 as->mrm.base = as->mrm.idx = RID_NONE;
227#endif
228 return;
193 default: 229 default:
194 lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO ||
195 ir->o == IR_KKPTR);
196 break; 230 break;
197 } 231 }
198 } 232 }
@@ -204,26 +238,53 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
204/* Fuse FLOAD/FREF reference into memory operand. */ 238/* Fuse FLOAD/FREF reference into memory operand. */
205static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) 239static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
206{ 240{
207 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); 241 lj_assertA(ir->o == IR_FLOAD || ir->o == IR_FREF,
208 as->mrm.ofs = field_ofs[ir->op2]; 242 "bad IR op %d", ir->o);
209 as->mrm.idx = RID_NONE; 243 as->mrm.idx = RID_NONE;
244 if (ir->op1 == REF_NIL) { /* FLOAD from GG_State with offset. */
245#if LJ_GC64
246 as->mrm.ofs = (int32_t)(ir->op2 << 2) - GG_OFS(dispatch);
247 as->mrm.base = RID_DISPATCH;
248#else
249 as->mrm.ofs = (int32_t)(ir->op2 << 2) + ptr2addr(J2GG(as->J));
250 as->mrm.base = RID_NONE;
251#endif
252 return;
253 }
254 as->mrm.ofs = field_ofs[ir->op2];
210 if (irref_isk(ir->op1)) { 255 if (irref_isk(ir->op1)) {
211 as->mrm.ofs += IR(ir->op1)->i; 256 IRIns *op1 = IR(ir->op1);
257#if LJ_GC64
258 if (ir->op1 == REF_NIL) {
259 as->mrm.ofs -= GG_OFS(dispatch);
260 as->mrm.base = RID_DISPATCH;
261 return;
262 } else if (op1->o == IR_KPTR || op1->o == IR_KKPTR) {
263 intptr_t ofs = dispofs(as, ir_kptr(op1));
264 if (checki32(as->mrm.ofs + ofs)) {
265 as->mrm.ofs += (int32_t)ofs;
266 as->mrm.base = RID_DISPATCH;
267 return;
268 }
269 }
270#else
271 as->mrm.ofs += op1->i;
212 as->mrm.base = RID_NONE; 272 as->mrm.base = RID_NONE;
213 } else { 273 return;
214 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); 274#endif
215 } 275 }
276 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
216} 277}
217 278
218/* Fuse string reference into memory operand. */ 279/* Fuse string reference into memory operand. */
219static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow) 280static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
220{ 281{
221 IRIns *irr; 282 IRIns *irr;
222 lua_assert(ir->o == IR_STRREF); 283 lj_assertA(ir->o == IR_STRREF, "bad IR op %d", ir->o);
223 as->mrm.base = as->mrm.idx = RID_NONE; 284 as->mrm.base = as->mrm.idx = RID_NONE;
224 as->mrm.scale = XM_SCALE1; 285 as->mrm.scale = XM_SCALE1;
225 as->mrm.ofs = sizeof(GCstr); 286 as->mrm.ofs = sizeof(GCstr);
226 if (irref_isk(ir->op1)) { 287 if (!LJ_GC64 && irref_isk(ir->op1)) {
227 as->mrm.ofs += IR(ir->op1)->i; 288 as->mrm.ofs += IR(ir->op1)->i;
228 } else { 289 } else {
229 Reg r = ra_alloc1(as, ir->op1, allow); 290 Reg r = ra_alloc1(as, ir->op1, allow);
@@ -255,10 +316,20 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
255 IRIns *ir = IR(ref); 316 IRIns *ir = IR(ref);
256 as->mrm.idx = RID_NONE; 317 as->mrm.idx = RID_NONE;
257 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { 318 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
319#if LJ_GC64
320 intptr_t ofs = dispofs(as, ir_kptr(ir));
321 if (checki32(ofs)) {
322 as->mrm.ofs = (int32_t)ofs;
323 as->mrm.base = RID_DISPATCH;
324 return;
325 }
326 } if (0) {
327#else
258 as->mrm.ofs = ir->i; 328 as->mrm.ofs = ir->i;
259 as->mrm.base = RID_NONE; 329 as->mrm.base = RID_NONE;
260 } else if (ir->o == IR_STRREF) { 330 } else if (ir->o == IR_STRREF) {
261 asm_fusestrref(as, ir, allow); 331 asm_fusestrref(as, ir, allow);
332#endif
262 } else { 333 } else {
263 as->mrm.ofs = 0; 334 as->mrm.ofs = 0;
264 if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) { 335 if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) {
@@ -301,7 +372,47 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
301 } 372 }
302} 373}
303 374
304/* Fuse load into memory operand. */ 375/* Fuse load of 64 bit IR constant into memory operand. */
376static Reg asm_fuseloadk64(ASMState *as, IRIns *ir)
377{
378 const uint64_t *k = &ir_k64(ir)->u64;
379 if (!LJ_GC64 || checki32((intptr_t)k)) {
380 as->mrm.ofs = ptr2addr(k);
381 as->mrm.base = RID_NONE;
382#if LJ_GC64
383 } else if (checki32(dispofs(as, k))) {
384 as->mrm.ofs = (int32_t)dispofs(as, k);
385 as->mrm.base = RID_DISPATCH;
386 } else if (checki32(mcpofs(as, k)) && checki32(mcpofs(as, k+1)) &&
387 checki32(mctopofs(as, k)) && checki32(mctopofs(as, k+1))) {
388 as->mrm.ofs = (int32_t)mcpofs(as, k);
389 as->mrm.base = RID_RIP;
390 } else { /* Intern 64 bit constant at bottom of mcode. */
391 if (ir->i) {
392 lj_assertA(*k == *(uint64_t*)(as->mctop - ir->i),
393 "bad interned 64 bit constant");
394 } else {
395 while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3;
396 *(uint64_t*)as->mcbot = *k;
397 ir->i = (int32_t)(as->mctop - as->mcbot);
398 as->mcbot += 8;
399 as->mclim = as->mcbot + MCLIM_REDZONE;
400 lj_mcode_commitbot(as->J, as->mcbot);
401 }
402 as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i);
403 as->mrm.base = RID_RIP;
404#endif
405 }
406 as->mrm.idx = RID_NONE;
407 return RID_MRM;
408}
409
410/* Fuse load into memory operand.
411**
412** Important caveat: this may emit RIP-relative loads! So don't place any
413** code emitters between this function and the use of its result.
414** The only permitted exception is asm_guardcc().
415*/
305static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) 416static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
306{ 417{
307 IRIns *ir = IR(ref); 418 IRIns *ir = IR(ref);
@@ -319,27 +430,36 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
319 } 430 }
320 if (ir->o == IR_KNUM) { 431 if (ir->o == IR_KNUM) {
321 RegSet avail = as->freeset & ~as->modset & RSET_FPR; 432 RegSet avail = as->freeset & ~as->modset & RSET_FPR;
322 lua_assert(allow != RSET_EMPTY); 433 lj_assertA(allow != RSET_EMPTY, "no register allowed");
323 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ 434 if (!(avail & (avail-1))) /* Fuse if less than two regs available. */
324 as->mrm.ofs = ptr2addr(ir_knum(ir)); 435 return asm_fuseloadk64(as, ir);
325 as->mrm.base = as->mrm.idx = RID_NONE;
326 return RID_MRM;
327 }
328 } else if (ref == REF_BASE || ir->o == IR_KINT64) { 436 } else if (ref == REF_BASE || ir->o == IR_KINT64) {
329 RegSet avail = as->freeset & ~as->modset & RSET_GPR; 437 RegSet avail = as->freeset & ~as->modset & RSET_GPR;
330 lua_assert(allow != RSET_EMPTY); 438 lj_assertA(allow != RSET_EMPTY, "no register allowed");
331 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ 439 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */
332 as->mrm.ofs = ptr2addr(ref == REF_BASE ? (void *)&J2G(as->J)->jit_base : (void *)ir_kint64(ir)); 440 if (ref == REF_BASE) {
333 as->mrm.base = as->mrm.idx = RID_NONE; 441#if LJ_GC64
334 return RID_MRM; 442 as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->jit_base);
443 as->mrm.base = RID_DISPATCH;
444#else
445 as->mrm.ofs = ptr2addr(&J2G(as->J)->jit_base);
446 as->mrm.base = RID_NONE;
447#endif
448 as->mrm.idx = RID_NONE;
449 return RID_MRM;
450 } else {
451 return asm_fuseloadk64(as, ir);
452 }
335 } 453 }
336 } else if (mayfuse(as, ref)) { 454 } else if (mayfuse(as, ref)) {
337 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; 455 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
338 if (ir->o == IR_SLOAD) { 456 if (ir->o == IR_SLOAD) {
339 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && 457 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
340 noconflict(as, ref, IR_RETF, 0)) { 458 noconflict(as, ref, IR_RETF, 0) &&
459 !(LJ_GC64 && irt_isaddr(ir->t))) {
341 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); 460 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
342 as->mrm.ofs = 8*((int32_t)ir->op1-1) + ((ir->op2&IRSLOAD_FRAME)?4:0); 461 as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
462 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
343 as->mrm.idx = RID_NONE; 463 as->mrm.idx = RID_NONE;
344 return RID_MRM; 464 return RID_MRM;
345 } 465 }
@@ -351,7 +471,8 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
351 return RID_MRM; 471 return RID_MRM;
352 } 472 }
353 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { 473 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
354 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0)) { 474 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
475 !(LJ_GC64 && irt_isaddr(ir->t))) {
355 asm_fuseahuref(as, ir->op1, xallow); 476 asm_fuseahuref(as, ir->op1, xallow);
356 return RID_MRM; 477 return RID_MRM;
357 } 478 }
@@ -364,11 +485,16 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
364 asm_fusexref(as, ir->op1, xallow); 485 asm_fusexref(as, ir->op1, xallow);
365 return RID_MRM; 486 return RID_MRM;
366 } 487 }
367 } else if (ir->o == IR_VLOAD) { 488 } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) {
368 asm_fuseahuref(as, ir->op1, xallow); 489 asm_fuseahuref(as, ir->op1, xallow);
490 as->mrm.ofs += 8 * ir->op2;
369 return RID_MRM; 491 return RID_MRM;
370 } 492 }
371 } 493 }
494 if (ir->o == IR_FLOAD && ir->op1 == REF_NIL) {
495 asm_fusefref(as, ir, RSET_EMPTY);
496 return RID_MRM;
497 }
372 if (!(as->freeset & allow) && !emit_canremat(ref) && 498 if (!(as->freeset & allow) && !emit_canremat(ref) &&
373 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref))) 499 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref)))
374 goto fusespill; 500 goto fusespill;
@@ -392,7 +518,7 @@ static Reg asm_fuseloadm(ASMState *as, IRRef ref, RegSet allow, int is64)
392/* Count the required number of stack slots for a call. */ 518/* Count the required number of stack slots for a call. */
393static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args) 519static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
394{ 520{
395 uint32_t i, nargs = CCI_NARGS(ci); 521 uint32_t i, nargs = CCI_XNARGS(ci);
396 int nslots = 0; 522 int nslots = 0;
397#if LJ_64 523#if LJ_64
398 if (LJ_ABI_WIN) { 524 if (LJ_ABI_WIN) {
@@ -425,7 +551,7 @@ static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
425/* Generate a call to a C function. */ 551/* Generate a call to a C function. */
426static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) 552static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
427{ 553{
428 uint32_t n, nargs = CCI_NARGS(ci); 554 uint32_t n, nargs = CCI_XNARGS(ci);
429 int32_t ofs = STACKARG_OFS; 555 int32_t ofs = STACKARG_OFS;
430#if LJ_64 556#if LJ_64
431 uint32_t gprs = REGARG_GPRS; 557 uint32_t gprs = REGARG_GPRS;
@@ -485,13 +611,14 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
485 if (r) { /* Argument is in a register. */ 611 if (r) { /* Argument is in a register. */
486 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 612 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
487#if LJ_64 613#if LJ_64
488 if (ir->o == IR_KINT64) 614 if (LJ_GC64 ? !(ir->o == IR_KINT || ir->o == IR_KNULL) : ir->o == IR_KINT64)
489 emit_loadu64(as, r, ir_kint64(ir)->u64); 615 emit_loadu64(as, r, ir_k64(ir)->u64);
490 else 616 else
491#endif 617#endif
492 emit_loadi(as, r, ir->i); 618 emit_loadi(as, r, ir->i);
493 } else { 619 } else {
494 lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */ 620 /* Must have been evicted. */
621 lj_assertA(rset_test(as->freeset, r), "reg %d not free", r);
495 if (ra_hasreg(ir->r)) { 622 if (ra_hasreg(ir->r)) {
496 ra_noweak(as, ir->r); 623 ra_noweak(as, ir->r);
497 emit_movrr(as, ir, r, ir->r); 624 emit_movrr(as, ir, r, ir->r);
@@ -500,7 +627,8 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
500 } 627 }
501 } 628 }
502 } else if (irt_isfp(ir->t)) { /* FP argument is on stack. */ 629 } else if (irt_isfp(ir->t)) { /* FP argument is on stack. */
503 lua_assert(!(irt_isfloat(ir->t) && irref_isk(ref))); /* No float k. */ 630 lj_assertA(!(irt_isfloat(ir->t) && irref_isk(ref)),
631 "unexpected float constant");
504 if (LJ_32 && (ofs & 4) && irref_isk(ref)) { 632 if (LJ_32 && (ofs & 4) && irref_isk(ref)) {
505 /* Split stores for unaligned FP consts. */ 633 /* Split stores for unaligned FP consts. */
506 emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo); 634 emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
@@ -531,7 +659,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
531static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) 659static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
532{ 660{
533 RegSet drop = RSET_SCRATCH; 661 RegSet drop = RSET_SCRATCH;
534 int hiop = (LJ_32 && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t)); 662 int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
535 if ((ci->flags & CCI_NOFPRCLOBBER)) 663 if ((ci->flags & CCI_NOFPRCLOBBER))
536 drop &= ~RSET_FPR; 664 drop &= ~RSET_FPR;
537 if (ra_hasreg(ir->r)) 665 if (ra_hasreg(ir->r))
@@ -560,7 +688,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
560 if (ra_hasreg(dest)) { 688 if (ra_hasreg(dest)) {
561 ra_free(as, dest); 689 ra_free(as, dest);
562 ra_modified(as, dest); 690 ra_modified(as, dest);
563 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, 691 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS,
564 dest, RID_ESP, ofs); 692 dest, RID_ESP, ofs);
565 } 693 }
566 if ((ci->flags & CCI_CASTU64)) { 694 if ((ci->flags & CCI_CASTU64)) {
@@ -571,12 +699,10 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
571 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); 699 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
572 } 700 }
573#endif 701#endif
574#if LJ_32
575 } else if (hiop) { 702 } else if (hiop) {
576 ra_destpair(as, ir); 703 ra_destpair(as, ir);
577#endif
578 } else { 704 } else {
579 lua_assert(!irt_ispri(ir->t)); 705 lj_assertA(!irt_ispri(ir->t), "PRI dest");
580 ra_destreg(as, ir, RID_RET); 706 ra_destreg(as, ir, RID_RET);
581 } 707 }
582 } else if (LJ_32 && irt_isfp(ir->t) && !(ci->flags & CCI_CASTU64)) { 708 } else if (LJ_32 && irt_isfp(ir->t) && !(ci->flags & CCI_CASTU64)) {
@@ -584,15 +710,6 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
584 } 710 }
585} 711}
586 712
587static void asm_call(ASMState *as, IRIns *ir)
588{
589 IRRef args[CCI_NARGS_MAX];
590 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
591 asm_collectargs(as, ir, ci, args);
592 asm_setupresult(as, ir, ci);
593 asm_gencall(as, ci, args);
594}
595
596/* Return a constant function pointer or NULL for indirect calls. */ 713/* Return a constant function pointer or NULL for indirect calls. */
597static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func) 714static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func)
598{ 715{
@@ -651,16 +768,39 @@ static void asm_callx(ASMState *as, IRIns *ir)
651static void asm_retf(ASMState *as, IRIns *ir) 768static void asm_retf(ASMState *as, IRIns *ir)
652{ 769{
653 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR); 770 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
771#if LJ_FR2
772 Reg rpc = ra_scratch(as, rset_exclude(RSET_GPR, base));
773#endif
654 void *pc = ir_kptr(IR(ir->op2)); 774 void *pc = ir_kptr(IR(ir->op2));
655 int32_t delta = 1+bc_a(*((const BCIns *)pc - 1)); 775 int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
656 as->topslot -= (BCReg)delta; 776 as->topslot -= (BCReg)delta;
657 if ((int32_t)as->topslot < 0) as->topslot = 0; 777 if ((int32_t)as->topslot < 0) as->topslot = 0;
658 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */ 778 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */
659 emit_setgl(as, base, jit_base); 779 emit_setgl(as, base, jit_base);
660 emit_addptr(as, base, -8*delta); 780 emit_addptr(as, base, -8*delta);
661 asm_guardcc(as, CC_NE); 781 asm_guardcc(as, CC_NE);
782#if LJ_FR2
783 emit_rmro(as, XO_CMP, rpc|REX_GC64, base, -8);
784 emit_loadu64(as, rpc, u64ptr(pc));
785#else
662 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc)); 786 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc));
787#endif
788}
789
790/* -- Buffer operations --------------------------------------------------- */
791
792#if LJ_HASBUFFER
793static void asm_bufhdr_write(ASMState *as, Reg sb)
794{
795 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb));
796 IRIns irgc;
797 irgc.ot = IRT(0, IRT_PGC); /* GC type. */
798 emit_storeofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
799 emit_opgl(as, XO_ARITH(XOg_OR), tmp|REX_GC64, cur_L);
800 emit_gri(as, XG_ARITHi(XOg_AND), tmp, SBUF_MASK_FLAG);
801 emit_loadofs(as, &irgc, tmp, sb, offsetof(SBuf, L));
663} 802}
803#endif
664 804
665/* -- Type conversions ---------------------------------------------------- */ 805/* -- Type conversions ---------------------------------------------------- */
666 806
@@ -672,8 +812,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
672 asm_guardcc(as, CC_NE); 812 asm_guardcc(as, CC_NE);
673 emit_rr(as, XO_UCOMISD, left, tmp); 813 emit_rr(as, XO_UCOMISD, left, tmp);
674 emit_rr(as, XO_CVTSI2SD, tmp, dest); 814 emit_rr(as, XO_CVTSI2SD, tmp, dest);
675 if (!(as->flags & JIT_F_SPLIT_XMM)) 815 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
676 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
677 emit_rr(as, XO_CVTTSD2SI, dest, left); 816 emit_rr(as, XO_CVTTSD2SI, dest, left);
678 /* Can't fuse since left is needed twice. */ 817 /* Can't fuse since left is needed twice. */
679} 818}
@@ -684,8 +823,9 @@ static void asm_tobit(ASMState *as, IRIns *ir)
684 Reg tmp = ra_noreg(IR(ir->op1)->r) ? 823 Reg tmp = ra_noreg(IR(ir->op1)->r) ?
685 ra_alloc1(as, ir->op1, RSET_FPR) : 824 ra_alloc1(as, ir->op1, RSET_FPR) :
686 ra_scratch(as, RSET_FPR); 825 ra_scratch(as, RSET_FPR);
687 Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); 826 Reg right;
688 emit_rr(as, XO_MOVDto, tmp, dest); 827 emit_rr(as, XO_MOVDto, tmp, dest);
828 right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
689 emit_mrm(as, XO_ADDSD, tmp, right); 829 emit_mrm(as, XO_ADDSD, tmp, right);
690 ra_left(as, tmp, ir->op1); 830 ra_left(as, tmp, ir->op1);
691} 831}
@@ -696,8 +836,10 @@ static void asm_conv(ASMState *as, IRIns *ir)
696 int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64)); 836 int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64));
697 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 837 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
698 IRRef lref = ir->op1; 838 IRRef lref = ir->op1;
699 lua_assert(irt_type(ir->t) != st); 839 lj_assertA(irt_type(ir->t) != st, "inconsistent types for CONV");
700 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */ 840 lj_assertA(!(LJ_32 && (irt_isint64(ir->t) || st64)),
841 "IR %04d has unsplit 64 bit type",
842 (int)(ir - as->ir) - REF_BIAS);
701 if (irt_isfp(ir->t)) { 843 if (irt_isfp(ir->t)) {
702 Reg dest = ra_dest(as, ir, RSET_FPR); 844 Reg dest = ra_dest(as, ir, RSET_FPR);
703 if (stfp) { /* FP to FP conversion. */ 845 if (stfp) { /* FP to FP conversion. */
@@ -706,13 +848,13 @@ static void asm_conv(ASMState *as, IRIns *ir)
706 if (left == dest) return; /* Avoid the XO_XORPS. */ 848 if (left == dest) return; /* Avoid the XO_XORPS. */
707 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ 849 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
708 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ 850 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
709 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000)); 851 cTValue *k = &as->J->k64[LJ_K64_TOBIT];
710 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); 852 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
711 if (irt_isfloat(ir->t)) 853 if (irt_isfloat(ir->t))
712 emit_rr(as, XO_CVTSD2SS, dest, dest); 854 emit_rr(as, XO_CVTSD2SS, dest, dest);
713 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ 855 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
714 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ 856 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
715 emit_loadn(as, bias, k); 857 emit_rma(as, XO_MOVSD, bias, k);
716 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); 858 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
717 return; 859 return;
718 } else { /* Integer to FP conversion. */ 860 } else { /* Integer to FP conversion. */
@@ -721,7 +863,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
721 asm_fuseloadm(as, lref, RSET_GPR, st64); 863 asm_fuseloadm(as, lref, RSET_GPR, st64);
722 if (LJ_64 && st == IRT_U64) { 864 if (LJ_64 && st == IRT_U64) {
723 MCLabel l_end = emit_label(as); 865 MCLabel l_end = emit_label(as);
724 const void *k = lj_ir_k64_find(as->J, U64x(43f00000,00000000)); 866 cTValue *k = &as->J->k64[LJ_K64_2P64];
725 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */ 867 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */
726 emit_sjcc(as, CC_NS, l_end); 868 emit_sjcc(as, CC_NS, l_end);
727 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */ 869 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */
@@ -729,18 +871,16 @@ static void asm_conv(ASMState *as, IRIns *ir)
729 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS, 871 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS,
730 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left); 872 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left);
731 } 873 }
732 if (!(as->flags & JIT_F_SPLIT_XMM)) 874 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
733 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
734 } else if (stfp) { /* FP to integer conversion. */ 875 } else if (stfp) { /* FP to integer conversion. */
735 if (irt_isguard(ir->t)) { 876 if (irt_isguard(ir->t)) {
736 /* Checked conversions are only supported from number to int. */ 877 /* Checked conversions are only supported from number to int. */
737 lua_assert(irt_isint(ir->t) && st == IRT_NUM); 878 lj_assertA(irt_isint(ir->t) && st == IRT_NUM,
879 "bad type for checked CONV");
738 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 880 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
739 } else { 881 } else {
740 Reg dest = ra_dest(as, ir, RSET_GPR); 882 Reg dest = ra_dest(as, ir, RSET_GPR);
741 x86Op op = st == IRT_NUM ? 883 x86Op op = st == IRT_NUM ? XO_CVTTSD2SI : XO_CVTTSS2SI;
742 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
743 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
744 if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) { 884 if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) {
745 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */ 885 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */
746 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */ 886 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */
@@ -751,30 +891,27 @@ static void asm_conv(ASMState *as, IRIns *ir)
751 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); 891 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
752 emit_rr(as, op, dest|REX_64, tmp); 892 emit_rr(as, op, dest|REX_64, tmp);
753 if (st == IRT_NUM) 893 if (st == IRT_NUM)
754 emit_rma(as, XO_ADDSD, tmp, lj_ir_k64_find(as->J, 894 emit_rma(as, XO_ADDSD, tmp, &as->J->k64[LJ_K64_M2P64_31]);
755 LJ_64 ? U64x(c3f00000,00000000) : U64x(c1e00000,00000000)));
756 else 895 else
757 emit_rma(as, XO_ADDSS, tmp, lj_ir_k64_find(as->J, 896 emit_rma(as, XO_ADDSS, tmp, &as->J->k32[LJ_K32_M2P64_31]);
758 LJ_64 ? U64x(00000000,df800000) : U64x(00000000,cf000000)));
759 emit_sjcc(as, CC_NS, l_end); 897 emit_sjcc(as, CC_NS, l_end);
760 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */ 898 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */
761 emit_rr(as, op, dest|REX_64, tmp); 899 emit_rr(as, op, dest|REX_64, tmp);
762 ra_left(as, tmp, lref); 900 ra_left(as, tmp, lref);
763 } else { 901 } else {
764 Reg left = asm_fuseload(as, lref, RSET_FPR);
765 if (LJ_64 && irt_isu32(ir->t)) 902 if (LJ_64 && irt_isu32(ir->t))
766 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ 903 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
767 emit_mrm(as, op, 904 emit_mrm(as, op,
768 dest|((LJ_64 && 905 dest|((LJ_64 &&
769 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 906 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
770 left); 907 asm_fuseload(as, lref, RSET_FPR));
771 } 908 }
772 } 909 }
773 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ 910 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
774 Reg left, dest = ra_dest(as, ir, RSET_GPR); 911 Reg left, dest = ra_dest(as, ir, RSET_GPR);
775 RegSet allow = RSET_GPR; 912 RegSet allow = RSET_GPR;
776 x86Op op; 913 x86Op op;
777 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t)); 914 lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t), "bad type for CONV EXT");
778 if (st == IRT_I8) { 915 if (st == IRT_I8) {
779 op = XO_MOVSXb; allow = RSET_GPR8; dest |= FORCE_REX; 916 op = XO_MOVSXb; allow = RSET_GPR8; dest |= FORCE_REX;
780 } else if (st == IRT_U8) { 917 } else if (st == IRT_U8) {
@@ -808,7 +945,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
808 } 945 }
809 } else { 946 } else {
810 Reg dest = ra_dest(as, ir, RSET_GPR); 947 Reg dest = ra_dest(as, ir, RSET_GPR);
811 if (st64) { 948 if (st64 && !(ir->op2 & IRCONV_NONE)) {
812 Reg left = asm_fuseload(as, lref, RSET_GPR); 949 Reg left = asm_fuseload(as, lref, RSET_GPR);
813 /* This is either a 32 bit reg/reg mov which zeroes the hiword 950 /* This is either a 32 bit reg/reg mov which zeroes the hiword
814 ** or a load of the loword from a 64 bit address. 951 ** or a load of the loword from a 64 bit address.
@@ -834,20 +971,18 @@ static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
834 if (ra_hasreg(dest)) { 971 if (ra_hasreg(dest)) {
835 ra_free(as, dest); 972 ra_free(as, dest);
836 ra_modified(as, dest); 973 ra_modified(as, dest);
837 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, 974 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS, dest, RID_ESP, ofs);
838 dest, RID_ESP, ofs);
839 } 975 }
840 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, 976 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
841 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); 977 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
842 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { 978 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
843 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ 979 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
844 MCLabel l_end = emit_label(as); 980 MCLabel l_end = emit_label(as);
845 emit_rma(as, XO_FADDq, XOg_FADDq, 981 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_2P64]);
846 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
847 emit_sjcc(as, CC_NS, l_end); 982 emit_sjcc(as, CC_NS, l_end);
848 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ 983 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
849 } else { 984 } else {
850 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64); 985 lj_assertA(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64, "bad type for CONV");
851 } 986 }
852 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0); 987 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
853 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */ 988 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
@@ -861,9 +996,8 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
861 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK); 996 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
862 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH); 997 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
863 Reg lo, hi; 998 Reg lo, hi;
864 lua_assert(st == IRT_NUM || st == IRT_FLOAT); 999 lj_assertA(st == IRT_NUM || st == IRT_FLOAT, "bad type for CONV");
865 lua_assert(dt == IRT_I64 || dt == IRT_U64); 1000 lj_assertA(dt == IRT_I64 || dt == IRT_U64, "bad type for CONV");
866 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
867 hi = ra_dest(as, ir, RSET_GPR); 1001 hi = ra_dest(as, ir, RSET_GPR);
868 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi)); 1002 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
869 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0); 1003 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
@@ -884,8 +1018,7 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
884 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); 1018 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
885 else 1019 else
886 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); 1020 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
887 emit_rma(as, XO_FADDq, XOg_FADDq, 1021 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_M2P64]);
888 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
889 emit_sjcc(as, CC_NS, l_pop); 1022 emit_sjcc(as, CC_NS, l_pop);
890 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ 1023 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
891 } 1024 }
@@ -906,6 +1039,14 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
906 st == IRT_NUM ? XOg_FLDq: XOg_FLDd, 1039 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
907 asm_fuseload(as, ir->op1, RSET_EMPTY)); 1040 asm_fuseload(as, ir->op1, RSET_EMPTY));
908} 1041}
1042
1043static void asm_conv64(ASMState *as, IRIns *ir)
1044{
1045 if (irt_isfp(ir->t))
1046 asm_conv_fp_int64(as, ir);
1047 else
1048 asm_conv_int64_fp(as, ir);
1049}
909#endif 1050#endif
910 1051
911static void asm_strto(ASMState *as, IRIns *ir) 1052static void asm_strto(ASMState *as, IRIns *ir)
@@ -927,54 +1068,61 @@ static void asm_strto(ASMState *as, IRIns *ir)
927 RID_ESP, sps_scale(ir->s)); 1068 RID_ESP, sps_scale(ir->s));
928} 1069}
929 1070
930static void asm_tostr(ASMState *as, IRIns *ir) 1071/* -- Memory references --------------------------------------------------- */
1072
1073/* Get pointer to TValue. */
1074static void asm_tvptr(ASMState *as, Reg dest, IRRef ref, MSize mode)
931{ 1075{
932 IRIns *irl = IR(ir->op1); 1076 if ((mode & IRTMPREF_IN1)) {
933 IRRef args[2]; 1077 IRIns *ir = IR(ref);
934 args[0] = ASMREF_L; 1078 if (irt_isnum(ir->t)) {
935 as->gcsteps++; 1079 if (irref_isk(ref) && !(mode & IRTMPREF_OUT1)) {
936 if (irt_isnum(irl->t)) { 1080 /* Use the number constant itself as a TValue. */
937 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromnum]; 1081 emit_loada(as, dest, ir_knum(ir));
938 args[1] = ASMREF_TMP1; /* const lua_Number * */ 1082 return;
939 asm_setupresult(as, ir, ci); /* GCstr * */ 1083 }
940 asm_gencall(as, ci, args); 1084 emit_rmro(as, XO_MOVSDto, ra_alloc1(as, ref, RSET_FPR), dest, 0);
941 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64, 1085 } else {
942 RID_ESP, ra_spill(as, irl)); 1086#if LJ_GC64
943 } else { 1087 if (irref_isk(ref)) {
944 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromint]; 1088 TValue k;
945 args[1] = ir->op1; /* int32_t k */ 1089 lj_ir_kvalue(as->J->L, &k, ir);
946 asm_setupresult(as, ir, ci); /* GCstr * */ 1090 emit_movmroi(as, dest, 4, k.u32.hi);
947 asm_gencall(as, ci, args); 1091 emit_movmroi(as, dest, 0, k.u32.lo);
1092 } else {
1093 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1094 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
1095 if (irt_is64(ir->t)) {
1096 emit_u32(as, irt_toitype(ir->t) << 15);
1097 emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4);
1098 } else {
1099 emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15));
1100 }
1101 emit_movtomro(as, REX_64IR(ir, src), dest, 0);
1102 }
1103#else
1104 if (!irref_isk(ref)) {
1105 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
1106 emit_movtomro(as, REX_64IR(ir, src), dest, 0);
1107 } else if (!irt_ispri(ir->t)) {
1108 emit_movmroi(as, dest, 0, ir->i);
1109 }
1110 if (!(LJ_64 && irt_islightud(ir->t)))
1111 emit_movmroi(as, dest, 4, irt_toitype(ir->t));
1112#endif
1113 }
948 } 1114 }
1115 emit_loada(as, dest, &J2G(as->J)->tmptv); /* g->tmptv holds the TValue(s). */
949} 1116}
950 1117
951/* -- Memory references --------------------------------------------------- */
952
953static void asm_aref(ASMState *as, IRIns *ir) 1118static void asm_aref(ASMState *as, IRIns *ir)
954{ 1119{
955 Reg dest = ra_dest(as, ir, RSET_GPR); 1120 Reg dest = ra_dest(as, ir, RSET_GPR);
956 asm_fusearef(as, ir, RSET_GPR); 1121 asm_fusearef(as, ir, RSET_GPR);
957 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0)) 1122 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
958 emit_mrm(as, XO_LEA, dest, RID_MRM); 1123 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
959 else if (as->mrm.base != dest) 1124 else if (as->mrm.base != dest)
960 emit_rr(as, XO_MOV, dest, as->mrm.base); 1125 emit_rr(as, XO_MOV, dest|REX_GC64, as->mrm.base);
961}
962
963/* Merge NE(HREF, niltv) check. */
964static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
965{
966 /* Assumes nothing else generates NE of HREF. */
967 if ((ir[1].o == IR_NE || ir[1].o == IR_EQ) && ir[1].op1 == as->curins &&
968 ra_hasreg(ir->r)) {
969 MCode *p = as->mcp;
970 p += (LJ_64 && *p != XI_ARITHi) ? 7+6 : 6+6;
971 /* Ensure no loop branch inversion happened. */
972 if (p[-6] == 0x0f && p[-5] == XI_JCCn+(CC_NE^(ir[1].o & 1))) {
973 as->mcp = p; /* Kill cmp reg, imm32 + jz exit. */
974 return p + *(int32_t *)(p-4); /* Return exit address. */
975 }
976 }
977 return NULL;
978} 1126}
979 1127
980/* Inlined hash lookup. Specialized for key type and for const keys. 1128/* Inlined hash lookup. Specialized for key type and for const keys.
@@ -985,10 +1133,10 @@ static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
985** } while ((n = nextnode(n))); 1133** } while ((n = nextnode(n)));
986** return niltv(L); 1134** return niltv(L);
987*/ 1135*/
988static void asm_href(ASMState *as, IRIns *ir) 1136static void asm_href(ASMState *as, IRIns *ir, IROp merge)
989{ 1137{
990 MCode *nilexit = merge_href_niltv(as, ir); /* Do this before any restores. */
991 RegSet allow = RSET_GPR; 1138 RegSet allow = RSET_GPR;
1139 int destused = ra_used(ir);
992 Reg dest = ra_dest(as, ir, allow); 1140 Reg dest = ra_dest(as, ir, allow);
993 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); 1141 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
994 Reg key = RID_NONE, tmp = RID_NONE; 1142 Reg key = RID_NONE, tmp = RID_NONE;
@@ -1001,28 +1149,26 @@ static void asm_href(ASMState *as, IRIns *ir)
1001 if (!isk) { 1149 if (!isk) {
1002 rset_clear(allow, tab); 1150 rset_clear(allow, tab);
1003 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); 1151 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
1004 if (!irt_isstr(kt)) 1152 if (LJ_GC64 || !irt_isstr(kt))
1005 tmp = ra_scratch(as, rset_exclude(allow, key)); 1153 tmp = ra_scratch(as, rset_exclude(allow, key));
1006 } 1154 }
1007 1155
1008 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */ 1156 /* Key not found in chain: jump to exit (if merged) or load niltv. */
1009 l_end = emit_label(as); 1157 l_end = emit_label(as);
1010 if (nilexit && ir[1].o == IR_NE) { 1158 if (merge == IR_NE)
1011 emit_jcc(as, CC_E, nilexit); /* XI_JMP is not found by lj_asm_patchexit. */ 1159 asm_guardcc(as, CC_E); /* XI_JMP is not found by lj_asm_patchexit. */
1012 nilexit = NULL; 1160 else if (destused)
1013 } else {
1014 emit_loada(as, dest, niltvg(J2G(as->J))); 1161 emit_loada(as, dest, niltvg(J2G(as->J)));
1015 }
1016 1162
1017 /* Follow hash chain until the end. */ 1163 /* Follow hash chain until the end. */
1018 l_loop = emit_sjcc_label(as, CC_NZ); 1164 l_loop = emit_sjcc_label(as, CC_NZ);
1019 emit_rr(as, XO_TEST, dest, dest); 1165 emit_rr(as, XO_TEST, dest|REX_GC64, dest);
1020 emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next)); 1166 emit_rmro(as, XO_MOV, dest|REX_GC64, dest, offsetof(Node, next));
1021 l_next = emit_label(as); 1167 l_next = emit_label(as);
1022 1168
1023 /* Type and value comparison. */ 1169 /* Type and value comparison. */
1024 if (nilexit) 1170 if (merge == IR_EQ)
1025 emit_jcc(as, CC_E, nilexit); 1171 asm_guardcc(as, CC_E);
1026 else 1172 else
1027 emit_sjcc(as, CC_E, l_end); 1173 emit_sjcc(as, CC_E, l_end);
1028 if (irt_isnum(kt)) { 1174 if (irt_isnum(kt)) {
@@ -1038,7 +1184,7 @@ static void asm_href(ASMState *as, IRIns *ir)
1038 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n)); 1184 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
1039 emit_sjcc(as, CC_AE, l_next); 1185 emit_sjcc(as, CC_AE, l_next);
1040 /* The type check avoids NaN penalties and complaints from Valgrind. */ 1186 /* The type check avoids NaN penalties and complaints from Valgrind. */
1041#if LJ_64 1187#if LJ_64 && !LJ_GC64
1042 emit_u32(as, LJ_TISNUM); 1188 emit_u32(as, LJ_TISNUM);
1043 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); 1189 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
1044#else 1190#else
@@ -1046,13 +1192,31 @@ static void asm_href(ASMState *as, IRIns *ir)
1046 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1192 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1047#endif 1193#endif
1048 } 1194 }
1049#if LJ_64 1195#if LJ_64 && !LJ_GC64
1050 } else if (irt_islightud(kt)) { 1196 } else if (irt_islightud(kt)) {
1051 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64)); 1197 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64));
1052#endif 1198#endif
1199#if LJ_GC64
1200 } else if (irt_isaddr(kt)) {
1201 if (isk) {
1202 TValue k;
1203 k.u64 = ((uint64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
1204 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
1205 k.u32.lo);
1206 emit_sjcc(as, CC_NE, l_next);
1207 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
1208 k.u32.hi);
1209 } else {
1210 emit_rmro(as, XO_CMP, tmp|REX_64, dest, offsetof(Node, key.u64));
1211 }
1212 } else {
1213 lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
1214 emit_u32(as, (irt_toitype(kt)<<15)|0x7fff);
1215 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
1216#else
1053 } else { 1217 } else {
1054 if (!irt_ispri(kt)) { 1218 if (!irt_ispri(kt)) {
1055 lua_assert(irt_isaddr(kt)); 1219 lj_assertA(irt_isaddr(kt), "bad HREF key type");
1056 if (isk) 1220 if (isk)
1057 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr), 1221 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr),
1058 ptr2addr(ir_kgc(irkey))); 1222 ptr2addr(ir_kgc(irkey)));
@@ -1060,31 +1224,33 @@ static void asm_href(ASMState *as, IRIns *ir)
1060 emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr)); 1224 emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr));
1061 emit_sjcc(as, CC_NE, l_next); 1225 emit_sjcc(as, CC_NE, l_next);
1062 } 1226 }
1063 lua_assert(!irt_isnil(kt)); 1227 lj_assertA(!irt_isnil(kt), "bad HREF key type");
1064 emit_i8(as, irt_toitype(kt)); 1228 emit_i8(as, irt_toitype(kt));
1065 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1229 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1230#endif
1066 } 1231 }
1067 emit_sfixup(as, l_loop); 1232 emit_sfixup(as, l_loop);
1068 checkmclim(as); 1233 checkmclim(as);
1234#if LJ_GC64
1235 if (!isk && irt_isaddr(kt)) {
1236 emit_rr(as, XO_OR, tmp|REX_64, key);
1237 emit_loadu64(as, tmp, (uint64_t)irt_toitype(kt) << 47);
1238 }
1239#endif
1069 1240
1070 /* Load main position relative to tab->node into dest. */ 1241 /* Load main position relative to tab->node into dest. */
1071 khash = isk ? ir_khash(irkey) : 1; 1242 khash = isk ? ir_khash(as, irkey) : 1;
1072 if (khash == 0) { 1243 if (khash == 0) {
1073 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node)); 1244 emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node));
1074 } else { 1245 } else {
1075 emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node)); 1246 emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node));
1076 if ((as->flags & JIT_F_PREFER_IMUL)) { 1247 emit_shifti(as, XOg_SHL, dest, 3);
1077 emit_i8(as, sizeof(Node)); 1248 emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
1078 emit_rr(as, XO_IMULi8, dest, dest);
1079 } else {
1080 emit_shifti(as, XOg_SHL, dest, 3);
1081 emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
1082 }
1083 if (isk) { 1249 if (isk) {
1084 emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash); 1250 emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
1085 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); 1251 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
1086 } else if (irt_isstr(kt)) { 1252 } else if (irt_isstr(kt)) {
1087 emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash)); 1253 emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, sid));
1088 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); 1254 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
1089 } else { /* Must match with hashrot() in lj_tab.c. */ 1255 } else { /* Must match with hashrot() in lj_tab.c. */
1090 emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask)); 1256 emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask));
@@ -1107,7 +1273,19 @@ static void asm_href(ASMState *as, IRIns *ir)
1107#endif 1273#endif
1108 } else { 1274 } else {
1109 emit_rr(as, XO_MOV, tmp, key); 1275 emit_rr(as, XO_MOV, tmp, key);
1276#if LJ_GC64
1277 checkmclim(as);
1278 emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
1279 if ((as->flags & JIT_F_BMI2)) {
1280 emit_i8(as, 32);
1281 emit_mrm(as, XV_RORX|VEX_64, dest, key);
1282 } else {
1283 emit_shifti(as, XOg_SHR|REX_64, dest, 32);
1284 emit_rr(as, XO_MOV, dest|REX_64, key|REX_64);
1285 }
1286#else
1110 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS); 1287 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS);
1288#endif
1111 } 1289 }
1112 } 1290 }
1113 } 1291 }
@@ -1123,15 +1301,15 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1123#if !LJ_64 1301#if !LJ_64
1124 MCLabel l_exit; 1302 MCLabel l_exit;
1125#endif 1303#endif
1126 lua_assert(ofs % sizeof(Node) == 0); 1304 lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
1127 if (ra_hasreg(dest)) { 1305 if (ra_hasreg(dest)) {
1128 if (ofs != 0) { 1306 if (ofs != 0) {
1129 if (dest == node && !(as->flags & JIT_F_LEA_AGU)) 1307 if (dest == node)
1130 emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs); 1308 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs);
1131 else 1309 else
1132 emit_rmro(as, XO_LEA, dest, node, ofs); 1310 emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs);
1133 } else if (dest != node) { 1311 } else if (dest != node) {
1134 emit_rr(as, XO_MOV, dest, node); 1312 emit_rr(as, XO_MOV, dest|REX_GC64, node);
1135 } 1313 }
1136 } 1314 }
1137 asm_guardcc(as, CC_NE); 1315 asm_guardcc(as, CC_NE);
@@ -1140,16 +1318,28 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1140 Reg key = ra_scratch(as, rset_exclude(RSET_GPR, node)); 1318 Reg key = ra_scratch(as, rset_exclude(RSET_GPR, node));
1141 emit_rmro(as, XO_CMP, key|REX_64, node, 1319 emit_rmro(as, XO_CMP, key|REX_64, node,
1142 ofs + (int32_t)offsetof(Node, key.u64)); 1320 ofs + (int32_t)offsetof(Node, key.u64));
1143 lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t)); 1321 lj_assertA(irt_isnum(irkey->t) || irt_isgcv(irkey->t),
1322 "bad HREFK key type");
1144 /* Assumes -0.0 is already canonicalized to +0.0. */ 1323 /* Assumes -0.0 is already canonicalized to +0.0. */
1145 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 : 1324 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 :
1325#if LJ_GC64
1326 ((uint64_t)irt_toitype(irkey->t) << 47) |
1327 (uint64_t)ir_kgc(irkey));
1328#else
1146 ((uint64_t)irt_toitype(irkey->t) << 32) | 1329 ((uint64_t)irt_toitype(irkey->t) << 32) |
1147 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey))); 1330 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey)));
1331#endif
1148 } else { 1332 } else {
1149 lua_assert(!irt_isnil(irkey->t)); 1333 lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
1334#if LJ_GC64
1335 emit_i32(as, (irt_toitype(irkey->t)<<15)|0x7fff);
1336 emit_rmro(as, XO_ARITHi, XOg_CMP, node,
1337 ofs + (int32_t)offsetof(Node, key.it));
1338#else
1150 emit_i8(as, irt_toitype(irkey->t)); 1339 emit_i8(as, irt_toitype(irkey->t));
1151 emit_rmro(as, XO_ARITHi8, XOg_CMP, node, 1340 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1152 ofs + (int32_t)offsetof(Node, key.it)); 1341 ofs + (int32_t)offsetof(Node, key.it));
1342#endif
1153 } 1343 }
1154#else 1344#else
1155 l_exit = emit_label(as); 1345 l_exit = emit_label(as);
@@ -1164,13 +1354,13 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1164 (int32_t)ir_knum(irkey)->u32.hi); 1354 (int32_t)ir_knum(irkey)->u32.hi);
1165 } else { 1355 } else {
1166 if (!irt_ispri(irkey->t)) { 1356 if (!irt_ispri(irkey->t)) {
1167 lua_assert(irt_isgcv(irkey->t)); 1357 lj_assertA(irt_isgcv(irkey->t), "bad HREFK key type");
1168 emit_gmroi(as, XG_ARITHi(XOg_CMP), node, 1358 emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1169 ofs + (int32_t)offsetof(Node, key.gcr), 1359 ofs + (int32_t)offsetof(Node, key.gcr),
1170 ptr2addr(ir_kgc(irkey))); 1360 ptr2addr(ir_kgc(irkey)));
1171 emit_sjcc(as, CC_NE, l_exit); 1361 emit_sjcc(as, CC_NE, l_exit);
1172 } 1362 }
1173 lua_assert(!irt_isnil(irkey->t)); 1363 lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
1174 emit_i8(as, irt_toitype(irkey->t)); 1364 emit_i8(as, irt_toitype(irkey->t));
1175 emit_rmro(as, XO_ARITHi8, XOg_CMP, node, 1365 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1176 ofs + (int32_t)offsetof(Node, key.it)); 1366 ofs + (int32_t)offsetof(Node, key.it));
@@ -1178,61 +1368,27 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1178#endif 1368#endif
1179} 1369}
1180 1370
1181static void asm_newref(ASMState *as, IRIns *ir)
1182{
1183 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
1184 IRRef args[3];
1185 IRIns *irkey;
1186 Reg tmp;
1187 if (ir->r == RID_SINK)
1188 return;
1189 args[0] = ASMREF_L; /* lua_State *L */
1190 args[1] = ir->op1; /* GCtab *t */
1191 args[2] = ASMREF_TMP1; /* cTValue *key */
1192 asm_setupresult(as, ir, ci); /* TValue * */
1193 asm_gencall(as, ci, args);
1194 tmp = ra_releasetmp(as, ASMREF_TMP1);
1195 irkey = IR(ir->op2);
1196 if (irt_isnum(irkey->t)) {
1197 /* For numbers use the constant itself or a spill slot as a TValue. */
1198 if (irref_isk(ir->op2))
1199 emit_loada(as, tmp, ir_knum(irkey));
1200 else
1201 emit_rmro(as, XO_LEA, tmp|REX_64, RID_ESP, ra_spill(as, irkey));
1202 } else {
1203 /* Otherwise use g->tmptv to hold the TValue. */
1204 if (!irref_isk(ir->op2)) {
1205 Reg src = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, tmp));
1206 emit_movtomro(as, REX_64IR(irkey, src), tmp, 0);
1207 } else if (!irt_ispri(irkey->t)) {
1208 emit_movmroi(as, tmp, 0, irkey->i);
1209 }
1210 if (!(LJ_64 && irt_islightud(irkey->t)))
1211 emit_movmroi(as, tmp, 4, irt_toitype(irkey->t));
1212 emit_loada(as, tmp, &J2G(as->J)->tmptv);
1213 }
1214}
1215
1216static void asm_uref(ASMState *as, IRIns *ir) 1371static void asm_uref(ASMState *as, IRIns *ir)
1217{ 1372{
1218 Reg dest = ra_dest(as, ir, RSET_GPR); 1373 Reg dest = ra_dest(as, ir, RSET_GPR);
1219 if (irref_isk(ir->op1)) { 1374 if (irref_isk(ir->op1)) {
1220 GCfunc *fn = ir_kfunc(IR(ir->op1)); 1375 GCfunc *fn = ir_kfunc(IR(ir->op1));
1221 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; 1376 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
1222 emit_rma(as, XO_MOV, dest, v); 1377 emit_rma(as, XO_MOV, dest|REX_GC64, v);
1223 } else { 1378 } else {
1224 Reg uv = ra_scratch(as, RSET_GPR); 1379 Reg uv = ra_scratch(as, RSET_GPR);
1225 Reg func = ra_alloc1(as, ir->op1, RSET_GPR); 1380 Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
1226 if (ir->o == IR_UREFC) { 1381 if (ir->o == IR_UREFC) {
1227 emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv)); 1382 emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
1228 asm_guardcc(as, CC_NE); 1383 asm_guardcc(as, CC_NE);
1229 emit_i8(as, 1); 1384 emit_i8(as, 1);
1230 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); 1385 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
1231 } else { 1386 } else {
1232 emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v)); 1387 emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
1233 } 1388 }
1234 emit_rmro(as, XO_MOV, uv, func, 1389 emit_rmro(as, XO_MOV, uv|REX_GC64, func,
1235 (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); 1390 (int32_t)offsetof(GCfuncL, uvptr) +
1391 (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
1236 } 1392 }
1237} 1393}
1238 1394
@@ -1250,9 +1406,9 @@ static void asm_strref(ASMState *as, IRIns *ir)
1250 if (as->mrm.base == RID_NONE) 1406 if (as->mrm.base == RID_NONE)
1251 emit_loadi(as, dest, as->mrm.ofs); 1407 emit_loadi(as, dest, as->mrm.ofs);
1252 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE) 1408 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
1253 emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs); 1409 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, as->mrm.ofs);
1254 else 1410 else
1255 emit_mrm(as, XO_LEA, dest, RID_MRM); 1411 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
1256} 1412}
1257 1413
1258/* -- Loads and stores ---------------------------------------------------- */ 1414/* -- Loads and stores ---------------------------------------------------- */
@@ -1271,19 +1427,23 @@ static void asm_fxload(ASMState *as, IRIns *ir)
1271 case IRT_U8: xo = XO_MOVZXb; break; 1427 case IRT_U8: xo = XO_MOVZXb; break;
1272 case IRT_I16: xo = XO_MOVSXw; break; 1428 case IRT_I16: xo = XO_MOVSXw; break;
1273 case IRT_U16: xo = XO_MOVZXw; break; 1429 case IRT_U16: xo = XO_MOVZXw; break;
1274 case IRT_NUM: xo = XMM_MOVRM(as); break; 1430 case IRT_NUM: xo = XO_MOVSD; break;
1275 case IRT_FLOAT: xo = XO_MOVSS; break; 1431 case IRT_FLOAT: xo = XO_MOVSS; break;
1276 default: 1432 default:
1277 if (LJ_64 && irt_is64(ir->t)) 1433 if (LJ_64 && irt_is64(ir->t))
1278 dest |= REX_64; 1434 dest |= REX_64;
1279 else 1435 else
1280 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)); 1436 lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t),
1437 "unsplit 64 bit load");
1281 xo = XO_MOV; 1438 xo = XO_MOV;
1282 break; 1439 break;
1283 } 1440 }
1284 emit_mrm(as, xo, dest, RID_MRM); 1441 emit_mrm(as, xo, dest, RID_MRM);
1285} 1442}
1286 1443
1444#define asm_fload(as, ir) asm_fxload(as, ir)
1445#define asm_xload(as, ir) asm_fxload(as, ir)
1446
1287static void asm_fxstore(ASMState *as, IRIns *ir) 1447static void asm_fxstore(ASMState *as, IRIns *ir)
1288{ 1448{
1289 RegSet allow = RSET_GPR; 1449 RegSet allow = RSET_GPR;
@@ -1318,14 +1478,17 @@ static void asm_fxstore(ASMState *as, IRIns *ir)
1318 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break; 1478 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
1319 case IRT_NUM: xo = XO_MOVSDto; break; 1479 case IRT_NUM: xo = XO_MOVSDto; break;
1320 case IRT_FLOAT: xo = XO_MOVSSto; break; 1480 case IRT_FLOAT: xo = XO_MOVSSto; break;
1321#if LJ_64 1481#if LJ_64 && !LJ_GC64
1322 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */ 1482 case IRT_LIGHTUD:
1483 /* NYI: mask 64 bit lightuserdata. */
1484 lj_assertA(0, "store of lightuserdata");
1323#endif 1485#endif
1324 default: 1486 default:
1325 if (LJ_64 && irt_is64(ir->t)) 1487 if (LJ_64 && irt_is64(ir->t))
1326 src |= REX_64; 1488 src |= REX_64;
1327 else 1489 else
1328 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)); 1490 lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t),
1491 "unsplit 64 bit store");
1329 xo = XO_MOVto; 1492 xo = XO_MOVto;
1330 break; 1493 break;
1331 } 1494 }
@@ -1339,15 +1502,18 @@ static void asm_fxstore(ASMState *as, IRIns *ir)
1339 emit_i8(as, k); 1502 emit_i8(as, k);
1340 emit_mrm(as, XO_MOVmib, 0, RID_MRM); 1503 emit_mrm(as, XO_MOVmib, 0, RID_MRM);
1341 } else { 1504 } else {
1342 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || irt_isu32(ir->t) || 1505 lj_assertA(irt_is64(ir->t) || irt_isint(ir->t) || irt_isu32(ir->t) ||
1343 irt_isaddr(ir->t)); 1506 irt_isaddr(ir->t), "bad store type");
1344 emit_i32(as, k); 1507 emit_i32(as, k);
1345 emit_mrm(as, XO_MOVmi, REX_64IR(ir, 0), RID_MRM); 1508 emit_mrm(as, XO_MOVmi, REX_64IR(ir, 0), RID_MRM);
1346 } 1509 }
1347 } 1510 }
1348} 1511}
1349 1512
1350#if LJ_64 1513#define asm_fstore(as, ir) asm_fxstore(as, ir)
1514#define asm_xstore(as, ir) asm_fxstore(as, ir)
1515
1516#if LJ_64 && !LJ_GC64
1351static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck) 1517static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
1352{ 1518{
1353 if (ra_used(ir) || typecheck) { 1519 if (ra_used(ir) || typecheck) {
@@ -1369,13 +1535,18 @@ static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
1369 1535
1370static void asm_ahuvload(ASMState *as, IRIns *ir) 1536static void asm_ahuvload(ASMState *as, IRIns *ir)
1371{ 1537{
1372 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || 1538#if LJ_GC64
1373 (LJ_DUALNUM && irt_isint(ir->t))); 1539 Reg tmp = RID_NONE;
1374#if LJ_64 1540#endif
1541 lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
1542 (LJ_DUALNUM && irt_isint(ir->t)),
1543 "bad load type %d", irt_type(ir->t));
1544#if LJ_64 && !LJ_GC64
1375 if (irt_islightud(ir->t)) { 1545 if (irt_islightud(ir->t)) {
1376 Reg dest = asm_load_lightud64(as, ir, 1); 1546 Reg dest = asm_load_lightud64(as, ir, 1);
1377 if (ra_hasreg(dest)) { 1547 if (ra_hasreg(dest)) {
1378 asm_fuseahuref(as, ir->op1, RSET_GPR); 1548 asm_fuseahuref(as, ir->op1, RSET_GPR);
1549 if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
1379 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM); 1550 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
1380 } 1551 }
1381 return; 1552 return;
@@ -1385,20 +1556,67 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
1385 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; 1556 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
1386 Reg dest = ra_dest(as, ir, allow); 1557 Reg dest = ra_dest(as, ir, allow);
1387 asm_fuseahuref(as, ir->op1, RSET_GPR); 1558 asm_fuseahuref(as, ir->op1, RSET_GPR);
1388 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM); 1559 if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
1560#if LJ_GC64
1561 if (irt_isaddr(ir->t)) {
1562 emit_shifti(as, XOg_SHR|REX_64, dest, 17);
1563 asm_guardcc(as, CC_NE);
1564 emit_i8(as, irt_toitype(ir->t));
1565 emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
1566 emit_i8(as, XI_O16);
1567 if ((as->flags & JIT_F_BMI2)) {
1568 emit_i8(as, 47);
1569 emit_mrm(as, XV_RORX|VEX_64, dest, RID_MRM);
1570 } else {
1571 emit_shifti(as, XOg_ROR|REX_64, dest, 47);
1572 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
1573 }
1574 return;
1575 } else
1576#endif
1577 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XO_MOVSD, dest, RID_MRM);
1389 } else { 1578 } else {
1390 asm_fuseahuref(as, ir->op1, RSET_GPR); 1579 RegSet gpr = RSET_GPR;
1580#if LJ_GC64
1581 if (irt_isaddr(ir->t)) {
1582 tmp = ra_scratch(as, RSET_GPR);
1583 gpr = rset_exclude(gpr, tmp);
1584 }
1585#endif
1586 asm_fuseahuref(as, ir->op1, gpr);
1587 if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
1391 } 1588 }
1392 /* Always do the type check, even if the load result is unused. */ 1589 /* Always do the type check, even if the load result is unused. */
1393 as->mrm.ofs += 4; 1590 as->mrm.ofs += 4;
1394 asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE); 1591 asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE);
1395 if (LJ_64 && irt_type(ir->t) >= IRT_NUM) { 1592 if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
1396 lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); 1593 lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
1594 "bad load type %d", irt_type(ir->t));
1595#if LJ_GC64
1596 emit_u32(as, LJ_TISNUM << 15);
1597#else
1397 emit_u32(as, LJ_TISNUM); 1598 emit_u32(as, LJ_TISNUM);
1599#endif
1600 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
1601#if LJ_GC64
1602 } else if (irt_isaddr(ir->t)) {
1603 as->mrm.ofs -= 4;
1604 emit_i8(as, irt_toitype(ir->t));
1605 emit_mrm(as, XO_ARITHi8, XOg_CMP, tmp);
1606 emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
1607 emit_mrm(as, XO_MOV, tmp|REX_64, RID_MRM);
1608 } else if (irt_isnil(ir->t)) {
1609 as->mrm.ofs -= 4;
1610 emit_i8(as, -1);
1611 emit_mrm(as, XO_ARITHi8, XOg_CMP|REX_64, RID_MRM);
1612 } else {
1613 emit_u32(as, (irt_toitype(ir->t) << 15) | 0x7fff);
1398 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); 1614 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
1615#else
1399 } else { 1616 } else {
1400 emit_i8(as, irt_toitype(ir->t)); 1617 emit_i8(as, irt_toitype(ir->t));
1401 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM); 1618 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
1619#endif
1402 } 1620 }
1403} 1621}
1404 1622
@@ -1410,12 +1628,28 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
1410 Reg src = ra_alloc1(as, ir->op2, RSET_FPR); 1628 Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
1411 asm_fuseahuref(as, ir->op1, RSET_GPR); 1629 asm_fuseahuref(as, ir->op1, RSET_GPR);
1412 emit_mrm(as, XO_MOVSDto, src, RID_MRM); 1630 emit_mrm(as, XO_MOVSDto, src, RID_MRM);
1413#if LJ_64 1631#if LJ_64 && !LJ_GC64
1414 } else if (irt_islightud(ir->t)) { 1632 } else if (irt_islightud(ir->t)) {
1415 Reg src = ra_alloc1(as, ir->op2, RSET_GPR); 1633 Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
1416 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src)); 1634 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src));
1417 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); 1635 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
1418#endif 1636#endif
1637#if LJ_GC64
1638 } else if (irref_isk(ir->op2)) {
1639 TValue k;
1640 lj_ir_kvalue(as->J->L, &k, IR(ir->op2));
1641 asm_fuseahuref(as, ir->op1, RSET_GPR);
1642 if (tvisnil(&k)) {
1643 emit_i32(as, -1);
1644 emit_mrm(as, XO_MOVmi, REX_64, RID_MRM);
1645 } else {
1646 emit_u32(as, k.u32.lo);
1647 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1648 as->mrm.ofs += 4;
1649 emit_u32(as, k.u32.hi);
1650 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1651 }
1652#endif
1419 } else { 1653 } else {
1420 IRIns *irr = IR(ir->op2); 1654 IRIns *irr = IR(ir->op2);
1421 RegSet allow = RSET_GPR; 1655 RegSet allow = RSET_GPR;
@@ -1426,34 +1660,56 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
1426 } 1660 }
1427 asm_fuseahuref(as, ir->op1, allow); 1661 asm_fuseahuref(as, ir->op1, allow);
1428 if (ra_hasreg(src)) { 1662 if (ra_hasreg(src)) {
1663#if LJ_GC64
1664 if (!(LJ_DUALNUM && irt_isinteger(ir->t))) {
1665 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1666 as->mrm.ofs += 4;
1667 emit_u32(as, irt_toitype(ir->t) << 15);
1668 emit_mrm(as, XO_ARITHi, XOg_OR, RID_MRM);
1669 as->mrm.ofs -= 4;
1670 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
1671 return;
1672 }
1673#endif
1429 emit_mrm(as, XO_MOVto, src, RID_MRM); 1674 emit_mrm(as, XO_MOVto, src, RID_MRM);
1430 } else if (!irt_ispri(irr->t)) { 1675 } else if (!irt_ispri(irr->t)) {
1431 lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t))); 1676 lj_assertA(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t)),
1677 "bad store type");
1432 emit_i32(as, irr->i); 1678 emit_i32(as, irr->i);
1433 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1679 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1434 } 1680 }
1435 as->mrm.ofs += 4; 1681 as->mrm.ofs += 4;
1682#if LJ_GC64
1683 lj_assertA(LJ_DUALNUM && irt_isinteger(ir->t), "bad store type");
1684 emit_i32(as, LJ_TNUMX << 15);
1685#else
1436 emit_i32(as, (int32_t)irt_toitype(ir->t)); 1686 emit_i32(as, (int32_t)irt_toitype(ir->t));
1687#endif
1437 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1688 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1438 } 1689 }
1439} 1690}
1440 1691
1441static void asm_sload(ASMState *as, IRIns *ir) 1692static void asm_sload(ASMState *as, IRIns *ir)
1442{ 1693{
1443 int32_t ofs = 8*((int32_t)ir->op1-1) + ((ir->op2 & IRSLOAD_FRAME) ? 4 : 0); 1694 int32_t ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
1695 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
1444 IRType1 t = ir->t; 1696 IRType1 t = ir->t;
1445 Reg base; 1697 Reg base;
1446 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ 1698 lj_assertA(!(ir->op2 & IRSLOAD_PARENT),
1447 lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK)); 1699 "bad parent SLOAD"); /* Handled by asm_head_side(). */
1448 lua_assert(LJ_DUALNUM || 1700 lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
1449 !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME))); 1701 "inconsistent SLOAD variant");
1702 lj_assertA(LJ_DUALNUM ||
1703 !irt_isint(t) ||
1704 (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME|IRSLOAD_KEYINDEX)),
1705 "bad SLOAD type");
1450 if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { 1706 if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
1451 Reg left = ra_scratch(as, RSET_FPR); 1707 Reg left = ra_scratch(as, RSET_FPR);
1452 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */ 1708 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */
1453 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1709 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1454 emit_rmro(as, XMM_MOVRM(as), left, base, ofs); 1710 emit_rmro(as, XO_MOVSD, left, base, ofs);
1455 t.irt = IRT_NUM; /* Continue with a regular number type check. */ 1711 t.irt = IRT_NUM; /* Continue with a regular number type check. */
1456#if LJ_64 1712#if LJ_64 && !LJ_GC64
1457 } else if (irt_islightud(t)) { 1713 } else if (irt_islightud(t)) {
1458 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK)); 1714 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK));
1459 if (ra_hasreg(dest)) { 1715 if (ra_hasreg(dest)) {
@@ -1466,14 +1722,43 @@ static void asm_sload(ASMState *as, IRIns *ir)
1466 RegSet allow = irt_isnum(t) ? RSET_FPR : RSET_GPR; 1722 RegSet allow = irt_isnum(t) ? RSET_FPR : RSET_GPR;
1467 Reg dest = ra_dest(as, ir, allow); 1723 Reg dest = ra_dest(as, ir, allow);
1468 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1724 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1469 lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); 1725 lj_assertA(irt_isnum(t) || irt_isint(t) || irt_isaddr(t),
1726 "bad SLOAD type %d", irt_type(t));
1470 if ((ir->op2 & IRSLOAD_CONVERT)) { 1727 if ((ir->op2 & IRSLOAD_CONVERT)) {
1471 t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */ 1728 t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */
1472 emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTSD2SI, dest, base, ofs); 1729 emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTTSD2SI, dest, base, ofs);
1473 } else if (irt_isnum(t)) {
1474 emit_rmro(as, XMM_MOVRM(as), dest, base, ofs);
1475 } else { 1730 } else {
1476 emit_rmro(as, XO_MOV, dest, base, ofs); 1731#if LJ_GC64
1732 if (irt_isaddr(t)) {
1733 /* LJ_GC64 type check + tag removal without BMI2 and with BMI2:
1734 **
1735 ** mov r64, [addr] rorx r64, [addr], 47
1736 ** ror r64, 47
1737 ** cmp r16, itype cmp r16, itype
1738 ** jne ->exit jne ->exit
1739 ** shr r64, 16 shr r64, 16
1740 */
1741 emit_shifti(as, XOg_SHR|REX_64, dest, 17);
1742 if ((ir->op2 & IRSLOAD_TYPECHECK)) {
1743 asm_guardcc(as, CC_NE);
1744 emit_i8(as, irt_toitype(t));
1745 emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
1746 emit_i8(as, XI_O16);
1747 }
1748 if ((as->flags & JIT_F_BMI2)) {
1749 emit_i8(as, 47);
1750 emit_rmro(as, XV_RORX|VEX_64, dest, base, ofs);
1751 } else {
1752 if ((ir->op2 & IRSLOAD_TYPECHECK))
1753 emit_shifti(as, XOg_ROR|REX_64, dest, 47);
1754 else
1755 emit_shifti(as, XOg_SHL|REX_64, dest, 17);
1756 emit_rmro(as, XO_MOV, dest|REX_64, base, ofs);
1757 }
1758 return;
1759 } else
1760#endif
1761 emit_rmro(as, irt_isnum(t) ? XO_MOVSD : XO_MOV, dest, base, ofs);
1477 } 1762 }
1478 } else { 1763 } else {
1479 if (!(ir->op2 & IRSLOAD_TYPECHECK)) 1764 if (!(ir->op2 & IRSLOAD_TYPECHECK))
@@ -1484,12 +1769,44 @@ static void asm_sload(ASMState *as, IRIns *ir)
1484 /* Need type check, even if the load result is unused. */ 1769 /* Need type check, even if the load result is unused. */
1485 asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE); 1770 asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE);
1486 if (LJ_64 && irt_type(t) >= IRT_NUM) { 1771 if (LJ_64 && irt_type(t) >= IRT_NUM) {
1487 lua_assert(irt_isinteger(t) || irt_isnum(t)); 1772 lj_assertA(irt_isinteger(t) || irt_isnum(t),
1773 "bad SLOAD type %d", irt_type(t));
1774#if LJ_GC64
1775 emit_u32(as, LJ_TISNUM << 15);
1776#else
1488 emit_u32(as, LJ_TISNUM); 1777 emit_u32(as, LJ_TISNUM);
1778#endif
1489 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); 1779 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
1780#if LJ_GC64
1781 } else if (irt_isnil(t)) {
1782 /* LJ_GC64 type check for nil:
1783 **
1784 ** cmp qword [addr], -1
1785 ** jne ->exit
1786 */
1787 emit_i8(as, -1);
1788 emit_rmro(as, XO_ARITHi8, XOg_CMP|REX_64, base, ofs);
1789 } else if (irt_ispri(t)) {
1790 emit_u32(as, (irt_toitype(t) << 15) | 0x7fff);
1791 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
1792 } else {
1793 /* LJ_GC64 type check only:
1794 **
1795 ** mov r64, [addr]
1796 ** sar r64, 47
1797 ** cmp r32, itype
1798 ** jne ->exit
1799 */
1800 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base));
1801 emit_i8(as, irt_toitype(t));
1802 emit_rr(as, XO_ARITHi8, XOg_CMP, tmp);
1803 emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
1804 emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs);
1805#else
1490 } else { 1806 } else {
1491 emit_i8(as, irt_toitype(t)); 1807 emit_i8(as, irt_toitype(t));
1492 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4); 1808 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
1809#endif
1493 } 1810 }
1494 } 1811 }
1495} 1812}
@@ -1500,15 +1817,14 @@ static void asm_sload(ASMState *as, IRIns *ir)
1500static void asm_cnew(ASMState *as, IRIns *ir) 1817static void asm_cnew(ASMState *as, IRIns *ir)
1501{ 1818{
1502 CTState *cts = ctype_ctsG(J2G(as->J)); 1819 CTState *cts = ctype_ctsG(J2G(as->J));
1503 CTypeID ctypeid = (CTypeID)IR(ir->op1)->i; 1820 CTypeID id = (CTypeID)IR(ir->op1)->i;
1504 CTSize sz = (ir->o == IR_CNEWI || ir->op2 == REF_NIL) ? 1821 CTSize sz;
1505 lj_ctype_size(cts, ctypeid) : (CTSize)IR(ir->op2)->i; 1822 CTInfo info = lj_ctype_info(cts, id, &sz);
1506 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco]; 1823 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
1507 IRRef args[2]; 1824 IRRef args[4];
1508 lua_assert(sz != CTSIZE_INVALID); 1825 lj_assertA(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL),
1826 "bad CNEW/CNEWI operands");
1509 1827
1510 args[0] = ASMREF_L; /* lua_State *L */
1511 args[1] = ASMREF_TMP1; /* MSize size */
1512 as->gcsteps++; 1828 as->gcsteps++;
1513 asm_setupresult(as, ir, ci); /* GCcdata * */ 1829 asm_setupresult(as, ir, ci); /* GCcdata * */
1514 1830
@@ -1519,8 +1835,9 @@ static void asm_cnew(ASMState *as, IRIns *ir)
1519 Reg r64 = sz == 8 ? REX_64 : 0; 1835 Reg r64 = sz == 8 ? REX_64 : 0;
1520 if (irref_isk(ir->op2)) { 1836 if (irref_isk(ir->op2)) {
1521 IRIns *irk = IR(ir->op2); 1837 IRIns *irk = IR(ir->op2);
1522 uint64_t k = irk->o == IR_KINT64 ? ir_k64(irk)->u64 : 1838 uint64_t k = (irk->o == IR_KINT64 ||
1523 (uint64_t)(uint32_t)irk->i; 1839 (LJ_GC64 && (irk->o == IR_KPTR || irk->o == IR_KKPTR))) ?
1840 ir_k64(irk)->u64 : (uint64_t)(uint32_t)irk->i;
1524 if (sz == 4 || checki32((int64_t)k)) { 1841 if (sz == 4 || checki32((int64_t)k)) {
1525 emit_i32(as, (int32_t)k); 1842 emit_i32(as, (int32_t)k);
1526 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata)); 1843 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata));
@@ -1536,7 +1853,7 @@ static void asm_cnew(ASMState *as, IRIns *ir)
1536 int32_t ofs = sizeof(GCcdata); 1853 int32_t ofs = sizeof(GCcdata);
1537 if (sz == 8) { 1854 if (sz == 8) {
1538 ofs += 4; ir++; 1855 ofs += 4; ir++;
1539 lua_assert(ir->o == IR_HIOP); 1856 lj_assertA(ir->o == IR_HIOP, "missing CNEWI HIOP");
1540 } 1857 }
1541 do { 1858 do {
1542 if (irref_isk(ir->op2)) { 1859 if (irref_isk(ir->op2)) {
@@ -1550,21 +1867,30 @@ static void asm_cnew(ASMState *as, IRIns *ir)
1550 ofs -= 4; ir--; 1867 ofs -= 4; ir--;
1551 } while (1); 1868 } while (1);
1552#endif 1869#endif
1553 lua_assert(sz == 4 || sz == 8); 1870 lj_assertA(sz == 4 || sz == 8, "bad CNEWI size %d", sz);
1871 } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */
1872 ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
1873 args[0] = ASMREF_L; /* lua_State *L */
1874 args[1] = ir->op1; /* CTypeID id */
1875 args[2] = ir->op2; /* CTSize sz */
1876 args[3] = ASMREF_TMP1; /* CTSize align */
1877 asm_gencall(as, ci, args);
1878 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
1879 return;
1554 } 1880 }
1555 1881
1556 /* Combine initialization of marked, gct and ctypeid. */ 1882 /* Combine initialization of marked, gct and ctypeid. */
1557 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked)); 1883 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked));
1558 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX, 1884 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX,
1559 (int32_t)((~LJ_TCDATA<<8)+(ctypeid<<16))); 1885 (int32_t)((~LJ_TCDATA<<8)+(id<<16)));
1560 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES); 1886 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES);
1561 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite); 1887 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite);
1562 1888
1889 args[0] = ASMREF_L; /* lua_State *L */
1890 args[1] = ASMREF_TMP1; /* MSize size */
1563 asm_gencall(as, ci, args); 1891 asm_gencall(as, ci, args);
1564 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata))); 1892 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata)));
1565} 1893}
1566#else
1567#define asm_cnew(as, ir) ((void)0)
1568#endif 1894#endif
1569 1895
1570/* -- Write barriers ------------------------------------------------------ */ 1896/* -- Write barriers ------------------------------------------------------ */
@@ -1574,7 +1900,7 @@ static void asm_tbar(ASMState *as, IRIns *ir)
1574 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); 1900 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
1575 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab)); 1901 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
1576 MCLabel l_end = emit_label(as); 1902 MCLabel l_end = emit_label(as);
1577 emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist)); 1903 emit_movtomro(as, tmp|REX_GC64, tab, offsetof(GCtab, gclist));
1578 emit_setgl(as, tab, gc.grayagain); 1904 emit_setgl(as, tab, gc.grayagain);
1579 emit_getgl(as, tmp, gc.grayagain); 1905 emit_getgl(as, tmp, gc.grayagain);
1580 emit_i8(as, ~LJ_GC_BLACK); 1906 emit_i8(as, ~LJ_GC_BLACK);
@@ -1591,7 +1917,7 @@ static void asm_obar(ASMState *as, IRIns *ir)
1591 MCLabel l_end; 1917 MCLabel l_end;
1592 Reg obj; 1918 Reg obj;
1593 /* No need for other object barriers (yet). */ 1919 /* No need for other object barriers (yet). */
1594 lua_assert(IR(ir->op1)->o == IR_UREFC); 1920 lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");
1595 ra_evictset(as, RSET_SCRATCH); 1921 ra_evictset(as, RSET_SCRATCH);
1596 l_end = emit_label(as); 1922 l_end = emit_label(as);
1597 args[0] = ASMREF_TMP1; /* global_State *g */ 1923 args[0] = ASMREF_TMP1; /* global_State *g */
@@ -1637,36 +1963,9 @@ static void asm_x87load(ASMState *as, IRRef ref)
1637 } 1963 }
1638} 1964}
1639 1965
1640/* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
1641static int fpmjoin_pow(ASMState *as, IRIns *ir)
1642{
1643 IRIns *irp = IR(ir->op1);
1644 if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) {
1645 IRIns *irpp = IR(irp->op1);
1646 if (irpp == ir-2 && irpp->o == IR_FPMATH &&
1647 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
1648 /* The modified regs must match with the *.dasc implementation. */
1649 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
1650 IRIns *irx;
1651 if (ra_hasreg(ir->r))
1652 rset_clear(drop, ir->r); /* Dest reg handled below. */
1653 ra_evictset(as, drop);
1654 ra_destreg(as, ir, RID_XMM0);
1655 emit_call(as, lj_vm_pow_sse);
1656 irx = IR(irpp->op1);
1657 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
1658 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
1659 ra_left(as, RID_XMM0, irpp->op1);
1660 ra_left(as, RID_XMM1, irp->op2);
1661 return 1;
1662 }
1663 }
1664 return 0;
1665}
1666
1667static void asm_fpmath(ASMState *as, IRIns *ir) 1966static void asm_fpmath(ASMState *as, IRIns *ir)
1668{ 1967{
1669 IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER; 1968 IRFPMathOp fpm = (IRFPMathOp)ir->op2;
1670 if (fpm == IRFPM_SQRT) { 1969 if (fpm == IRFPM_SQRT) {
1671 Reg dest = ra_dest(as, ir, RSET_FPR); 1970 Reg dest = ra_dest(as, ir, RSET_FPR);
1672 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 1971 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
@@ -1697,51 +1996,25 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
1697 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); 1996 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
1698 ra_left(as, RID_XMM0, ir->op1); 1997 ra_left(as, RID_XMM0, ir->op1);
1699 } 1998 }
1700 } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) { 1999 } else {
1701 /* Rejoined to pow(). */ 2000 asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
1702 } else { /* Handle x87 ops. */ 2001 }
1703 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 2002}
1704 Reg dest = ir->r; 2003
1705 if (ra_hasreg(dest)) { 2004static void asm_ldexp(ASMState *as, IRIns *ir)
1706 ra_free(as, dest); 2005{
1707 ra_modified(as, dest); 2006 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1708 emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs); 2007 Reg dest = ir->r;
1709 } 2008 if (ra_hasreg(dest)) {
1710 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); 2009 ra_free(as, dest);
1711 switch (fpm) { /* st0 = lj_vm_*(st0) */ 2010 ra_modified(as, dest);
1712 case IRFPM_EXP: emit_call(as, lj_vm_exp_x87); break; 2011 emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs);
1713 case IRFPM_EXP2: emit_call(as, lj_vm_exp2_x87); break;
1714 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
1715 case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
1716 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
1717 case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
1718 /* Note: the use of fyl2xp1 would be pointless here. When computing
1719 ** log(1.0+eps) the precision is already lost after 1.0 is added.
1720 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
1721 */
1722 emit_x87op(as, XI_FYL2X); break;
1723 case IRFPM_OTHER:
1724 switch (ir->o) {
1725 case IR_ATAN2:
1726 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
1727 case IR_LDEXP:
1728 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
1729 default: lua_assert(0); break;
1730 }
1731 break;
1732 default: lua_assert(0); break;
1733 }
1734 asm_x87load(as, ir->op1);
1735 switch (fpm) {
1736 case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
1737 case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
1738 case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
1739 case IRFPM_OTHER:
1740 if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
1741 break;
1742 default: break;
1743 }
1744 } 2012 }
2013 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
2014 emit_x87op(as, XI_FPOP1);
2015 emit_x87op(as, XI_FSCALE);
2016 asm_x87load(as, ir->op1);
2017 asm_x87load(as, ir->op2);
1745} 2018}
1746 2019
1747static void asm_fppowi(ASMState *as, IRIns *ir) 2020static void asm_fppowi(ASMState *as, IRIns *ir)
@@ -1757,33 +2030,11 @@ static void asm_fppowi(ASMState *as, IRIns *ir)
1757 ra_left(as, RID_EAX, ir->op2); 2030 ra_left(as, RID_EAX, ir->op2);
1758} 2031}
1759 2032
1760#if LJ_64 && LJ_HASFFI
1761static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
1762{
1763 const CCallInfo *ci = &lj_ir_callinfo[id];
1764 IRRef args[2];
1765 args[0] = ir->op1;
1766 args[1] = ir->op2;
1767 asm_setupresult(as, ir, ci);
1768 asm_gencall(as, ci, args);
1769}
1770#endif
1771
1772static void asm_intmod(ASMState *as, IRIns *ir)
1773{
1774 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_vm_modi];
1775 IRRef args[2];
1776 args[0] = ir->op1;
1777 args[1] = ir->op2;
1778 asm_setupresult(as, ir, ci);
1779 asm_gencall(as, ci, args);
1780}
1781
1782static int asm_swapops(ASMState *as, IRIns *ir) 2033static int asm_swapops(ASMState *as, IRIns *ir)
1783{ 2034{
1784 IRIns *irl = IR(ir->op1); 2035 IRIns *irl = IR(ir->op1);
1785 IRIns *irr = IR(ir->op2); 2036 IRIns *irr = IR(ir->op2);
1786 lua_assert(ra_noreg(irr->r)); 2037 lj_assertA(ra_noreg(irr->r), "bad usage");
1787 if (!irm_iscomm(lj_ir_mode[ir->o])) 2038 if (!irm_iscomm(lj_ir_mode[ir->o]))
1788 return 0; /* Can't swap non-commutative operations. */ 2039 return 0; /* Can't swap non-commutative operations. */
1789 if (irref_isk(ir->op2)) 2040 if (irref_isk(ir->op2))
@@ -1955,11 +2206,28 @@ static void asm_add(ASMState *as, IRIns *ir)
1955{ 2206{
1956 if (irt_isnum(ir->t)) 2207 if (irt_isnum(ir->t))
1957 asm_fparith(as, ir, XO_ADDSD); 2208 asm_fparith(as, ir, XO_ADDSD);
1958 else if ((as->flags & JIT_F_LEA_AGU) || as->flagmcp == as->mcp || 2209 else if (as->flagmcp == as->mcp || irt_is64(ir->t) || !asm_lea(as, ir))
1959 irt_is64(ir->t) || !asm_lea(as, ir))
1960 asm_intarith(as, ir, XOg_ADD); 2210 asm_intarith(as, ir, XOg_ADD);
1961} 2211}
1962 2212
2213static void asm_sub(ASMState *as, IRIns *ir)
2214{
2215 if (irt_isnum(ir->t))
2216 asm_fparith(as, ir, XO_SUBSD);
2217 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2218 asm_intarith(as, ir, XOg_SUB);
2219}
2220
2221static void asm_mul(ASMState *as, IRIns *ir)
2222{
2223 if (irt_isnum(ir->t))
2224 asm_fparith(as, ir, XO_MULSD);
2225 else
2226 asm_intarith(as, ir, XOg_X_IMUL);
2227}
2228
2229#define asm_fpdiv(as, ir) asm_fparith(as, ir, XO_DIVSD)
2230
1963static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg) 2231static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
1964{ 2232{
1965 Reg dest = ra_dest(as, ir, RSET_GPR); 2233 Reg dest = ra_dest(as, ir, RSET_GPR);
@@ -1967,7 +2235,17 @@ static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
1967 ra_left(as, dest, ir->op1); 2235 ra_left(as, dest, ir->op1);
1968} 2236}
1969 2237
1970static void asm_min_max(ASMState *as, IRIns *ir, int cc) 2238static void asm_neg(ASMState *as, IRIns *ir)
2239{
2240 if (irt_isnum(ir->t))
2241 asm_fparith(as, ir, XO_XORPS);
2242 else
2243 asm_neg_not(as, ir, XOg_NEG);
2244}
2245
2246#define asm_abs(as, ir) asm_fparith(as, ir, XO_ANDPS)
2247
2248static void asm_intmin_max(ASMState *as, IRIns *ir, int cc)
1971{ 2249{
1972 Reg right, dest = ra_dest(as, ir, RSET_GPR); 2250 Reg right, dest = ra_dest(as, ir, RSET_GPR);
1973 IRRef lref = ir->op1, rref = ir->op2; 2251 IRRef lref = ir->op1, rref = ir->op2;
@@ -1978,7 +2256,30 @@ static void asm_min_max(ASMState *as, IRIns *ir, int cc)
1978 ra_left(as, dest, lref); 2256 ra_left(as, dest, lref);
1979} 2257}
1980 2258
1981static void asm_bitswap(ASMState *as, IRIns *ir) 2259static void asm_min(ASMState *as, IRIns *ir)
2260{
2261 if (irt_isnum(ir->t))
2262 asm_fparith(as, ir, XO_MINSD);
2263 else
2264 asm_intmin_max(as, ir, CC_G);
2265}
2266
2267static void asm_max(ASMState *as, IRIns *ir)
2268{
2269 if (irt_isnum(ir->t))
2270 asm_fparith(as, ir, XO_MAXSD);
2271 else
2272 asm_intmin_max(as, ir, CC_L);
2273}
2274
2275/* Note: don't use LEA for overflow-checking arithmetic! */
2276#define asm_addov(as, ir) asm_intarith(as, ir, XOg_ADD)
2277#define asm_subov(as, ir) asm_intarith(as, ir, XOg_SUB)
2278#define asm_mulov(as, ir) asm_intarith(as, ir, XOg_X_IMUL)
2279
2280#define asm_bnot(as, ir) asm_neg_not(as, ir, XOg_NOT)
2281
2282static void asm_bswap(ASMState *as, IRIns *ir)
1982{ 2283{
1983 Reg dest = ra_dest(as, ir, RSET_GPR); 2284 Reg dest = ra_dest(as, ir, RSET_GPR);
1984 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24), 2285 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24),
@@ -1986,7 +2287,11 @@ static void asm_bitswap(ASMState *as, IRIns *ir)
1986 ra_left(as, dest, ir->op1); 2287 ra_left(as, dest, ir->op1);
1987} 2288}
1988 2289
1989static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) 2290#define asm_band(as, ir) asm_intarith(as, ir, XOg_AND)
2291#define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR)
2292#define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR)
2293
2294static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv)
1990{ 2295{
1991 IRRef rref = ir->op2; 2296 IRRef rref = ir->op2;
1992 IRIns *irr = IR(rref); 2297 IRIns *irr = IR(rref);
@@ -1995,17 +2300,33 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
1995 int shift; 2300 int shift;
1996 dest = ra_dest(as, ir, RSET_GPR); 2301 dest = ra_dest(as, ir, RSET_GPR);
1997 shift = irr->i & (irt_is64(ir->t) ? 63 : 31); 2302 shift = irr->i & (irt_is64(ir->t) ? 63 : 31);
2303 if (!xv && shift && (as->flags & JIT_F_BMI2)) {
2304 Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t));
2305 if (left != dest) { /* BMI2 rotate right by constant. */
2306 emit_i8(as, xs == XOg_ROL ? -shift : shift);
2307 emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left);
2308 return;
2309 }
2310 }
1998 switch (shift) { 2311 switch (shift) {
1999 case 0: break; 2312 case 0: break;
2000 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; 2313 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break;
2001 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; 2314 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break;
2002 } 2315 }
2316 } else if ((as->flags & JIT_F_BMI2) && xv) { /* BMI2 variable shifts. */
2317 Reg left, right;
2318 dest = ra_dest(as, ir, RSET_GPR);
2319 right = ra_alloc1(as, rref, RSET_GPR);
2320 left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right),
2321 irt_is64(ir->t));
2322 emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left);
2323 return;
2003 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ 2324 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2004 Reg right; 2325 Reg right;
2005 dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); 2326 dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX));
2006 if (dest == RID_ECX) { 2327 if (dest == RID_ECX) {
2007 dest = ra_scratch(as, rset_exclude(RSET_GPR, RID_ECX)); 2328 dest = ra_scratch(as, rset_exclude(RSET_GPR, RID_ECX));
2008 emit_rr(as, XO_MOV, RID_ECX, dest); 2329 emit_rr(as, XO_MOV, REX_64IR(ir, RID_ECX), dest);
2009 } 2330 }
2010 right = irr->r; 2331 right = irr->r;
2011 if (ra_noreg(right)) 2332 if (ra_noreg(right))
@@ -2025,6 +2346,12 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2025 */ 2346 */
2026} 2347}
2027 2348
2349#define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX)
2350#define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX)
2351#define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX)
2352#define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0)
2353#define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0)
2354
2028/* -- Comparisons --------------------------------------------------------- */ 2355/* -- Comparisons --------------------------------------------------------- */
2029 2356
2030/* Virtual flags for unordered FP comparisons. */ 2357/* Virtual flags for unordered FP comparisons. */
@@ -2051,8 +2378,9 @@ static const uint16_t asm_compmap[IR_ABC+1] = {
2051}; 2378};
2052 2379
2053/* FP and integer comparisons. */ 2380/* FP and integer comparisons. */
2054static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc) 2381static void asm_comp(ASMState *as, IRIns *ir)
2055{ 2382{
2383 uint32_t cc = asm_compmap[ir->o];
2056 if (irt_isnum(ir->t)) { 2384 if (irt_isnum(ir->t)) {
2057 IRRef lref = ir->op1; 2385 IRRef lref = ir->op1;
2058 IRRef rref = ir->op2; 2386 IRRef rref = ir->op2;
@@ -2073,7 +2401,6 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2073 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */ 2401 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2074 } 2402 }
2075 left = ra_alloc1(as, lref, RSET_FPR); 2403 left = ra_alloc1(as, lref, RSET_FPR);
2076 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
2077 l_around = emit_label(as); 2404 l_around = emit_label(as);
2078 asm_guardcc(as, cc >> 4); 2405 asm_guardcc(as, cc >> 4);
2079 if (cc & VCC_P) { /* Extra CC_P branch required? */ 2406 if (cc & VCC_P) { /* Extra CC_P branch required? */
@@ -2090,14 +2417,16 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2090 emit_jcc(as, CC_P, as->mcp); 2417 emit_jcc(as, CC_P, as->mcp);
2091 } 2418 }
2092 } 2419 }
2420 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
2093 emit_mrm(as, XO_UCOMISD, left, right); 2421 emit_mrm(as, XO_UCOMISD, left, right);
2094 } else { 2422 } else {
2095 IRRef lref = ir->op1, rref = ir->op2; 2423 IRRef lref = ir->op1, rref = ir->op2;
2096 IROp leftop = (IROp)(IR(lref)->o); 2424 IROp leftop = (IROp)(IR(lref)->o);
2097 Reg r64 = REX_64IR(ir, 0); 2425 Reg r64 = REX_64IR(ir, 0);
2098 int32_t imm = 0; 2426 int32_t imm = 0;
2099 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || 2427 lj_assertA(irt_is64(ir->t) || irt_isint(ir->t) ||
2100 irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t)); 2428 irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t),
2429 "bad comparison data type %d", irt_type(ir->t));
2101 /* Swap constants (only for ABC) and fusable loads to the right. */ 2430 /* Swap constants (only for ABC) and fusable loads to the right. */
2102 if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) { 2431 if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
2103 if ((cc & 0xc) == 0xc) cc ^= 0x53; /* L <-> G, LE <-> GE */ 2432 if ((cc & 0xc) == 0xc) cc ^= 0x53; /* L <-> G, LE <-> GE */
@@ -2179,7 +2508,7 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2179 /* Use test r,r instead of cmp r,0. */ 2508 /* Use test r,r instead of cmp r,0. */
2180 x86Op xo = XO_TEST; 2509 x86Op xo = XO_TEST;
2181 if (irt_isu8(ir->t)) { 2510 if (irt_isu8(ir->t)) {
2182 lua_assert(ir->o == IR_EQ || ir->o == IR_NE); 2511 lj_assertA(ir->o == IR_EQ || ir->o == IR_NE, "bad usage");
2183 xo = XO_TESTb; 2512 xo = XO_TESTb;
2184 if (!rset_test(RSET_RANGE(RID_EAX, RID_EBX+1), left)) { 2513 if (!rset_test(RSET_RANGE(RID_EAX, RID_EBX+1), left)) {
2185 if (LJ_64) { 2514 if (LJ_64) {
@@ -2207,6 +2536,8 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2207 } 2536 }
2208} 2537}
2209 2538
2539#define asm_equal(as, ir) asm_comp(as, ir)
2540
2210#if LJ_32 && LJ_HASFFI 2541#if LJ_32 && LJ_HASFFI
2211/* 64 bit integer comparisons in 32 bit mode. */ 2542/* 64 bit integer comparisons in 32 bit mode. */
2212static void asm_comp_int64(ASMState *as, IRIns *ir) 2543static void asm_comp_int64(ASMState *as, IRIns *ir)
@@ -2279,23 +2610,19 @@ static void asm_comp_int64(ASMState *as, IRIns *ir)
2279} 2610}
2280#endif 2611#endif
2281 2612
2282/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */ 2613/* -- Split register ops -------------------------------------------------- */
2283 2614
2284/* Hiword op of a split 64 bit op. Previous op must be the loword op. */ 2615/* Hiword op of a split 32/32 or 64/64 bit op. Previous op is the loword op. */
2285static void asm_hiop(ASMState *as, IRIns *ir) 2616static void asm_hiop(ASMState *as, IRIns *ir)
2286{ 2617{
2287#if LJ_32 && LJ_HASFFI
2288 /* HIOP is marked as a store because it needs its own DCE logic. */ 2618 /* HIOP is marked as a store because it needs its own DCE logic. */
2289 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ 2619 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
2290 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; 2620 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
2621#if LJ_32 && LJ_HASFFI
2291 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ 2622 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
2292 if (usehi || uselo) {
2293 if (irt_isfp(ir->t))
2294 asm_conv_fp_int64(as, ir);
2295 else
2296 asm_conv_int64_fp(as, ir);
2297 }
2298 as->curins--; /* Always skip the CONV. */ 2623 as->curins--; /* Always skip the CONV. */
2624 if (usehi || uselo)
2625 asm_conv64(as, ir);
2299 return; 2626 return;
2300 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ 2627 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
2301 asm_comp_int64(as, ir); 2628 asm_comp_int64(as, ir);
@@ -2305,8 +2632,10 @@ static void asm_hiop(ASMState *as, IRIns *ir)
2305 asm_fxstore(as, ir); 2632 asm_fxstore(as, ir);
2306 return; 2633 return;
2307 } 2634 }
2635#endif
2308 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ 2636 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */
2309 switch ((ir-1)->o) { 2637 switch ((ir-1)->o) {
2638#if LJ_32 && LJ_HASFFI
2310 case IR_ADD: 2639 case IR_ADD:
2311 as->flagmcp = NULL; 2640 as->flagmcp = NULL;
2312 as->curins--; 2641 as->curins--;
@@ -2329,19 +2658,26 @@ static void asm_hiop(ASMState *as, IRIns *ir)
2329 asm_neg_not(as, ir-1, XOg_NEG); 2658 asm_neg_not(as, ir-1, XOg_NEG);
2330 break; 2659 break;
2331 } 2660 }
2332 case IR_CALLN:
2333 case IR_CALLXS:
2334 if (!uselo)
2335 ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */
2336 break;
2337 case IR_CNEWI: 2661 case IR_CNEWI:
2338 /* Nothing to do here. Handled by CNEWI itself. */ 2662 /* Nothing to do here. Handled by CNEWI itself. */
2339 break; 2663 break;
2340 default: lua_assert(0); break;
2341 }
2342#else
2343 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */
2344#endif 2664#endif
2665 case IR_CALLN: case IR_CALLL: case IR_CALLS: case IR_CALLXS:
2666 if (!uselo)
2667 ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */
2668 break;
2669 default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
2670 }
2671}
2672
2673/* -- Profiling ----------------------------------------------------------- */
2674
2675static void asm_prof(ASMState *as, IRIns *ir)
2676{
2677 UNUSED(ir);
2678 asm_guardcc(as, CC_NE);
2679 emit_i8(as, HOOK_PROFILE);
2680 emit_rma(as, XO_GROUP3b, XOg_TEST, &J2G(as->J)->hookmask);
2345} 2681}
2346 2682
2347/* -- Stack handling ------------------------------------------------------ */ 2683/* -- Stack handling ------------------------------------------------------ */
@@ -2358,14 +2694,19 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
2358 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0); 2694 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0);
2359 else 2695 else
2360 ra_modified(as, r); 2696 ra_modified(as, r);
2361 emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*topslot)); 2697 emit_gri(as, XG_ARITHi(XOg_CMP), r|REX_GC64, (int32_t)(8*topslot));
2362 if (ra_hasreg(pbase) && pbase != r) 2698 if (ra_hasreg(pbase) && pbase != r)
2363 emit_rr(as, XO_ARITH(XOg_SUB), r, pbase); 2699 emit_rr(as, XO_ARITH(XOg_SUB), r|REX_GC64, pbase);
2364 else 2700 else
2701#if LJ_GC64
2702 emit_rmro(as, XO_ARITH(XOg_SUB), r|REX_64, RID_DISPATCH,
2703 (int32_t)dispofs(as, &J2G(as->J)->jit_base));
2704#else
2365 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, 2705 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE,
2366 ptr2addr(&J2G(as->J)->jit_base)); 2706 ptr2addr(&J2G(as->J)->jit_base));
2367 emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack)); 2707#endif
2368 emit_getgl(as, r, jit_L); 2708 emit_rmro(as, XO_MOV, r|REX_GC64, r, offsetof(lua_State, maxstack));
2709 emit_getgl(as, r, cur_L);
2369 if (allow == RSET_EMPTY) /* Spill temp. register. */ 2710 if (allow == RSET_EMPTY) /* Spill temp. register. */
2370 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0); 2711 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0);
2371} 2712}
@@ -2374,40 +2715,79 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
2374static void asm_stack_restore(ASMState *as, SnapShot *snap) 2715static void asm_stack_restore(ASMState *as, SnapShot *snap)
2375{ 2716{
2376 SnapEntry *map = &as->T->snapmap[snap->mapofs]; 2717 SnapEntry *map = &as->T->snapmap[snap->mapofs];
2377 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1]; 2718#if !LJ_FR2 || defined(LUA_USE_ASSERT)
2719 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
2720#endif
2378 MSize n, nent = snap->nent; 2721 MSize n, nent = snap->nent;
2379 /* Store the value of all modified slots to the Lua stack. */ 2722 /* Store the value of all modified slots to the Lua stack. */
2380 for (n = 0; n < nent; n++) { 2723 for (n = 0; n < nent; n++) {
2381 SnapEntry sn = map[n]; 2724 SnapEntry sn = map[n];
2382 BCReg s = snap_slot(sn); 2725 BCReg s = snap_slot(sn);
2383 int32_t ofs = 8*((int32_t)s-1); 2726 int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
2384 IRRef ref = snap_ref(sn); 2727 IRRef ref = snap_ref(sn);
2385 IRIns *ir = IR(ref); 2728 IRIns *ir = IR(ref);
2386 if ((sn & SNAP_NORESTORE)) 2729 if ((sn & SNAP_NORESTORE))
2387 continue; 2730 continue;
2388 if (irt_isnum(ir->t)) { 2731 if ((sn & SNAP_KEYINDEX)) {
2732 emit_movmroi(as, RID_BASE, ofs+4, LJ_KEYINDEX);
2733 if (irref_isk(ref)) {
2734 emit_movmroi(as, RID_BASE, ofs, ir->i);
2735 } else {
2736 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
2737 emit_movtomro(as, src, RID_BASE, ofs);
2738 }
2739 } else if (irt_isnum(ir->t)) {
2389 Reg src = ra_alloc1(as, ref, RSET_FPR); 2740 Reg src = ra_alloc1(as, ref, RSET_FPR);
2390 emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs); 2741 emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
2391 } else { 2742 } else {
2392 lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || 2743 lj_assertA(irt_ispri(ir->t) || irt_isaddr(ir->t) ||
2393 (LJ_DUALNUM && irt_isinteger(ir->t))); 2744 (LJ_DUALNUM && irt_isinteger(ir->t)),
2745 "restore of IR type %d", irt_type(ir->t));
2394 if (!irref_isk(ref)) { 2746 if (!irref_isk(ref)) {
2395 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); 2747 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
2748#if LJ_GC64
2749 if (irt_is64(ir->t)) {
2750 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
2751 emit_u32(as, irt_toitype(ir->t) << 15);
2752 emit_rmro(as, XO_ARITHi, XOg_OR, RID_BASE, ofs+4);
2753 } else if (LJ_DUALNUM && irt_isinteger(ir->t)) {
2754 emit_movmroi(as, RID_BASE, ofs+4, LJ_TISNUM << 15);
2755 } else {
2756 emit_movmroi(as, RID_BASE, ofs+4, (irt_toitype(ir->t)<<15)|0x7fff);
2757 }
2758#endif
2396 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs); 2759 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs);
2760#if LJ_GC64
2761 } else {
2762 TValue k;
2763 lj_ir_kvalue(as->J->L, &k, ir);
2764 if (tvisnil(&k)) {
2765 emit_i32(as, -1);
2766 emit_rmro(as, XO_MOVmi, REX_64, RID_BASE, ofs);
2767 } else {
2768 emit_movmroi(as, RID_BASE, ofs+4, k.u32.hi);
2769 emit_movmroi(as, RID_BASE, ofs, k.u32.lo);
2770 }
2771#else
2397 } else if (!irt_ispri(ir->t)) { 2772 } else if (!irt_ispri(ir->t)) {
2398 emit_movmroi(as, RID_BASE, ofs, ir->i); 2773 emit_movmroi(as, RID_BASE, ofs, ir->i);
2774#endif
2399 } 2775 }
2400 if ((sn & (SNAP_CONT|SNAP_FRAME))) { 2776 if ((sn & (SNAP_CONT|SNAP_FRAME))) {
2777#if !LJ_FR2
2401 if (s != 0) /* Do not overwrite link to previous frame. */ 2778 if (s != 0) /* Do not overwrite link to previous frame. */
2402 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--)); 2779 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--));
2780#endif
2781#if !LJ_GC64
2403 } else { 2782 } else {
2404 if (!(LJ_64 && irt_islightud(ir->t))) 2783 if (!(LJ_64 && irt_islightud(ir->t)))
2405 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); 2784 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
2785#endif
2406 } 2786 }
2407 } 2787 }
2408 checkmclim(as); 2788 checkmclim(as);
2409 } 2789 }
2410 lua_assert(map + nent == flinks); 2790 lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
2411} 2791}
2412 2792
2413/* -- GC handling --------------------------------------------------------- */ 2793/* -- GC handling --------------------------------------------------------- */
@@ -2428,11 +2808,15 @@ static void asm_gc_check(ASMState *as)
2428 args[1] = ASMREF_TMP2; /* MSize steps */ 2808 args[1] = ASMREF_TMP2; /* MSize steps */
2429 asm_gencall(as, ci, args); 2809 asm_gencall(as, ci, args);
2430 tmp = ra_releasetmp(as, ASMREF_TMP1); 2810 tmp = ra_releasetmp(as, ASMREF_TMP1);
2811#if LJ_GC64
2812 emit_rmro(as, XO_LEA, tmp|REX_64, RID_DISPATCH, GG_DISP2G);
2813#else
2431 emit_loada(as, tmp, J2G(as->J)); 2814 emit_loada(as, tmp, J2G(as->J));
2815#endif
2432 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps); 2816 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps);
2433 /* Jump around GC step if GC total < GC threshold. */ 2817 /* Jump around GC step if GC total < GC threshold. */
2434 emit_sjcc(as, CC_B, l_end); 2818 emit_sjcc(as, CC_B, l_end);
2435 emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold); 2819 emit_opgl(as, XO_ARITH(XOg_CMP), tmp|REX_GC64, gc.threshold);
2436 emit_getgl(as, tmp, gc.total); 2820 emit_getgl(as, tmp, gc.total);
2437 as->gcsteps = 0; 2821 as->gcsteps = 0;
2438 checkmclim(as); 2822 checkmclim(as);
@@ -2447,16 +2831,16 @@ static void asm_loop_fixup(ASMState *as)
2447 MCode *target = as->mcp; 2831 MCode *target = as->mcp;
2448 if (as->realign) { /* Realigned loops use short jumps. */ 2832 if (as->realign) { /* Realigned loops use short jumps. */
2449 as->realign = NULL; /* Stop another retry. */ 2833 as->realign = NULL; /* Stop another retry. */
2450 lua_assert(((intptr_t)target & 15) == 0); 2834 lj_assertA(((intptr_t)target & 15) == 0, "loop realign failed");
2451 if (as->loopinv) { /* Inverted loop branch? */ 2835 if (as->loopinv) { /* Inverted loop branch? */
2452 p -= 5; 2836 p -= 5;
2453 p[0] = XI_JMP; 2837 p[0] = XI_JMP;
2454 lua_assert(target - p >= -128); 2838 lj_assertA(target - p >= -128, "loop realign failed");
2455 p[-1] = (MCode)(target - p); /* Patch sjcc. */ 2839 p[-1] = (MCode)(target - p); /* Patch sjcc. */
2456 if (as->loopinv == 2) 2840 if (as->loopinv == 2)
2457 p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */ 2841 p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */
2458 } else { 2842 } else {
2459 lua_assert(target - p >= -128); 2843 lj_assertA(target - p >= -128, "loop realign failed");
2460 p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */ 2844 p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */
2461 p[-2] = XI_JMPs; 2845 p[-2] = XI_JMPs;
2462 } 2846 }
@@ -2485,6 +2869,12 @@ static void asm_loop_fixup(ASMState *as)
2485 } 2869 }
2486} 2870}
2487 2871
2872/* Fixup the tail of the loop. */
2873static void asm_loop_tail_fixup(ASMState *as)
2874{
2875 UNUSED(as); /* Nothing to do. */
2876}
2877
2488/* -- Head of trace ------------------------------------------------------- */ 2878/* -- Head of trace ------------------------------------------------------- */
2489 2879
2490/* Coalesce BASE register for a root trace. */ 2880/* Coalesce BASE register for a root trace. */
@@ -2497,7 +2887,7 @@ static void asm_head_root_base(ASMState *as)
2497 if (rset_test(as->modset, r) || irt_ismarked(ir->t)) 2887 if (rset_test(as->modset, r) || irt_ismarked(ir->t))
2498 ir->r = RID_INIT; /* No inheritance for modified BASE register. */ 2888 ir->r = RID_INIT; /* No inheritance for modified BASE register. */
2499 if (r != RID_BASE) 2889 if (r != RID_BASE)
2500 emit_rr(as, XO_MOV, r, RID_BASE); 2890 emit_rr(as, XO_MOV, r|REX_GC64, RID_BASE);
2501 } 2891 }
2502} 2892}
2503 2893
@@ -2513,8 +2903,9 @@ static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
2513 if (irp->r == r) { 2903 if (irp->r == r) {
2514 rset_clear(allow, r); /* Mark same BASE register as coalesced. */ 2904 rset_clear(allow, r); /* Mark same BASE register as coalesced. */
2515 } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) { 2905 } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
2906 /* Move from coalesced parent reg. */
2516 rset_clear(allow, irp->r); 2907 rset_clear(allow, irp->r);
2517 emit_rr(as, XO_MOV, r, irp->r); /* Move from coalesced parent reg. */ 2908 emit_rr(as, XO_MOV, r|REX_GC64, irp->r);
2518 } else { 2909 } else {
2519 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */ 2910 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */
2520 } 2911 }
@@ -2532,7 +2923,7 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
2532 MCode *target, *q; 2923 MCode *target, *q;
2533 int32_t spadj = as->T->spadjust; 2924 int32_t spadj = as->T->spadjust;
2534 if (spadj == 0) { 2925 if (spadj == 0) {
2535 p -= ((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0); 2926 p -= LJ_64 ? 7 : 6;
2536 } else { 2927 } else {
2537 MCode *p1; 2928 MCode *p1;
2538 /* Patch stack adjustment. */ 2929 /* Patch stack adjustment. */
@@ -2544,24 +2935,15 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
2544 p1 = p-9; 2935 p1 = p-9;
2545 *(int32_t *)p1 = spadj; 2936 *(int32_t *)p1 = spadj;
2546 } 2937 }
2547 if ((as->flags & JIT_F_LEA_AGU)) {
2548#if LJ_64
2549 p1[-4] = 0x48;
2550#endif
2551 p1[-3] = (MCode)XI_LEA;
2552 p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
2553 p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
2554 } else {
2555#if LJ_64 2938#if LJ_64
2556 p1[-3] = 0x48; 2939 p1[-3] = 0x48;
2557#endif 2940#endif
2558 p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi); 2941 p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
2559 p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP); 2942 p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
2560 }
2561 } 2943 }
2562 /* Patch exit branch. */ 2944 /* Patch exit branch. */
2563 target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp; 2945 target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
2564 *(int32_t *)(p-4) = jmprel(p, target); 2946 *(int32_t *)(p-4) = jmprel(as->J, p, target);
2565 p[-5] = XI_JMP; 2947 p[-5] = XI_JMP;
2566 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */ 2948 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
2567 for (q = as->mctop-1; q >= p; q--) 2949 for (q = as->mctop-1; q >= p; q--)
@@ -2588,168 +2970,11 @@ static void asm_tail_prep(ASMState *as)
2588 as->invmcp = as->mcp = p; 2970 as->invmcp = as->mcp = p;
2589 } else { 2971 } else {
2590 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */ 2972 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
2591 as->mcp = p - (((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0)); 2973 as->mcp = p - (LJ_64 ? 7 : 6);
2592 as->invmcp = NULL; 2974 as->invmcp = NULL;
2593 } 2975 }
2594} 2976}
2595 2977
2596/* -- Instruction dispatch ------------------------------------------------ */
2597
2598/* Assemble a single instruction. */
2599static void asm_ir(ASMState *as, IRIns *ir)
2600{
2601 switch ((IROp)ir->o) {
2602 /* Miscellaneous ops. */
2603 case IR_LOOP: asm_loop(as); break;
2604 case IR_NOP: case IR_XBAR: lua_assert(!ra_used(ir)); break;
2605 case IR_USE:
2606 ra_alloc1(as, ir->op1, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); break;
2607 case IR_PHI: asm_phi(as, ir); break;
2608 case IR_HIOP: asm_hiop(as, ir); break;
2609 case IR_GCSTEP: asm_gcstep(as, ir); break;
2610
2611 /* Guarded assertions. */
2612 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
2613 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
2614 case IR_EQ: case IR_NE: case IR_ABC:
2615 asm_comp(as, ir, asm_compmap[ir->o]);
2616 break;
2617
2618 case IR_RETF: asm_retf(as, ir); break;
2619
2620 /* Bit ops. */
2621 case IR_BNOT: asm_neg_not(as, ir, XOg_NOT); break;
2622 case IR_BSWAP: asm_bitswap(as, ir); break;
2623
2624 case IR_BAND: asm_intarith(as, ir, XOg_AND); break;
2625 case IR_BOR: asm_intarith(as, ir, XOg_OR); break;
2626 case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break;
2627
2628 case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break;
2629 case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break;
2630 case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break;
2631 case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break;
2632 case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break;
2633
2634 /* Arithmetic ops. */
2635 case IR_ADD: asm_add(as, ir); break;
2636 case IR_SUB:
2637 if (irt_isnum(ir->t))
2638 asm_fparith(as, ir, XO_SUBSD);
2639 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2640 asm_intarith(as, ir, XOg_SUB);
2641 break;
2642 case IR_MUL:
2643 if (irt_isnum(ir->t))
2644 asm_fparith(as, ir, XO_MULSD);
2645 else
2646 asm_intarith(as, ir, XOg_X_IMUL);
2647 break;
2648 case IR_DIV:
2649#if LJ_64 && LJ_HASFFI
2650 if (!irt_isnum(ir->t))
2651 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
2652 IRCALL_lj_carith_divu64);
2653 else
2654#endif
2655 asm_fparith(as, ir, XO_DIVSD);
2656 break;
2657 case IR_MOD:
2658#if LJ_64 && LJ_HASFFI
2659 if (!irt_isint(ir->t))
2660 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
2661 IRCALL_lj_carith_modu64);
2662 else
2663#endif
2664 asm_intmod(as, ir);
2665 break;
2666
2667 case IR_NEG:
2668 if (irt_isnum(ir->t))
2669 asm_fparith(as, ir, XO_XORPS);
2670 else
2671 asm_neg_not(as, ir, XOg_NEG);
2672 break;
2673 case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break;
2674
2675 case IR_MIN:
2676 if (irt_isnum(ir->t))
2677 asm_fparith(as, ir, XO_MINSD);
2678 else
2679 asm_min_max(as, ir, CC_G);
2680 break;
2681 case IR_MAX:
2682 if (irt_isnum(ir->t))
2683 asm_fparith(as, ir, XO_MAXSD);
2684 else
2685 asm_min_max(as, ir, CC_L);
2686 break;
2687
2688 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
2689 asm_fpmath(as, ir);
2690 break;
2691 case IR_POW:
2692#if LJ_64 && LJ_HASFFI
2693 if (!irt_isnum(ir->t))
2694 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
2695 IRCALL_lj_carith_powu64);
2696 else
2697#endif
2698 asm_fppowi(as, ir);
2699 break;
2700
2701 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
2702 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
2703 case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break;
2704 case IR_MULOV: asm_intarith(as, ir, XOg_X_IMUL); break;
2705
2706 /* Memory references. */
2707 case IR_AREF: asm_aref(as, ir); break;
2708 case IR_HREF: asm_href(as, ir); break;
2709 case IR_HREFK: asm_hrefk(as, ir); break;
2710 case IR_NEWREF: asm_newref(as, ir); break;
2711 case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
2712 case IR_FREF: asm_fref(as, ir); break;
2713 case IR_STRREF: asm_strref(as, ir); break;
2714
2715 /* Loads and stores. */
2716 case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
2717 asm_ahuvload(as, ir);
2718 break;
2719 case IR_FLOAD: case IR_XLOAD: asm_fxload(as, ir); break;
2720 case IR_SLOAD: asm_sload(as, ir); break;
2721
2722 case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
2723 case IR_FSTORE: case IR_XSTORE: asm_fxstore(as, ir); break;
2724
2725 /* Allocations. */
2726 case IR_SNEW: case IR_XSNEW: asm_snew(as, ir); break;
2727 case IR_TNEW: asm_tnew(as, ir); break;
2728 case IR_TDUP: asm_tdup(as, ir); break;
2729 case IR_CNEW: case IR_CNEWI: asm_cnew(as, ir); break;
2730
2731 /* Write barriers. */
2732 case IR_TBAR: asm_tbar(as, ir); break;
2733 case IR_OBAR: asm_obar(as, ir); break;
2734
2735 /* Type conversions. */
2736 case IR_TOBIT: asm_tobit(as, ir); break;
2737 case IR_CONV: asm_conv(as, ir); break;
2738 case IR_TOSTR: asm_tostr(as, ir); break;
2739 case IR_STRTO: asm_strto(as, ir); break;
2740
2741 /* Calls. */
2742 case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break;
2743 case IR_CALLXS: asm_callx(as, ir); break;
2744 case IR_CARG: break;
2745
2746 default:
2747 setintV(&as->J->errinfo, ir->o);
2748 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
2749 break;
2750 }
2751}
2752
2753/* -- Trace setup --------------------------------------------------------- */ 2978/* -- Trace setup --------------------------------------------------------- */
2754 2979
2755/* Ensure there are enough stack slots for call arguments. */ 2980/* Ensure there are enough stack slots for call arguments. */
@@ -2772,6 +2997,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
2772static void asm_setup_target(ASMState *as) 2997static void asm_setup_target(ASMState *as)
2773{ 2998{
2774 asm_exitstub_setup(as, as->T->nsnap); 2999 asm_exitstub_setup(as, as->T->nsnap);
3000 as->mrm.base = 0;
2775} 3001}
2776 3002
2777/* -- Trace patching ------------------------------------------------------ */ 3003/* -- Trace patching ------------------------------------------------------ */
@@ -2885,18 +3111,24 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
2885 MCode *px = exitstub_addr(J, exitno) - 6; 3111 MCode *px = exitstub_addr(J, exitno) - 6;
2886 MCode *pe = p+len-6; 3112 MCode *pe = p+len-6;
2887 MCode *pgc = NULL; 3113 MCode *pgc = NULL;
2888 uint32_t stateaddr = u32ptr(&J2G(J)->vmstate); 3114#if LJ_GC64
3115 uint32_t statei = (uint32_t)(GG_OFS(g.vmstate) - GG_OFS(dispatch));
3116#else
3117 uint32_t statei = u32ptr(&J2G(J)->vmstate);
3118#endif
2889 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) 3119 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
2890 *(int32_t *)(p+len-4) = jmprel(p+len, target); 3120 *(int32_t *)(p+len-4) = jmprel(J, p+len, target);
2891 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */ 3121 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
2892 for (; p < pe; p += asm_x86_inslen(p)) 3122 for (; p < pe; p += asm_x86_inslen(p)) {
2893 if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) 3123 intptr_t ofs = LJ_GC64 ? (p[0] & 0xf0) == 0x40 : LJ_64;
3124 if (*(uint32_t *)(p+2+ofs) == statei && p[ofs+LJ_GC64-LJ_64] == XI_MOVmi)
2894 break; 3125 break;
2895 lua_assert(p < pe); 3126 }
3127 lj_assertJ(p < pe, "instruction length decoder failed");
2896 for (; p < pe; p += asm_x86_inslen(p)) { 3128 for (; p < pe; p += asm_x86_inslen(p)) {
2897 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px && 3129 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px &&
2898 p != pgc) { 3130 p != pgc) {
2899 *(int32_t *)(p+2) = jmprel(p+6, target); 3131 *(int32_t *)(p+2) = jmprel(J, p+6, target);
2900 } else if (*p == XI_CALL && 3132 } else if (*p == XI_CALL &&
2901 (void *)(p+5+*(int32_t *)(p+1)) == (void *)lj_gc_step_jit) { 3133 (void *)(p+5+*(int32_t *)(p+1)) == (void *)lj_gc_step_jit) {
2902 pgc = p+7; /* Do not patch GC check exit. */ 3134 pgc = p+7; /* Do not patch GC check exit. */