aboutsummaryrefslogtreecommitdiff
path: root/src/lj_asm_x86.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_asm_x86.h')
-rw-r--r--src/lj_asm_x86.h1121
1 files changed, 671 insertions, 450 deletions
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 2c38d1ec..21b510ca 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -21,12 +21,14 @@ static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
21 } 21 }
22 /* Push the high byte of the exitno for each exit stub group. */ 22 /* Push the high byte of the exitno for each exit stub group. */
23 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8); 23 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
24#if !LJ_GC64
24 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */ 25 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
25 *mxp++ = XI_MOVmi; 26 *mxp++ = XI_MOVmi;
26 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP); 27 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
27 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP); 28 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
28 *mxp++ = 2*sizeof(void *); 29 *mxp++ = 2*sizeof(void *);
29 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4; 30 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
31#endif
30 /* Jump to exit handler which fills in the ExitState. */ 32 /* Jump to exit handler which fills in the ExitState. */
31 *mxp++ = XI_JMP; mxp += 4; 33 *mxp++ = XI_JMP; mxp += 4;
32 *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler); 34 *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler);
@@ -62,10 +64,14 @@ static void asm_guardcc(ASMState *as, int cc)
62 target = p; 64 target = p;
63 cc ^= 1; 65 cc ^= 1;
64 if (as->realign) { 66 if (as->realign) {
67 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
68 as->mrm.ofs += 2; /* Fixup RIP offset for pending fused load. */
65 emit_sjcc(as, cc, target); 69 emit_sjcc(as, cc, target);
66 return; 70 return;
67 } 71 }
68 } 72 }
73 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
74 as->mrm.ofs += 6; /* Fixup RIP offset for pending fused load. */
69 emit_jcc(as, cc, target); 75 emit_jcc(as, cc, target);
70} 76}
71 77
@@ -79,6 +85,15 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
79{ 85{
80 if (irref_isk(ref)) { 86 if (irref_isk(ref)) {
81 IRIns *ir = IR(ref); 87 IRIns *ir = IR(ref);
88#if LJ_GC64
89 if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
90 *k = ir->i;
91 return 1;
92 } else if (checki32((int64_t)ir_k64(ir)->u64)) {
93 *k = (int32_t)ir_k64(ir)->u64;
94 return 1;
95 }
96#else
82 if (ir->o != IR_KINT64) { 97 if (ir->o != IR_KINT64) {
83 *k = ir->i; 98 *k = ir->i;
84 return 1; 99 return 1;
@@ -86,6 +101,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
86 *k = (int32_t)ir_kint64(ir)->u64; 101 *k = (int32_t)ir_kint64(ir)->u64;
87 return 1; 102 return 1;
88 } 103 }
104#endif
89 } 105 }
90 return 0; 106 return 0;
91} 107}
@@ -185,9 +201,19 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
185 if (irref_isk(ir->op1)) { 201 if (irref_isk(ir->op1)) {
186 GCfunc *fn = ir_kfunc(IR(ir->op1)); 202 GCfunc *fn = ir_kfunc(IR(ir->op1));
187 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv; 203 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
204#if LJ_GC64
205 int64_t ofs = dispofs(as, &uv->tv);
206 if (checki32(ofs) && checki32(ofs+4)) {
207 as->mrm.ofs = (int32_t)ofs;
208 as->mrm.base = RID_DISPATCH;
209 as->mrm.idx = RID_NONE;
210 return;
211 }
212#else
188 as->mrm.ofs = ptr2addr(&uv->tv); 213 as->mrm.ofs = ptr2addr(&uv->tv);
189 as->mrm.base = as->mrm.idx = RID_NONE; 214 as->mrm.base = as->mrm.idx = RID_NONE;
190 return; 215 return;
216#endif
191 } 217 }
192 break; 218 break;
193 default: 219 default:
@@ -205,14 +231,40 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
205static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) 231static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
206{ 232{
207 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); 233 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF);
208 as->mrm.ofs = field_ofs[ir->op2];
209 as->mrm.idx = RID_NONE; 234 as->mrm.idx = RID_NONE;
235 if (ir->op1 == REF_NIL) {
236#if LJ_GC64
237 as->mrm.ofs = (int32_t)(ir->op2 << 2) - GG_OFS(dispatch);
238 as->mrm.base = RID_DISPATCH;
239#else
240 as->mrm.ofs = (int32_t)(ir->op2 << 2) + ptr2addr(J2GG(as->J));
241 as->mrm.base = RID_NONE;
242#endif
243 return;
244 }
245 as->mrm.ofs = field_ofs[ir->op2];
210 if (irref_isk(ir->op1)) { 246 if (irref_isk(ir->op1)) {
211 as->mrm.ofs += IR(ir->op1)->i; 247 IRIns *op1 = IR(ir->op1);
248#if LJ_GC64
249 if (ir->op1 == REF_NIL) {
250 as->mrm.ofs -= GG_OFS(dispatch);
251 as->mrm.base = RID_DISPATCH;
252 return;
253 } else if (op1->o == IR_KPTR || op1->o == IR_KKPTR) {
254 intptr_t ofs = dispofs(as, ir_kptr(op1));
255 if (checki32(as->mrm.ofs + ofs)) {
256 as->mrm.ofs += (int32_t)ofs;
257 as->mrm.base = RID_DISPATCH;
258 return;
259 }
260 }
261#else
262 as->mrm.ofs += op1->i;
212 as->mrm.base = RID_NONE; 263 as->mrm.base = RID_NONE;
213 } else { 264 return;
214 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); 265#endif
215 } 266 }
267 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
216} 268}
217 269
218/* Fuse string reference into memory operand. */ 270/* Fuse string reference into memory operand. */
@@ -223,7 +275,7 @@ static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
223 as->mrm.base = as->mrm.idx = RID_NONE; 275 as->mrm.base = as->mrm.idx = RID_NONE;
224 as->mrm.scale = XM_SCALE1; 276 as->mrm.scale = XM_SCALE1;
225 as->mrm.ofs = sizeof(GCstr); 277 as->mrm.ofs = sizeof(GCstr);
226 if (irref_isk(ir->op1)) { 278 if (!LJ_GC64 && irref_isk(ir->op1)) {
227 as->mrm.ofs += IR(ir->op1)->i; 279 as->mrm.ofs += IR(ir->op1)->i;
228 } else { 280 } else {
229 Reg r = ra_alloc1(as, ir->op1, allow); 281 Reg r = ra_alloc1(as, ir->op1, allow);
@@ -255,10 +307,20 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
255 IRIns *ir = IR(ref); 307 IRIns *ir = IR(ref);
256 as->mrm.idx = RID_NONE; 308 as->mrm.idx = RID_NONE;
257 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { 309 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
310#if LJ_GC64
311 intptr_t ofs = dispofs(as, ir_kptr(ir));
312 if (checki32(ofs)) {
313 as->mrm.ofs = (int32_t)ofs;
314 as->mrm.base = RID_DISPATCH;
315 return;
316 }
317 } if (0) {
318#else
258 as->mrm.ofs = ir->i; 319 as->mrm.ofs = ir->i;
259 as->mrm.base = RID_NONE; 320 as->mrm.base = RID_NONE;
260 } else if (ir->o == IR_STRREF) { 321 } else if (ir->o == IR_STRREF) {
261 asm_fusestrref(as, ir, allow); 322 asm_fusestrref(as, ir, allow);
323#endif
262 } else { 324 } else {
263 as->mrm.ofs = 0; 325 as->mrm.ofs = 0;
264 if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) { 326 if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) {
@@ -301,7 +363,46 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
301 } 363 }
302} 364}
303 365
304/* Fuse load into memory operand. */ 366/* Fuse load of 64 bit IR constant into memory operand. */
367static Reg asm_fuseloadk64(ASMState *as, IRIns *ir)
368{
369 const uint64_t *k = &ir_k64(ir)->u64;
370 if (!LJ_GC64 || checki32((intptr_t)k)) {
371 as->mrm.ofs = ptr2addr(k);
372 as->mrm.base = RID_NONE;
373#if LJ_GC64
374 } else if (checki32(dispofs(as, k))) {
375 as->mrm.ofs = (int32_t)dispofs(as, k);
376 as->mrm.base = RID_DISPATCH;
377 } else if (checki32(mcpofs(as, k)) && checki32(mcpofs(as, k+1)) &&
378 checki32(mctopofs(as, k)) && checki32(mctopofs(as, k+1))) {
379 as->mrm.ofs = (int32_t)mcpofs(as, k);
380 as->mrm.base = RID_RIP;
381 } else {
382 if (ir->i) {
383 lua_assert(*k == *(uint64_t*)(as->mctop - ir->i));
384 } else {
385 while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3;
386 *(uint64_t*)as->mcbot = *k;
387 ir->i = (int32_t)(as->mctop - as->mcbot);
388 as->mcbot += 8;
389 as->mclim = as->mcbot + MCLIM_REDZONE;
390 lj_mcode_commitbot(as->J, as->mcbot);
391 }
392 as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i);
393 as->mrm.base = RID_RIP;
394#endif
395 }
396 as->mrm.idx = RID_NONE;
397 return RID_MRM;
398}
399
400/* Fuse load into memory operand.
401**
402** Important caveat: this may emit RIP-relative loads! So don't place any
403** code emitters between this function and the use of its result.
404** The only permitted exception is asm_guardcc().
405*/
305static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) 406static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
306{ 407{
307 IRIns *ir = IR(ref); 408 IRIns *ir = IR(ref);
@@ -320,26 +421,35 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
320 if (ir->o == IR_KNUM) { 421 if (ir->o == IR_KNUM) {
321 RegSet avail = as->freeset & ~as->modset & RSET_FPR; 422 RegSet avail = as->freeset & ~as->modset & RSET_FPR;
322 lua_assert(allow != RSET_EMPTY); 423 lua_assert(allow != RSET_EMPTY);
323 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ 424 if (!(avail & (avail-1))) /* Fuse if less than two regs available. */
324 as->mrm.ofs = ptr2addr(ir_knum(ir)); 425 return asm_fuseloadk64(as, ir);
325 as->mrm.base = as->mrm.idx = RID_NONE;
326 return RID_MRM;
327 }
328 } else if (ref == REF_BASE || ir->o == IR_KINT64) { 426 } else if (ref == REF_BASE || ir->o == IR_KINT64) {
329 RegSet avail = as->freeset & ~as->modset & RSET_GPR; 427 RegSet avail = as->freeset & ~as->modset & RSET_GPR;
330 lua_assert(allow != RSET_EMPTY); 428 lua_assert(allow != RSET_EMPTY);
331 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ 429 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */
332 as->mrm.ofs = ptr2addr(ref == REF_BASE ? (void *)&J2G(as->J)->jit_base : (void *)ir_kint64(ir)); 430 if (ref == REF_BASE) {
333 as->mrm.base = as->mrm.idx = RID_NONE; 431#if LJ_GC64
334 return RID_MRM; 432 as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->jit_base);
433 as->mrm.base = RID_DISPATCH;
434#else
435 as->mrm.ofs = ptr2addr(&J2G(as->J)->jit_base);
436 as->mrm.base = RID_NONE;
437#endif
438 as->mrm.idx = RID_NONE;
439 return RID_MRM;
440 } else {
441 return asm_fuseloadk64(as, ir);
442 }
335 } 443 }
336 } else if (mayfuse(as, ref)) { 444 } else if (mayfuse(as, ref)) {
337 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; 445 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
338 if (ir->o == IR_SLOAD) { 446 if (ir->o == IR_SLOAD) {
339 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && 447 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
340 noconflict(as, ref, IR_RETF, 0)) { 448 noconflict(as, ref, IR_RETF, 0) &&
449 !(LJ_GC64 && irt_isaddr(ir->t))) {
341 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); 450 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
342 as->mrm.ofs = 8*((int32_t)ir->op1-1) + ((ir->op2&IRSLOAD_FRAME)?4:0); 451 as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
452 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
343 as->mrm.idx = RID_NONE; 453 as->mrm.idx = RID_NONE;
344 return RID_MRM; 454 return RID_MRM;
345 } 455 }
@@ -351,7 +461,8 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
351 return RID_MRM; 461 return RID_MRM;
352 } 462 }
353 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { 463 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
354 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0)) { 464 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
465 !(LJ_GC64 && irt_isaddr(ir->t))) {
355 asm_fuseahuref(as, ir->op1, xallow); 466 asm_fuseahuref(as, ir->op1, xallow);
356 return RID_MRM; 467 return RID_MRM;
357 } 468 }
@@ -364,11 +475,15 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
364 asm_fusexref(as, ir->op1, xallow); 475 asm_fusexref(as, ir->op1, xallow);
365 return RID_MRM; 476 return RID_MRM;
366 } 477 }
367 } else if (ir->o == IR_VLOAD) { 478 } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) {
368 asm_fuseahuref(as, ir->op1, xallow); 479 asm_fuseahuref(as, ir->op1, xallow);
369 return RID_MRM; 480 return RID_MRM;
370 } 481 }
371 } 482 }
483 if (ir->o == IR_FLOAD && ir->op1 == REF_NIL) {
484 asm_fusefref(as, ir, RSET_EMPTY);
485 return RID_MRM;
486 }
372 if (!(as->freeset & allow) && !emit_canremat(ref) && 487 if (!(as->freeset & allow) && !emit_canremat(ref) &&
373 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref))) 488 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref)))
374 goto fusespill; 489 goto fusespill;
@@ -392,7 +507,7 @@ static Reg asm_fuseloadm(ASMState *as, IRRef ref, RegSet allow, int is64)
392/* Count the required number of stack slots for a call. */ 507/* Count the required number of stack slots for a call. */
393static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args) 508static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
394{ 509{
395 uint32_t i, nargs = CCI_NARGS(ci); 510 uint32_t i, nargs = CCI_XNARGS(ci);
396 int nslots = 0; 511 int nslots = 0;
397#if LJ_64 512#if LJ_64
398 if (LJ_ABI_WIN) { 513 if (LJ_ABI_WIN) {
@@ -425,7 +540,7 @@ static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
425/* Generate a call to a C function. */ 540/* Generate a call to a C function. */
426static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) 541static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
427{ 542{
428 uint32_t n, nargs = CCI_NARGS(ci); 543 uint32_t n, nargs = CCI_XNARGS(ci);
429 int32_t ofs = STACKARG_OFS; 544 int32_t ofs = STACKARG_OFS;
430#if LJ_64 545#if LJ_64
431 uint32_t gprs = REGARG_GPRS; 546 uint32_t gprs = REGARG_GPRS;
@@ -485,8 +600,8 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
485 if (r) { /* Argument is in a register. */ 600 if (r) { /* Argument is in a register. */
486 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 601 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
487#if LJ_64 602#if LJ_64
488 if (ir->o == IR_KINT64) 603 if (LJ_GC64 ? !(ir->o == IR_KINT || ir->o == IR_KNULL) : ir->o == IR_KINT64)
489 emit_loadu64(as, r, ir_kint64(ir)->u64); 604 emit_loadu64(as, r, ir_k64(ir)->u64);
490 else 605 else
491#endif 606#endif
492 emit_loadi(as, r, ir->i); 607 emit_loadi(as, r, ir->i);
@@ -560,7 +675,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
560 if (ra_hasreg(dest)) { 675 if (ra_hasreg(dest)) {
561 ra_free(as, dest); 676 ra_free(as, dest);
562 ra_modified(as, dest); 677 ra_modified(as, dest);
563 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, 678 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS,
564 dest, RID_ESP, ofs); 679 dest, RID_ESP, ofs);
565 } 680 }
566 if ((ci->flags & CCI_CASTU64)) { 681 if ((ci->flags & CCI_CASTU64)) {
@@ -584,15 +699,6 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
584 } 699 }
585} 700}
586 701
587static void asm_call(ASMState *as, IRIns *ir)
588{
589 IRRef args[CCI_NARGS_MAX];
590 const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
591 asm_collectargs(as, ir, ci, args);
592 asm_setupresult(as, ir, ci);
593 asm_gencall(as, ci, args);
594}
595
596/* Return a constant function pointer or NULL for indirect calls. */ 702/* Return a constant function pointer or NULL for indirect calls. */
597static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func) 703static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func)
598{ 704{
@@ -651,15 +757,23 @@ static void asm_callx(ASMState *as, IRIns *ir)
651static void asm_retf(ASMState *as, IRIns *ir) 757static void asm_retf(ASMState *as, IRIns *ir)
652{ 758{
653 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR); 759 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
760#if LJ_FR2
761 Reg rpc = ra_scratch(as, rset_exclude(RSET_GPR, base));
762#endif
654 void *pc = ir_kptr(IR(ir->op2)); 763 void *pc = ir_kptr(IR(ir->op2));
655 int32_t delta = 1+bc_a(*((const BCIns *)pc - 1)); 764 int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
656 as->topslot -= (BCReg)delta; 765 as->topslot -= (BCReg)delta;
657 if ((int32_t)as->topslot < 0) as->topslot = 0; 766 if ((int32_t)as->topslot < 0) as->topslot = 0;
658 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */ 767 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */
659 emit_setgl(as, base, jit_base); 768 emit_setgl(as, base, jit_base);
660 emit_addptr(as, base, -8*delta); 769 emit_addptr(as, base, -8*delta);
661 asm_guardcc(as, CC_NE); 770 asm_guardcc(as, CC_NE);
771#if LJ_FR2
772 emit_rmro(as, XO_CMP, rpc|REX_GC64, base, -8);
773 emit_loadu64(as, rpc, u64ptr(pc));
774#else
662 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc)); 775 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc));
776#endif
663} 777}
664 778
665/* -- Type conversions ---------------------------------------------------- */ 779/* -- Type conversions ---------------------------------------------------- */
@@ -672,8 +786,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
672 asm_guardcc(as, CC_NE); 786 asm_guardcc(as, CC_NE);
673 emit_rr(as, XO_UCOMISD, left, tmp); 787 emit_rr(as, XO_UCOMISD, left, tmp);
674 emit_rr(as, XO_CVTSI2SD, tmp, dest); 788 emit_rr(as, XO_CVTSI2SD, tmp, dest);
675 if (!(as->flags & JIT_F_SPLIT_XMM)) 789 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
676 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
677 emit_rr(as, XO_CVTTSD2SI, dest, left); 790 emit_rr(as, XO_CVTTSD2SI, dest, left);
678 /* Can't fuse since left is needed twice. */ 791 /* Can't fuse since left is needed twice. */
679} 792}
@@ -684,8 +797,9 @@ static void asm_tobit(ASMState *as, IRIns *ir)
684 Reg tmp = ra_noreg(IR(ir->op1)->r) ? 797 Reg tmp = ra_noreg(IR(ir->op1)->r) ?
685 ra_alloc1(as, ir->op1, RSET_FPR) : 798 ra_alloc1(as, ir->op1, RSET_FPR) :
686 ra_scratch(as, RSET_FPR); 799 ra_scratch(as, RSET_FPR);
687 Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); 800 Reg right;
688 emit_rr(as, XO_MOVDto, tmp, dest); 801 emit_rr(as, XO_MOVDto, tmp, dest);
802 right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
689 emit_mrm(as, XO_ADDSD, tmp, right); 803 emit_mrm(as, XO_ADDSD, tmp, right);
690 ra_left(as, tmp, ir->op1); 804 ra_left(as, tmp, ir->op1);
691} 805}
@@ -706,13 +820,13 @@ static void asm_conv(ASMState *as, IRIns *ir)
706 if (left == dest) return; /* Avoid the XO_XORPS. */ 820 if (left == dest) return; /* Avoid the XO_XORPS. */
707 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ 821 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */
708 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ 822 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
709 cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000)); 823 cTValue *k = &as->J->k64[LJ_K64_TOBIT];
710 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); 824 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
711 if (irt_isfloat(ir->t)) 825 if (irt_isfloat(ir->t))
712 emit_rr(as, XO_CVTSD2SS, dest, dest); 826 emit_rr(as, XO_CVTSD2SS, dest, dest);
713 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ 827 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
714 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ 828 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
715 emit_loadn(as, bias, k); 829 emit_rma(as, XO_MOVSD, bias, k);
716 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); 830 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
717 return; 831 return;
718 } else { /* Integer to FP conversion. */ 832 } else { /* Integer to FP conversion. */
@@ -721,7 +835,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
721 asm_fuseloadm(as, lref, RSET_GPR, st64); 835 asm_fuseloadm(as, lref, RSET_GPR, st64);
722 if (LJ_64 && st == IRT_U64) { 836 if (LJ_64 && st == IRT_U64) {
723 MCLabel l_end = emit_label(as); 837 MCLabel l_end = emit_label(as);
724 const void *k = lj_ir_k64_find(as->J, U64x(43f00000,00000000)); 838 cTValue *k = &as->J->k64[LJ_K64_2P64];
725 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */ 839 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */
726 emit_sjcc(as, CC_NS, l_end); 840 emit_sjcc(as, CC_NS, l_end);
727 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */ 841 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */
@@ -729,8 +843,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
729 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS, 843 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS,
730 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left); 844 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left);
731 } 845 }
732 if (!(as->flags & JIT_F_SPLIT_XMM)) 846 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
733 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */
734 } else if (stfp) { /* FP to integer conversion. */ 847 } else if (stfp) { /* FP to integer conversion. */
735 if (irt_isguard(ir->t)) { 848 if (irt_isguard(ir->t)) {
736 /* Checked conversions are only supported from number to int. */ 849 /* Checked conversions are only supported from number to int. */
@@ -738,9 +851,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
738 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 851 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
739 } else { 852 } else {
740 Reg dest = ra_dest(as, ir, RSET_GPR); 853 Reg dest = ra_dest(as, ir, RSET_GPR);
741 x86Op op = st == IRT_NUM ? 854 x86Op op = st == IRT_NUM ? XO_CVTTSD2SI : XO_CVTTSS2SI;
742 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) :
743 ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI);
744 if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) { 855 if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) {
745 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */ 856 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */
746 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */ 857 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */
@@ -751,23 +862,20 @@ static void asm_conv(ASMState *as, IRIns *ir)
751 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); 862 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
752 emit_rr(as, op, dest|REX_64, tmp); 863 emit_rr(as, op, dest|REX_64, tmp);
753 if (st == IRT_NUM) 864 if (st == IRT_NUM)
754 emit_rma(as, XO_ADDSD, tmp, lj_ir_k64_find(as->J, 865 emit_rma(as, XO_ADDSD, tmp, &as->J->k64[LJ_K64_M2P64_31]);
755 LJ_64 ? U64x(c3f00000,00000000) : U64x(c1e00000,00000000)));
756 else 866 else
757 emit_rma(as, XO_ADDSS, tmp, lj_ir_k64_find(as->J, 867 emit_rma(as, XO_ADDSS, tmp, &as->J->k32[LJ_K32_M2P64_31]);
758 LJ_64 ? U64x(00000000,df800000) : U64x(00000000,cf000000)));
759 emit_sjcc(as, CC_NS, l_end); 868 emit_sjcc(as, CC_NS, l_end);
760 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */ 869 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */
761 emit_rr(as, op, dest|REX_64, tmp); 870 emit_rr(as, op, dest|REX_64, tmp);
762 ra_left(as, tmp, lref); 871 ra_left(as, tmp, lref);
763 } else { 872 } else {
764 Reg left = asm_fuseload(as, lref, RSET_FPR);
765 if (LJ_64 && irt_isu32(ir->t)) 873 if (LJ_64 && irt_isu32(ir->t))
766 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ 874 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */
767 emit_mrm(as, op, 875 emit_mrm(as, op,
768 dest|((LJ_64 && 876 dest|((LJ_64 &&
769 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 877 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
770 left); 878 asm_fuseload(as, lref, RSET_FPR));
771 } 879 }
772 } 880 }
773 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ 881 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
@@ -834,16 +942,14 @@ static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
834 if (ra_hasreg(dest)) { 942 if (ra_hasreg(dest)) {
835 ra_free(as, dest); 943 ra_free(as, dest);
836 ra_modified(as, dest); 944 ra_modified(as, dest);
837 emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, 945 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS, dest, RID_ESP, ofs);
838 dest, RID_ESP, ofs);
839 } 946 }
840 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, 947 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
841 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); 948 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
842 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { 949 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
843 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ 950 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
844 MCLabel l_end = emit_label(as); 951 MCLabel l_end = emit_label(as);
845 emit_rma(as, XO_FADDq, XOg_FADDq, 952 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_2P64]);
846 lj_ir_k64_find(as->J, U64x(43f00000,00000000)));
847 emit_sjcc(as, CC_NS, l_end); 953 emit_sjcc(as, CC_NS, l_end);
848 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ 954 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */
849 } else { 955 } else {
@@ -863,7 +969,6 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
863 Reg lo, hi; 969 Reg lo, hi;
864 lua_assert(st == IRT_NUM || st == IRT_FLOAT); 970 lua_assert(st == IRT_NUM || st == IRT_FLOAT);
865 lua_assert(dt == IRT_I64 || dt == IRT_U64); 971 lua_assert(dt == IRT_I64 || dt == IRT_U64);
866 lua_assert(((ir-1)->op2 & IRCONV_TRUNC));
867 hi = ra_dest(as, ir, RSET_GPR); 972 hi = ra_dest(as, ir, RSET_GPR);
868 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi)); 973 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
869 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0); 974 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
@@ -884,8 +989,7 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
884 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); 989 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
885 else 990 else
886 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); 991 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
887 emit_rma(as, XO_FADDq, XOg_FADDq, 992 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_M2P64]);
888 lj_ir_k64_find(as->J, U64x(c3f00000,00000000)));
889 emit_sjcc(as, CC_NS, l_pop); 993 emit_sjcc(as, CC_NS, l_pop);
890 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ 994 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */
891 } 995 }
@@ -906,6 +1010,14 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
906 st == IRT_NUM ? XOg_FLDq: XOg_FLDd, 1010 st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
907 asm_fuseload(as, ir->op1, RSET_EMPTY)); 1011 asm_fuseload(as, ir->op1, RSET_EMPTY));
908} 1012}
1013
1014static void asm_conv64(ASMState *as, IRIns *ir)
1015{
1016 if (irt_isfp(ir->t))
1017 asm_conv_fp_int64(as, ir);
1018 else
1019 asm_conv_int64_fp(as, ir);
1020}
909#endif 1021#endif
910 1022
911static void asm_strto(ASMState *as, IRIns *ir) 1023static void asm_strto(ASMState *as, IRIns *ir)
@@ -927,54 +1039,60 @@ static void asm_strto(ASMState *as, IRIns *ir)
927 RID_ESP, sps_scale(ir->s)); 1039 RID_ESP, sps_scale(ir->s));
928} 1040}
929 1041
930static void asm_tostr(ASMState *as, IRIns *ir) 1042/* -- Memory references --------------------------------------------------- */
1043
1044/* Get pointer to TValue. */
1045static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
931{ 1046{
932 IRIns *irl = IR(ir->op1); 1047 IRIns *ir = IR(ref);
933 IRRef args[2]; 1048 if (irt_isnum(ir->t)) {
934 args[0] = ASMREF_L; 1049 /* For numbers use the constant itself or a spill slot as a TValue. */
935 as->gcsteps++; 1050 if (irref_isk(ref))
936 if (irt_isnum(irl->t)) { 1051 emit_loada(as, dest, ir_knum(ir));
937 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromnum]; 1052 else
938 args[1] = ASMREF_TMP1; /* const lua_Number * */ 1053 emit_rmro(as, XO_LEA, dest|REX_64, RID_ESP, ra_spill(as, ir));
939 asm_setupresult(as, ir, ci); /* GCstr * */
940 asm_gencall(as, ci, args);
941 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64,
942 RID_ESP, ra_spill(as, irl));
943 } else { 1054 } else {
944 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromint]; 1055 /* Otherwise use g->tmptv to hold the TValue. */
945 args[1] = ir->op1; /* int32_t k */ 1056#if LJ_GC64
946 asm_setupresult(as, ir, ci); /* GCstr * */ 1057 if (irref_isk(ref)) {
947 asm_gencall(as, ci, args); 1058 TValue k;
1059 lj_ir_kvalue(as->J->L, &k, ir);
1060 emit_movmroi(as, dest, 4, k.u32.hi);
1061 emit_movmroi(as, dest, 0, k.u32.lo);
1062 } else {
1063 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1064 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
1065 if (irt_is64(ir->t)) {
1066 emit_u32(as, irt_toitype(ir->t) << 15);
1067 emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4);
1068 } else {
1069 /* Currently, no caller passes integers that might end up here. */
1070 emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15));
1071 }
1072 emit_movtomro(as, REX_64IR(ir, src), dest, 0);
1073 }
1074#else
1075 if (!irref_isk(ref)) {
1076 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
1077 emit_movtomro(as, REX_64IR(ir, src), dest, 0);
1078 } else if (!irt_ispri(ir->t)) {
1079 emit_movmroi(as, dest, 0, ir->i);
1080 }
1081 if (!(LJ_64 && irt_islightud(ir->t)))
1082 emit_movmroi(as, dest, 4, irt_toitype(ir->t));
1083#endif
1084 emit_loada(as, dest, &J2G(as->J)->tmptv);
948 } 1085 }
949} 1086}
950 1087
951/* -- Memory references --------------------------------------------------- */
952
953static void asm_aref(ASMState *as, IRIns *ir) 1088static void asm_aref(ASMState *as, IRIns *ir)
954{ 1089{
955 Reg dest = ra_dest(as, ir, RSET_GPR); 1090 Reg dest = ra_dest(as, ir, RSET_GPR);
956 asm_fusearef(as, ir, RSET_GPR); 1091 asm_fusearef(as, ir, RSET_GPR);
957 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0)) 1092 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
958 emit_mrm(as, XO_LEA, dest, RID_MRM); 1093 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
959 else if (as->mrm.base != dest) 1094 else if (as->mrm.base != dest)
960 emit_rr(as, XO_MOV, dest, as->mrm.base); 1095 emit_rr(as, XO_MOV, dest|REX_GC64, as->mrm.base);
961}
962
963/* Merge NE(HREF, niltv) check. */
964static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
965{
966 /* Assumes nothing else generates NE of HREF. */
967 if ((ir[1].o == IR_NE || ir[1].o == IR_EQ) && ir[1].op1 == as->curins &&
968 ra_hasreg(ir->r)) {
969 MCode *p = as->mcp;
970 p += (LJ_64 && *p != XI_ARITHi) ? 7+6 : 6+6;
971 /* Ensure no loop branch inversion happened. */
972 if (p[-6] == 0x0f && p[-5] == XI_JCCn+(CC_NE^(ir[1].o & 1))) {
973 as->mcp = p; /* Kill cmp reg, imm32 + jz exit. */
974 return p + *(int32_t *)(p-4); /* Return exit address. */
975 }
976 }
977 return NULL;
978} 1096}
979 1097
980/* Inlined hash lookup. Specialized for key type and for const keys. 1098/* Inlined hash lookup. Specialized for key type and for const keys.
@@ -985,10 +1103,10 @@ static MCode *merge_href_niltv(ASMState *as, IRIns *ir)
985** } while ((n = nextnode(n))); 1103** } while ((n = nextnode(n)));
986** return niltv(L); 1104** return niltv(L);
987*/ 1105*/
988static void asm_href(ASMState *as, IRIns *ir) 1106static void asm_href(ASMState *as, IRIns *ir, IROp merge)
989{ 1107{
990 MCode *nilexit = merge_href_niltv(as, ir); /* Do this before any restores. */
991 RegSet allow = RSET_GPR; 1108 RegSet allow = RSET_GPR;
1109 int destused = ra_used(ir);
992 Reg dest = ra_dest(as, ir, allow); 1110 Reg dest = ra_dest(as, ir, allow);
993 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); 1111 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
994 Reg key = RID_NONE, tmp = RID_NONE; 1112 Reg key = RID_NONE, tmp = RID_NONE;
@@ -1001,28 +1119,26 @@ static void asm_href(ASMState *as, IRIns *ir)
1001 if (!isk) { 1119 if (!isk) {
1002 rset_clear(allow, tab); 1120 rset_clear(allow, tab);
1003 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); 1121 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
1004 if (!irt_isstr(kt)) 1122 if (LJ_GC64 || !irt_isstr(kt))
1005 tmp = ra_scratch(as, rset_exclude(allow, key)); 1123 tmp = ra_scratch(as, rset_exclude(allow, key));
1006 } 1124 }
1007 1125
1008 /* Key not found in chain: jump to exit (if merged with NE) or load niltv. */ 1126 /* Key not found in chain: jump to exit (if merged) or load niltv. */
1009 l_end = emit_label(as); 1127 l_end = emit_label(as);
1010 if (nilexit && ir[1].o == IR_NE) { 1128 if (merge == IR_NE)
1011 emit_jcc(as, CC_E, nilexit); /* XI_JMP is not found by lj_asm_patchexit. */ 1129 asm_guardcc(as, CC_E); /* XI_JMP is not found by lj_asm_patchexit. */
1012 nilexit = NULL; 1130 else if (destused)
1013 } else {
1014 emit_loada(as, dest, niltvg(J2G(as->J))); 1131 emit_loada(as, dest, niltvg(J2G(as->J)));
1015 }
1016 1132
1017 /* Follow hash chain until the end. */ 1133 /* Follow hash chain until the end. */
1018 l_loop = emit_sjcc_label(as, CC_NZ); 1134 l_loop = emit_sjcc_label(as, CC_NZ);
1019 emit_rr(as, XO_TEST, dest, dest); 1135 emit_rr(as, XO_TEST, dest|REX_GC64, dest);
1020 emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next)); 1136 emit_rmro(as, XO_MOV, dest|REX_GC64, dest, offsetof(Node, next));
1021 l_next = emit_label(as); 1137 l_next = emit_label(as);
1022 1138
1023 /* Type and value comparison. */ 1139 /* Type and value comparison. */
1024 if (nilexit) 1140 if (merge == IR_EQ)
1025 emit_jcc(as, CC_E, nilexit); 1141 asm_guardcc(as, CC_E);
1026 else 1142 else
1027 emit_sjcc(as, CC_E, l_end); 1143 emit_sjcc(as, CC_E, l_end);
1028 if (irt_isnum(kt)) { 1144 if (irt_isnum(kt)) {
@@ -1038,7 +1154,7 @@ static void asm_href(ASMState *as, IRIns *ir)
1038 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n)); 1154 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
1039 emit_sjcc(as, CC_AE, l_next); 1155 emit_sjcc(as, CC_AE, l_next);
1040 /* The type check avoids NaN penalties and complaints from Valgrind. */ 1156 /* The type check avoids NaN penalties and complaints from Valgrind. */
1041#if LJ_64 1157#if LJ_64 && !LJ_GC64
1042 emit_u32(as, LJ_TISNUM); 1158 emit_u32(as, LJ_TISNUM);
1043 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); 1159 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
1044#else 1160#else
@@ -1046,10 +1162,28 @@ static void asm_href(ASMState *as, IRIns *ir)
1046 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1162 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1047#endif 1163#endif
1048 } 1164 }
1049#if LJ_64 1165#if LJ_64 && !LJ_GC64
1050 } else if (irt_islightud(kt)) { 1166 } else if (irt_islightud(kt)) {
1051 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64)); 1167 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64));
1052#endif 1168#endif
1169#if LJ_GC64
1170 } else if (irt_isaddr(kt)) {
1171 if (isk) {
1172 TValue k;
1173 k.u64 = ((uint64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
1174 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
1175 k.u32.lo);
1176 emit_sjcc(as, CC_NE, l_next);
1177 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
1178 k.u32.hi);
1179 } else {
1180 emit_rmro(as, XO_CMP, tmp|REX_64, dest, offsetof(Node, key.u64));
1181 }
1182 } else {
1183 lua_assert(irt_ispri(kt) && !irt_isnil(kt));
1184 emit_u32(as, (irt_toitype(kt)<<15)|0x7fff);
1185 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
1186#else
1053 } else { 1187 } else {
1054 if (!irt_ispri(kt)) { 1188 if (!irt_ispri(kt)) {
1055 lua_assert(irt_isaddr(kt)); 1189 lua_assert(irt_isaddr(kt));
@@ -1063,16 +1197,23 @@ static void asm_href(ASMState *as, IRIns *ir)
1063 lua_assert(!irt_isnil(kt)); 1197 lua_assert(!irt_isnil(kt));
1064 emit_i8(as, irt_toitype(kt)); 1198 emit_i8(as, irt_toitype(kt));
1065 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1199 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1200#endif
1066 } 1201 }
1067 emit_sfixup(as, l_loop); 1202 emit_sfixup(as, l_loop);
1068 checkmclim(as); 1203 checkmclim(as);
1204#if LJ_GC64
1205 if (!isk && irt_isaddr(kt)) {
1206 emit_rr(as, XO_OR, tmp|REX_64, key);
1207 emit_loadu64(as, tmp, (uint64_t)irt_toitype(kt) << 47);
1208 }
1209#endif
1069 1210
1070 /* Load main position relative to tab->node into dest. */ 1211 /* Load main position relative to tab->node into dest. */
1071 khash = isk ? ir_khash(irkey) : 1; 1212 khash = isk ? ir_khash(irkey) : 1;
1072 if (khash == 0) { 1213 if (khash == 0) {
1073 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node)); 1214 emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node));
1074 } else { 1215 } else {
1075 emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node)); 1216 emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node));
1076 if ((as->flags & JIT_F_PREFER_IMUL)) { 1217 if ((as->flags & JIT_F_PREFER_IMUL)) {
1077 emit_i8(as, sizeof(Node)); 1218 emit_i8(as, sizeof(Node));
1078 emit_rr(as, XO_IMULi8, dest, dest); 1219 emit_rr(as, XO_IMULi8, dest, dest);
@@ -1107,7 +1248,19 @@ static void asm_href(ASMState *as, IRIns *ir)
1107#endif 1248#endif
1108 } else { 1249 } else {
1109 emit_rr(as, XO_MOV, tmp, key); 1250 emit_rr(as, XO_MOV, tmp, key);
1251#if LJ_GC64
1252 checkmclim(as);
1253 emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
1254 if ((as->flags & JIT_F_BMI2)) {
1255 emit_i8(as, 32);
1256 emit_mrm(as, XV_RORX|VEX_64, dest, key);
1257 } else {
1258 emit_shifti(as, XOg_SHR|REX_64, dest, 32);
1259 emit_rr(as, XO_MOV, dest|REX_64, key|REX_64);
1260 }
1261#else
1110 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS); 1262 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS);
1263#endif
1111 } 1264 }
1112 } 1265 }
1113 } 1266 }
@@ -1127,11 +1280,11 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1127 if (ra_hasreg(dest)) { 1280 if (ra_hasreg(dest)) {
1128 if (ofs != 0) { 1281 if (ofs != 0) {
1129 if (dest == node && !(as->flags & JIT_F_LEA_AGU)) 1282 if (dest == node && !(as->flags & JIT_F_LEA_AGU))
1130 emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs); 1283 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs);
1131 else 1284 else
1132 emit_rmro(as, XO_LEA, dest, node, ofs); 1285 emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs);
1133 } else if (dest != node) { 1286 } else if (dest != node) {
1134 emit_rr(as, XO_MOV, dest, node); 1287 emit_rr(as, XO_MOV, dest|REX_GC64, node);
1135 } 1288 }
1136 } 1289 }
1137 asm_guardcc(as, CC_NE); 1290 asm_guardcc(as, CC_NE);
@@ -1143,13 +1296,24 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1143 lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t)); 1296 lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t));
1144 /* Assumes -0.0 is already canonicalized to +0.0. */ 1297 /* Assumes -0.0 is already canonicalized to +0.0. */
1145 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 : 1298 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 :
1299#if LJ_GC64
1300 ((uint64_t)irt_toitype(irkey->t) << 47) |
1301 (uint64_t)ir_kgc(irkey));
1302#else
1146 ((uint64_t)irt_toitype(irkey->t) << 32) | 1303 ((uint64_t)irt_toitype(irkey->t) << 32) |
1147 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey))); 1304 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey)));
1305#endif
1148 } else { 1306 } else {
1149 lua_assert(!irt_isnil(irkey->t)); 1307 lua_assert(!irt_isnil(irkey->t));
1308#if LJ_GC64
1309 emit_i32(as, (irt_toitype(irkey->t)<<15)|0x7fff);
1310 emit_rmro(as, XO_ARITHi, XOg_CMP, node,
1311 ofs + (int32_t)offsetof(Node, key.it));
1312#else
1150 emit_i8(as, irt_toitype(irkey->t)); 1313 emit_i8(as, irt_toitype(irkey->t));
1151 emit_rmro(as, XO_ARITHi8, XOg_CMP, node, 1314 emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1152 ofs + (int32_t)offsetof(Node, key.it)); 1315 ofs + (int32_t)offsetof(Node, key.it));
1316#endif
1153 } 1317 }
1154#else 1318#else
1155 l_exit = emit_label(as); 1319 l_exit = emit_label(as);
@@ -1178,61 +1342,27 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
1178#endif 1342#endif
1179} 1343}
1180 1344
1181static void asm_newref(ASMState *as, IRIns *ir)
1182{
1183 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
1184 IRRef args[3];
1185 IRIns *irkey;
1186 Reg tmp;
1187 if (ir->r == RID_SINK)
1188 return;
1189 args[0] = ASMREF_L; /* lua_State *L */
1190 args[1] = ir->op1; /* GCtab *t */
1191 args[2] = ASMREF_TMP1; /* cTValue *key */
1192 asm_setupresult(as, ir, ci); /* TValue * */
1193 asm_gencall(as, ci, args);
1194 tmp = ra_releasetmp(as, ASMREF_TMP1);
1195 irkey = IR(ir->op2);
1196 if (irt_isnum(irkey->t)) {
1197 /* For numbers use the constant itself or a spill slot as a TValue. */
1198 if (irref_isk(ir->op2))
1199 emit_loada(as, tmp, ir_knum(irkey));
1200 else
1201 emit_rmro(as, XO_LEA, tmp|REX_64, RID_ESP, ra_spill(as, irkey));
1202 } else {
1203 /* Otherwise use g->tmptv to hold the TValue. */
1204 if (!irref_isk(ir->op2)) {
1205 Reg src = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, tmp));
1206 emit_movtomro(as, REX_64IR(irkey, src), tmp, 0);
1207 } else if (!irt_ispri(irkey->t)) {
1208 emit_movmroi(as, tmp, 0, irkey->i);
1209 }
1210 if (!(LJ_64 && irt_islightud(irkey->t)))
1211 emit_movmroi(as, tmp, 4, irt_toitype(irkey->t));
1212 emit_loada(as, tmp, &J2G(as->J)->tmptv);
1213 }
1214}
1215
1216static void asm_uref(ASMState *as, IRIns *ir) 1345static void asm_uref(ASMState *as, IRIns *ir)
1217{ 1346{
1218 Reg dest = ra_dest(as, ir, RSET_GPR); 1347 Reg dest = ra_dest(as, ir, RSET_GPR);
1219 if (irref_isk(ir->op1)) { 1348 if (irref_isk(ir->op1)) {
1220 GCfunc *fn = ir_kfunc(IR(ir->op1)); 1349 GCfunc *fn = ir_kfunc(IR(ir->op1));
1221 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; 1350 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
1222 emit_rma(as, XO_MOV, dest, v); 1351 emit_rma(as, XO_MOV, dest|REX_GC64, v);
1223 } else { 1352 } else {
1224 Reg uv = ra_scratch(as, RSET_GPR); 1353 Reg uv = ra_scratch(as, RSET_GPR);
1225 Reg func = ra_alloc1(as, ir->op1, RSET_GPR); 1354 Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
1226 if (ir->o == IR_UREFC) { 1355 if (ir->o == IR_UREFC) {
1227 emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv)); 1356 emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
1228 asm_guardcc(as, CC_NE); 1357 asm_guardcc(as, CC_NE);
1229 emit_i8(as, 1); 1358 emit_i8(as, 1);
1230 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); 1359 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
1231 } else { 1360 } else {
1232 emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v)); 1361 emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
1233 } 1362 }
1234 emit_rmro(as, XO_MOV, uv, func, 1363 emit_rmro(as, XO_MOV, uv|REX_GC64, func,
1235 (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); 1364 (int32_t)offsetof(GCfuncL, uvptr) +
1365 (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
1236 } 1366 }
1237} 1367}
1238 1368
@@ -1250,9 +1380,9 @@ static void asm_strref(ASMState *as, IRIns *ir)
1250 if (as->mrm.base == RID_NONE) 1380 if (as->mrm.base == RID_NONE)
1251 emit_loadi(as, dest, as->mrm.ofs); 1381 emit_loadi(as, dest, as->mrm.ofs);
1252 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE) 1382 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
1253 emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs); 1383 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, as->mrm.ofs);
1254 else 1384 else
1255 emit_mrm(as, XO_LEA, dest, RID_MRM); 1385 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
1256} 1386}
1257 1387
1258/* -- Loads and stores ---------------------------------------------------- */ 1388/* -- Loads and stores ---------------------------------------------------- */
@@ -1271,7 +1401,7 @@ static void asm_fxload(ASMState *as, IRIns *ir)
1271 case IRT_U8: xo = XO_MOVZXb; break; 1401 case IRT_U8: xo = XO_MOVZXb; break;
1272 case IRT_I16: xo = XO_MOVSXw; break; 1402 case IRT_I16: xo = XO_MOVSXw; break;
1273 case IRT_U16: xo = XO_MOVZXw; break; 1403 case IRT_U16: xo = XO_MOVZXw; break;
1274 case IRT_NUM: xo = XMM_MOVRM(as); break; 1404 case IRT_NUM: xo = XO_MOVSD; break;
1275 case IRT_FLOAT: xo = XO_MOVSS; break; 1405 case IRT_FLOAT: xo = XO_MOVSS; break;
1276 default: 1406 default:
1277 if (LJ_64 && irt_is64(ir->t)) 1407 if (LJ_64 && irt_is64(ir->t))
@@ -1284,6 +1414,9 @@ static void asm_fxload(ASMState *as, IRIns *ir)
1284 emit_mrm(as, xo, dest, RID_MRM); 1414 emit_mrm(as, xo, dest, RID_MRM);
1285} 1415}
1286 1416
1417#define asm_fload(as, ir) asm_fxload(as, ir)
1418#define asm_xload(as, ir) asm_fxload(as, ir)
1419
1287static void asm_fxstore(ASMState *as, IRIns *ir) 1420static void asm_fxstore(ASMState *as, IRIns *ir)
1288{ 1421{
1289 RegSet allow = RSET_GPR; 1422 RegSet allow = RSET_GPR;
@@ -1318,7 +1451,7 @@ static void asm_fxstore(ASMState *as, IRIns *ir)
1318 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break; 1451 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
1319 case IRT_NUM: xo = XO_MOVSDto; break; 1452 case IRT_NUM: xo = XO_MOVSDto; break;
1320 case IRT_FLOAT: xo = XO_MOVSSto; break; 1453 case IRT_FLOAT: xo = XO_MOVSSto; break;
1321#if LJ_64 1454#if LJ_64 && !LJ_GC64
1322 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */ 1455 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */
1323#endif 1456#endif
1324 default: 1457 default:
@@ -1347,7 +1480,10 @@ static void asm_fxstore(ASMState *as, IRIns *ir)
1347 } 1480 }
1348} 1481}
1349 1482
1350#if LJ_64 1483#define asm_fstore(as, ir) asm_fxstore(as, ir)
1484#define asm_xstore(as, ir) asm_fxstore(as, ir)
1485
1486#if LJ_64 && !LJ_GC64
1351static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck) 1487static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
1352{ 1488{
1353 if (ra_used(ir) || typecheck) { 1489 if (ra_used(ir) || typecheck) {
@@ -1369,9 +1505,12 @@ static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
1369 1505
1370static void asm_ahuvload(ASMState *as, IRIns *ir) 1506static void asm_ahuvload(ASMState *as, IRIns *ir)
1371{ 1507{
1508#if LJ_GC64
1509 Reg tmp = RID_NONE;
1510#endif
1372 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || 1511 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
1373 (LJ_DUALNUM && irt_isint(ir->t))); 1512 (LJ_DUALNUM && irt_isint(ir->t)));
1374#if LJ_64 1513#if LJ_64 && !LJ_GC64
1375 if (irt_islightud(ir->t)) { 1514 if (irt_islightud(ir->t)) {
1376 Reg dest = asm_load_lightud64(as, ir, 1); 1515 Reg dest = asm_load_lightud64(as, ir, 1);
1377 if (ra_hasreg(dest)) { 1516 if (ra_hasreg(dest)) {
@@ -1385,20 +1524,64 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
1385 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; 1524 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
1386 Reg dest = ra_dest(as, ir, allow); 1525 Reg dest = ra_dest(as, ir, allow);
1387 asm_fuseahuref(as, ir->op1, RSET_GPR); 1526 asm_fuseahuref(as, ir->op1, RSET_GPR);
1388 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XMM_MOVRM(as), dest, RID_MRM); 1527#if LJ_GC64
1528 if (irt_isaddr(ir->t)) {
1529 emit_shifti(as, XOg_SHR|REX_64, dest, 17);
1530 asm_guardcc(as, CC_NE);
1531 emit_i8(as, irt_toitype(ir->t));
1532 emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
1533 emit_i8(as, XI_O16);
1534 if ((as->flags & JIT_F_BMI2)) {
1535 emit_i8(as, 47);
1536 emit_mrm(as, XV_RORX|VEX_64, dest, RID_MRM);
1537 } else {
1538 emit_shifti(as, XOg_ROR|REX_64, dest, 47);
1539 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
1540 }
1541 return;
1542 } else
1543#endif
1544 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XO_MOVSD, dest, RID_MRM);
1389 } else { 1545 } else {
1390 asm_fuseahuref(as, ir->op1, RSET_GPR); 1546 RegSet gpr = RSET_GPR;
1547#if LJ_GC64
1548 if (irt_isaddr(ir->t)) {
1549 tmp = ra_scratch(as, RSET_GPR);
1550 gpr = rset_exclude(gpr, tmp);
1551 }
1552#endif
1553 asm_fuseahuref(as, ir->op1, gpr);
1391 } 1554 }
1392 /* Always do the type check, even if the load result is unused. */ 1555 /* Always do the type check, even if the load result is unused. */
1393 as->mrm.ofs += 4; 1556 as->mrm.ofs += 4;
1394 asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE); 1557 asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE);
1395 if (LJ_64 && irt_type(ir->t) >= IRT_NUM) { 1558 if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
1396 lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); 1559 lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
1560#if LJ_GC64
1561 emit_u32(as, LJ_TISNUM << 15);
1562#else
1397 emit_u32(as, LJ_TISNUM); 1563 emit_u32(as, LJ_TISNUM);
1564#endif
1398 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); 1565 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
1566#if LJ_GC64
1567 } else if (irt_isaddr(ir->t)) {
1568 as->mrm.ofs -= 4;
1569 emit_i8(as, irt_toitype(ir->t));
1570 emit_mrm(as, XO_ARITHi8, XOg_CMP, tmp);
1571 emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
1572 emit_mrm(as, XO_MOV, tmp|REX_64, RID_MRM);
1573 } else if (irt_isnil(ir->t)) {
1574 as->mrm.ofs -= 4;
1575 emit_i8(as, -1);
1576 emit_mrm(as, XO_ARITHi8, XOg_CMP|REX_64, RID_MRM);
1577 } else {
1578 emit_u32(as, (irt_toitype(ir->t) << 15) | 0x7fff);
1579 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
1580#else
1399 } else { 1581 } else {
1400 emit_i8(as, irt_toitype(ir->t)); 1582 emit_i8(as, irt_toitype(ir->t));
1401 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM); 1583 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
1584#endif
1402 } 1585 }
1403} 1586}
1404 1587
@@ -1410,12 +1593,28 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
1410 Reg src = ra_alloc1(as, ir->op2, RSET_FPR); 1593 Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
1411 asm_fuseahuref(as, ir->op1, RSET_GPR); 1594 asm_fuseahuref(as, ir->op1, RSET_GPR);
1412 emit_mrm(as, XO_MOVSDto, src, RID_MRM); 1595 emit_mrm(as, XO_MOVSDto, src, RID_MRM);
1413#if LJ_64 1596#if LJ_64 && !LJ_GC64
1414 } else if (irt_islightud(ir->t)) { 1597 } else if (irt_islightud(ir->t)) {
1415 Reg src = ra_alloc1(as, ir->op2, RSET_GPR); 1598 Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
1416 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src)); 1599 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src));
1417 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); 1600 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
1418#endif 1601#endif
1602#if LJ_GC64
1603 } else if (irref_isk(ir->op2)) {
1604 TValue k;
1605 lj_ir_kvalue(as->J->L, &k, IR(ir->op2));
1606 asm_fuseahuref(as, ir->op1, RSET_GPR);
1607 if (tvisnil(&k)) {
1608 emit_i32(as, -1);
1609 emit_mrm(as, XO_MOVmi, REX_64, RID_MRM);
1610 } else {
1611 emit_u32(as, k.u32.lo);
1612 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1613 as->mrm.ofs += 4;
1614 emit_u32(as, k.u32.hi);
1615 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1616 }
1617#endif
1419 } else { 1618 } else {
1420 IRIns *irr = IR(ir->op2); 1619 IRIns *irr = IR(ir->op2);
1421 RegSet allow = RSET_GPR; 1620 RegSet allow = RSET_GPR;
@@ -1426,6 +1625,17 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
1426 } 1625 }
1427 asm_fuseahuref(as, ir->op1, allow); 1626 asm_fuseahuref(as, ir->op1, allow);
1428 if (ra_hasreg(src)) { 1627 if (ra_hasreg(src)) {
1628#if LJ_GC64
1629 if (!(LJ_DUALNUM && irt_isinteger(ir->t))) {
1630 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1631 as->mrm.ofs += 4;
1632 emit_u32(as, irt_toitype(ir->t) << 15);
1633 emit_mrm(as, XO_ARITHi, XOg_OR, RID_MRM);
1634 as->mrm.ofs -= 4;
1635 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
1636 return;
1637 }
1638#endif
1429 emit_mrm(as, XO_MOVto, src, RID_MRM); 1639 emit_mrm(as, XO_MOVto, src, RID_MRM);
1430 } else if (!irt_ispri(irr->t)) { 1640 } else if (!irt_ispri(irr->t)) {
1431 lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t))); 1641 lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t)));
@@ -1433,14 +1643,20 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
1433 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1643 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1434 } 1644 }
1435 as->mrm.ofs += 4; 1645 as->mrm.ofs += 4;
1646#if LJ_GC64
1647 lua_assert(LJ_DUALNUM && irt_isinteger(ir->t));
1648 emit_i32(as, LJ_TNUMX << 15);
1649#else
1436 emit_i32(as, (int32_t)irt_toitype(ir->t)); 1650 emit_i32(as, (int32_t)irt_toitype(ir->t));
1651#endif
1437 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1652 emit_mrm(as, XO_MOVmi, 0, RID_MRM);
1438 } 1653 }
1439} 1654}
1440 1655
1441static void asm_sload(ASMState *as, IRIns *ir) 1656static void asm_sload(ASMState *as, IRIns *ir)
1442{ 1657{
1443 int32_t ofs = 8*((int32_t)ir->op1-1) + ((ir->op2 & IRSLOAD_FRAME) ? 4 : 0); 1658 int32_t ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
1659 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
1444 IRType1 t = ir->t; 1660 IRType1 t = ir->t;
1445 Reg base; 1661 Reg base;
1446 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ 1662 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */
@@ -1451,9 +1667,9 @@ static void asm_sload(ASMState *as, IRIns *ir)
1451 Reg left = ra_scratch(as, RSET_FPR); 1667 Reg left = ra_scratch(as, RSET_FPR);
1452 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */ 1668 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */
1453 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1669 base = ra_alloc1(as, REF_BASE, RSET_GPR);
1454 emit_rmro(as, XMM_MOVRM(as), left, base, ofs); 1670 emit_rmro(as, XO_MOVSD, left, base, ofs);
1455 t.irt = IRT_NUM; /* Continue with a regular number type check. */ 1671 t.irt = IRT_NUM; /* Continue with a regular number type check. */
1456#if LJ_64 1672#if LJ_64 && !LJ_GC64
1457 } else if (irt_islightud(t)) { 1673 } else if (irt_islightud(t)) {
1458 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK)); 1674 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK));
1459 if (ra_hasreg(dest)) { 1675 if (ra_hasreg(dest)) {
@@ -1469,11 +1685,39 @@ static void asm_sload(ASMState *as, IRIns *ir)
1469 lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); 1685 lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t));
1470 if ((ir->op2 & IRSLOAD_CONVERT)) { 1686 if ((ir->op2 & IRSLOAD_CONVERT)) {
1471 t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */ 1687 t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */
1472 emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTSD2SI, dest, base, ofs); 1688 emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTTSD2SI, dest, base, ofs);
1473 } else if (irt_isnum(t)) {
1474 emit_rmro(as, XMM_MOVRM(as), dest, base, ofs);
1475 } else { 1689 } else {
1476 emit_rmro(as, XO_MOV, dest, base, ofs); 1690#if LJ_GC64
1691 if (irt_isaddr(t)) {
1692 /* LJ_GC64 type check + tag removal without BMI2 and with BMI2:
1693 **
1694 ** mov r64, [addr] rorx r64, [addr], 47
1695 ** ror r64, 47
1696 ** cmp r16, itype cmp r16, itype
1697 ** jne ->exit jne ->exit
1698 ** shr r64, 16 shr r64, 16
1699 */
1700 emit_shifti(as, XOg_SHR|REX_64, dest, 17);
1701 if ((ir->op2 & IRSLOAD_TYPECHECK)) {
1702 asm_guardcc(as, CC_NE);
1703 emit_i8(as, irt_toitype(t));
1704 emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
1705 emit_i8(as, XI_O16);
1706 }
1707 if ((as->flags & JIT_F_BMI2)) {
1708 emit_i8(as, 47);
1709 emit_rmro(as, XV_RORX|VEX_64, dest, base, ofs);
1710 } else {
1711 if ((ir->op2 & IRSLOAD_TYPECHECK))
1712 emit_shifti(as, XOg_ROR|REX_64, dest, 47);
1713 else
1714 emit_shifti(as, XOg_SHL|REX_64, dest, 17);
1715 emit_rmro(as, XO_MOV, dest|REX_64, base, ofs);
1716 }
1717 return;
1718 } else
1719#endif
1720 emit_rmro(as, irt_isnum(t) ? XO_MOVSD : XO_MOV, dest, base, ofs);
1477 } 1721 }
1478 } else { 1722 } else {
1479 if (!(ir->op2 & IRSLOAD_TYPECHECK)) 1723 if (!(ir->op2 & IRSLOAD_TYPECHECK))
@@ -1485,11 +1729,42 @@ static void asm_sload(ASMState *as, IRIns *ir)
1485 asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE); 1729 asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE);
1486 if (LJ_64 && irt_type(t) >= IRT_NUM) { 1730 if (LJ_64 && irt_type(t) >= IRT_NUM) {
1487 lua_assert(irt_isinteger(t) || irt_isnum(t)); 1731 lua_assert(irt_isinteger(t) || irt_isnum(t));
1732#if LJ_GC64
1733 emit_u32(as, LJ_TISNUM << 15);
1734#else
1488 emit_u32(as, LJ_TISNUM); 1735 emit_u32(as, LJ_TISNUM);
1736#endif
1737 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
1738#if LJ_GC64
1739 } else if (irt_isnil(t)) {
1740 /* LJ_GC64 type check for nil:
1741 **
1742 ** cmp qword [addr], -1
1743 ** jne ->exit
1744 */
1745 emit_i8(as, -1);
1746 emit_rmro(as, XO_ARITHi8, XOg_CMP|REX_64, base, ofs);
1747 } else if (irt_ispri(t)) {
1748 emit_u32(as, (irt_toitype(t) << 15) | 0x7fff);
1489 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); 1749 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
1490 } else { 1750 } else {
1751 /* LJ_GC64 type check only:
1752 **
1753 ** mov r64, [addr]
1754 ** sar r64, 47
1755 ** cmp r32, itype
1756 ** jne ->exit
1757 */
1758 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base));
1759 emit_i8(as, irt_toitype(t));
1760 emit_rr(as, XO_ARITHi8, XOg_CMP, tmp);
1761 emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
1762 emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs);
1763#else
1764 } else {
1491 emit_i8(as, irt_toitype(t)); 1765 emit_i8(as, irt_toitype(t));
1492 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4); 1766 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
1767#endif
1493 } 1768 }
1494 } 1769 }
1495} 1770}
@@ -1500,15 +1775,13 @@ static void asm_sload(ASMState *as, IRIns *ir)
1500static void asm_cnew(ASMState *as, IRIns *ir) 1775static void asm_cnew(ASMState *as, IRIns *ir)
1501{ 1776{
1502 CTState *cts = ctype_ctsG(J2G(as->J)); 1777 CTState *cts = ctype_ctsG(J2G(as->J));
1503 CTypeID ctypeid = (CTypeID)IR(ir->op1)->i; 1778 CTypeID id = (CTypeID)IR(ir->op1)->i;
1504 CTSize sz = (ir->o == IR_CNEWI || ir->op2 == REF_NIL) ? 1779 CTSize sz;
1505 lj_ctype_size(cts, ctypeid) : (CTSize)IR(ir->op2)->i; 1780 CTInfo info = lj_ctype_info(cts, id, &sz);
1506 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco]; 1781 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
1507 IRRef args[2]; 1782 IRRef args[4];
1508 lua_assert(sz != CTSIZE_INVALID); 1783 lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
1509 1784
1510 args[0] = ASMREF_L; /* lua_State *L */
1511 args[1] = ASMREF_TMP1; /* MSize size */
1512 as->gcsteps++; 1785 as->gcsteps++;
1513 asm_setupresult(as, ir, ci); /* GCcdata * */ 1786 asm_setupresult(as, ir, ci); /* GCcdata * */
1514 1787
@@ -1519,8 +1792,9 @@ static void asm_cnew(ASMState *as, IRIns *ir)
1519 Reg r64 = sz == 8 ? REX_64 : 0; 1792 Reg r64 = sz == 8 ? REX_64 : 0;
1520 if (irref_isk(ir->op2)) { 1793 if (irref_isk(ir->op2)) {
1521 IRIns *irk = IR(ir->op2); 1794 IRIns *irk = IR(ir->op2);
1522 uint64_t k = irk->o == IR_KINT64 ? ir_k64(irk)->u64 : 1795 uint64_t k = (irk->o == IR_KINT64 ||
1523 (uint64_t)(uint32_t)irk->i; 1796 (LJ_GC64 && (irk->o == IR_KPTR || irk->o == IR_KKPTR))) ?
1797 ir_k64(irk)->u64 : (uint64_t)(uint32_t)irk->i;
1524 if (sz == 4 || checki32((int64_t)k)) { 1798 if (sz == 4 || checki32((int64_t)k)) {
1525 emit_i32(as, (int32_t)k); 1799 emit_i32(as, (int32_t)k);
1526 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata)); 1800 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata));
@@ -1551,15 +1825,26 @@ static void asm_cnew(ASMState *as, IRIns *ir)
1551 } while (1); 1825 } while (1);
1552#endif 1826#endif
1553 lua_assert(sz == 4 || sz == 8); 1827 lua_assert(sz == 4 || sz == 8);
1828 } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */
1829 ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
1830 args[0] = ASMREF_L; /* lua_State *L */
1831 args[1] = ir->op1; /* CTypeID id */
1832 args[2] = ir->op2; /* CTSize sz */
1833 args[3] = ASMREF_TMP1; /* CTSize align */
1834 asm_gencall(as, ci, args);
1835 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
1836 return;
1554 } 1837 }
1555 1838
1556 /* Combine initialization of marked, gct and ctypeid. */ 1839 /* Combine initialization of marked, gct and ctypeid. */
1557 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked)); 1840 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked));
1558 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX, 1841 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX,
1559 (int32_t)((~LJ_TCDATA<<8)+(ctypeid<<16))); 1842 (int32_t)((~LJ_TCDATA<<8)+(id<<16)));
1560 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES); 1843 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES);
1561 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite); 1844 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite);
1562 1845
1846 args[0] = ASMREF_L; /* lua_State *L */
1847 args[1] = ASMREF_TMP1; /* MSize size */
1563 asm_gencall(as, ci, args); 1848 asm_gencall(as, ci, args);
1564 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata))); 1849 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata)));
1565} 1850}
@@ -1574,7 +1859,7 @@ static void asm_tbar(ASMState *as, IRIns *ir)
1574 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); 1859 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
1575 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab)); 1860 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
1576 MCLabel l_end = emit_label(as); 1861 MCLabel l_end = emit_label(as);
1577 emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist)); 1862 emit_movtomro(as, tmp|REX_GC64, tab, offsetof(GCtab, gclist));
1578 emit_setgl(as, tab, gc.grayagain); 1863 emit_setgl(as, tab, gc.grayagain);
1579 emit_getgl(as, tmp, gc.grayagain); 1864 emit_getgl(as, tmp, gc.grayagain);
1580 emit_i8(as, ~LJ_GC_BLACK); 1865 emit_i8(as, ~LJ_GC_BLACK);
@@ -1637,36 +1922,9 @@ static void asm_x87load(ASMState *as, IRRef ref)
1637 } 1922 }
1638} 1923}
1639 1924
1640/* Try to rejoin pow from EXP2, MUL and LOG2 (if still unsplit). */
1641static int fpmjoin_pow(ASMState *as, IRIns *ir)
1642{
1643 IRIns *irp = IR(ir->op1);
1644 if (irp == ir-1 && irp->o == IR_MUL && !ra_used(irp)) {
1645 IRIns *irpp = IR(irp->op1);
1646 if (irpp == ir-2 && irpp->o == IR_FPMATH &&
1647 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
1648 /* The modified regs must match with the *.dasc implementation. */
1649 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
1650 IRIns *irx;
1651 if (ra_hasreg(ir->r))
1652 rset_clear(drop, ir->r); /* Dest reg handled below. */
1653 ra_evictset(as, drop);
1654 ra_destreg(as, ir, RID_XMM0);
1655 emit_call(as, lj_vm_pow_sse);
1656 irx = IR(irpp->op1);
1657 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
1658 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
1659 ra_left(as, RID_XMM0, irpp->op1);
1660 ra_left(as, RID_XMM1, irp->op2);
1661 return 1;
1662 }
1663 }
1664 return 0;
1665}
1666
1667static void asm_fpmath(ASMState *as, IRIns *ir) 1925static void asm_fpmath(ASMState *as, IRIns *ir)
1668{ 1926{
1669 IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER; 1927 IRFPMathOp fpm = (IRFPMathOp)ir->op2;
1670 if (fpm == IRFPM_SQRT) { 1928 if (fpm == IRFPM_SQRT) {
1671 Reg dest = ra_dest(as, ir, RSET_FPR); 1929 Reg dest = ra_dest(as, ir, RSET_FPR);
1672 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 1930 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
@@ -1697,51 +1955,29 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
1697 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); 1955 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
1698 ra_left(as, RID_XMM0, ir->op1); 1956 ra_left(as, RID_XMM0, ir->op1);
1699 } 1957 }
1700 } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) { 1958 } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
1701 /* Rejoined to pow(). */ 1959 /* Rejoined to pow(). */
1702 } else { /* Handle x87 ops. */ 1960 } else {
1703 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1961 asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
1704 Reg dest = ir->r; 1962 }
1705 if (ra_hasreg(dest)) { 1963}
1706 ra_free(as, dest); 1964
1707 ra_modified(as, dest); 1965#define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2)
1708 emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs); 1966
1709 } 1967static void asm_ldexp(ASMState *as, IRIns *ir)
1710 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); 1968{
1711 switch (fpm) { /* st0 = lj_vm_*(st0) */ 1969 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1712 case IRFPM_EXP: emit_call(as, lj_vm_exp_x87); break; 1970 Reg dest = ir->r;
1713 case IRFPM_EXP2: emit_call(as, lj_vm_exp2_x87); break; 1971 if (ra_hasreg(dest)) {
1714 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; 1972 ra_free(as, dest);
1715 case IRFPM_COS: emit_x87op(as, XI_FCOS); break; 1973 ra_modified(as, dest);
1716 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; 1974 emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs);
1717 case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
1718 /* Note: the use of fyl2xp1 would be pointless here. When computing
1719 ** log(1.0+eps) the precision is already lost after 1.0 is added.
1720 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
1721 */
1722 emit_x87op(as, XI_FYL2X); break;
1723 case IRFPM_OTHER:
1724 switch (ir->o) {
1725 case IR_ATAN2:
1726 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
1727 case IR_LDEXP:
1728 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
1729 default: lua_assert(0); break;
1730 }
1731 break;
1732 default: lua_assert(0); break;
1733 }
1734 asm_x87load(as, ir->op1);
1735 switch (fpm) {
1736 case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
1737 case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
1738 case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
1739 case IRFPM_OTHER:
1740 if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
1741 break;
1742 default: break;
1743 }
1744 } 1975 }
1976 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
1977 emit_x87op(as, XI_FPOP1);
1978 emit_x87op(as, XI_FSCALE);
1979 asm_x87load(as, ir->op1);
1980 asm_x87load(as, ir->op2);
1745} 1981}
1746 1982
1747static void asm_fppowi(ASMState *as, IRIns *ir) 1983static void asm_fppowi(ASMState *as, IRIns *ir)
@@ -1757,26 +1993,15 @@ static void asm_fppowi(ASMState *as, IRIns *ir)
1757 ra_left(as, RID_EAX, ir->op2); 1993 ra_left(as, RID_EAX, ir->op2);
1758} 1994}
1759 1995
1760#if LJ_64 && LJ_HASFFI 1996static void asm_pow(ASMState *as, IRIns *ir)
1761static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id)
1762{ 1997{
1763 const CCallInfo *ci = &lj_ir_callinfo[id]; 1998#if LJ_64 && LJ_HASFFI
1764 IRRef args[2]; 1999 if (!irt_isnum(ir->t))
1765 args[0] = ir->op1; 2000 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
1766 args[1] = ir->op2; 2001 IRCALL_lj_carith_powu64);
1767 asm_setupresult(as, ir, ci); 2002 else
1768 asm_gencall(as, ci, args);
1769}
1770#endif 2003#endif
1771 2004 asm_fppowi(as, ir);
1772static void asm_intmod(ASMState *as, IRIns *ir)
1773{
1774 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_vm_modi];
1775 IRRef args[2];
1776 args[0] = ir->op1;
1777 args[1] = ir->op2;
1778 asm_setupresult(as, ir, ci);
1779 asm_gencall(as, ci, args);
1780} 2005}
1781 2006
1782static int asm_swapops(ASMState *as, IRIns *ir) 2007static int asm_swapops(ASMState *as, IRIns *ir)
@@ -1959,6 +2184,44 @@ static void asm_add(ASMState *as, IRIns *ir)
1959 asm_intarith(as, ir, XOg_ADD); 2184 asm_intarith(as, ir, XOg_ADD);
1960} 2185}
1961 2186
2187static void asm_sub(ASMState *as, IRIns *ir)
2188{
2189 if (irt_isnum(ir->t))
2190 asm_fparith(as, ir, XO_SUBSD);
2191 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2192 asm_intarith(as, ir, XOg_SUB);
2193}
2194
2195static void asm_mul(ASMState *as, IRIns *ir)
2196{
2197 if (irt_isnum(ir->t))
2198 asm_fparith(as, ir, XO_MULSD);
2199 else
2200 asm_intarith(as, ir, XOg_X_IMUL);
2201}
2202
2203static void asm_div(ASMState *as, IRIns *ir)
2204{
2205#if LJ_64 && LJ_HASFFI
2206 if (!irt_isnum(ir->t))
2207 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
2208 IRCALL_lj_carith_divu64);
2209 else
2210#endif
2211 asm_fparith(as, ir, XO_DIVSD);
2212}
2213
2214static void asm_mod(ASMState *as, IRIns *ir)
2215{
2216#if LJ_64 && LJ_HASFFI
2217 if (!irt_isint(ir->t))
2218 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
2219 IRCALL_lj_carith_modu64);
2220 else
2221#endif
2222 asm_callid(as, ir, IRCALL_lj_vm_modi);
2223}
2224
1962static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg) 2225static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
1963{ 2226{
1964 Reg dest = ra_dest(as, ir, RSET_GPR); 2227 Reg dest = ra_dest(as, ir, RSET_GPR);
@@ -1966,7 +2229,17 @@ static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
1966 ra_left(as, dest, ir->op1); 2229 ra_left(as, dest, ir->op1);
1967} 2230}
1968 2231
1969static void asm_min_max(ASMState *as, IRIns *ir, int cc) 2232static void asm_neg(ASMState *as, IRIns *ir)
2233{
2234 if (irt_isnum(ir->t))
2235 asm_fparith(as, ir, XO_XORPS);
2236 else
2237 asm_neg_not(as, ir, XOg_NEG);
2238}
2239
2240#define asm_abs(as, ir) asm_fparith(as, ir, XO_ANDPS)
2241
2242static void asm_intmin_max(ASMState *as, IRIns *ir, int cc)
1970{ 2243{
1971 Reg right, dest = ra_dest(as, ir, RSET_GPR); 2244 Reg right, dest = ra_dest(as, ir, RSET_GPR);
1972 IRRef lref = ir->op1, rref = ir->op2; 2245 IRRef lref = ir->op1, rref = ir->op2;
@@ -1977,7 +2250,30 @@ static void asm_min_max(ASMState *as, IRIns *ir, int cc)
1977 ra_left(as, dest, lref); 2250 ra_left(as, dest, lref);
1978} 2251}
1979 2252
1980static void asm_bitswap(ASMState *as, IRIns *ir) 2253static void asm_min(ASMState *as, IRIns *ir)
2254{
2255 if (irt_isnum(ir->t))
2256 asm_fparith(as, ir, XO_MINSD);
2257 else
2258 asm_intmin_max(as, ir, CC_G);
2259}
2260
2261static void asm_max(ASMState *as, IRIns *ir)
2262{
2263 if (irt_isnum(ir->t))
2264 asm_fparith(as, ir, XO_MAXSD);
2265 else
2266 asm_intmin_max(as, ir, CC_L);
2267}
2268
2269/* Note: don't use LEA for overflow-checking arithmetic! */
2270#define asm_addov(as, ir) asm_intarith(as, ir, XOg_ADD)
2271#define asm_subov(as, ir) asm_intarith(as, ir, XOg_SUB)
2272#define asm_mulov(as, ir) asm_intarith(as, ir, XOg_X_IMUL)
2273
2274#define asm_bnot(as, ir) asm_neg_not(as, ir, XOg_NOT)
2275
2276static void asm_bswap(ASMState *as, IRIns *ir)
1981{ 2277{
1982 Reg dest = ra_dest(as, ir, RSET_GPR); 2278 Reg dest = ra_dest(as, ir, RSET_GPR);
1983 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24), 2279 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24),
@@ -1985,7 +2281,11 @@ static void asm_bitswap(ASMState *as, IRIns *ir)
1985 ra_left(as, dest, ir->op1); 2281 ra_left(as, dest, ir->op1);
1986} 2282}
1987 2283
1988static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) 2284#define asm_band(as, ir) asm_intarith(as, ir, XOg_AND)
2285#define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR)
2286#define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR)
2287
2288static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv)
1989{ 2289{
1990 IRRef rref = ir->op2; 2290 IRRef rref = ir->op2;
1991 IRIns *irr = IR(rref); 2291 IRIns *irr = IR(rref);
@@ -1994,11 +2294,27 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
1994 int shift; 2294 int shift;
1995 dest = ra_dest(as, ir, RSET_GPR); 2295 dest = ra_dest(as, ir, RSET_GPR);
1996 shift = irr->i & (irt_is64(ir->t) ? 63 : 31); 2296 shift = irr->i & (irt_is64(ir->t) ? 63 : 31);
2297 if (!xv && shift && (as->flags & JIT_F_BMI2)) {
2298 Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t));
2299 if (left != dest) { /* BMI2 rotate right by constant. */
2300 emit_i8(as, xs == XOg_ROL ? -shift : shift);
2301 emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left);
2302 return;
2303 }
2304 }
1997 switch (shift) { 2305 switch (shift) {
1998 case 0: break; 2306 case 0: break;
1999 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; 2307 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break;
2000 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; 2308 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break;
2001 } 2309 }
2310 } else if ((as->flags & JIT_F_BMI2) && xv) { /* BMI2 variable shifts. */
2311 Reg left, right;
2312 dest = ra_dest(as, ir, RSET_GPR);
2313 right = ra_alloc1(as, rref, RSET_GPR);
2314 left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right),
2315 irt_is64(ir->t));
2316 emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left);
2317 return;
2002 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ 2318 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */
2003 Reg right; 2319 Reg right;
2004 dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); 2320 dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX));
@@ -2024,6 +2340,12 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs)
2024 */ 2340 */
2025} 2341}
2026 2342
2343#define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX)
2344#define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX)
2345#define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX)
2346#define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0)
2347#define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0)
2348
2027/* -- Comparisons --------------------------------------------------------- */ 2349/* -- Comparisons --------------------------------------------------------- */
2028 2350
2029/* Virtual flags for unordered FP comparisons. */ 2351/* Virtual flags for unordered FP comparisons. */
@@ -2050,8 +2372,9 @@ static const uint16_t asm_compmap[IR_ABC+1] = {
2050}; 2372};
2051 2373
2052/* FP and integer comparisons. */ 2374/* FP and integer comparisons. */
2053static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc) 2375static void asm_comp(ASMState *as, IRIns *ir)
2054{ 2376{
2377 uint32_t cc = asm_compmap[ir->o];
2055 if (irt_isnum(ir->t)) { 2378 if (irt_isnum(ir->t)) {
2056 IRRef lref = ir->op1; 2379 IRRef lref = ir->op1;
2057 IRRef rref = ir->op2; 2380 IRRef rref = ir->op2;
@@ -2072,7 +2395,6 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2072 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */ 2395 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */
2073 } 2396 }
2074 left = ra_alloc1(as, lref, RSET_FPR); 2397 left = ra_alloc1(as, lref, RSET_FPR);
2075 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
2076 l_around = emit_label(as); 2398 l_around = emit_label(as);
2077 asm_guardcc(as, cc >> 4); 2399 asm_guardcc(as, cc >> 4);
2078 if (cc & VCC_P) { /* Extra CC_P branch required? */ 2400 if (cc & VCC_P) { /* Extra CC_P branch required? */
@@ -2089,6 +2411,7 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2089 emit_jcc(as, CC_P, as->mcp); 2411 emit_jcc(as, CC_P, as->mcp);
2090 } 2412 }
2091 } 2413 }
2414 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
2092 emit_mrm(as, XO_UCOMISD, left, right); 2415 emit_mrm(as, XO_UCOMISD, left, right);
2093 } else { 2416 } else {
2094 IRRef lref = ir->op1, rref = ir->op2; 2417 IRRef lref = ir->op1, rref = ir->op2;
@@ -2206,6 +2529,8 @@ static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc)
2206 } 2529 }
2207} 2530}
2208 2531
2532#define asm_equal(as, ir) asm_comp(as, ir)
2533
2209#if LJ_32 && LJ_HASFFI 2534#if LJ_32 && LJ_HASFFI
2210/* 64 bit integer comparisons in 32 bit mode. */ 2535/* 64 bit integer comparisons in 32 bit mode. */
2211static void asm_comp_int64(ASMState *as, IRIns *ir) 2536static void asm_comp_int64(ASMState *as, IRIns *ir)
@@ -2288,13 +2613,9 @@ static void asm_hiop(ASMState *as, IRIns *ir)
2288 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ 2613 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */
2289 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; 2614 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
2290 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ 2615 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */
2291 if (usehi || uselo) {
2292 if (irt_isfp(ir->t))
2293 asm_conv_fp_int64(as, ir);
2294 else
2295 asm_conv_int64_fp(as, ir);
2296 }
2297 as->curins--; /* Always skip the CONV. */ 2616 as->curins--; /* Always skip the CONV. */
2617 if (usehi || uselo)
2618 asm_conv64(as, ir);
2298 return; 2619 return;
2299 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ 2620 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */
2300 asm_comp_int64(as, ir); 2621 asm_comp_int64(as, ir);
@@ -2343,6 +2664,16 @@ static void asm_hiop(ASMState *as, IRIns *ir)
2343#endif 2664#endif
2344} 2665}
2345 2666
2667/* -- Profiling ----------------------------------------------------------- */
2668
2669static void asm_prof(ASMState *as, IRIns *ir)
2670{
2671 UNUSED(ir);
2672 asm_guardcc(as, CC_NE);
2673 emit_i8(as, HOOK_PROFILE);
2674 emit_rma(as, XO_GROUP3b, XOg_TEST, &J2G(as->J)->hookmask);
2675}
2676
2346/* -- Stack handling ------------------------------------------------------ */ 2677/* -- Stack handling ------------------------------------------------------ */
2347 2678
2348/* Check Lua stack size for overflow. Use exit handler as fallback. */ 2679/* Check Lua stack size for overflow. Use exit handler as fallback. */
@@ -2357,14 +2688,19 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
2357 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0); 2688 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0);
2358 else 2689 else
2359 ra_modified(as, r); 2690 ra_modified(as, r);
2360 emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*topslot)); 2691 emit_gri(as, XG_ARITHi(XOg_CMP), r|REX_GC64, (int32_t)(8*topslot));
2361 if (ra_hasreg(pbase) && pbase != r) 2692 if (ra_hasreg(pbase) && pbase != r)
2362 emit_rr(as, XO_ARITH(XOg_SUB), r, pbase); 2693 emit_rr(as, XO_ARITH(XOg_SUB), r|REX_GC64, pbase);
2363 else 2694 else
2695#if LJ_GC64
2696 emit_rmro(as, XO_ARITH(XOg_SUB), r|REX_64, RID_DISPATCH,
2697 (int32_t)dispofs(as, &J2G(as->J)->jit_base));
2698#else
2364 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, 2699 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE,
2365 ptr2addr(&J2G(as->J)->jit_base)); 2700 ptr2addr(&J2G(as->J)->jit_base));
2366 emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack)); 2701#endif
2367 emit_getgl(as, r, jit_L); 2702 emit_rmro(as, XO_MOV, r|REX_GC64, r, offsetof(lua_State, maxstack));
2703 emit_getgl(as, r, cur_L);
2368 if (allow == RSET_EMPTY) /* Spill temp. register. */ 2704 if (allow == RSET_EMPTY) /* Spill temp. register. */
2369 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0); 2705 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0);
2370} 2706}
@@ -2373,13 +2709,15 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
2373static void asm_stack_restore(ASMState *as, SnapShot *snap) 2709static void asm_stack_restore(ASMState *as, SnapShot *snap)
2374{ 2710{
2375 SnapEntry *map = &as->T->snapmap[snap->mapofs]; 2711 SnapEntry *map = &as->T->snapmap[snap->mapofs];
2376 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1]; 2712#if !LJ_FR2 || defined(LUA_USE_ASSERT)
2713 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
2714#endif
2377 MSize n, nent = snap->nent; 2715 MSize n, nent = snap->nent;
2378 /* Store the value of all modified slots to the Lua stack. */ 2716 /* Store the value of all modified slots to the Lua stack. */
2379 for (n = 0; n < nent; n++) { 2717 for (n = 0; n < nent; n++) {
2380 SnapEntry sn = map[n]; 2718 SnapEntry sn = map[n];
2381 BCReg s = snap_slot(sn); 2719 BCReg s = snap_slot(sn);
2382 int32_t ofs = 8*((int32_t)s-1); 2720 int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
2383 IRRef ref = snap_ref(sn); 2721 IRRef ref = snap_ref(sn);
2384 IRIns *ir = IR(ref); 2722 IRIns *ir = IR(ref);
2385 if ((sn & SNAP_NORESTORE)) 2723 if ((sn & SNAP_NORESTORE))
@@ -2392,16 +2730,44 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
2392 (LJ_DUALNUM && irt_isinteger(ir->t))); 2730 (LJ_DUALNUM && irt_isinteger(ir->t)));
2393 if (!irref_isk(ref)) { 2731 if (!irref_isk(ref)) {
2394 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); 2732 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
2733#if LJ_GC64
2734 if (irt_is64(ir->t)) {
2735 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
2736 emit_u32(as, irt_toitype(ir->t) << 15);
2737 emit_rmro(as, XO_ARITHi, XOg_OR, RID_BASE, ofs+4);
2738 } else if (LJ_DUALNUM && irt_isinteger(ir->t)) {
2739 emit_movmroi(as, RID_BASE, ofs+4, LJ_TISNUM << 15);
2740 } else {
2741 emit_movmroi(as, RID_BASE, ofs+4, (irt_toitype(ir->t)<<15)|0x7fff);
2742 }
2743#endif
2395 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs); 2744 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs);
2745#if LJ_GC64
2746 } else {
2747 TValue k;
2748 lj_ir_kvalue(as->J->L, &k, ir);
2749 if (tvisnil(&k)) {
2750 emit_i32(as, -1);
2751 emit_rmro(as, XO_MOVmi, REX_64, RID_BASE, ofs);
2752 } else {
2753 emit_movmroi(as, RID_BASE, ofs+4, k.u32.hi);
2754 emit_movmroi(as, RID_BASE, ofs, k.u32.lo);
2755 }
2756#else
2396 } else if (!irt_ispri(ir->t)) { 2757 } else if (!irt_ispri(ir->t)) {
2397 emit_movmroi(as, RID_BASE, ofs, ir->i); 2758 emit_movmroi(as, RID_BASE, ofs, ir->i);
2759#endif
2398 } 2760 }
2399 if ((sn & (SNAP_CONT|SNAP_FRAME))) { 2761 if ((sn & (SNAP_CONT|SNAP_FRAME))) {
2762#if !LJ_FR2
2400 if (s != 0) /* Do not overwrite link to previous frame. */ 2763 if (s != 0) /* Do not overwrite link to previous frame. */
2401 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--)); 2764 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--));
2765#endif
2766#if !LJ_GC64
2402 } else { 2767 } else {
2403 if (!(LJ_64 && irt_islightud(ir->t))) 2768 if (!(LJ_64 && irt_islightud(ir->t)))
2404 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); 2769 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
2770#endif
2405 } 2771 }
2406 } 2772 }
2407 checkmclim(as); 2773 checkmclim(as);
@@ -2427,11 +2793,15 @@ static void asm_gc_check(ASMState *as)
2427 args[1] = ASMREF_TMP2; /* MSize steps */ 2793 args[1] = ASMREF_TMP2; /* MSize steps */
2428 asm_gencall(as, ci, args); 2794 asm_gencall(as, ci, args);
2429 tmp = ra_releasetmp(as, ASMREF_TMP1); 2795 tmp = ra_releasetmp(as, ASMREF_TMP1);
2796#if LJ_GC64
2797 emit_rmro(as, XO_LEA, tmp|REX_64, RID_DISPATCH, GG_DISP2G);
2798#else
2430 emit_loada(as, tmp, J2G(as->J)); 2799 emit_loada(as, tmp, J2G(as->J));
2800#endif
2431 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps); 2801 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps);
2432 /* Jump around GC step if GC total < GC threshold. */ 2802 /* Jump around GC step if GC total < GC threshold. */
2433 emit_sjcc(as, CC_B, l_end); 2803 emit_sjcc(as, CC_B, l_end);
2434 emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold); 2804 emit_opgl(as, XO_ARITH(XOg_CMP), tmp|REX_GC64, gc.threshold);
2435 emit_getgl(as, tmp, gc.total); 2805 emit_getgl(as, tmp, gc.total);
2436 as->gcsteps = 0; 2806 as->gcsteps = 0;
2437 checkmclim(as); 2807 checkmclim(as);
@@ -2496,7 +2866,7 @@ static void asm_head_root_base(ASMState *as)
2496 if (rset_test(as->modset, r) || irt_ismarked(ir->t)) 2866 if (rset_test(as->modset, r) || irt_ismarked(ir->t))
2497 ir->r = RID_INIT; /* No inheritance for modified BASE register. */ 2867 ir->r = RID_INIT; /* No inheritance for modified BASE register. */
2498 if (r != RID_BASE) 2868 if (r != RID_BASE)
2499 emit_rr(as, XO_MOV, r, RID_BASE); 2869 emit_rr(as, XO_MOV, r|REX_GC64, RID_BASE);
2500 } 2870 }
2501} 2871}
2502 2872
@@ -2512,8 +2882,9 @@ static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
2512 if (irp->r == r) { 2882 if (irp->r == r) {
2513 rset_clear(allow, r); /* Mark same BASE register as coalesced. */ 2883 rset_clear(allow, r); /* Mark same BASE register as coalesced. */
2514 } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) { 2884 } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
2885 /* Move from coalesced parent reg. */
2515 rset_clear(allow, irp->r); 2886 rset_clear(allow, irp->r);
2516 emit_rr(as, XO_MOV, r, irp->r); /* Move from coalesced parent reg. */ 2887 emit_rr(as, XO_MOV, r|REX_GC64, irp->r);
2517 } else { 2888 } else {
2518 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */ 2889 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */
2519 } 2890 }
@@ -2592,163 +2963,6 @@ static void asm_tail_prep(ASMState *as)
2592 } 2963 }
2593} 2964}
2594 2965
2595/* -- Instruction dispatch ------------------------------------------------ */
2596
2597/* Assemble a single instruction. */
2598static void asm_ir(ASMState *as, IRIns *ir)
2599{
2600 switch ((IROp)ir->o) {
2601 /* Miscellaneous ops. */
2602 case IR_LOOP: asm_loop(as); break;
2603 case IR_NOP: case IR_XBAR: lua_assert(!ra_used(ir)); break;
2604 case IR_USE:
2605 ra_alloc1(as, ir->op1, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); break;
2606 case IR_PHI: asm_phi(as, ir); break;
2607 case IR_HIOP: asm_hiop(as, ir); break;
2608 case IR_GCSTEP: asm_gcstep(as, ir); break;
2609
2610 /* Guarded assertions. */
2611 case IR_LT: case IR_GE: case IR_LE: case IR_GT:
2612 case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT:
2613 case IR_EQ: case IR_NE: case IR_ABC:
2614 asm_comp(as, ir, asm_compmap[ir->o]);
2615 break;
2616
2617 case IR_RETF: asm_retf(as, ir); break;
2618
2619 /* Bit ops. */
2620 case IR_BNOT: asm_neg_not(as, ir, XOg_NOT); break;
2621 case IR_BSWAP: asm_bitswap(as, ir); break;
2622
2623 case IR_BAND: asm_intarith(as, ir, XOg_AND); break;
2624 case IR_BOR: asm_intarith(as, ir, XOg_OR); break;
2625 case IR_BXOR: asm_intarith(as, ir, XOg_XOR); break;
2626
2627 case IR_BSHL: asm_bitshift(as, ir, XOg_SHL); break;
2628 case IR_BSHR: asm_bitshift(as, ir, XOg_SHR); break;
2629 case IR_BSAR: asm_bitshift(as, ir, XOg_SAR); break;
2630 case IR_BROL: asm_bitshift(as, ir, XOg_ROL); break;
2631 case IR_BROR: asm_bitshift(as, ir, XOg_ROR); break;
2632
2633 /* Arithmetic ops. */
2634 case IR_ADD: asm_add(as, ir); break;
2635 case IR_SUB:
2636 if (irt_isnum(ir->t))
2637 asm_fparith(as, ir, XO_SUBSD);
2638 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2639 asm_intarith(as, ir, XOg_SUB);
2640 break;
2641 case IR_MUL:
2642 if (irt_isnum(ir->t))
2643 asm_fparith(as, ir, XO_MULSD);
2644 else
2645 asm_intarith(as, ir, XOg_X_IMUL);
2646 break;
2647 case IR_DIV:
2648#if LJ_64 && LJ_HASFFI
2649 if (!irt_isnum(ir->t))
2650 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
2651 IRCALL_lj_carith_divu64);
2652 else
2653#endif
2654 asm_fparith(as, ir, XO_DIVSD);
2655 break;
2656 case IR_MOD:
2657#if LJ_64 && LJ_HASFFI
2658 if (!irt_isint(ir->t))
2659 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
2660 IRCALL_lj_carith_modu64);
2661 else
2662#endif
2663 asm_intmod(as, ir);
2664 break;
2665
2666 case IR_NEG:
2667 if (irt_isnum(ir->t))
2668 asm_fparith(as, ir, XO_XORPS);
2669 else
2670 asm_neg_not(as, ir, XOg_NEG);
2671 break;
2672 case IR_ABS: asm_fparith(as, ir, XO_ANDPS); break;
2673
2674 case IR_MIN:
2675 if (irt_isnum(ir->t))
2676 asm_fparith(as, ir, XO_MINSD);
2677 else
2678 asm_min_max(as, ir, CC_G);
2679 break;
2680 case IR_MAX:
2681 if (irt_isnum(ir->t))
2682 asm_fparith(as, ir, XO_MAXSD);
2683 else
2684 asm_min_max(as, ir, CC_L);
2685 break;
2686
2687 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
2688 asm_fpmath(as, ir);
2689 break;
2690 case IR_POW:
2691#if LJ_64 && LJ_HASFFI
2692 if (!irt_isnum(ir->t))
2693 asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
2694 IRCALL_lj_carith_powu64);
2695 else
2696#endif
2697 asm_fppowi(as, ir);
2698 break;
2699
2700 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
2701 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
2702 case IR_SUBOV: asm_intarith(as, ir, XOg_SUB); break;
2703 case IR_MULOV: asm_intarith(as, ir, XOg_X_IMUL); break;
2704
2705 /* Memory references. */
2706 case IR_AREF: asm_aref(as, ir); break;
2707 case IR_HREF: asm_href(as, ir); break;
2708 case IR_HREFK: asm_hrefk(as, ir); break;
2709 case IR_NEWREF: asm_newref(as, ir); break;
2710 case IR_UREFO: case IR_UREFC: asm_uref(as, ir); break;
2711 case IR_FREF: asm_fref(as, ir); break;
2712 case IR_STRREF: asm_strref(as, ir); break;
2713
2714 /* Loads and stores. */
2715 case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD:
2716 asm_ahuvload(as, ir);
2717 break;
2718 case IR_FLOAD: case IR_XLOAD: asm_fxload(as, ir); break;
2719 case IR_SLOAD: asm_sload(as, ir); break;
2720
2721 case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
2722 case IR_FSTORE: case IR_XSTORE: asm_fxstore(as, ir); break;
2723
2724 /* Allocations. */
2725 case IR_SNEW: case IR_XSNEW: asm_snew(as, ir); break;
2726 case IR_TNEW: asm_tnew(as, ir); break;
2727 case IR_TDUP: asm_tdup(as, ir); break;
2728 case IR_CNEW: case IR_CNEWI: asm_cnew(as, ir); break;
2729
2730 /* Write barriers. */
2731 case IR_TBAR: asm_tbar(as, ir); break;
2732 case IR_OBAR: asm_obar(as, ir); break;
2733
2734 /* Type conversions. */
2735 case IR_TOBIT: asm_tobit(as, ir); break;
2736 case IR_CONV: asm_conv(as, ir); break;
2737 case IR_TOSTR: asm_tostr(as, ir); break;
2738 case IR_STRTO: asm_strto(as, ir); break;
2739
2740 /* Calls. */
2741 case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break;
2742 case IR_CALLXS: asm_callx(as, ir); break;
2743 case IR_CARG: break;
2744
2745 default:
2746 setintV(&as->J->errinfo, ir->o);
2747 lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
2748 break;
2749 }
2750}
2751
2752/* -- Trace setup --------------------------------------------------------- */ 2966/* -- Trace setup --------------------------------------------------------- */
2753 2967
2754/* Ensure there are enough stack slots for call arguments. */ 2968/* Ensure there are enough stack slots for call arguments. */
@@ -2771,6 +2985,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
2771static void asm_setup_target(ASMState *as) 2985static void asm_setup_target(ASMState *as)
2772{ 2986{
2773 asm_exitstub_setup(as, as->T->nsnap); 2987 asm_exitstub_setup(as, as->T->nsnap);
2988 as->mrm.base = 0;
2774} 2989}
2775 2990
2776/* -- Trace patching ------------------------------------------------------ */ 2991/* -- Trace patching ------------------------------------------------------ */
@@ -2883,13 +3098,19 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
2883 MSize len = T->szmcode; 3098 MSize len = T->szmcode;
2884 MCode *px = exitstub_addr(J, exitno) - 6; 3099 MCode *px = exitstub_addr(J, exitno) - 6;
2885 MCode *pe = p+len-6; 3100 MCode *pe = p+len-6;
2886 uint32_t stateaddr = u32ptr(&J2G(J)->vmstate); 3101#if LJ_GC64
3102 uint32_t statei = (uint32_t)(GG_OFS(g.vmstate) - GG_OFS(dispatch));
3103#else
3104 uint32_t statei = u32ptr(&J2G(J)->vmstate);
3105#endif
2887 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) 3106 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
2888 *(int32_t *)(p+len-4) = jmprel(p+len, target); 3107 *(int32_t *)(p+len-4) = jmprel(p+len, target);
2889 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */ 3108 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
2890 for (; p < pe; p += asm_x86_inslen(p)) 3109 for (; p < pe; p += asm_x86_inslen(p)) {
2891 if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) 3110 intptr_t ofs = LJ_GC64 ? (p[0] & 0xf0) == 0x40 : LJ_64;
3111 if (*(uint32_t *)(p+2+ofs) == statei && p[ofs+LJ_GC64-LJ_64] == XI_MOVmi)
2892 break; 3112 break;
3113 }
2893 lua_assert(p < pe); 3114 lua_assert(p < pe);
2894 for (; p < pe; p += asm_x86_inslen(p)) 3115 for (; p < pe; p += asm_x86_inslen(p))
2895 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) 3116 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px)