aboutsummaryrefslogtreecommitdiff
path: root/src/lj_asm.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_asm.c')
-rw-r--r--src/lj_asm.c111
1 files changed, 72 insertions, 39 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c
index c2cc4342..eb14b0e5 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1991,9 +1991,19 @@ static int fpmjoin_pow(ASMState *as, IRIns *ir)
1991 IRIns *irpp = IR(irp->op1); 1991 IRIns *irpp = IR(irp->op1);
1992 if (irpp == ir-2 && irpp->o == IR_FPMATH && 1992 if (irpp == ir-2 && irpp->o == IR_FPMATH &&
1993 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) { 1993 irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
1994 emit_call(as, lj_vm_pow); /* st0 = lj_vm_pow(st1, st0) */ 1994 /* The modified regs must match with the *.dasc implementation. */
1995 asm_x87load(as, irp->op2); 1995 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
1996 asm_x87load(as, irpp->op1); 1996 IRIns *irx;
1997 if (ra_hasreg(ir->r))
1998 rset_clear(drop, ir->r); /* Dest reg handled below. */
1999 ra_evictset(as, drop);
2000 ra_destreg(as, ir, RID_XMM0);
2001 emit_call(as, lj_vm_pow_sse);
2002 irx = IR(irpp->op1);
2003 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
2004 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
2005 ra_left(as, RID_XMM0, irpp->op1);
2006 ra_left(as, RID_XMM1, irp->op2);
1997 return 1; 2007 return 1;
1998 } 2008 }
1999 } 2009 }
@@ -2007,30 +2017,35 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
2007 Reg dest = ra_dest(as, ir, RSET_FPR); 2017 Reg dest = ra_dest(as, ir, RSET_FPR);
2008 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 2018 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2009 emit_mrm(as, XO_SQRTSD, dest, left); 2019 emit_mrm(as, XO_SQRTSD, dest, left);
2010 } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) {
2011 Reg dest = ra_dest(as, ir, RSET_FPR);
2012 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2013 /* Round down/up/trunc == 1001/1010/1011. */
2014 emit_i8(as, 0x09 + fpm);
2015 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */
2016 emit_mrm(as, XO_ROUNDSD, dest, left);
2017 /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */
2018 /* This is atrocious, but the alternatives are much worse. */
2019 if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
2020 as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
2021 }
2022 *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
2023 } else if (fpm <= IRFPM_TRUNC) { 2020 } else if (fpm <= IRFPM_TRUNC) {
2024 /* The modified regs must match with the *.dasc implementation. */ 2021 if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */
2025 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); 2022 Reg dest = ra_dest(as, ir, RSET_FPR);
2026 if (ra_hasreg(ir->r)) 2023 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
2027 rset_clear(drop, ir->r); /* Dest reg handled below. */ 2024 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
2028 ra_evictset(as, drop); 2025 ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
2029 ra_destreg(as, ir, RID_XMM0); 2026 ** This is atrocious, but the alternatives are much worse.
2030 emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse : 2027 */
2031 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); 2028 /* Round down/up/trunc == 1001/1010/1011. */
2032 ra_left(as, RID_XMM0, ir->op1); 2029 emit_i8(as, 0x09 + fpm);
2033 } else { 2030 emit_mrm(as, XO_ROUNDSD, dest, left);
2031 if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
2032 as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
2033 }
2034 *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
2035 } else { /* Call helper functions for SSE2 variant. */
2036 /* The modified regs must match with the *.dasc implementation. */
2037 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
2038 if (ra_hasreg(ir->r))
2039 rset_clear(drop, ir->r); /* Dest reg handled below. */
2040 ra_evictset(as, drop);
2041 ra_destreg(as, ir, RID_XMM0);
2042 emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
2043 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
2044 ra_left(as, RID_XMM0, ir->op1);
2045 }
2046 } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
2047 /* Rejoined to pow(). */
2048 } else { /* Handle x87 ops. */
2034 int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */ 2049 int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
2035 Reg dest = ir->r; 2050 Reg dest = ir->r;
2036 if (ra_hasreg(dest)) { 2051 if (ra_hasreg(dest)) {
@@ -2040,14 +2055,8 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
2040 } 2055 }
2041 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); 2056 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
2042 switch (fpm) { /* st0 = lj_vm_*(st0) */ 2057 switch (fpm) { /* st0 = lj_vm_*(st0) */
2043 case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break;
2044 case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break;
2045 case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break;
2046 case IRFPM_EXP: emit_call(as, lj_vm_exp); break; 2058 case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
2047 case IRFPM_EXP2: 2059 case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
2048 if (fpmjoin_pow(as, ir)) return;
2049 emit_call(as, lj_vm_exp2); /* st0 = lj_vm_exp2(st0) */
2050 break;
2051 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; 2060 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
2052 case IRFPM_COS: emit_x87op(as, XI_FCOS); break; 2061 case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
2053 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; 2062 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
@@ -2063,10 +2072,6 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
2063 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break; 2072 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
2064 case IR_LDEXP: 2073 case IR_LDEXP:
2065 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break; 2074 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
2066 case IR_POWI:
2067 emit_call(as, lj_vm_powi); /* st0 = lj_vm_powi(st0, [esp]) */
2068 emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0);
2069 break;
2070 default: lua_assert(0); break; 2075 default: lua_assert(0); break;
2071 } 2076 }
2072 break; 2077 break;
@@ -2085,6 +2090,19 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
2085 } 2090 }
2086} 2091}
2087 2092
2093static void asm_powi(ASMState *as, IRIns *ir)
2094{
2095 /* The modified regs must match with the *.dasc implementation. */
2096 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
2097 if (ra_hasreg(ir->r))
2098 rset_clear(drop, ir->r); /* Dest reg handled below. */
2099 ra_evictset(as, drop);
2100 ra_destreg(as, ir, RID_XMM0);
2101 emit_call(as, lj_vm_powi_sse);
2102 ra_left(as, RID_XMM0, ir->op1);
2103 ra_left(as, RID_EAX, ir->op2);
2104}
2105
2088/* Find out whether swapping operands might be beneficial. */ 2106/* Find out whether swapping operands might be beneficial. */
2089static int swapops(ASMState *as, IRIns *ir) 2107static int swapops(ASMState *as, IRIns *ir)
2090{ 2108{
@@ -3132,9 +3150,10 @@ static void asm_ir(ASMState *as, IRIns *ir)
3132 case IR_MIN: asm_fparith(as, ir, XO_MINSD); break; 3150 case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
3133 case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break; 3151 case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
3134 3152
3135 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI: 3153 case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
3136 asm_fpmath(as, ir); 3154 asm_fpmath(as, ir);
3137 break; 3155 break;
3156 case IR_POWI: asm_powi(as, ir); break;
3138 3157
3139 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ 3158 /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
3140 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; 3159 case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3285,8 +3304,22 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
3285 if (inloop) 3304 if (inloop)
3286 as->modset = RSET_SCRATCH; 3305 as->modset = RSET_SCRATCH;
3287 break; 3306 break;
3307 case IR_POWI:
3308 ir->prev = REGSP_HINT(RID_XMM0);
3309 if (inloop)
3310 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
3311 continue;
3288 case IR_FPMATH: 3312 case IR_FPMATH:
3289 if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) { 3313 if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */
3314 ir->prev = REGSP_HINT(RID_XMM0);
3315#if !LJ_64
3316 if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */
3317 as->evenspill = 4;
3318#endif
3319 if (inloop)
3320 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
3321 continue;
3322 } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
3290 ir->prev = REGSP_HINT(RID_XMM0); 3323 ir->prev = REGSP_HINT(RID_XMM0);
3291 if (inloop) 3324 if (inloop)
3292 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); 3325 as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);