diff options
Diffstat (limited to 'src/lj_asm.c')
-rw-r--r-- | src/lj_asm.c | 111 |
1 files changed, 72 insertions, 39 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c index c2cc4342..eb14b0e5 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c | |||
@@ -1991,9 +1991,19 @@ static int fpmjoin_pow(ASMState *as, IRIns *ir) | |||
1991 | IRIns *irpp = IR(irp->op1); | 1991 | IRIns *irpp = IR(irp->op1); |
1992 | if (irpp == ir-2 && irpp->o == IR_FPMATH && | 1992 | if (irpp == ir-2 && irpp->o == IR_FPMATH && |
1993 | irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) { | 1993 | irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) { |
1994 | emit_call(as, lj_vm_pow); /* st0 = lj_vm_pow(st1, st0) */ | 1994 | /* The modified regs must match with the *.dasc implementation. */ |
1995 | asm_x87load(as, irp->op2); | 1995 | RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX); |
1996 | asm_x87load(as, irpp->op1); | 1996 | IRIns *irx; |
1997 | if (ra_hasreg(ir->r)) | ||
1998 | rset_clear(drop, ir->r); /* Dest reg handled below. */ | ||
1999 | ra_evictset(as, drop); | ||
2000 | ra_destreg(as, ir, RID_XMM0); | ||
2001 | emit_call(as, lj_vm_pow_sse); | ||
2002 | irx = IR(irpp->op1); | ||
2003 | if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1) | ||
2004 | irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */ | ||
2005 | ra_left(as, RID_XMM0, irpp->op1); | ||
2006 | ra_left(as, RID_XMM1, irp->op2); | ||
1997 | return 1; | 2007 | return 1; |
1998 | } | 2008 | } |
1999 | } | 2009 | } |
@@ -2007,30 +2017,35 @@ static void asm_fpmath(ASMState *as, IRIns *ir) | |||
2007 | Reg dest = ra_dest(as, ir, RSET_FPR); | 2017 | Reg dest = ra_dest(as, ir, RSET_FPR); |
2008 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); | 2018 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); |
2009 | emit_mrm(as, XO_SQRTSD, dest, left); | 2019 | emit_mrm(as, XO_SQRTSD, dest, left); |
2010 | } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) { | ||
2011 | Reg dest = ra_dest(as, ir, RSET_FPR); | ||
2012 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); | ||
2013 | /* Round down/up/trunc == 1001/1010/1011. */ | ||
2014 | emit_i8(as, 0x09 + fpm); | ||
2015 | /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */ | ||
2016 | emit_mrm(as, XO_ROUNDSD, dest, left); | ||
2017 | /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */ | ||
2018 | /* This is atrocious, but the alternatives are much worse. */ | ||
2019 | if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) { | ||
2020 | as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */ | ||
2021 | } | ||
2022 | *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */ | ||
2023 | } else if (fpm <= IRFPM_TRUNC) { | 2020 | } else if (fpm <= IRFPM_TRUNC) { |
2024 | /* The modified regs must match with the *.dasc implementation. */ | 2021 | if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */ |
2025 | RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); | 2022 | Reg dest = ra_dest(as, ir, RSET_FPR); |
2026 | if (ra_hasreg(ir->r)) | 2023 | Reg left = asm_fuseload(as, ir->op1, RSET_FPR); |
2027 | rset_clear(drop, ir->r); /* Dest reg handled below. */ | 2024 | /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. |
2028 | ra_evictset(as, drop); | 2025 | ** Let's pretend it's a 3-byte opcode, and compensate afterwards. |
2029 | ra_destreg(as, ir, RID_XMM0); | 2026 | ** This is atrocious, but the alternatives are much worse. |
2030 | emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse : | 2027 | */ |
2031 | fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); | 2028 | /* Round down/up/trunc == 1001/1010/1011. */ |
2032 | ra_left(as, RID_XMM0, ir->op1); | 2029 | emit_i8(as, 0x09 + fpm); |
2033 | } else { | 2030 | emit_mrm(as, XO_ROUNDSD, dest, left); |
2031 | if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) { | ||
2032 | as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */ | ||
2033 | } | ||
2034 | *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */ | ||
2035 | } else { /* Call helper functions for SSE2 variant. */ | ||
2036 | /* The modified regs must match with the *.dasc implementation. */ | ||
2037 | RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); | ||
2038 | if (ra_hasreg(ir->r)) | ||
2039 | rset_clear(drop, ir->r); /* Dest reg handled below. */ | ||
2040 | ra_evictset(as, drop); | ||
2041 | ra_destreg(as, ir, RID_XMM0); | ||
2042 | emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse : | ||
2043 | fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); | ||
2044 | ra_left(as, RID_XMM0, ir->op1); | ||
2045 | } | ||
2046 | } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) { | ||
2047 | /* Rejoined to pow(). */ | ||
2048 | } else { /* Handle x87 ops. */ | ||
2034 | int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */ | 2049 | int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */ |
2035 | Reg dest = ir->r; | 2050 | Reg dest = ir->r; |
2036 | if (ra_hasreg(dest)) { | 2051 | if (ra_hasreg(dest)) { |
@@ -2040,14 +2055,8 @@ static void asm_fpmath(ASMState *as, IRIns *ir) | |||
2040 | } | 2055 | } |
2041 | emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); | 2056 | emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); |
2042 | switch (fpm) { /* st0 = lj_vm_*(st0) */ | 2057 | switch (fpm) { /* st0 = lj_vm_*(st0) */ |
2043 | case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break; | ||
2044 | case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break; | ||
2045 | case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break; | ||
2046 | case IRFPM_EXP: emit_call(as, lj_vm_exp); break; | 2058 | case IRFPM_EXP: emit_call(as, lj_vm_exp); break; |
2047 | case IRFPM_EXP2: | 2059 | case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break; |
2048 | if (fpmjoin_pow(as, ir)) return; | ||
2049 | emit_call(as, lj_vm_exp2); /* st0 = lj_vm_exp2(st0) */ | ||
2050 | break; | ||
2051 | case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; | 2060 | case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; |
2052 | case IRFPM_COS: emit_x87op(as, XI_FCOS); break; | 2061 | case IRFPM_COS: emit_x87op(as, XI_FCOS); break; |
2053 | case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; | 2062 | case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; |
@@ -2063,10 +2072,6 @@ static void asm_fpmath(ASMState *as, IRIns *ir) | |||
2063 | emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break; | 2072 | emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break; |
2064 | case IR_LDEXP: | 2073 | case IR_LDEXP: |
2065 | emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break; | 2074 | emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break; |
2066 | case IR_POWI: | ||
2067 | emit_call(as, lj_vm_powi); /* st0 = lj_vm_powi(st0, [esp]) */ | ||
2068 | emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0); | ||
2069 | break; | ||
2070 | default: lua_assert(0); break; | 2075 | default: lua_assert(0); break; |
2071 | } | 2076 | } |
2072 | break; | 2077 | break; |
@@ -2085,6 +2090,19 @@ static void asm_fpmath(ASMState *as, IRIns *ir) | |||
2085 | } | 2090 | } |
2086 | } | 2091 | } |
2087 | 2092 | ||
2093 | static void asm_powi(ASMState *as, IRIns *ir) | ||
2094 | { | ||
2095 | /* The modified regs must match with the *.dasc implementation. */ | ||
2096 | RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); | ||
2097 | if (ra_hasreg(ir->r)) | ||
2098 | rset_clear(drop, ir->r); /* Dest reg handled below. */ | ||
2099 | ra_evictset(as, drop); | ||
2100 | ra_destreg(as, ir, RID_XMM0); | ||
2101 | emit_call(as, lj_vm_powi_sse); | ||
2102 | ra_left(as, RID_XMM0, ir->op1); | ||
2103 | ra_left(as, RID_EAX, ir->op2); | ||
2104 | } | ||
2105 | |||
2088 | /* Find out whether swapping operands might be beneficial. */ | 2106 | /* Find out whether swapping operands might be beneficial. */ |
2089 | static int swapops(ASMState *as, IRIns *ir) | 2107 | static int swapops(ASMState *as, IRIns *ir) |
2090 | { | 2108 | { |
@@ -3132,9 +3150,10 @@ static void asm_ir(ASMState *as, IRIns *ir) | |||
3132 | case IR_MIN: asm_fparith(as, ir, XO_MINSD); break; | 3150 | case IR_MIN: asm_fparith(as, ir, XO_MINSD); break; |
3133 | case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break; | 3151 | case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break; |
3134 | 3152 | ||
3135 | case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI: | 3153 | case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: |
3136 | asm_fpmath(as, ir); | 3154 | asm_fpmath(as, ir); |
3137 | break; | 3155 | break; |
3156 | case IR_POWI: asm_powi(as, ir); break; | ||
3138 | 3157 | ||
3139 | /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ | 3158 | /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ |
3140 | case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; | 3159 | case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; |
@@ -3285,8 +3304,22 @@ static void asm_setup_regsp(ASMState *as, Trace *T) | |||
3285 | if (inloop) | 3304 | if (inloop) |
3286 | as->modset = RSET_SCRATCH; | 3305 | as->modset = RSET_SCRATCH; |
3287 | break; | 3306 | break; |
3307 | case IR_POWI: | ||
3308 | ir->prev = REGSP_HINT(RID_XMM0); | ||
3309 | if (inloop) | ||
3310 | as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); | ||
3311 | continue; | ||
3288 | case IR_FPMATH: | 3312 | case IR_FPMATH: |
3289 | if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) { | 3313 | if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */ |
3314 | ir->prev = REGSP_HINT(RID_XMM0); | ||
3315 | #if !LJ_64 | ||
3316 | if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */ | ||
3317 | as->evenspill = 4; | ||
3318 | #endif | ||
3319 | if (inloop) | ||
3320 | as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX); | ||
3321 | continue; | ||
3322 | } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) { | ||
3290 | ir->prev = REGSP_HINT(RID_XMM0); | 3323 | ir->prev = REGSP_HINT(RID_XMM0); |
3291 | if (inloop) | 3324 | if (inloop) |
3292 | as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); | 3325 | as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); |