aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Pall <mike>2014-12-08 02:02:34 +0100
committerMike Pall <mike>2014-12-08 02:02:34 +0100
commitad03eba715e5e0d0bd0f3c0ddef4b8f5bbb0c626 (patch)
tree3404e1b148e08f2320a9937ca4849dc794b36bad
parente03df1e3395bc719d43bd9196d0290757f992b2f (diff)
downloadluajit-ad03eba715e5e0d0bd0f3c0ddef4b8f5bbb0c626.tar.gz
luajit-ad03eba715e5e0d0bd0f3c0ddef4b8f5bbb0c626.tar.bz2
luajit-ad03eba715e5e0d0bd0f3c0ddef4b8f5bbb0c626.zip
x86/x64: Drop internal x87 math functions. Use libm functions.
-rw-r--r--src/lj_arch.h6
-rw-r--r--src/lj_asm.c4
-rw-r--r--src/lj_asm_x86.h82
-rw-r--r--src/lj_ircall.h24
-rw-r--r--src/lj_vm.h12
-rw-r--r--src/lj_vmmath.c16
-rw-r--r--src/vm_x86.dasc425
7 files changed, 114 insertions, 455 deletions
diff --git a/src/lj_arch.h b/src/lj_arch.h
index da16a193..36b38886 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -426,11 +426,11 @@
426#define LJ_TARGET_UNALIGNED 0 426#define LJ_TARGET_UNALIGNED 0
427#endif 427#endif
428 428
429/* Various workarounds for embedded operating systems. */ 429/* Various workarounds for embedded operating systems or weak C runtimes. */
430#if (defined(__ANDROID__) && !defined(LJ_TARGET_X86ORX64)) || defined(__symbian__) || LJ_TARGET_XBOX360 430#if (defined(__ANDROID__) && !defined(LJ_TARGET_X86ORX64)) || defined(__symbian__) || LJ_TARGET_XBOX360 || LJ_TARGET_WINDOWS
431#define LUAJIT_NO_LOG2 431#define LUAJIT_NO_LOG2
432#endif 432#endif
433#if defined(__symbian__) 433#if defined(__symbian__) || LJ_TARGET_WINDOWS
434#define LUAJIT_NO_EXP2 434#define LUAJIT_NO_EXP2
435#endif 435#endif
436 436
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 0b6738da..aaab3255 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1262,9 +1262,6 @@ static void asm_call(ASMState *as, IRIns *ir)
1262} 1262}
1263 1263
1264#if !LJ_SOFTFP 1264#if !LJ_SOFTFP
1265static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref);
1266
1267#if !LJ_TARGET_X86ORX64
1268static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) 1265static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref)
1269{ 1266{
1270 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_pow]; 1267 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_pow];
@@ -1274,7 +1271,6 @@ static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref)
1274 asm_setupresult(as, ir, ci); 1271 asm_setupresult(as, ir, ci);
1275 asm_gencall(as, ci, args); 1272 asm_gencall(as, ci, args);
1276} 1273}
1277#endif
1278 1274
1279static int asm_fpjoin_pow(ASMState *as, IRIns *ir) 1275static int asm_fpjoin_pow(ASMState *as, IRIns *ir)
1280{ 1276{
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 8b541250..bd97764f 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -1593,26 +1593,9 @@ static void asm_x87load(ASMState *as, IRRef ref)
1593 } 1593 }
1594} 1594}
1595 1595
1596static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref)
1597{
1598 /* The modified regs must match with the *.dasc implementation. */
1599 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
1600 IRIns *irx;
1601 if (ra_hasreg(ir->r))
1602 rset_clear(drop, ir->r); /* Dest reg handled below. */
1603 ra_evictset(as, drop);
1604 ra_destreg(as, ir, RID_XMM0);
1605 emit_call(as, lj_vm_pow_sse);
1606 irx = IR(lref);
1607 if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
1608 irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
1609 ra_left(as, RID_XMM0, lref);
1610 ra_left(as, RID_XMM1, rref);
1611}
1612
1613static void asm_fpmath(ASMState *as, IRIns *ir) 1596static void asm_fpmath(ASMState *as, IRIns *ir)
1614{ 1597{
1615 IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER; 1598 IRFPMathOp fpm = (IRFPMathOp)ir->op2;
1616 if (fpm == IRFPM_SQRT) { 1599 if (fpm == IRFPM_SQRT) {
1617 Reg dest = ra_dest(as, ir, RSET_FPR); 1600 Reg dest = ra_dest(as, ir, RSET_FPR);
1618 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 1601 Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
@@ -1645,53 +1628,28 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
1645 } 1628 }
1646 } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) { 1629 } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
1647 /* Rejoined to pow(). */ 1630 /* Rejoined to pow(). */
1648 } else { /* Handle x87 ops. */ 1631 } else {
1649 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1632 asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
1650 Reg dest = ir->r;
1651 if (ra_hasreg(dest)) {
1652 ra_free(as, dest);
1653 ra_modified(as, dest);
1654 emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs);
1655 }
1656 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
1657 switch (fpm) { /* st0 = lj_vm_*(st0) */
1658 case IRFPM_EXP: emit_call(as, lj_vm_exp_x87); break;
1659 case IRFPM_EXP2: emit_call(as, lj_vm_exp2_x87); break;
1660 case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
1661 case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
1662 case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
1663 case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10:
1664 /* Note: the use of fyl2xp1 would be pointless here. When computing
1665 ** log(1.0+eps) the precision is already lost after 1.0 is added.
1666 ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense.
1667 */
1668 emit_x87op(as, XI_FYL2X); break;
1669 case IRFPM_OTHER:
1670 switch (ir->o) {
1671 case IR_ATAN2:
1672 emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
1673 case IR_LDEXP:
1674 emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
1675 default: lua_assert(0); break;
1676 }
1677 break;
1678 default: lua_assert(0); break;
1679 }
1680 asm_x87load(as, ir->op1);
1681 switch (fpm) {
1682 case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break;
1683 case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break;
1684 case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break;
1685 case IRFPM_OTHER:
1686 if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2);
1687 break;
1688 default: break;
1689 }
1690 } 1633 }
1691} 1634}
1692 1635
1693#define asm_atan2(as, ir) asm_fpmath(as, ir) 1636#define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2)
1694#define asm_ldexp(as, ir) asm_fpmath(as, ir) 1637
1638static void asm_ldexp(ASMState *as, IRIns *ir)
1639{
1640 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */
1641 Reg dest = ir->r;
1642 if (ra_hasreg(dest)) {
1643 ra_free(as, dest);
1644 ra_modified(as, dest);
1645 emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs);
1646 }
1647 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
1648 emit_x87op(as, XI_FPOP1);
1649 emit_x87op(as, XI_FSCALE);
1650 asm_x87load(as, ir->op1);
1651 asm_x87load(as, ir->op2);
1652}
1695 1653
1696static void asm_fppowi(ASMState *as, IRIns *ir) 1654static void asm_fppowi(ASMState *as, IRIns *ir)
1697{ 1655{
diff --git a/src/lj_ircall.h b/src/lj_ircall.h
index 9bf46918..e71f0432 100644
--- a/src/lj_ircall.h
+++ b/src/lj_ircall.h
@@ -169,18 +169,18 @@ typedef struct CCallInfo {
169 _(FPMATH, lj_vm_ceil, 1, N, NUM, XA_FP) \ 169 _(FPMATH, lj_vm_ceil, 1, N, NUM, XA_FP) \
170 _(FPMATH, lj_vm_trunc, 1, N, NUM, XA_FP) \ 170 _(FPMATH, lj_vm_trunc, 1, N, NUM, XA_FP) \
171 _(FPMATH, sqrt, 1, N, NUM, XA_FP) \ 171 _(FPMATH, sqrt, 1, N, NUM, XA_FP) \
172 _(FPMATH, exp, 1, N, NUM, XA_FP) \ 172 _(ANY, exp, 1, N, NUM, XA_FP) \
173 _(FPMATH, lj_vm_exp2, 1, N, NUM, XA_FP) \ 173 _(ANY, lj_vm_exp2, 1, N, NUM, XA_FP) \
174 _(FPMATH, log, 1, N, NUM, XA_FP) \ 174 _(ANY, log, 1, N, NUM, XA_FP) \
175 _(FPMATH, lj_vm_log2, 1, N, NUM, XA_FP) \ 175 _(ANY, lj_vm_log2, 1, N, NUM, XA_FP) \
176 _(FPMATH, log10, 1, N, NUM, XA_FP) \ 176 _(ANY, log10, 1, N, NUM, XA_FP) \
177 _(FPMATH, sin, 1, N, NUM, XA_FP) \ 177 _(ANY, sin, 1, N, NUM, XA_FP) \
178 _(FPMATH, cos, 1, N, NUM, XA_FP) \ 178 _(ANY, cos, 1, N, NUM, XA_FP) \
179 _(FPMATH, tan, 1, N, NUM, XA_FP) \ 179 _(ANY, tan, 1, N, NUM, XA_FP) \
180 _(FPMATH, lj_vm_powi, 2, N, NUM, XA_FP) \ 180 _(ANY, lj_vm_powi, 2, N, NUM, XA_FP) \
181 _(FPMATH, pow, 2, N, NUM, XA2_FP) \ 181 _(ANY, pow, 2, N, NUM, XA2_FP) \
182 _(FPMATH, atan2, 2, N, NUM, XA2_FP) \ 182 _(ANY, atan2, 2, N, NUM, XA2_FP) \
183 _(FPMATH, ldexp, 2, N, NUM, XA_FP) \ 183 _(ANY, ldexp, 2, N, NUM, XA_FP) \
184 _(SOFTFP, lj_vm_tobit, 2, N, INT, 0) \ 184 _(SOFTFP, lj_vm_tobit, 2, N, INT, 0) \
185 _(SOFTFP, softfp_add, 4, N, NUM, 0) \ 185 _(SOFTFP, softfp_add, 4, N, NUM, 0) \
186 _(SOFTFP, softfp_sub, 4, N, NUM, 0) \ 186 _(SOFTFP, softfp_sub, 4, N, NUM, 0) \
diff --git a/src/lj_vm.h b/src/lj_vm.h
index 83883e2c..a69d699f 100644
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -55,15 +55,13 @@ LJ_ASMF void lj_vm_exit_interp(void);
55#define lj_vm_ceil ceil 55#define lj_vm_ceil ceil
56#else 56#else
57LJ_ASMF double lj_vm_floor(double); 57LJ_ASMF double lj_vm_floor(double);
58#if !LJ_TARGET_X86ORX64
59LJ_ASMF double lj_vm_ceil(double); 58LJ_ASMF double lj_vm_ceil(double);
60#endif
61#if LJ_TARGET_ARM 59#if LJ_TARGET_ARM
62LJ_ASMF double lj_vm_floor_sf(double); 60LJ_ASMF double lj_vm_floor_sf(double);
63LJ_ASMF double lj_vm_ceil_sf(double); 61LJ_ASMF double lj_vm_ceil_sf(double);
64#endif 62#endif
65#endif 63#endif
66#if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 64#ifdef LUAJIT_NO_LOG2
67LJ_ASMF double lj_vm_log2(double); 65LJ_ASMF double lj_vm_log2(double);
68#else 66#else
69#define lj_vm_log2 log2 67#define lj_vm_log2 log2
@@ -74,11 +72,11 @@ LJ_ASMF double lj_vm_log2(double);
74LJ_ASMF void lj_vm_floor_sse(void); 72LJ_ASMF void lj_vm_floor_sse(void);
75LJ_ASMF void lj_vm_ceil_sse(void); 73LJ_ASMF void lj_vm_ceil_sse(void);
76LJ_ASMF void lj_vm_trunc_sse(void); 74LJ_ASMF void lj_vm_trunc_sse(void);
77LJ_ASMF void lj_vm_exp_x87(void);
78LJ_ASMF void lj_vm_exp2_x87(void);
79LJ_ASMF void lj_vm_pow_sse(void);
80LJ_ASMF void lj_vm_powi_sse(void); 75LJ_ASMF void lj_vm_powi_sse(void);
76#define lj_vm_powi NULL
81#else 77#else
78LJ_ASMF double lj_vm_powi(double, int32_t);
79#endif
82#if LJ_TARGET_PPC 80#if LJ_TARGET_PPC
83#define lj_vm_trunc trunc 81#define lj_vm_trunc trunc
84#else 82#else
@@ -87,13 +85,11 @@ LJ_ASMF double lj_vm_trunc(double);
87LJ_ASMF double lj_vm_trunc_sf(double); 85LJ_ASMF double lj_vm_trunc_sf(double);
88#endif 86#endif
89#endif 87#endif
90LJ_ASMF double lj_vm_powi(double, int32_t);
91#ifdef LUAJIT_NO_EXP2 88#ifdef LUAJIT_NO_EXP2
92LJ_ASMF double lj_vm_exp2(double); 89LJ_ASMF double lj_vm_exp2(double);
93#else 90#else
94#define lj_vm_exp2 exp2 91#define lj_vm_exp2 exp2
95#endif 92#endif
96#endif
97LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(int32_t, int32_t); 93LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(int32_t, int32_t);
98#if LJ_HASFFI 94#if LJ_HASFFI
99LJ_ASMF int lj_vm_errno(void); 95LJ_ASMF int lj_vm_errno(void);
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
index b60858b2..6ea99d15 100644
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -17,14 +17,25 @@
17 17
18#if LJ_TARGET_X86 && __ELF__ && __PIC__ 18#if LJ_TARGET_X86 && __ELF__ && __PIC__
19/* Wrapper functions to deal with the ELF/x86 PIC disaster. */ 19/* Wrapper functions to deal with the ELF/x86 PIC disaster. */
20LJ_FUNCA double lj_wrap_log(double x) { return log(x); }
21LJ_FUNCA double lj_wrap_log10(double x) { return log10(x); }
22LJ_FUNCA double lj_wrap_exp(double x) { return exp(x); }
23LJ_FUNCA double lj_wrap_sin(double x) { return sin(x); }
24LJ_FUNCA double lj_wrap_cos(double x) { return cos(x); }
25LJ_FUNCA double lj_wrap_tan(double x) { return tan(x); }
26LJ_FUNCA double lj_wrap_asin(double x) { return asin(x); }
27LJ_FUNCA double lj_wrap_acos(double x) { return acos(x); }
28LJ_FUNCA double lj_wrap_atan(double x) { return atan(x); }
20LJ_FUNCA double lj_wrap_sinh(double x) { return sinh(x); } 29LJ_FUNCA double lj_wrap_sinh(double x) { return sinh(x); }
21LJ_FUNCA double lj_wrap_cosh(double x) { return cosh(x); } 30LJ_FUNCA double lj_wrap_cosh(double x) { return cosh(x); }
22LJ_FUNCA double lj_wrap_tanh(double x) { return tanh(x); } 31LJ_FUNCA double lj_wrap_tanh(double x) { return tanh(x); }
32LJ_FUNCA double lj_wrap_atan2(double x, double y) { return atan2(x, y); }
33LJ_FUNCA double lj_wrap_pow(double x, double y) { return pow(x, y); }
34LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
23#endif 35#endif
24 36
25/* -- Helper functions for generated machine code ------------------------- */ 37/* -- Helper functions for generated machine code ------------------------- */
26 38
27#if !LJ_TARGET_X86ORX64
28double lj_vm_foldarith(double x, double y, int op) 39double lj_vm_foldarith(double x, double y, int op)
29{ 40{
30 switch (op) { 41 switch (op) {
@@ -45,7 +56,6 @@ double lj_vm_foldarith(double x, double y, int op)
45 default: return x; 56 default: return x;
46 } 57 }
47} 58}
48#endif
49 59
50#if LJ_HASJIT 60#if LJ_HASJIT
51 61
@@ -109,6 +119,7 @@ double lj_vm_powi(double x, int32_t k)
109 else 119 else
110 return 1.0 / lj_vm_powui(x, (uint32_t)-k); 120 return 1.0 / lj_vm_powui(x, (uint32_t)-k);
111} 121}
122#endif
112 123
113/* Computes fpm(x) for extended math functions. */ 124/* Computes fpm(x) for extended math functions. */
114double lj_vm_foldfpm(double x, int fpm) 125double lj_vm_foldfpm(double x, int fpm)
@@ -130,7 +141,6 @@ double lj_vm_foldfpm(double x, int fpm)
130 } 141 }
131 return 0; 142 return 0;
132} 143}
133#endif
134 144
135#if LJ_HASFFI 145#if LJ_HASFFI
136int lj_vm_errno(void) 146int lj_vm_errno(void)
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index cd43afbd..290054dc 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -373,7 +373,6 @@
373| fpop 373| fpop
374|.endmacro 374|.endmacro
375| 375|
376|.macro fdup; fld st0; .endmacro
377|.macro fpop1; fstp st1; .endmacro 376|.macro fpop1; fstp st1; .endmacro
378| 377|
379|// Synthesize SSE FP constants. 378|// Synthesize SSE FP constants.
@@ -1329,19 +1328,6 @@ static void build_subroutines(BuildCtx *ctx)
1329 | cmp NARGS:RD, 2+1; jb ->fff_fallback 1328 | cmp NARGS:RD, 2+1; jb ->fff_fallback
1330 |.endmacro 1329 |.endmacro
1331 | 1330 |
1332 |.macro .ffunc_n, name
1333 | .ffunc_1 name
1334 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
1335 | fld qword [BASE]
1336 |.endmacro
1337 |
1338 |.macro .ffunc_n, name, op
1339 | .ffunc_1 name
1340 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
1341 | op
1342 | fld qword [BASE]
1343 |.endmacro
1344 |
1345 |.macro .ffunc_nsse, name, op 1331 |.macro .ffunc_nsse, name, op
1346 | .ffunc_1 name 1332 | .ffunc_1 name
1347 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback 1333 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
@@ -1352,14 +1338,6 @@ static void build_subroutines(BuildCtx *ctx)
1352 | .ffunc_nsse name, movsd 1338 | .ffunc_nsse name, movsd
1353 |.endmacro 1339 |.endmacro
1354 | 1340 |
1355 |.macro .ffunc_nn, name
1356 | .ffunc_2 name
1357 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
1358 | cmp dword [BASE+12], LJ_TISNUM; jae ->fff_fallback
1359 | fld qword [BASE]
1360 | fld qword [BASE+8]
1361 |.endmacro
1362 |
1363 |.macro .ffunc_nnsse, name 1341 |.macro .ffunc_nnsse, name
1364 | .ffunc_2 name 1342 | .ffunc_2 name
1365 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback 1343 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
@@ -2029,6 +2007,12 @@ static void build_subroutines(BuildCtx *ctx)
2029 | mov RAa, -8 // Results start at BASE+RA = BASE-8. 2007 | mov RAa, -8 // Results start at BASE+RA = BASE-8.
2030 | jmp ->vm_return 2008 | jmp ->vm_return
2031 | 2009 |
2010 |.if X64
2011 |.define fff_resfp, fff_resxmm0
2012 |.else
2013 |.define fff_resfp, fff_resn
2014 |.endif
2015 |
2032 |.macro math_round, func 2016 |.macro math_round, func
2033 | .ffunc math_ .. func 2017 | .ffunc math_ .. func
2034 |.if DUALNUM 2018 |.if DUALNUM
@@ -2061,22 +2045,14 @@ static void build_subroutines(BuildCtx *ctx)
2061 |.ffunc math_log 2045 |.ffunc math_log
2062 | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument. 2046 | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument.
2063 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback 2047 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
2064 | fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn 2048 | movsd xmm0, qword [BASE]
2065 | 2049 |.if not X64
2066 |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn 2050 | movsd FPARG1, xmm0
2067 |.ffunc_n math_exp; call ->vm_exp_x87; jmp ->fff_resn 2051 |.endif
2068 | 2052 | mov RB, BASE
2069 |.ffunc_n math_sin; fsin; jmp ->fff_resn 2053 | call extern log
2070 |.ffunc_n math_cos; fcos; jmp ->fff_resn 2054 | mov BASE, RB
2071 |.ffunc_n math_tan; fptan; fpop; jmp ->fff_resn 2055 | jmp ->fff_resfp
2072 |
2073 |.ffunc_n math_asin
2074 | fdup; fmul st0; fld1; fsubrp st1; fsqrt; fpatan
2075 | jmp ->fff_resn
2076 |.ffunc_n math_acos
2077 | fdup; fmul st0; fld1; fsubrp st1; fsqrt; fxch; fpatan
2078 | jmp ->fff_resn
2079 |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn
2080 | 2056 |
2081 |.macro math_extern, func 2057 |.macro math_extern, func
2082 | .ffunc_nsse math_ .. func 2058 | .ffunc_nsse math_ .. func
@@ -2086,18 +2062,36 @@ static void build_subroutines(BuildCtx *ctx)
2086 | mov RB, BASE 2062 | mov RB, BASE
2087 | call extern func 2063 | call extern func
2088 | mov BASE, RB 2064 | mov BASE, RB
2089 |.if X64 2065 | jmp ->fff_resfp
2090 | jmp ->fff_resxmm0 2066 |.endmacro
2091 |.else 2067 |
2092 | jmp ->fff_resn 2068 |.macro math_extern2, func
2069 | .ffunc_nnsse math_ .. func
2070 |.if not X64
2071 | movsd FPARG1, xmm0
2072 | movsd FPARG3, xmm1
2093 |.endif 2073 |.endif
2074 | mov RB, BASE
2075 | call extern func
2076 | mov BASE, RB
2077 | jmp ->fff_resfp
2094 |.endmacro 2078 |.endmacro
2095 | 2079 |
2080 | math_extern log10
2081 | math_extern exp
2082 | math_extern sin
2083 | math_extern cos
2084 | math_extern tan
2085 | math_extern asin
2086 | math_extern acos
2087 | math_extern atan
2096 | math_extern sinh 2088 | math_extern sinh
2097 | math_extern cosh 2089 | math_extern cosh
2098 | math_extern tanh 2090 | math_extern tanh
2091 | math_extern2 pow
2092 | math_extern2 atan2
2093 | math_extern2 fmod
2099 | 2094 |
2100 |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn
2101 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn 2095 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn
2102 | 2096 |
2103 |.ffunc_1 math_frexp 2097 |.ffunc_1 math_frexp
@@ -2151,13 +2145,6 @@ static void build_subroutines(BuildCtx *ctx)
2151 |4: 2145 |4:
2152 | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0. 2146 | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0.
2153 | 2147 |
2154 |.ffunc_nnr math_fmod
2155 |1: ; fprem; fnstsw ax; and ax, 0x400; jnz <1
2156 | fpop1
2157 | jmp ->fff_resn
2158 |
2159 |.ffunc_nnsse math_pow; call ->vm_pow_sse; jmp ->fff_resxmm0
2160 |
2161 |.macro math_minmax, name, cmovop, sseop 2148 |.macro math_minmax, name, cmovop, sseop
2162 | .ffunc name 2149 | .ffunc name
2163 | mov RA, 2 2150 | mov RA, 2
@@ -2899,7 +2886,16 @@ static void build_subroutines(BuildCtx *ctx)
2899 | 2886 |
2900 |// FP value rounding. Called by math.floor/math.ceil fast functions 2887 |// FP value rounding. Called by math.floor/math.ceil fast functions
2901 |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified. 2888 |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
2902 |.macro vm_round, name, mode 2889 |.macro vm_round, name, mode, cond
2890 |->name:
2891 |.if not X64 and cond
2892 | movsd xmm0, qword [esp+4]
2893 | call ->name .. _sse
2894 | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg.
2895 | fld qword [esp+4]
2896 | ret
2897 |.endif
2898 |
2903 |->name .. _sse: 2899 |->name .. _sse:
2904 | sseconst_abs xmm2, RDa 2900 | sseconst_abs xmm2, RDa
2905 | sseconst_2p52 xmm3, RDa 2901 | sseconst_2p52 xmm3, RDa
@@ -2936,18 +2932,9 @@ static void build_subroutines(BuildCtx *ctx)
2936 | ret 2932 | ret
2937 |.endmacro 2933 |.endmacro
2938 | 2934 |
2939 |->vm_floor: 2935 | vm_round vm_floor, 0, 1
2940 |.if not X64 2936 | vm_round vm_ceil, 1, JIT
2941 | movsd xmm0, qword [esp+4] 2937 | vm_round vm_trunc, 2, JIT
2942 | call ->vm_floor_sse
2943 | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg.
2944 | fld qword [esp+4]
2945 | ret
2946 |.endif
2947 |
2948 | vm_round vm_floor, 0
2949 | vm_round vm_ceil, 1
2950 | vm_round vm_trunc, 2
2951 | 2938 |
2952 |// FP modulo x%y. Called by BC_MOD* and vm_arith. 2939 |// FP modulo x%y. Called by BC_MOD* and vm_arith.
2953 |->vm_mod: 2940 |->vm_mod:
@@ -2979,65 +2966,6 @@ static void build_subroutines(BuildCtx *ctx)
2979 | subsd xmm0, xmm1 2966 | subsd xmm0, xmm1
2980 | ret 2967 | ret
2981 | 2968 |
2982 |// FP log2(x). Called by math.log(x, base).
2983 |->vm_log2:
2984 |.if X64WIN
2985 | movsd qword [rsp+8], xmm0 // Use scratch area.
2986 | fld1
2987 | fld qword [rsp+8]
2988 | fyl2x
2989 | fstp qword [rsp+8]
2990 | movsd xmm0, qword [rsp+8]
2991 |.elif X64
2992 | movsd qword [rsp-8], xmm0 // Use red zone.
2993 | fld1
2994 | fld qword [rsp-8]
2995 | fyl2x
2996 | fstp qword [rsp-8]
2997 | movsd xmm0, qword [rsp-8]
2998 |.else
2999 | fld1
3000 | fld qword [esp+4]
3001 | fyl2x
3002 |.endif
3003 | ret
3004 |
3005 |// FP exponentiation e^x and 2^x. Called by math.exp fast function and
3006 |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified.
3007 |// Caveat: needs 3 slots on x87 stack!
3008 |->vm_exp_x87:
3009 | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e))
3010 |->vm_exp2_x87:
3011 | .if X64WIN
3012 | .define expscratch, dword [rsp+8] // Use scratch area.
3013 | .elif X64
3014 | .define expscratch, dword [rsp-8] // Use red zone.
3015 | .else
3016 | .define expscratch, dword [esp+4] // Needs 4 byte scratch area.
3017 | .endif
3018 | fst expscratch // Caveat: overwrites ARG1.
3019 | cmp expscratch, 0x7f800000; je >1 // Special case: e^+Inf = +Inf
3020 | cmp expscratch, 0xff800000; je >2 // Special case: e^-Inf = 0
3021 |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check.
3022 | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
3023 | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
3024 |1:
3025 | ret
3026 |2:
3027 | fpop; fldz; ret
3028 |
3029 |// Generic power function x^y. Called by BC_POW, math.pow fast function,
3030 |// and vm_arith.
3031 |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
3032 |// Needs 16 byte scratch area for x86. Also called from JIT code.
3033 |->vm_pow_sse:
3034 | cvttsd2si eax, xmm1
3035 | cvtsi2sd xmm2, eax
3036 | ucomisd xmm1, xmm2
3037 | jnz >8 // Branch for FP exponents.
3038 | jp >9 // Branch for NaN exponent.
3039 | // Fallthrough.
3040 |
3041 |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified. 2969 |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
3042 |->vm_powi_sse: 2970 |->vm_powi_sse:
3043 | cmp eax, 1; jle >6 // i<=1? 2971 | cmp eax, 1; jle >6 // i<=1?
@@ -3073,246 +3001,6 @@ static void build_subroutines(BuildCtx *ctx)
3073 | sseconst_1 xmm0, RDa 3001 | sseconst_1 xmm0, RDa
3074 | ret 3002 | ret
3075 | 3003 |
3076 |8: // FP/FP power function x^y.
3077 |.if X64
3078 | movd rax, xmm1; shl rax, 1
3079 | rol rax, 12; cmp rax, 0xffe; je >2 // x^+-Inf?
3080 | movd rax, xmm0; shl rax, 1; je >4 // +-0^y?
3081 | rol rax, 12; cmp rax, 0xffe; je >5 // +-Inf^y?
3082 | .if X64WIN
3083 | movsd qword [rsp+16], xmm1 // Use scratch area.
3084 | movsd qword [rsp+8], xmm0
3085 | fld qword [rsp+16]
3086 | fld qword [rsp+8]
3087 | .else
3088 | movsd qword [rsp-16], xmm1 // Use red zone.
3089 | movsd qword [rsp-8], xmm0
3090 | fld qword [rsp-16]
3091 | fld qword [rsp-8]
3092 | .endif
3093 |.else
3094 | movsd qword [esp+12], xmm1 // Needs 16 byte scratch area.
3095 | movsd qword [esp+4], xmm0
3096 | cmp dword [esp+12], 0; jne >1
3097 | mov eax, [esp+16]; shl eax, 1
3098 | cmp eax, 0xffe00000; je >2 // x^+-Inf?
3099 |1:
3100 | cmp dword [esp+4], 0; jne >1
3101 | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
3102 | cmp eax, 0xffe00000; je >5 // +-Inf^y?
3103 |1:
3104 | fld qword [esp+12]
3105 | fld qword [esp+4]
3106 |.endif
3107 | fyl2x // y*log2(x)
3108 | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
3109 | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
3110 |.if X64WIN
3111 | fstp qword [rsp+8] // Use scratch area.
3112 | movsd xmm0, qword [rsp+8]
3113 |.elif X64
3114 | fstp qword [rsp-8] // Use red zone.
3115 | movsd xmm0, qword [rsp-8]
3116 |.else
3117 | fstp qword [esp+4] // Needs 8 byte scratch area.
3118 | movsd xmm0, qword [esp+4]
3119 |.endif
3120 | ret
3121 |
3122 |9: // Handle x^NaN.
3123 | sseconst_1 xmm2, RDa
3124 | ucomisd xmm0, xmm2; je >1 // 1^NaN ==> 1
3125 | movaps xmm0, xmm1 // x^NaN ==> NaN
3126 |1:
3127 | ret
3128 |
3129 |2: // Handle x^+-Inf.
3130 | sseconst_abs xmm2, RDa
3131 | andpd xmm0, xmm2 // |x|
3132 | sseconst_1 xmm2, RDa
3133 | ucomisd xmm0, xmm2; je <1 // +-1^+-Inf ==> 1
3134 | movmskpd eax, xmm1
3135 | xorps xmm0, xmm0
3136 | mov ah, al; setc al; xor al, ah; jne <1 // |x|<>1, x^+-Inf ==> +Inf/0
3137 |3:
3138 | sseconst_hi xmm0, RDa, 7ff00000 // +Inf
3139 | ret
3140 |
3141 |4: // Handle +-0^y.
3142 | movmskpd eax, xmm1; test eax, eax; jnz <3 // y < 0, +-0^y ==> +Inf
3143 | xorps xmm0, xmm0 // y >= 0, +-0^y ==> 0
3144 | ret
3145 |
3146 |5: // Handle +-Inf^y.
3147 | movmskpd eax, xmm1; test eax, eax; jz <3 // y >= 0, +-Inf^y ==> +Inf
3148 | xorps xmm0, xmm0 // y < 0, +-Inf^y ==> 0
3149 | ret
3150 |
3151 |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
3152 |// Computes fpm(x) for extended math functions. ORDER FPM.
3153 |->vm_foldfpm:
3154 |.if JIT
3155 |.if X64
3156 | .if X64WIN
3157 | .define fpmop, CARG2d
3158 | .else
3159 | .define fpmop, CARG1d
3160 | .endif
3161 | cmp fpmop, 1; jb ->vm_floor_sse; je ->vm_ceil_sse
3162 | cmp fpmop, 3; jb ->vm_trunc_sse; ja >2
3163 | sqrtsd xmm0, xmm0; ret
3164 |2:
3165 | .if X64WIN
3166 | movsd qword [rsp+8], xmm0 // Use scratch area.
3167 | fld qword [rsp+8]
3168 | .else
3169 | movsd qword [rsp-8], xmm0 // Use red zone.
3170 | fld qword [rsp-8]
3171 | .endif
3172 | cmp fpmop, 5; ja >2
3173 | .if X64WIN; pop rax; .endif
3174 | je >1
3175 | call ->vm_exp_x87
3176 | .if X64WIN; push rax; .endif
3177 | jmp >7
3178 |1:
3179 | call ->vm_exp2_x87
3180 | .if X64WIN; push rax; .endif
3181 | jmp >7
3182 |2: ; cmp fpmop, 7; je >1; ja >2
3183 | fldln2; fxch; fyl2x; jmp >7
3184 |1: ; fld1; fxch; fyl2x; jmp >7
3185 |2: ; cmp fpmop, 9; je >1; ja >2
3186 | fldlg2; fxch; fyl2x; jmp >7
3187 |1: ; fsin; jmp >7
3188 |2: ; cmp fpmop, 11; je >1; ja >9
3189 | fcos; jmp >7
3190 |1: ; fptan; fpop
3191 |7:
3192 | .if X64WIN
3193 | fstp qword [rsp+8] // Use scratch area.
3194 | movsd xmm0, qword [rsp+8]
3195 | .else
3196 | fstp qword [rsp-8] // Use red zone.
3197 | movsd xmm0, qword [rsp-8]
3198 | .endif
3199 | ret
3200 |.else // x86 calling convention.
3201 | .define fpmop, eax
3202 | mov fpmop, [esp+12]
3203 | movsd xmm0, qword [esp+4]
3204 | cmp fpmop, 1; je >1; ja >2
3205 | call ->vm_floor_sse; jmp >7
3206 |1: ; call ->vm_ceil_sse; jmp >7
3207 |2: ; cmp fpmop, 3; je >1; ja >2
3208 | call ->vm_trunc_sse; jmp >7
3209 |1:
3210 | sqrtsd xmm0, xmm0
3211 |7:
3212 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
3213 | fld qword [esp+4]
3214 | ret
3215 |2: ; fld qword [esp+4]
3216 | cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87
3217 |2: ; cmp fpmop, 7; je >1; ja >2
3218 | fldln2; fxch; fyl2x; ret
3219 |1: ; fld1; fxch; fyl2x; ret
3220 |2: ; cmp fpmop, 9; je >1; ja >2
3221 | fldlg2; fxch; fyl2x; ret
3222 |1: ; fsin; ret
3223 |2: ; cmp fpmop, 11; je >1; ja >9
3224 | fcos; ret
3225 |1: ; fptan; fpop; ret
3226 |.endif
3227 |9: ; int3 // Bad fpm.
3228 |.endif
3229 |
3230 |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
3231 |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -)
3232 |// and basic math functions. ORDER ARITH
3233 |->vm_foldarith:
3234 |.if X64
3235 |
3236 | .if X64WIN
3237 | .define foldop, CARG3d
3238 | .else
3239 | .define foldop, CARG1d
3240 | .endif
3241 | cmp foldop, 1; je >1; ja >2
3242 | addsd xmm0, xmm1; ret
3243 |1: ; subsd xmm0, xmm1; ret
3244 |2: ; cmp foldop, 3; je >1; ja >2
3245 | mulsd xmm0, xmm1; ret
3246 |1: ; divsd xmm0, xmm1; ret
3247 |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow_sse
3248 | cmp foldop, 7; je >1; ja >2
3249 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
3250 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
3251 |2: ; cmp foldop, 9; ja >2
3252 |.if X64WIN
3253 | movsd qword [rsp+8], xmm0 // Use scratch area.
3254 | movsd qword [rsp+16], xmm1
3255 | fld qword [rsp+8]
3256 | fld qword [rsp+16]
3257 |.else
3258 | movsd qword [rsp-8], xmm0 // Use red zone.
3259 | movsd qword [rsp-16], xmm1
3260 | fld qword [rsp-8]
3261 | fld qword [rsp-16]
3262 |.endif
3263 | je >1
3264 | fpatan
3265 |7:
3266 |.if X64WIN
3267 | fstp qword [rsp+8] // Use scratch area.
3268 | movsd xmm0, qword [rsp+8]
3269 |.else
3270 | fstp qword [rsp-8] // Use red zone.
3271 | movsd xmm0, qword [rsp-8]
3272 |.endif
3273 | ret
3274 |1: ; fxch; fscale; fpop1; jmp <7
3275 |2: ; cmp foldop, 11; je >1; ja >9
3276 | minsd xmm0, xmm1; ret
3277 |1: ; maxsd xmm0, xmm1; ret
3278 |9: ; int3 // Bad op.
3279 |
3280 |.else // x86 calling convention.
3281 |
3282 | .define foldop, eax
3283 | mov foldop, [esp+20]
3284 | movsd xmm0, qword [esp+4]
3285 | movsd xmm1, qword [esp+12]
3286 | cmp foldop, 1; je >1; ja >2
3287 | addsd xmm0, xmm1
3288 |7:
3289 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
3290 | fld qword [esp+4]
3291 | ret
3292 |1: ; subsd xmm0, xmm1; jmp <7
3293 |2: ; cmp foldop, 3; je >1; ja >2
3294 | mulsd xmm0, xmm1; jmp <7
3295 |1: ; divsd xmm0, xmm1; jmp <7
3296 |2: ; cmp foldop, 5
3297 | je >1; ja >2
3298 | call ->vm_mod; jmp <7
3299 |1: ; pop edx; call ->vm_pow_sse; push edx; jmp <7 // Writes to scratch area.
3300 |2: ; cmp foldop, 7; je >1; ja >2
3301 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
3302 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
3303 |2: ; cmp foldop, 9; ja >2
3304 | fld qword [esp+4] // Reload from stack
3305 | fld qword [esp+12]
3306 | je >1
3307 | fpatan; ret
3308 |1: ; fxch; fscale; fpop1; ret
3309 |2: ; cmp foldop, 11; je >1; ja >9
3310 | minsd xmm0, xmm1; jmp <7
3311 |1: ; maxsd xmm0, xmm1; jmp <7
3312 |9: ; int3 // Bad op.
3313 |
3314 |.endif
3315 |
3316 |//----------------------------------------------------------------------- 3004 |//-----------------------------------------------------------------------
3317 |//-- Miscellaneous functions -------------------------------------------- 3005 |//-- Miscellaneous functions --------------------------------------------
3318 |//----------------------------------------------------------------------- 3006 |//-----------------------------------------------------------------------
@@ -4107,8 +3795,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4107 break; 3795 break;
4108 case BC_POW: 3796 case BC_POW:
4109 | ins_arithpre movsd, xmm1 3797 | ins_arithpre movsd, xmm1
4110 | call ->vm_pow_sse 3798 | mov RB, BASE
3799 |.if not X64
3800 | movsd FPARG1, xmm0
3801 | movsd FPARG3, xmm1
3802 |.endif
3803 | call extern pow
3804 | movzx RA, PC_RA
3805 | mov BASE, RB
3806 |.if X64
4111 | ins_arithpost 3807 | ins_arithpost
3808 |.else
3809 | fstp qword [BASE+RA*8]
3810 |.endif
4112 | ins_next 3811 | ins_next
4113 break; 3812 break;
4114 3813