aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Pall <mike>2013-02-21 16:56:59 +0100
committerMike Pall <mike>2013-02-21 16:56:59 +0100
commit57768cd5882eb8d39c673d9dd8598946ef7c1843 (patch)
treef3f5663e8fb76b965e704aca33347d72cbfa3532
parent61fb587d2c1646cae4c90990b9c4c1f1bff09e5b (diff)
downloadluajit-57768cd5882eb8d39c673d9dd8598946ef7c1843.tar.gz
luajit-57768cd5882eb8d39c673d9dd8598946ef7c1843.tar.bz2
luajit-57768cd5882eb8d39c673d9dd8598946ef7c1843.zip
x86: Remove x87 support from interpreter.
SSE2 required from now on.
-rw-r--r--src/Makefile11
-rw-r--r--src/lib_jit.c22
-rw-r--r--src/lj_asm.c2
-rw-r--r--src/lj_jit.h18
-rw-r--r--src/lj_vm.h4
-rw-r--r--src/msvcbuild.bat1
-rw-r--r--src/vm_x86.dasc687
7 files changed, 100 insertions, 645 deletions
diff --git a/src/Makefile b/src/Makefile
index 278324a1..4ea8c85e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -42,13 +42,10 @@ CCOPT= -O2 -fomit-frame-pointer
42# 42#
43# Target-specific compiler options: 43# Target-specific compiler options:
44# 44#
45# x86 only: it's recommended to compile at least for i686. Better yet,
46# compile for an architecture that has SSE2, too (-msse -msse2).
47#
48# x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute 45# x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute
49# the binaries to a different machine you could also use: -march=native 46# the binaries to a different machine you could also use: -march=native
50# 47#
51CCOPT_x86= -march=i686 48CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse
52CCOPT_x64= 49CCOPT_x64=
53CCOPT_arm= 50CCOPT_arm=
54CCOPT_ppc= 51CCOPT_ppc=
@@ -394,11 +391,6 @@ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subs
394ifeq (Windows,$(TARGET_SYS)) 391ifeq (Windows,$(TARGET_SYS))
395 DASM_AFLAGS+= -D WIN 392 DASM_AFLAGS+= -D WIN
396endif 393endif
397ifeq (x86,$(TARGET_LJARCH))
398 ifneq (,$(findstring __SSE2__ 1,$(TARGET_TESTARCH)))
399 DASM_AFLAGS+= -D SSE
400 endif
401else
402ifeq (x64,$(TARGET_LJARCH)) 394ifeq (x64,$(TARGET_LJARCH))
403 DASM_ARCH= x86 395 DASM_ARCH= x86
404else 396else
@@ -423,7 +415,6 @@ ifeq (ppc,$(TARGET_LJARCH))
423endif 415endif
424endif 416endif
425endif 417endif
426endif
427 418
428DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS) 419DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
429DASM_DASC= vm_$(DASM_ARCH).dasc 420DASM_DASC= vm_$(DASM_ARCH).dasc
diff --git a/src/lib_jit.c b/src/lib_jit.c
index 82e68258..1b69caa5 100644
--- a/src/lib_jit.c
+++ b/src/lib_jit.c
@@ -538,18 +538,14 @@ static uint32_t jit_cpudetect(lua_State *L)
538 uint32_t features[4]; 538 uint32_t features[4];
539 if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { 539 if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
540#if !LJ_HASJIT 540#if !LJ_HASJIT
541#define JIT_F_CMOV 1
542#define JIT_F_SSE2 2 541#define JIT_F_SSE2 2
543#endif 542#endif
544 flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
545 flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; 543 flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
546#if LJ_HASJIT 544#if LJ_HASJIT
547 flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; 545 flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
548 flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; 546 flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
549 if (vendor[2] == 0x6c65746e) { /* Intel. */ 547 if (vendor[2] == 0x6c65746e) { /* Intel. */
550 if ((features[0] & 0x0ff00f00) == 0x00000f00) /* P4. */ 548 if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
551 flags |= JIT_F_P4; /* Currently unused. */
552 else if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
553 flags |= JIT_F_LEA_AGU; 549 flags |= JIT_F_LEA_AGU;
554 } else if (vendor[2] == 0x444d4163) { /* AMD. */ 550 } else if (vendor[2] == 0x444d4163) { /* AMD. */
555 uint32_t fam = (features[0] & 0x0ff00f00); 551 uint32_t fam = (features[0] & 0x0ff00f00);
@@ -562,14 +558,8 @@ static uint32_t jit_cpudetect(lua_State *L)
562 } 558 }
563 /* Check for required instruction set support on x86 (unnecessary on x64). */ 559 /* Check for required instruction set support on x86 (unnecessary on x64). */
564#if LJ_TARGET_X86 560#if LJ_TARGET_X86
565#if !defined(LUAJIT_CPU_NOCMOV)
566 if (!(flags & JIT_F_CMOV))
567 luaL_error(L, "CPU not supported");
568#endif
569#if defined(LUAJIT_CPU_SSE2)
570 if (!(flags & JIT_F_SSE2)) 561 if (!(flags & JIT_F_SSE2))
571 luaL_error(L, "CPU does not support SSE2 (recompile without -DLUAJIT_CPU_SSE2)"); 562 luaL_error(L, "CPU with SSE2 required");
572#endif
573#endif 563#endif
574#elif LJ_TARGET_ARM 564#elif LJ_TARGET_ARM
575#if LJ_HASJIT 565#if LJ_HASJIT
@@ -631,11 +621,7 @@ static void jit_init(lua_State *L)
631 uint32_t flags = jit_cpudetect(L); 621 uint32_t flags = jit_cpudetect(L);
632#if LJ_HASJIT 622#if LJ_HASJIT
633 jit_State *J = L2J(L); 623 jit_State *J = L2J(L);
634#if LJ_TARGET_X86 624 J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
635 /* Silently turn off the JIT compiler on CPUs without SSE2. */
636 if ((flags & JIT_F_SSE2))
637#endif
638 J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
639 memcpy(J->param, jit_param_default, sizeof(J->param)); 625 memcpy(J->param, jit_param_default, sizeof(J->param));
640 lj_dispatch_update(G(L)); 626 lj_dispatch_update(G(L));
641#else 627#else
@@ -645,6 +631,7 @@ static void jit_init(lua_State *L)
645 631
646LUALIB_API int luaopen_jit(lua_State *L) 632LUALIB_API int luaopen_jit(lua_State *L)
647{ 633{
634 jit_init(L);
648 lua_pushliteral(L, LJ_OS_NAME); 635 lua_pushliteral(L, LJ_OS_NAME);
649 lua_pushliteral(L, LJ_ARCH_NAME); 636 lua_pushliteral(L, LJ_ARCH_NAME);
650 lua_pushinteger(L, LUAJIT_VERSION_NUM); 637 lua_pushinteger(L, LUAJIT_VERSION_NUM);
@@ -657,7 +644,6 @@ LUALIB_API int luaopen_jit(lua_State *L)
657 LJ_LIB_REG(L, "jit.opt", jit_opt); 644 LJ_LIB_REG(L, "jit.opt", jit_opt);
658#endif 645#endif
659 L->top -= 2; 646 L->top -= 2;
660 jit_init(L);
661 return 1; 647 return 1;
662} 648}
663 649
diff --git a/src/lj_asm.c b/src/lj_asm.c
index c7365404..a01b4e52 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1730,7 +1730,7 @@ static void asm_setup_regsp(ASMState *as)
1730 break; 1730 break;
1731 case IR_FPMATH: 1731 case IR_FPMATH:
1732#if LJ_TARGET_X86ORX64 1732#if LJ_TARGET_X86ORX64
1733 if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */ 1733 if (ir->op2 == IRFPM_EXP2) { /* May be joined to pow. */
1734 ir->prev = REGSP_HINT(RID_XMM0); 1734 ir->prev = REGSP_HINT(RID_XMM0);
1735#if !LJ_64 1735#if !LJ_64
1736 if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */ 1736 if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */
diff --git a/src/lj_jit.h b/src/lj_jit.h
index c0b1c41e..8b42dd4e 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -14,18 +14,16 @@
14 14
15/* CPU-specific JIT engine flags. */ 15/* CPU-specific JIT engine flags. */
16#if LJ_TARGET_X86ORX64 16#if LJ_TARGET_X86ORX64
17#define JIT_F_CMOV 0x00000010 17#define JIT_F_SSE2 0x00000010
18#define JIT_F_SSE2 0x00000020 18#define JIT_F_SSE3 0x00000020
19#define JIT_F_SSE3 0x00000040 19#define JIT_F_SSE4_1 0x00000040
20#define JIT_F_SSE4_1 0x00000080 20#define JIT_F_PREFER_IMUL 0x00000080
21#define JIT_F_P4 0x00000100 21#define JIT_F_SPLIT_XMM 0x00000100
22#define JIT_F_PREFER_IMUL 0x00000200 22#define JIT_F_LEA_AGU 0x00000200
23#define JIT_F_SPLIT_XMM 0x00000400
24#define JIT_F_LEA_AGU 0x00000800
25 23
26/* Names for the CPU-specific flags. Must match the order above. */ 24/* Names for the CPU-specific flags. Must match the order above. */
27#define JIT_F_CPU_FIRST JIT_F_CMOV 25#define JIT_F_CPU_FIRST JIT_F_SSE2
28#define JIT_F_CPUSTRING "\4CMOV\4SSE2\4SSE3\6SSE4.1\2P4\3AMD\2K8\4ATOM" 26#define JIT_F_CPUSTRING "\4SSE2\4SSE3\6SSE4.1\3AMD\2K8\4ATOM"
29#elif LJ_TARGET_ARM 27#elif LJ_TARGET_ARM
30#define JIT_F_ARMV6_ 0x00000010 28#define JIT_F_ARMV6_ 0x00000010
31#define JIT_F_ARMV6T2_ 0x00000020 29#define JIT_F_ARMV6T2_ 0x00000020
diff --git a/src/lj_vm.h b/src/lj_vm.h
index c5d05de4..948d63c2 100644
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -49,12 +49,14 @@ LJ_ASMF void lj_vm_exit_handler(void);
49LJ_ASMF void lj_vm_exit_interp(void); 49LJ_ASMF void lj_vm_exit_interp(void);
50 50
51/* Internal math helper functions. */ 51/* Internal math helper functions. */
52#if LJ_TARGET_X86ORX64 || LJ_TARGET_PPC 52#if LJ_TARGET_PPC
53#define lj_vm_floor floor 53#define lj_vm_floor floor
54#define lj_vm_ceil ceil 54#define lj_vm_ceil ceil
55#else 55#else
56LJ_ASMF double lj_vm_floor(double); 56LJ_ASMF double lj_vm_floor(double);
57#if !LJ_TARGET_X86ORX64
57LJ_ASMF double lj_vm_ceil(double); 58LJ_ASMF double lj_vm_ceil(double);
59#endif
58#if LJ_TARGET_ARM 60#if LJ_TARGET_ARM
59LJ_ASMF double lj_vm_floor_sf(double); 61LJ_ASMF double lj_vm_floor_sf(double);
60LJ_ASMF double lj_vm_ceil_sf(double); 62LJ_ASMF double lj_vm_ceil_sf(double);
diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
index 745c93ff..1d5bd55a 100644
--- a/src/msvcbuild.bat
+++ b/src/msvcbuild.bat
@@ -35,6 +35,7 @@ if exist minilua.exe.manifest^
35@if errorlevel 8 goto :X64 35@if errorlevel 8 goto :X64
36@set DASMFLAGS=-D WIN -D JIT -D FFI 36@set DASMFLAGS=-D WIN -D JIT -D FFI
37@set LJARCH=x86 37@set LJARCH=x86
38@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
38:X64 39:X64
39minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc 40minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc
40@if errorlevel 1 goto :BAD 41@if errorlevel 1 goto :BAD
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index b4674e2b..7020eb27 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -18,7 +18,6 @@
18| 18|
19|.if P64 19|.if P64
20|.define X64, 1 20|.define X64, 1
21|.define SSE, 1
22|.if WIN 21|.if WIN
23|.define X64WIN, 1 22|.define X64WIN, 1
24|.endif 23|.endif
@@ -856,13 +855,9 @@ static void build_subroutines(BuildCtx *ctx)
856 |.if DUALNUM 855 |.if DUALNUM
857 | mov TMP2, LJ_TISNUM 856 | mov TMP2, LJ_TISNUM
858 | mov TMP1, RC 857 | mov TMP1, RC
859 |.elif SSE 858 |.else
860 | cvtsi2sd xmm0, RC 859 | cvtsi2sd xmm0, RC
861 | movsd TMPQ, xmm0 860 | movsd TMPQ, xmm0
862 |.else
863 | mov ARG4, RC
864 | fild ARG4
865 | fstp TMPQ
866 |.endif 861 |.endif
867 | lea RCa, TMPQ // Store temp. TValue in TMPQ. 862 | lea RCa, TMPQ // Store temp. TValue in TMPQ.
868 | jmp >1 863 | jmp >1
@@ -935,13 +930,9 @@ static void build_subroutines(BuildCtx *ctx)
935 |.if DUALNUM 930 |.if DUALNUM
936 | mov TMP2, LJ_TISNUM 931 | mov TMP2, LJ_TISNUM
937 | mov TMP1, RC 932 | mov TMP1, RC
938 |.elif SSE 933 |.else
939 | cvtsi2sd xmm0, RC 934 | cvtsi2sd xmm0, RC
940 | movsd TMPQ, xmm0 935 | movsd TMPQ, xmm0
941 |.else
942 | mov ARG4, RC
943 | fild ARG4
944 | fstp TMPQ
945 |.endif 936 |.endif
946 | lea RCa, TMPQ // Store temp. TValue in TMPQ. 937 | lea RCa, TMPQ // Store temp. TValue in TMPQ.
947 | jmp >1 938 | jmp >1
@@ -1509,11 +1500,7 @@ static void build_subroutines(BuildCtx *ctx)
1509 |.else 1500 |.else
1510 | jae ->fff_fallback 1501 | jae ->fff_fallback
1511 |.endif 1502 |.endif
1512 |.if SSE
1513 | movsd xmm0, qword [BASE]; jmp ->fff_resxmm0 1503 | movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
1514 |.else
1515 | fld qword [BASE]; jmp ->fff_resn
1516 |.endif
1517 | 1504 |
1518 |.ffunc_1 tostring 1505 |.ffunc_1 tostring
1519 | // Only handles the string or number case inline. 1506 | // Only handles the string or number case inline.
@@ -1631,19 +1618,12 @@ static void build_subroutines(BuildCtx *ctx)
1631 | add RD, 1 1618 | add RD, 1
1632 | mov dword [BASE-4], LJ_TISNUM 1619 | mov dword [BASE-4], LJ_TISNUM
1633 | mov dword [BASE-8], RD 1620 | mov dword [BASE-8], RD
1634 |.elif SSE 1621 |.else
1635 | movsd xmm0, qword [BASE+8] 1622 | movsd xmm0, qword [BASE+8]
1636 | sseconst_1 xmm1, RBa 1623 | sseconst_1 xmm1, RBa
1637 | addsd xmm0, xmm1 1624 | addsd xmm0, xmm1
1638 | cvtsd2si RD, xmm0 1625 | cvtsd2si RD, xmm0
1639 | movsd qword [BASE-8], xmm0 1626 | movsd qword [BASE-8], xmm0
1640 |.else
1641 | fld qword [BASE+8]
1642 | fld1
1643 | faddp st1
1644 | fist ARG1
1645 | fstp qword [BASE-8]
1646 | mov RD, ARG1
1647 |.endif 1627 |.endif
1648 | mov TAB:RB, [BASE] 1628 | mov TAB:RB, [BASE]
1649 | cmp RD, TAB:RB->asize; jae >2 // Not in array part? 1629 | cmp RD, TAB:RB->asize; jae >2 // Not in array part?
@@ -1690,12 +1670,9 @@ static void build_subroutines(BuildCtx *ctx)
1690 |.if DUALNUM 1670 |.if DUALNUM
1691 | mov dword [BASE+12], LJ_TISNUM 1671 | mov dword [BASE+12], LJ_TISNUM
1692 | mov dword [BASE+8], 0 1672 | mov dword [BASE+8], 0
1693 |.elif SSE 1673 |.else
1694 | xorps xmm0, xmm0 1674 | xorps xmm0, xmm0
1695 | movsd qword [BASE+8], xmm0 1675 | movsd qword [BASE+8], xmm0
1696 |.else
1697 | fldz
1698 | fstp qword [BASE+8]
1699 |.endif 1676 |.endif
1700 | mov RD, 1+3 1677 | mov RD, 1+3
1701 | jmp ->fff_res 1678 | jmp ->fff_res
@@ -1925,12 +1902,10 @@ static void build_subroutines(BuildCtx *ctx)
1925 |->fff_resi: // Dummy. 1902 |->fff_resi: // Dummy.
1926 |.endif 1903 |.endif
1927 | 1904 |
1928 |.if SSE
1929 |->fff_resn: 1905 |->fff_resn:
1930 | mov PC, [BASE-4] 1906 | mov PC, [BASE-4]
1931 | fstp qword [BASE-8] 1907 | fstp qword [BASE-8]
1932 | jmp ->fff_res1 1908 | jmp ->fff_res1
1933 |.endif
1934 | 1909 |
1935 | .ffunc_1 math_abs 1910 | .ffunc_1 math_abs
1936 |.if DUALNUM 1911 |.if DUALNUM
@@ -1954,8 +1929,6 @@ static void build_subroutines(BuildCtx *ctx)
1954 |.else 1929 |.else
1955 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback 1930 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
1956 |.endif 1931 |.endif
1957 |
1958 |.if SSE
1959 | movsd xmm0, qword [BASE] 1932 | movsd xmm0, qword [BASE]
1960 | sseconst_abs xmm1, RDa 1933 | sseconst_abs xmm1, RDa
1961 | andps xmm0, xmm1 1934 | andps xmm0, xmm1
@@ -1963,15 +1936,6 @@ static void build_subroutines(BuildCtx *ctx)
1963 | mov PC, [BASE-4] 1936 | mov PC, [BASE-4]
1964 | movsd qword [BASE-8], xmm0 1937 | movsd qword [BASE-8], xmm0
1965 | // fallthrough 1938 | // fallthrough
1966 |.else
1967 | fld qword [BASE]
1968 | fabs
1969 | // fallthrough
1970 |->fff_resxmm0: // Dummy.
1971 |->fff_resn:
1972 | mov PC, [BASE-4]
1973 | fstp qword [BASE-8]
1974 |.endif
1975 | 1939 |
1976 |->fff_res1: 1940 |->fff_res1:
1977 | mov RD, 1+1 1941 | mov RD, 1+1
@@ -2008,48 +1972,24 @@ static void build_subroutines(BuildCtx *ctx)
2008 |.else 1972 |.else
2009 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback 1973 | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
2010 |.endif 1974 |.endif
2011 |.if SSE
2012 | movsd xmm0, qword [BASE] 1975 | movsd xmm0, qword [BASE]
2013 | call ->vm_ .. func 1976 | call ->vm_ .. func .. _sse
2014 | .if DUALNUM 1977 |.if DUALNUM
2015 | cvtsd2si RB, xmm0 1978 | cvtsd2si RB, xmm0
2016 | cmp RB, 0x80000000 1979 | cmp RB, 0x80000000
2017 | jne ->fff_resi 1980 | jne ->fff_resi
2018 | cvtsi2sd xmm1, RB 1981 | cvtsi2sd xmm1, RB
2019 | ucomisd xmm0, xmm1 1982 | ucomisd xmm0, xmm1
2020 | jp ->fff_resxmm0 1983 | jp ->fff_resxmm0
2021 | je ->fff_resi 1984 | je ->fff_resi
2022 | .endif
2023 | jmp ->fff_resxmm0
2024 |.else
2025 | fld qword [BASE]
2026 | call ->vm_ .. func
2027 | .if DUALNUM
2028 | fist ARG1
2029 | mov RB, ARG1
2030 | cmp RB, 0x80000000; jne >2
2031 | fdup
2032 | fild ARG1
2033 | fcomparepp
2034 | jp ->fff_resn
2035 | jne ->fff_resn
2036 |2:
2037 | fpop
2038 | jmp ->fff_resi
2039 | .else
2040 | jmp ->fff_resn
2041 | .endif
2042 |.endif 1985 |.endif
1986 | jmp ->fff_resxmm0
2043 |.endmacro 1987 |.endmacro
2044 | 1988 |
2045 | math_round floor 1989 | math_round floor
2046 | math_round ceil 1990 | math_round ceil
2047 | 1991 |
2048 |.if SSE
2049 |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0 1992 |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
2050 |.else
2051 |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
2052 |.endif
2053 | 1993 |
2054 |.ffunc math_log 1994 |.ffunc math_log
2055 | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument. 1995 | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument.
@@ -2072,23 +2012,18 @@ static void build_subroutines(BuildCtx *ctx)
2072 |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn 2012 |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn
2073 | 2013 |
2074 |.macro math_extern, func 2014 |.macro math_extern, func
2075 |.if SSE
2076 | .ffunc_nsse math_ .. func 2015 | .ffunc_nsse math_ .. func
2077 | .if not X64 2016 |.if not X64
2078 | movsd FPARG1, xmm0 2017 | movsd FPARG1, xmm0
2079 | .endif
2080 |.else
2081 | .ffunc_n math_ .. func
2082 | fstp FPARG1
2083 |.endif 2018 |.endif
2084 | mov RB, BASE 2019 | mov RB, BASE
2085 | call extern lj_vm_ .. func 2020 | call extern lj_vm_ .. func
2086 | mov BASE, RB 2021 | mov BASE, RB
2087 | .if X64 2022 |.if X64
2088 | jmp ->fff_resxmm0 2023 | jmp ->fff_resxmm0
2089 | .else 2024 |.else
2090 | jmp ->fff_resn 2025 | jmp ->fff_resn
2091 | .endif 2026 |.endif
2092 |.endmacro 2027 |.endmacro
2093 | 2028 |
2094 | math_extern sinh 2029 | math_extern sinh
@@ -2096,17 +2031,10 @@ static void build_subroutines(BuildCtx *ctx)
2096 | math_extern tanh 2031 | math_extern tanh
2097 | 2032 |
2098 |->ff_math_deg: 2033 |->ff_math_deg:
2099 |.if SSE
2100 |.ffunc_nsse math_rad 2034 |.ffunc_nsse math_rad
2101 | mov CFUNC:RB, [BASE-8] 2035 | mov CFUNC:RB, [BASE-8]
2102 | mulsd xmm0, qword CFUNC:RB->upvalue[0] 2036 | mulsd xmm0, qword CFUNC:RB->upvalue[0]
2103 | jmp ->fff_resxmm0 2037 | jmp ->fff_resxmm0
2104 |.else
2105 |.ffunc_n math_rad
2106 | mov CFUNC:RB, [BASE-8]
2107 | fmul qword CFUNC:RB->upvalue[0]
2108 | jmp ->fff_resn
2109 |.endif
2110 | 2038 |
2111 |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn 2039 |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn
2112 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn 2040 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn
@@ -2123,65 +2051,34 @@ static void build_subroutines(BuildCtx *ctx)
2123 | cmp RB, 0x00200000; jb >4 2051 | cmp RB, 0x00200000; jb >4
2124 |1: 2052 |1:
2125 | shr RB, 21; sub RB, RC // Extract and unbias exponent. 2053 | shr RB, 21; sub RB, RC // Extract and unbias exponent.
2126 |.if SSE
2127 | cvtsi2sd xmm0, RB 2054 | cvtsi2sd xmm0, RB
2128 |.else
2129 | mov TMP1, RB; fild TMP1
2130 |.endif
2131 | mov RB, [BASE-4] 2055 | mov RB, [BASE-4]
2132 | and RB, 0x800fffff // Mask off exponent. 2056 | and RB, 0x800fffff // Mask off exponent.
2133 | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0. 2057 | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0.
2134 | mov [BASE-4], RB 2058 | mov [BASE-4], RB
2135 |2: 2059 |2:
2136 |.if SSE
2137 | movsd qword [BASE], xmm0 2060 | movsd qword [BASE], xmm0
2138 |.else
2139 | fstp qword [BASE]
2140 |.endif
2141 | mov RD, 1+2 2061 | mov RD, 1+2
2142 | jmp ->fff_res 2062 | jmp ->fff_res
2143 |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0. 2063 |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
2144 |.if SSE
2145 | xorps xmm0, xmm0; jmp <2 2064 | xorps xmm0, xmm0; jmp <2
2146 |.else
2147 | fldz; jmp <2
2148 |.endif
2149 |4: // Handle denormals by multiplying with 2^54 and adjusting the bias. 2065 |4: // Handle denormals by multiplying with 2^54 and adjusting the bias.
2150 |.if SSE
2151 | movsd xmm0, qword [BASE] 2066 | movsd xmm0, qword [BASE]
2152 | sseconst_hi xmm1, RBa, 43500000 // 2^54. 2067 | sseconst_hi xmm1, RBa, 43500000 // 2^54.
2153 | mulsd xmm0, xmm1 2068 | mulsd xmm0, xmm1
2154 | movsd qword [BASE-8], xmm0 2069 | movsd qword [BASE-8], xmm0
2155 |.else
2156 | fld qword [BASE]
2157 | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54
2158 | fstp qword [BASE-8]
2159 |.endif
2160 | mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1 2070 | mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
2161 | 2071 |
2162 |.if SSE
2163 |.ffunc_nsse math_modf 2072 |.ffunc_nsse math_modf
2164 |.else
2165 |.ffunc_n math_modf
2166 |.endif
2167 | mov RB, [BASE+4] 2073 | mov RB, [BASE+4]
2168 | mov PC, [BASE-4] 2074 | mov PC, [BASE-4]
2169 | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf? 2075 | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf?
2170 |.if SSE
2171 | movaps xmm4, xmm0 2076 | movaps xmm4, xmm0
2172 | call ->vm_trunc 2077 | call ->vm_trunc_sse
2173 | subsd xmm4, xmm0 2078 | subsd xmm4, xmm0
2174 |1: 2079 |1:
2175 | movsd qword [BASE-8], xmm0 2080 | movsd qword [BASE-8], xmm0
2176 | movsd qword [BASE], xmm4 2081 | movsd qword [BASE], xmm4
2177 |.else
2178 | fdup
2179 | call ->vm_trunc
2180 | fsub st1, st0
2181 |1:
2182 | fstp qword [BASE-8]
2183 | fstp qword [BASE]
2184 |.endif
2185 | mov RC, [BASE-4]; mov RB, [BASE+4] 2082 | mov RC, [BASE-4]; mov RB, [BASE+4]
2186 | xor RC, RB; js >3 // Need to adjust sign? 2083 | xor RC, RB; js >3 // Need to adjust sign?
2187 |2: 2084 |2:
@@ -2191,24 +2088,16 @@ static void build_subroutines(BuildCtx *ctx)
2191 | xor RB, 0x80000000; mov [BASE+4], RB // Flip sign of fraction. 2088 | xor RB, 0x80000000; mov [BASE+4], RB // Flip sign of fraction.
2192 | jmp <2 2089 | jmp <2
2193 |4: 2090 |4:
2194 |.if SSE
2195 | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0. 2091 | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0.
2196 |.else
2197 | fldz; fxch; jmp <1 // Return +-Inf and +-0.
2198 |.endif
2199 | 2092 |
2200 |.ffunc_nnr math_fmod 2093 |.ffunc_nnr math_fmod
2201 |1: ; fprem; fnstsw ax; sahf; jp <1 2094 |1: ; fprem; fnstsw ax; sahf; jp <1
2202 | fpop1 2095 | fpop1
2203 | jmp ->fff_resn 2096 | jmp ->fff_resn
2204 | 2097 |
2205 |.if SSE 2098 |.ffunc_nnsse math_pow; call ->vm_pow_sse; jmp ->fff_resxmm0
2206 |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0
2207 |.else
2208 |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn
2209 |.endif
2210 | 2099 |
2211 |.macro math_minmax, name, cmovop, fcmovop, sseop 2100 |.macro math_minmax, name, cmovop, sseop
2212 | .ffunc name 2101 | .ffunc name
2213 | mov RA, 2 2102 | mov RA, 2
2214 | cmp dword [BASE+4], LJ_TISNUM 2103 | cmp dword [BASE+4], LJ_TISNUM
@@ -2225,12 +2114,7 @@ static void build_subroutines(BuildCtx *ctx)
2225 |3: 2114 |3:
2226 | ja ->fff_fallback 2115 | ja ->fff_fallback
2227 | // Convert intermediate result to number and continue below. 2116 | // Convert intermediate result to number and continue below.
2228 |.if SSE
2229 | cvtsi2sd xmm0, RB 2117 | cvtsi2sd xmm0, RB
2230 |.else
2231 | mov TMP1, RB
2232 | fild TMP1
2233 |.endif
2234 | jmp >6 2118 | jmp >6
2235 |4: 2119 |4:
2236 | ja ->fff_fallback 2120 | ja ->fff_fallback
@@ -2238,7 +2122,6 @@ static void build_subroutines(BuildCtx *ctx)
2238 | jae ->fff_fallback 2122 | jae ->fff_fallback
2239 |.endif 2123 |.endif
2240 | 2124 |
2241 |.if SSE
2242 | movsd xmm0, qword [BASE] 2125 | movsd xmm0, qword [BASE]
2243 |5: // Handle numbers or integers. 2126 |5: // Handle numbers or integers.
2244 | cmp RA, RD; jae ->fff_resxmm0 2127 | cmp RA, RD; jae ->fff_resxmm0
@@ -2257,34 +2140,10 @@ static void build_subroutines(BuildCtx *ctx)
2257 | sseop xmm0, xmm1 2140 | sseop xmm0, xmm1
2258 | add RA, 1 2141 | add RA, 1
2259 | jmp <5 2142 | jmp <5
2260 |.else
2261 | fld qword [BASE]
2262 |5: // Handle numbers or integers.
2263 | cmp RA, RD; jae ->fff_resn
2264 | cmp dword [BASE+RA*8-4], LJ_TISNUM
2265 |.if DUALNUM
2266 | jb >6
2267 | ja >9
2268 | fild dword [BASE+RA*8-8]
2269 | jmp >7
2270 |.else
2271 | jae >9
2272 |.endif
2273 |6:
2274 | fld qword [BASE+RA*8-8]
2275 |7:
2276 | fucomi st1; fcmovop st1; fpop1
2277 | add RA, 1
2278 | jmp <5
2279 |.endif
2280 |.endmacro 2143 |.endmacro
2281 | 2144 |
2282 | math_minmax math_min, cmovg, fcmovnbe, minsd 2145 | math_minmax math_min, cmovg, minsd
2283 | math_minmax math_max, cmovl, fcmovbe, maxsd 2146 | math_minmax math_max, cmovl, maxsd
2284 |.if not SSE
2285 |9:
2286 | fpop; jmp ->fff_fallback
2287 |.endif
2288 | 2147 |
2289 |//-- String library ----------------------------------------------------- 2148 |//-- String library -----------------------------------------------------
2290 | 2149 |
@@ -2293,10 +2152,8 @@ static void build_subroutines(BuildCtx *ctx)
2293 | mov STR:RB, [BASE] 2152 | mov STR:RB, [BASE]
2294 |.if DUALNUM 2153 |.if DUALNUM
2295 | mov RB, dword STR:RB->len; jmp ->fff_resi 2154 | mov RB, dword STR:RB->len; jmp ->fff_resi
2296 |.elif SSE
2297 | cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
2298 |.else 2155 |.else
2299 | fild dword STR:RB->len; jmp ->fff_resn 2156 | cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
2300 |.endif 2157 |.endif
2301 | 2158 |
2302 |.ffunc string_byte // Only handle the 1-arg case here. 2159 |.ffunc string_byte // Only handle the 1-arg case here.
@@ -2309,10 +2166,8 @@ static void build_subroutines(BuildCtx *ctx)
2309 | movzx RB, byte STR:RB[1] 2166 | movzx RB, byte STR:RB[1]
2310 |.if DUALNUM 2167 |.if DUALNUM
2311 | jmp ->fff_resi 2168 | jmp ->fff_resi
2312 |.elif SSE
2313 | cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
2314 |.else 2169 |.else
2315 | mov TMP1, RB; fild TMP1; jmp ->fff_resn 2170 | cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
2316 |.endif 2171 |.endif
2317 | 2172 |
2318 |.ffunc string_char // Only handle the 1-arg case here. 2173 |.ffunc string_char // Only handle the 1-arg case here.
@@ -2324,16 +2179,11 @@ static void build_subroutines(BuildCtx *ctx)
2324 | mov RB, dword [BASE] 2179 | mov RB, dword [BASE]
2325 | cmp RB, 255; ja ->fff_fallback 2180 | cmp RB, 255; ja ->fff_fallback
2326 | mov TMP2, RB 2181 | mov TMP2, RB
2327 |.elif SSE 2182 |.else
2328 | jae ->fff_fallback 2183 | jae ->fff_fallback
2329 | cvttsd2si RB, qword [BASE] 2184 | cvttsd2si RB, qword [BASE]
2330 | cmp RB, 255; ja ->fff_fallback 2185 | cmp RB, 255; ja ->fff_fallback
2331 | mov TMP2, RB 2186 | mov TMP2, RB
2332 |.else
2333 | jae ->fff_fallback
2334 | fld qword [BASE]
2335 | fistp TMP2
2336 | cmp TMP2, 255; ja ->fff_fallback
2337 |.endif 2187 |.endif
2338 |.if X64 2188 |.if X64
2339 | mov TMP3, 1 2189 | mov TMP3, 1
@@ -2371,14 +2221,10 @@ static void build_subroutines(BuildCtx *ctx)
2371 | jne ->fff_fallback 2221 | jne ->fff_fallback
2372 | mov RB, dword [BASE+16] 2222 | mov RB, dword [BASE+16]
2373 | mov TMP2, RB 2223 | mov TMP2, RB
2374 |.elif SSE 2224 |.else
2375 | jae ->fff_fallback 2225 | jae ->fff_fallback
2376 | cvttsd2si RB, qword [BASE+16] 2226 | cvttsd2si RB, qword [BASE+16]
2377 | mov TMP2, RB 2227 | mov TMP2, RB
2378 |.else
2379 | jae ->fff_fallback
2380 | fld qword [BASE+16]
2381 | fistp TMP2
2382 |.endif 2228 |.endif
2383 |1: 2229 |1:
2384 | cmp dword [BASE+4], LJ_TSTR; jne ->fff_fallback 2230 | cmp dword [BASE+4], LJ_TSTR; jne ->fff_fallback
@@ -2393,12 +2239,8 @@ static void build_subroutines(BuildCtx *ctx)
2393 | mov RB, STR:RB->len 2239 | mov RB, STR:RB->len
2394 |.if DUALNUM 2240 |.if DUALNUM
2395 | mov RA, dword [BASE+8] 2241 | mov RA, dword [BASE+8]
2396 |.elif SSE
2397 | cvttsd2si RA, qword [BASE+8]
2398 |.else 2242 |.else
2399 | fld qword [BASE+8] 2243 | cvttsd2si RA, qword [BASE+8]
2400 | fistp ARG3
2401 | mov RA, ARG3
2402 |.endif 2244 |.endif
2403 | mov RC, TMP2 2245 | mov RC, TMP2
2404 | cmp RB, RC // len < end? (unsigned compare) 2246 | cmp RB, RC // len < end? (unsigned compare)
@@ -2451,14 +2293,9 @@ static void build_subroutines(BuildCtx *ctx)
2451 |.if DUALNUM 2293 |.if DUALNUM
2452 | jne ->fff_fallback 2294 | jne ->fff_fallback
2453 | mov RC, dword [BASE+8] 2295 | mov RC, dword [BASE+8]
2454 |.elif SSE
2455 | jae ->fff_fallback
2456 | cvttsd2si RC, qword [BASE+8]
2457 |.else 2296 |.else
2458 | jae ->fff_fallback 2297 | jae ->fff_fallback
2459 | fld qword [BASE+8] 2298 | cvttsd2si RC, qword [BASE+8]
2460 | fistp TMP2
2461 | mov RC, TMP2
2462 |.endif 2299 |.endif
2463 | test RC, RC 2300 | test RC, RC
2464 | jle ->fff_emptystr // Count <= 0? (or non-int) 2301 | jle ->fff_emptystr // Count <= 0? (or non-int)
@@ -2554,10 +2391,8 @@ static void build_subroutines(BuildCtx *ctx)
2554 | mov BASE, RB // Restore BASE. 2391 | mov BASE, RB // Restore BASE.
2555 |.if DUALNUM 2392 |.if DUALNUM
2556 | mov RB, RD; jmp ->fff_resi 2393 | mov RB, RD; jmp ->fff_resi
2557 |.elif SSE
2558 | cvtsi2sd xmm0, RD; jmp ->fff_resxmm0
2559 |.else 2394 |.else
2560 | mov ARG1, RD; fild ARG1; jmp ->fff_resn 2395 | cvtsi2sd xmm0, RD; jmp ->fff_resxmm0
2561 |.endif 2396 |.endif
2562 | 2397 |
2563 |//-- Bit library -------------------------------------------------------- 2398 |//-- Bit library --------------------------------------------------------
@@ -2567,11 +2402,7 @@ static void build_subroutines(BuildCtx *ctx)
2567 |.macro .ffunc_bit, name, kind 2402 |.macro .ffunc_bit, name, kind
2568 | .ffunc_1 name 2403 | .ffunc_1 name
2569 |.if kind == 2 2404 |.if kind == 2
2570 |.if SSE
2571 | sseconst_tobit xmm1, RBa 2405 | sseconst_tobit xmm1, RBa
2572 |.else
2573 | mov TMP1, TOBIT_BIAS
2574 |.endif
2575 |.endif 2406 |.endif
2576 | cmp dword [BASE+4], LJ_TISNUM 2407 | cmp dword [BASE+4], LJ_TISNUM
2577 |.if DUALNUM 2408 |.if DUALNUM
@@ -2587,37 +2418,17 @@ static void build_subroutines(BuildCtx *ctx)
2587 |.else 2418 |.else
2588 | jae ->fff_fallback 2419 | jae ->fff_fallback
2589 |.endif 2420 |.endif
2590 |.if SSE
2591 | movsd xmm0, qword [BASE] 2421 | movsd xmm0, qword [BASE]
2592 |.if kind < 2 2422 |.if kind < 2
2593 | sseconst_tobit xmm1, RBa 2423 | sseconst_tobit xmm1, RBa
2594 |.endif 2424 |.endif
2595 | addsd xmm0, xmm1 2425 | addsd xmm0, xmm1
2596 | movd RB, xmm0 2426 | movd RB, xmm0
2597 |.else
2598 | fld qword [BASE]
2599 |.if kind < 2
2600 | mov TMP1, TOBIT_BIAS
2601 |.endif
2602 | fadd TMP1
2603 | fstp FPARG1
2604 |.if kind > 0
2605 | mov RB, ARG1
2606 |.endif
2607 |.endif
2608 |2: 2427 |2:
2609 |.endmacro 2428 |.endmacro
2610 | 2429 |
2611 |.ffunc_bit bit_tobit, 0 2430 |.ffunc_bit bit_tobit, 0
2612 |.if DUALNUM or SSE
2613 |.if not SSE
2614 | mov RB, ARG1
2615 |.endif
2616 | jmp ->fff_resbit 2431 | jmp ->fff_resbit
2617 |.else
2618 | fild ARG1
2619 | jmp ->fff_resn
2620 |.endif
2621 | 2432 |
2622 |.macro .ffunc_bit_op, name, ins 2433 |.macro .ffunc_bit_op, name, ins
2623 | .ffunc_bit name, 2 2434 | .ffunc_bit name, 2
@@ -2637,17 +2448,10 @@ static void build_subroutines(BuildCtx *ctx)
2637 |.else 2448 |.else
2638 | jae ->fff_fallback_bit_op 2449 | jae ->fff_fallback_bit_op
2639 |.endif 2450 |.endif
2640 |.if SSE
2641 | movsd xmm0, qword [RD] 2451 | movsd xmm0, qword [RD]
2642 | addsd xmm0, xmm1 2452 | addsd xmm0, xmm1
2643 | movd RA, xmm0 2453 | movd RA, xmm0
2644 | ins RB, RA 2454 | ins RB, RA
2645 |.else
2646 | fld qword [RD]
2647 | fadd TMP1
2648 | fstp FPARG1
2649 | ins RB, ARG1
2650 |.endif
2651 | sub RD, 8 2455 | sub RD, 8
2652 | jmp <1 2456 | jmp <1
2653 |.endmacro 2457 |.endmacro
@@ -2664,15 +2468,10 @@ static void build_subroutines(BuildCtx *ctx)
2664 | not RB 2468 | not RB
2665 |.if DUALNUM 2469 |.if DUALNUM
2666 | jmp ->fff_resbit 2470 | jmp ->fff_resbit
2667 |.elif SSE 2471 |.else
2668 |->fff_resbit: 2472 |->fff_resbit:
2669 | cvtsi2sd xmm0, RB 2473 | cvtsi2sd xmm0, RB
2670 | jmp ->fff_resxmm0 2474 | jmp ->fff_resxmm0
2671 |.else
2672 |->fff_resbit:
2673 | mov ARG1, RB
2674 | fild ARG1
2675 | jmp ->fff_resn
2676 |.endif 2475 |.endif
2677 | 2476 |
2678 |->fff_fallback_bit_op: 2477 |->fff_fallback_bit_op:
@@ -2685,22 +2484,13 @@ static void build_subroutines(BuildCtx *ctx)
2685 | // Note: no inline conversion from number for 2nd argument! 2484 | // Note: no inline conversion from number for 2nd argument!
2686 | cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback 2485 | cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
2687 | mov RA, dword [BASE+8] 2486 | mov RA, dword [BASE+8]
2688 |.elif SSE 2487 |.else
2689 | .ffunc_nnsse name 2488 | .ffunc_nnsse name
2690 | sseconst_tobit xmm2, RBa 2489 | sseconst_tobit xmm2, RBa
2691 | addsd xmm0, xmm2 2490 | addsd xmm0, xmm2
2692 | addsd xmm1, xmm2 2491 | addsd xmm1, xmm2
2693 | movd RB, xmm0 2492 | movd RB, xmm0
2694 | movd RA, xmm1 2493 | movd RA, xmm1
2695 |.else
2696 | .ffunc_nn name
2697 | mov TMP1, TOBIT_BIAS
2698 | fadd TMP1
2699 | fstp FPARG3
2700 | fadd TMP1
2701 | fstp FPARG1
2702 | mov RA, ARG3
2703 | mov RB, ARG1
2704 |.endif 2494 |.endif
2705 | ins RB, cl // Assumes RA is ecx. 2495 | ins RB, cl // Assumes RA is ecx.
2706 | jmp ->fff_resbit 2496 | jmp ->fff_resbit
@@ -3051,27 +2841,9 @@ static void build_subroutines(BuildCtx *ctx)
3051 |//----------------------------------------------------------------------- 2841 |//-----------------------------------------------------------------------
3052 | 2842 |
3053 |// FP value rounding. Called by math.floor/math.ceil fast functions 2843 |// FP value rounding. Called by math.floor/math.ceil fast functions
3054 |// and from JIT code. 2844 |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
3055 | 2845 |.macro vm_round, name, mode
3056 |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified. 2846 |->name .. _sse:
3057 |.macro vm_round_x87, mode1, mode2
3058 | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2.
3059 | mov [esp+8], eax
3060 | mov ax, mode1
3061 | or ax, [esp+4]
3062 |.if mode2 ~= 0xffff
3063 | and ax, mode2
3064 |.endif
3065 | mov [esp+6], ax
3066 | fldcw word [esp+6]
3067 | frndint
3068 | fldcw word [esp+4]
3069 | mov eax, [esp+8]
3070 | ret
3071 |.endmacro
3072 |
3073 |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
3074 |.macro vm_round_sse, mode
3075 | sseconst_abs xmm2, RDa 2847 | sseconst_abs xmm2, RDa
3076 | sseconst_2p52 xmm3, RDa 2848 | sseconst_2p52 xmm3, RDa
3077 | movaps xmm1, xmm0 2849 | movaps xmm1, xmm0
@@ -3107,22 +2879,21 @@ static void build_subroutines(BuildCtx *ctx)
3107 | ret 2879 | ret
3108 |.endmacro 2880 |.endmacro
3109 | 2881 |
3110 |.macro vm_round, name, ssemode, mode1, mode2 2882 |->vm_floor:
3111 |->name: 2883 |.if not X64
3112 |.if not SSE 2884 | movsd xmm0, qword [esp+4]
3113 | vm_round_x87 mode1, mode2 2885 | call ->vm_floor_sse
2886 | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg.
2887 | fld qword [esp+4]
2888 | ret
3114 |.endif 2889 |.endif
3115 |->name .. _sse:
3116 | vm_round_sse ssemode
3117 |.endmacro
3118 | 2890 |
3119 | vm_round vm_floor, 0, 0x0400, 0xf7ff 2891 | vm_round vm_floor, 0
3120 | vm_round vm_ceil, 1, 0x0800, 0xfbff 2892 | vm_round vm_ceil, 1
3121 | vm_round vm_trunc, 2, 0x0c00, 0xffff 2893 | vm_round vm_trunc, 2
3122 | 2894 |
3123 |// FP modulo x%y. Called by BC_MOD* and vm_arith. 2895 |// FP modulo x%y. Called by BC_MOD* and vm_arith.
3124 |->vm_mod: 2896 |->vm_mod:
3125 |.if SSE
3126 |// Args in xmm0/xmm1, return value in xmm0. 2897 |// Args in xmm0/xmm1, return value in xmm0.
3127 |// Caveat: xmm0-xmm5 and RC (eax) modified! 2898 |// Caveat: xmm0-xmm5 and RC (eax) modified!
3128 | movaps xmm5, xmm0 2899 | movaps xmm5, xmm0
@@ -3150,23 +2921,6 @@ static void build_subroutines(BuildCtx *ctx)
3150 | movaps xmm0, xmm5 2921 | movaps xmm0, xmm5
3151 | subsd xmm0, xmm1 2922 | subsd xmm0, xmm1
3152 | ret 2923 | ret
3153 |.else
3154 |// Args/ret on x87 stack (y on top). No xmm registers modified.
3155 |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
3156 | fld st1
3157 | fdiv st1
3158 | fnstcw word [esp+4]
3159 | mov ax, 0x0400
3160 | or ax, [esp+4]
3161 | and ax, 0xf7ff
3162 | mov [esp+6], ax
3163 | fldcw word [esp+6]
3164 | frndint
3165 | fldcw word [esp+4]
3166 | fmulp st1
3167 | fsubp st1
3168 | ret
3169 |.endif
3170 | 2924 |
3171 |// FP log2(x). Called by math.log(x, base). 2925 |// FP log2(x). Called by math.log(x, base).
3172 |->vm_log2: 2926 |->vm_log2:
@@ -3217,96 +2971,6 @@ static void build_subroutines(BuildCtx *ctx)
3217 | 2971 |
3218 |// Generic power function x^y. Called by BC_POW, math.pow fast function, 2972 |// Generic power function x^y. Called by BC_POW, math.pow fast function,
3219 |// and vm_arith. 2973 |// and vm_arith.
3220 |// Args/ret on x87 stack (y on top). RC (eax) modified.
3221 |// Caveat: needs 3 slots on x87 stack!
3222 |->vm_pow:
3223 |.if not SSE
3224 | fist dword [esp+4] // Store/reload int before comparison.
3225 | fild dword [esp+4] // Integral exponent used in vm_powi.
3226 | fucomip st1
3227 | jnz >8 // Branch for FP exponents.
3228 | jp >9 // Branch for NaN exponent.
3229 | fpop // Pop y and fallthrough to vm_powi.
3230 |
3231 |// FP/int power function x^i. Arg1/ret on x87 stack.
3232 |// Arg2 (int) on C stack. RC (eax) modified.
3233 |// Caveat: needs 2 slots on x87 stack!
3234 | mov eax, [esp+4]
3235 | cmp eax, 1; jle >6 // i<=1?
3236 | // Now 1 < (unsigned)i <= 0x80000000.
3237 |1: // Handle leading zeros.
3238 | test eax, 1; jnz >2
3239 | fmul st0
3240 | shr eax, 1
3241 | jmp <1
3242 |2:
3243 | shr eax, 1; jz >5
3244 | fdup
3245 |3: // Handle trailing bits.
3246 | fmul st0
3247 | shr eax, 1; jz >4
3248 | jnc <3
3249 | fmul st1, st0
3250 | jmp <3
3251 |4:
3252 | fmulp st1
3253 |5:
3254 | ret
3255 |6:
3256 | je <5 // x^1 ==> x
3257 | jb >7
3258 | fld1; fdivrp st1
3259 | neg eax
3260 | cmp eax, 1; je <5 // x^-1 ==> 1/x
3261 | jmp <1 // x^-i ==> (1/x)^i
3262 |7:
3263 | fpop; fld1 // x^0 ==> 1
3264 | ret
3265 |
3266 |8: // FP/FP power function x^y.
3267 | fst dword [esp+4]
3268 | fxch
3269 | fst dword [esp+8]
3270 | mov eax, [esp+4]; shl eax, 1
3271 | cmp eax, 0xff000000; je >2 // x^+-Inf?
3272 | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
3273 | cmp eax, 0xff000000; je >4 // +-Inf^y?
3274 | fyl2x
3275 | jmp ->vm_exp2raw
3276 |
3277 |9: // Handle x^NaN.
3278 | fld1
3279 | fucomip st2
3280 | je >1 // 1^NaN ==> 1
3281 | fxch // x^NaN ==> NaN
3282 |1:
3283 | fpop
3284 | ret
3285 |
3286 |2: // Handle x^+-Inf.
3287 | fabs
3288 | fld1
3289 | fucomip st1
3290 | je >3 // +-1^+-Inf ==> 1
3291 | fpop; fabs; fldz; mov eax, 0; setc al
3292 | ror eax, 1; xor eax, [esp+4]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0
3293 | fxch
3294 |3:
3295 | fpop1; fabs
3296 | ret
3297 |
3298 |4: // Handle +-0^y or +-Inf^y.
3299 | cmp dword [esp+4], 0; jge <3 // y >= 0, x^y ==> |x|
3300 | fpop; fpop
3301 | test eax, eax; jz >5 // y < 0, +-0^y ==> +Inf
3302 | fldz // y < 0, +-Inf^y ==> 0
3303 | ret
3304 |5:
3305 | mov dword [esp+4], 0x7f800000 // Return +Inf.
3306 | fld dword [esp+4]
3307 | ret
3308 |.endif
3309 |
3310 |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified. 2974 |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
3311 |// Needs 16 byte scratch area for x86. Also called from JIT code. 2975 |// Needs 16 byte scratch area for x86. Also called from JIT code.
3312 |->vm_pow_sse: 2976 |->vm_pow_sse:
@@ -3315,7 +2979,7 @@ static void build_subroutines(BuildCtx *ctx)
3315 | ucomisd xmm1, xmm2 2979 | ucomisd xmm1, xmm2
3316 | jnz >8 // Branch for FP exponents. 2980 | jnz >8 // Branch for FP exponents.
3317 | jp >9 // Branch for NaN exponent. 2981 | jp >9 // Branch for NaN exponent.
3318 | // Fallthrough to vm_powi_sse. 2982 | // Fallthrough.
3319 | 2983 |
3320 |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified. 2984 |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
3321 |->vm_powi_sse: 2985 |->vm_powi_sse:
@@ -3437,8 +3101,8 @@ static void build_subroutines(BuildCtx *ctx)
3437 | .else 3101 | .else
3438 | .define fpmop, CARG1d 3102 | .define fpmop, CARG1d
3439 | .endif 3103 | .endif
3440 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil 3104 | cmp fpmop, 1; jb ->vm_floor_sse; je ->vm_ceil_sse
3441 | cmp fpmop, 3; jb ->vm_trunc; ja >2 3105 | cmp fpmop, 3; jb ->vm_trunc_sse; ja >2
3442 | sqrtsd xmm0, xmm0; ret 3106 | sqrtsd xmm0, xmm0; ret
3443 |2: 3107 |2:
3444 | .if X64WIN 3108 | .if X64WIN
@@ -3478,14 +3142,13 @@ static void build_subroutines(BuildCtx *ctx)
3478 | ret 3142 | ret
3479 |.else // x86 calling convention. 3143 |.else // x86 calling convention.
3480 | .define fpmop, eax 3144 | .define fpmop, eax
3481 |.if SSE
3482 | mov fpmop, [esp+12] 3145 | mov fpmop, [esp+12]
3483 | movsd xmm0, qword [esp+4] 3146 | movsd xmm0, qword [esp+4]
3484 | cmp fpmop, 1; je >1; ja >2 3147 | cmp fpmop, 1; je >1; ja >2
3485 | call ->vm_floor; jmp >7 3148 | call ->vm_floor_sse; jmp >7
3486 |1: ; call ->vm_ceil; jmp >7 3149 |1: ; call ->vm_ceil_sse; jmp >7
3487 |2: ; cmp fpmop, 3; je >1; ja >2 3150 |2: ; cmp fpmop, 3; je >1; ja >2
3488 | call ->vm_trunc; jmp >7 3151 | call ->vm_trunc_sse; jmp >7
3489 |1: 3152 |1:
3490 | sqrtsd xmm0, xmm0 3153 | sqrtsd xmm0, xmm0
3491 |7: 3154 |7:
@@ -3503,23 +3166,6 @@ static void build_subroutines(BuildCtx *ctx)
3503 |2: ; cmp fpmop, 11; je >1; ja >9 3166 |2: ; cmp fpmop, 11; je >1; ja >9
3504 | fcos; ret 3167 | fcos; ret
3505 |1: ; fptan; fpop; ret 3168 |1: ; fptan; fpop; ret
3506 |.else
3507 | mov fpmop, [esp+12]
3508 | fld qword [esp+4]
3509 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
3510 | cmp fpmop, 3; jb ->vm_trunc; ja >2
3511 | fsqrt; ret
3512 |2: ; cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87
3513 | cmp fpmop, 7; je >1; ja >2
3514 | fldln2; fxch; fyl2x; ret
3515 |1: ; fld1; fxch; fyl2x; ret
3516 |2: ; cmp fpmop, 9; je >1; ja >2
3517 | fldlg2; fxch; fyl2x; ret
3518 |1: ; fsin; ret
3519 |2: ; cmp fpmop, 11; je >1; ja >9
3520 | fcos; ret
3521 |1: ; fptan; fpop; ret
3522 |.endif
3523 |.endif 3169 |.endif
3524 |9: ; int3 // Bad fpm. 3170 |9: ; int3 // Bad fpm.
3525 |.endif 3171 |.endif
@@ -3541,7 +3187,7 @@ static void build_subroutines(BuildCtx *ctx)
3541 |2: ; cmp foldop, 3; je >1; ja >2 3187 |2: ; cmp foldop, 3; je >1; ja >2
3542 | mulsd xmm0, xmm1; ret 3188 | mulsd xmm0, xmm1; ret
3543 |1: ; divsd xmm0, xmm1; ret 3189 |1: ; divsd xmm0, xmm1; ret
3544 |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow 3190 |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow_sse
3545 | cmp foldop, 7; je >1; ja >2 3191 | cmp foldop, 7; je >1; ja >2
3546 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret 3192 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
3547 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret 3193 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
@@ -3574,7 +3220,7 @@ static void build_subroutines(BuildCtx *ctx)
3574 |1: ; maxsd xmm0, xmm1; ret 3220 |1: ; maxsd xmm0, xmm1; ret
3575 |9: ; int3 // Bad op. 3221 |9: ; int3 // Bad op.
3576 | 3222 |
3577 |.elif SSE // x86 calling convention with SSE ops. 3223 |.else // x86 calling convention.
3578 | 3224 |
3579 | .define foldop, eax 3225 | .define foldop, eax
3580 | mov foldop, [esp+20] 3226 | mov foldop, [esp+20]
@@ -3593,7 +3239,7 @@ static void build_subroutines(BuildCtx *ctx)
3593 |2: ; cmp foldop, 5 3239 |2: ; cmp foldop, 5
3594 | je >1; ja >2 3240 | je >1; ja >2
3595 | call ->vm_mod; jmp <7 3241 | call ->vm_mod; jmp <7
3596 |1: ; pop edx; call ->vm_pow; push edx; jmp <7 // Writes to scratch area. 3242 |1: ; pop edx; call ->vm_pow_sse; push edx; jmp <7 // Writes to scratch area.
3597 |2: ; cmp foldop, 7; je >1; ja >2 3243 |2: ; cmp foldop, 7; je >1; ja >2
3598 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7 3244 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
3599 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7 3245 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
@@ -3608,29 +3254,6 @@ static void build_subroutines(BuildCtx *ctx)
3608 |1: ; maxsd xmm0, xmm1; jmp <7 3254 |1: ; maxsd xmm0, xmm1; jmp <7
3609 |9: ; int3 // Bad op. 3255 |9: ; int3 // Bad op.
3610 | 3256 |
3611 |.else // x86 calling convention with x87 ops.
3612 |
3613 | mov eax, [esp+20]
3614 | fld qword [esp+4]
3615 | fld qword [esp+12]
3616 | cmp eax, 1; je >1; ja >2
3617 | faddp st1; ret
3618 |1: ; fsubp st1; ret
3619 |2: ; cmp eax, 3; je >1; ja >2
3620 | fmulp st1; ret
3621 |1: ; fdivp st1; ret
3622 |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
3623 | cmp eax, 7; je >1; ja >2
3624 | fpop; fchs; ret
3625 |1: ; fpop; fabs; ret
3626 |2: ; cmp eax, 9; je >1; ja >2
3627 | fpatan; ret
3628 |1: ; fxch; fscale; fpop1; ret
3629 |2: ; cmp eax, 11; je >1; ja >9
3630 | fucomi st1; fcmovnbe st1; fpop1; ret
3631 |1: ; fucomi st1; fcmovbe st1; fpop1; ret
3632 |9: ; int3 // Bad op.
3633 |
3634 |.endif 3257 |.endif
3635 | 3258 |
3636 |//----------------------------------------------------------------------- 3259 |//-----------------------------------------------------------------------
@@ -3943,19 +3566,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
3943 | // RA is a number. 3566 | // RA is a number.
3944 | cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp 3567 | cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
3945 | // RA is a number, RD is an integer. 3568 | // RA is a number, RD is an integer.
3946 |.if SSE
3947 | cvtsi2sd xmm0, dword [BASE+RD*8] 3569 | cvtsi2sd xmm0, dword [BASE+RD*8]
3948 | jmp >2 3570 | jmp >2
3949 |.else
3950 | fld qword [BASE+RA*8]
3951 | fild dword [BASE+RD*8]
3952 | jmp >3
3953 |.endif
3954 | 3571 |
3955 |8: // RA is an integer, RD is not an integer. 3572 |8: // RA is an integer, RD is not an integer.
3956 | ja ->vmeta_comp 3573 | ja ->vmeta_comp
3957 | // RA is an integer, RD is a number. 3574 | // RA is an integer, RD is a number.
3958 |.if SSE
3959 | cvtsi2sd xmm1, dword [BASE+RA*8] 3575 | cvtsi2sd xmm1, dword [BASE+RA*8]
3960 | movsd xmm0, qword [BASE+RD*8] 3576 | movsd xmm0, qword [BASE+RD*8]
3961 | add PC, 4 3577 | add PC, 4
@@ -3963,29 +3579,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
3963 | jmp_comp jbe, ja, jb, jae, <9 3579 | jmp_comp jbe, ja, jb, jae, <9
3964 | jmp <6 3580 | jmp <6
3965 |.else 3581 |.else
3966 | fild dword [BASE+RA*8]
3967 | jmp >2
3968 |.endif
3969 |.else
3970 | checknum RA, ->vmeta_comp 3582 | checknum RA, ->vmeta_comp
3971 | checknum RD, ->vmeta_comp 3583 | checknum RD, ->vmeta_comp
3972 |.endif 3584 |.endif
3973 |.if SSE
3974 |1: 3585 |1:
3975 | movsd xmm0, qword [BASE+RD*8] 3586 | movsd xmm0, qword [BASE+RD*8]
3976 |2: 3587 |2:
3977 | add PC, 4 3588 | add PC, 4
3978 | ucomisd xmm0, qword [BASE+RA*8] 3589 | ucomisd xmm0, qword [BASE+RA*8]
3979 |3: 3590 |3:
3980 |.else
3981 |1:
3982 | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A.
3983 |2:
3984 | fld qword [BASE+RD*8]
3985 |3:
3986 | add PC, 4
3987 | fcomparepp
3988 |.endif
3989 | // Unordered: all of ZF CF PF set, ordered: PF clear. 3591 | // Unordered: all of ZF CF PF set, ordered: PF clear.
3990 | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't. 3592 | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
3991 |.if DUALNUM 3593 |.if DUALNUM
@@ -4025,43 +3627,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4025 | // RD is a number. 3627 | // RD is a number.
4026 | cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5 3628 | cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
4027 | // RD is a number, RA is an integer. 3629 | // RD is a number, RA is an integer.
4028 |.if SSE
4029 | cvtsi2sd xmm0, dword [BASE+RA*8] 3630 | cvtsi2sd xmm0, dword [BASE+RA*8]
4030 |.else
4031 | fild dword [BASE+RA*8]
4032 |.endif
4033 | jmp >2 3631 | jmp >2
4034 | 3632 |
4035 |8: // RD is an integer, RA is not an integer. 3633 |8: // RD is an integer, RA is not an integer.
4036 | ja >5 3634 | ja >5
4037 | // RD is an integer, RA is a number. 3635 | // RD is an integer, RA is a number.
4038 |.if SSE
4039 | cvtsi2sd xmm0, dword [BASE+RD*8] 3636 | cvtsi2sd xmm0, dword [BASE+RD*8]
4040 | ucomisd xmm0, qword [BASE+RA*8] 3637 | ucomisd xmm0, qword [BASE+RA*8]
4041 |.else
4042 | fild dword [BASE+RD*8]
4043 | fld qword [BASE+RA*8]
4044 |.endif
4045 | jmp >4 3638 | jmp >4
4046 | 3639 |
4047 |.else 3640 |.else
4048 | cmp RB, LJ_TISNUM; jae >5 3641 | cmp RB, LJ_TISNUM; jae >5
4049 | checknum RA, >5 3642 | checknum RA, >5
4050 |.endif 3643 |.endif
4051 |.if SSE
4052 |1: 3644 |1:
4053 | movsd xmm0, qword [BASE+RA*8] 3645 | movsd xmm0, qword [BASE+RA*8]
4054 |2: 3646 |2:
4055 | ucomisd xmm0, qword [BASE+RD*8] 3647 | ucomisd xmm0, qword [BASE+RD*8]
4056 |4: 3648 |4:
4057 |.else
4058 |1:
4059 | fld qword [BASE+RA*8]
4060 |2:
4061 | fld qword [BASE+RD*8]
4062 |4:
4063 | fcomparepp
4064 |.endif
4065 iseqne_fp: 3649 iseqne_fp:
4066 if (vk) { 3650 if (vk) {
4067 | jp >2 // Unordered means not equal. 3651 | jp >2 // Unordered means not equal.
@@ -4184,39 +3768,21 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4184 | // RA is a number. 3768 | // RA is a number.
4185 | cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1 3769 | cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
4186 | // RA is a number, RD is an integer. 3770 | // RA is a number, RD is an integer.
4187 |.if SSE
4188 | cvtsi2sd xmm0, dword [KBASE+RD*8] 3771 | cvtsi2sd xmm0, dword [KBASE+RD*8]
4189 |.else
4190 | fild dword [KBASE+RD*8]
4191 |.endif
4192 | jmp >2 3772 | jmp >2
4193 | 3773 |
4194 |8: // RA is an integer, RD is a number. 3774 |8: // RA is an integer, RD is a number.
4195 |.if SSE
4196 | cvtsi2sd xmm0, dword [BASE+RA*8] 3775 | cvtsi2sd xmm0, dword [BASE+RA*8]
4197 | ucomisd xmm0, qword [KBASE+RD*8] 3776 | ucomisd xmm0, qword [KBASE+RD*8]
4198 |.else
4199 | fild dword [BASE+RA*8]
4200 | fld qword [KBASE+RD*8]
4201 |.endif
4202 | jmp >4 3777 | jmp >4
4203 |.else 3778 |.else
4204 | cmp RB, LJ_TISNUM; jae >3 3779 | cmp RB, LJ_TISNUM; jae >3
4205 |.endif 3780 |.endif
4206 |.if SSE
4207 |1: 3781 |1:
4208 | movsd xmm0, qword [KBASE+RD*8] 3782 | movsd xmm0, qword [KBASE+RD*8]
4209 |2: 3783 |2:
4210 | ucomisd xmm0, qword [BASE+RA*8] 3784 | ucomisd xmm0, qword [BASE+RA*8]
4211 |4: 3785 |4:
4212 |.else
4213 |1:
4214 | fld qword [KBASE+RD*8]
4215 |2:
4216 | fld qword [BASE+RA*8]
4217 |4:
4218 | fcomparepp
4219 |.endif
4220 goto iseqne_fp; 3786 goto iseqne_fp;
4221 case BC_ISEQP: case BC_ISNEP: 3787 case BC_ISEQP: case BC_ISNEP:
4222 vk = op == BC_ISEQP; 3788 vk = op == BC_ISEQP;
@@ -4310,16 +3876,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4310 |.else 3876 |.else
4311 | checknum RD, ->vmeta_unm 3877 | checknum RD, ->vmeta_unm
4312 |.endif 3878 |.endif
4313 |.if SSE
4314 | movsd xmm0, qword [BASE+RD*8] 3879 | movsd xmm0, qword [BASE+RD*8]
4315 | sseconst_sign xmm1, RDa 3880 | sseconst_sign xmm1, RDa
4316 | xorps xmm0, xmm1 3881 | xorps xmm0, xmm1
4317 | movsd qword [BASE+RA*8], xmm0 3882 | movsd qword [BASE+RA*8], xmm0
4318 |.else
4319 | fld qword [BASE+RD*8]
4320 | fchs
4321 | fstp qword [BASE+RA*8]
4322 |.endif
4323 |.if DUALNUM 3883 |.if DUALNUM
4324 | jmp <9 3884 | jmp <9
4325 |.else 3885 |.else
@@ -4335,15 +3895,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4335 |1: 3895 |1:
4336 | mov dword [BASE+RA*8+4], LJ_TISNUM 3896 | mov dword [BASE+RA*8+4], LJ_TISNUM
4337 | mov dword [BASE+RA*8], RD 3897 | mov dword [BASE+RA*8], RD
4338 |.elif SSE 3898 |.else
4339 | xorps xmm0, xmm0 3899 | xorps xmm0, xmm0
4340 | cvtsi2sd xmm0, dword STR:RD->len 3900 | cvtsi2sd xmm0, dword STR:RD->len
4341 |1: 3901 |1:
4342 | movsd qword [BASE+RA*8], xmm0 3902 | movsd qword [BASE+RA*8], xmm0
4343 |.else
4344 | fild dword STR:RD->len
4345 |1:
4346 | fstp qword [BASE+RA*8]
4347 |.endif 3903 |.endif
4348 | ins_next 3904 | ins_next
4349 |2: 3905 |2:
@@ -4361,11 +3917,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4361 | // Length of table returned in eax (RD). 3917 | // Length of table returned in eax (RD).
4362 |.if DUALNUM 3918 |.if DUALNUM
4363 | // Nothing to do. 3919 | // Nothing to do.
4364 |.elif SSE
4365 | cvtsi2sd xmm0, RD
4366 |.else 3920 |.else
4367 | mov ARG1, RD 3921 | cvtsi2sd xmm0, RD
4368 | fild ARG1
4369 |.endif 3922 |.endif
4370 | mov BASE, RB // Restore BASE. 3923 | mov BASE, RB // Restore BASE.
4371 | movzx RA, PC_RA 3924 | movzx RA, PC_RA
@@ -4380,7 +3933,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4380 3933
4381 /* -- Binary ops -------------------------------------------------------- */ 3934 /* -- Binary ops -------------------------------------------------------- */
4382 3935
4383 |.macro ins_arithpre, x87ins, sseins, ssereg 3936 |.macro ins_arithpre, sseins, ssereg
4384 | ins_ABC 3937 | ins_ABC
4385 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); 3938 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
4386 ||switch (vk) { 3939 ||switch (vk) {
@@ -4389,37 +3942,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4389 | .if DUALNUM 3942 | .if DUALNUM
4390 | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn 3943 | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
4391 | .endif 3944 | .endif
4392 | .if SSE 3945 | movsd xmm0, qword [BASE+RB*8]
4393 | movsd xmm0, qword [BASE+RB*8] 3946 | sseins ssereg, qword [KBASE+RC*8]
4394 | sseins ssereg, qword [KBASE+RC*8]
4395 | .else
4396 | fld qword [BASE+RB*8]
4397 | x87ins qword [KBASE+RC*8]
4398 | .endif
4399 || break; 3947 || break;
4400 ||case 1: 3948 ||case 1:
4401 | checknum RB, ->vmeta_arith_nv 3949 | checknum RB, ->vmeta_arith_nv
4402 | .if DUALNUM 3950 | .if DUALNUM
4403 | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv 3951 | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
4404 | .endif 3952 | .endif
4405 | .if SSE 3953 | movsd xmm0, qword [KBASE+RC*8]
4406 | movsd xmm0, qword [KBASE+RC*8] 3954 | sseins ssereg, qword [BASE+RB*8]
4407 | sseins ssereg, qword [BASE+RB*8]
4408 | .else
4409 | fld qword [KBASE+RC*8]
4410 | x87ins qword [BASE+RB*8]
4411 | .endif
4412 || break; 3955 || break;
4413 ||default: 3956 ||default:
4414 | checknum RB, ->vmeta_arith_vv 3957 | checknum RB, ->vmeta_arith_vv
4415 | checknum RC, ->vmeta_arith_vv 3958 | checknum RC, ->vmeta_arith_vv
4416 | .if SSE 3959 | movsd xmm0, qword [BASE+RB*8]
4417 | movsd xmm0, qword [BASE+RB*8] 3960 | sseins ssereg, qword [BASE+RC*8]
4418 | sseins ssereg, qword [BASE+RC*8]
4419 | .else
4420 | fld qword [BASE+RB*8]
4421 | x87ins qword [BASE+RC*8]
4422 | .endif
4423 || break; 3961 || break;
4424 ||} 3962 ||}
4425 |.endmacro 3963 |.endmacro
@@ -4457,54 +3995,50 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4457 |.endmacro 3995 |.endmacro
4458 | 3996 |
4459 |.macro ins_arithpost 3997 |.macro ins_arithpost
4460 |.if SSE
4461 | movsd qword [BASE+RA*8], xmm0 3998 | movsd qword [BASE+RA*8], xmm0
4462 |.else
4463 | fstp qword [BASE+RA*8]
4464 |.endif
4465 |.endmacro 3999 |.endmacro
4466 | 4000 |
4467 |.macro ins_arith, x87ins, sseins 4001 |.macro ins_arith, sseins
4468 | ins_arithpre x87ins, sseins, xmm0 4002 | ins_arithpre sseins, xmm0
4469 | ins_arithpost 4003 | ins_arithpost
4470 | ins_next 4004 | ins_next
4471 |.endmacro 4005 |.endmacro
4472 | 4006 |
4473 |.macro ins_arith, intins, x87ins, sseins 4007 |.macro ins_arith, intins, sseins
4474 |.if DUALNUM 4008 |.if DUALNUM
4475 | ins_arithdn intins 4009 | ins_arithdn intins
4476 |.else 4010 |.else
4477 | ins_arith, x87ins, sseins 4011 | ins_arith, sseins
4478 |.endif 4012 |.endif
4479 |.endmacro 4013 |.endmacro
4480 4014
4481 | // RA = dst, RB = src1 or num const, RC = src2 or num const 4015 | // RA = dst, RB = src1 or num const, RC = src2 or num const
4482 case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: 4016 case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
4483 | ins_arith add, fadd, addsd 4017 | ins_arith add, addsd
4484 break; 4018 break;
4485 case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: 4019 case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
4486 | ins_arith sub, fsub, subsd 4020 | ins_arith sub, subsd
4487 break; 4021 break;
4488 case BC_MULVN: case BC_MULNV: case BC_MULVV: 4022 case BC_MULVN: case BC_MULNV: case BC_MULVV:
4489 | ins_arith imul, fmul, mulsd 4023 | ins_arith imul, mulsd
4490 break; 4024 break;
4491 case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: 4025 case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
4492 | ins_arith fdiv, divsd 4026 | ins_arith divsd
4493 break; 4027 break;
4494 case BC_MODVN: 4028 case BC_MODVN:
4495 | ins_arithpre fld, movsd, xmm1 4029 | ins_arithpre movsd, xmm1
4496 |->BC_MODVN_Z: 4030 |->BC_MODVN_Z:
4497 | call ->vm_mod 4031 | call ->vm_mod
4498 | ins_arithpost 4032 | ins_arithpost
4499 | ins_next 4033 | ins_next
4500 break; 4034 break;
4501 case BC_MODNV: case BC_MODVV: 4035 case BC_MODNV: case BC_MODVV:
4502 | ins_arithpre fld, movsd, xmm1 4036 | ins_arithpre movsd, xmm1
4503 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. 4037 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
4504 break; 4038 break;
4505 case BC_POW: 4039 case BC_POW:
4506 | ins_arithpre fld, movsd, xmm1 4040 | ins_arithpre movsd, xmm1
4507 | call ->vm_pow 4041 | call ->vm_pow_sse
4508 | ins_arithpost 4042 | ins_arithpost
4509 | ins_next 4043 | ins_next
4510 break; 4044 break;
@@ -4573,25 +4107,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4573 | movsx RD, RDW 4107 | movsx RD, RDW
4574 | mov dword [BASE+RA*8+4], LJ_TISNUM 4108 | mov dword [BASE+RA*8+4], LJ_TISNUM
4575 | mov dword [BASE+RA*8], RD 4109 | mov dword [BASE+RA*8], RD
4576 |.elif SSE 4110 |.else
4577 | movsx RD, RDW // Sign-extend literal. 4111 | movsx RD, RDW // Sign-extend literal.
4578 | cvtsi2sd xmm0, RD 4112 | cvtsi2sd xmm0, RD
4579 | movsd qword [BASE+RA*8], xmm0 4113 | movsd qword [BASE+RA*8], xmm0
4580 |.else
4581 | fild PC_RD // Refetch signed RD from instruction.
4582 | fstp qword [BASE+RA*8]
4583 |.endif 4114 |.endif
4584 | ins_next 4115 | ins_next
4585 break; 4116 break;
4586 case BC_KNUM: 4117 case BC_KNUM:
4587 | ins_AD // RA = dst, RD = num const 4118 | ins_AD // RA = dst, RD = num const
4588 |.if SSE
4589 | movsd xmm0, qword [KBASE+RD*8] 4119 | movsd xmm0, qword [KBASE+RD*8]
4590 | movsd qword [BASE+RA*8], xmm0 4120 | movsd qword [BASE+RA*8], xmm0
4591 |.else
4592 | fld qword [KBASE+RD*8]
4593 | fstp qword [BASE+RA*8]
4594 |.endif
4595 | ins_next 4121 | ins_next
4596 break; 4122 break;
4597 case BC_KPRI: 4123 case BC_KPRI:
@@ -4698,18 +4224,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4698 case BC_USETN: 4224 case BC_USETN:
4699 | ins_AD // RA = upvalue #, RD = num const 4225 | ins_AD // RA = upvalue #, RD = num const
4700 | mov LFUNC:RB, [BASE-8] 4226 | mov LFUNC:RB, [BASE-8]
4701 |.if SSE
4702 | movsd xmm0, qword [KBASE+RD*8] 4227 | movsd xmm0, qword [KBASE+RD*8]
4703 |.else
4704 | fld qword [KBASE+RD*8]
4705 |.endif
4706 | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)] 4228 | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
4707 | mov RA, UPVAL:RB->v 4229 | mov RA, UPVAL:RB->v
4708 |.if SSE
4709 | movsd qword [RA], xmm0 4230 | movsd qword [RA], xmm0
4710 |.else
4711 | fstp qword [RA]
4712 |.endif
4713 | ins_next 4231 | ins_next
4714 break; 4232 break;
4715 case BC_USETP: 4233 case BC_USETP:
@@ -4863,18 +4381,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
4863 |.else 4381 |.else
4864 | // Convert number to int and back and compare. 4382 | // Convert number to int and back and compare.
4865 | checknum RC, >5 4383 | checknum RC, >5
4866 |.if SSE
4867 | movsd xmm0, qword [BASE+RC*8] 4384 | movsd xmm0, qword [BASE+RC*8]
4868 | cvtsd2si RC, xmm0 4385 | cvtsd2si RC, xmm0
4869 | cvtsi2sd xmm1, RC 4386 | cvtsi2sd xmm1, RC
4870 | ucomisd xmm0, xmm1 4387 | ucomisd xmm0, xmm1
4871 |.else
4872 | fld qword [BASE+RC*8]
4873 | fist ARG1
4874 | fild ARG1
4875 | fcomparepp
4876 | mov RC, ARG1
4877 |.endif
4878 | jne ->vmeta_tgetv // Generic numeric key? Use fallback. 4388 | jne ->vmeta_tgetv // Generic numeric key? Use fallback.
4879 |.endif 4389 |.endif
4880 | cmp RC, TAB:RB->asize // Takes care of unordered, too. 4390 | cmp RC, TAB:RB->asize // Takes care of unordered, too.
@@ -5011,18 +4521,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5011 |.else 4521 |.else
5012 | // Convert number to int and back and compare. 4522 | // Convert number to int and back and compare.
5013 | checknum RC, >5 4523 | checknum RC, >5
5014 |.if SSE
5015 | movsd xmm0, qword [BASE+RC*8] 4524 | movsd xmm0, qword [BASE+RC*8]
5016 | cvtsd2si RC, xmm0 4525 | cvtsd2si RC, xmm0
5017 | cvtsi2sd xmm1, RC 4526 | cvtsi2sd xmm1, RC
5018 | ucomisd xmm0, xmm1 4527 | ucomisd xmm0, xmm1
5019 |.else
5020 | fld qword [BASE+RC*8]
5021 | fist ARG1
5022 | fild ARG1
5023 | fcomparepp
5024 | mov RC, ARG1
5025 |.endif
5026 | jne ->vmeta_tsetv // Generic numeric key? Use fallback. 4528 | jne ->vmeta_tsetv // Generic numeric key? Use fallback.
5027 |.endif 4529 |.endif
5028 | cmp RC, TAB:RB->asize // Takes care of unordered, too. 4530 | cmp RC, TAB:RB->asize // Takes care of unordered, too.
@@ -5386,10 +4888,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5386 |.if DUALNUM 4888 |.if DUALNUM
5387 | mov dword [BASE+RA*8+4], LJ_TISNUM 4889 | mov dword [BASE+RA*8+4], LJ_TISNUM
5388 | mov dword [BASE+RA*8], RC 4890 | mov dword [BASE+RA*8], RC
5389 |.elif SSE
5390 | cvtsi2sd xmm0, RC
5391 |.else 4891 |.else
5392 | fild dword [BASE+RA*8-8] 4892 | cvtsi2sd xmm0, RC
5393 |.endif 4893 |.endif
5394 | // Copy array slot to returned value. 4894 | // Copy array slot to returned value.
5395 |.if X64 4895 |.if X64
@@ -5405,10 +4905,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5405 | // Return array index as a numeric key. 4905 | // Return array index as a numeric key.
5406 |.if DUALNUM 4906 |.if DUALNUM
5407 | // See above. 4907 | // See above.
5408 |.elif SSE
5409 | movsd qword [BASE+RA*8], xmm0
5410 |.else 4908 |.else
5411 | fstp qword [BASE+RA*8] 4909 | movsd qword [BASE+RA*8], xmm0
5412 |.endif 4910 |.endif
5413 | mov [BASE+RA*8-8], RC // Update control var. 4911 | mov [BASE+RA*8-8], RC // Update control var.
5414 |2: 4912 |2:
@@ -5421,9 +4919,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5421 | 4919 |
5422 |4: // Skip holes in array part. 4920 |4: // Skip holes in array part.
5423 | add RC, 1 4921 | add RC, 1
5424 |.if not (DUALNUM or SSE)
5425 | mov [BASE+RA*8-8], RC
5426 |.endif
5427 | jmp <1 4922 | jmp <1
5428 | 4923 |
5429 |5: // Traverse hash part. 4924 |5: // Traverse hash part.
@@ -5757,7 +5252,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5757 if (!vk) { 5252 if (!vk) {
5758 | cmp RB, LJ_TISNUM; jae ->vmeta_for 5253 | cmp RB, LJ_TISNUM; jae ->vmeta_for
5759 } 5254 }
5760 |.if SSE
5761 | movsd xmm0, qword FOR_IDX 5255 | movsd xmm0, qword FOR_IDX
5762 | movsd xmm1, qword FOR_STOP 5256 | movsd xmm1, qword FOR_STOP
5763 if (vk) { 5257 if (vk) {
@@ -5770,22 +5264,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5770 | ucomisd xmm1, xmm0 5264 | ucomisd xmm1, xmm0
5771 |1: 5265 |1:
5772 | movsd qword FOR_EXT, xmm0 5266 | movsd qword FOR_EXT, xmm0
5773 |.else
5774 | fld qword FOR_STOP
5775 | fld qword FOR_IDX
5776 if (vk) {
5777 | fadd qword FOR_STEP // nidx = idx + step
5778 | fst qword FOR_IDX
5779 | fst qword FOR_EXT
5780 | test RB, RB; js >1
5781 } else {
5782 | fst qword FOR_EXT
5783 | jl >1
5784 }
5785 | fxch // Swap lim/(n)idx if step non-negative.
5786 |1:
5787 | fcomparepp
5788 |.endif
5789 if (op == BC_FORI) { 5267 if (op == BC_FORI) {
5790 |.if DUALNUM 5268 |.if DUALNUM
5791 | jnb <7 5269 | jnb <7
@@ -5813,11 +5291,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
5813 |2: 5291 |2:
5814 | ins_next 5292 | ins_next
5815 |.endif 5293 |.endif
5816 |.if SSE 5294 |
5817 |3: // Invert comparison if step is negative. 5295 |3: // Invert comparison if step is negative.
5818 | ucomisd xmm0, xmm1 5296 | ucomisd xmm0, xmm1
5819 | jmp <1 5297 | jmp <1
5820 |.endif
5821 break; 5298 break;
5822 5299
5823 case BC_ITERL: 5300 case BC_ITERL: