summaryrefslogtreecommitdiff
path: root/src/buildvm_x86.dasc
diff options
context:
space:
mode:
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r--src/buildvm_x86.dasc683
1 files changed, 531 insertions, 152 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc
index b220c58f..58767c1e 100644
--- a/src/buildvm_x86.dasc
+++ b/src/buildvm_x86.dasc
@@ -34,6 +34,7 @@
34|.if X64; .define RAa, rcx; .else; .define RAa, RA; .endif 34|.if X64; .define RAa, rcx; .else; .define RAa, RA; .endif
35|.define RAL, cl 35|.define RAL, cl
36|.define RB, ebp // Must be ebp (C callee-save). 36|.define RB, ebp // Must be ebp (C callee-save).
37|.if X64; .define RBa, rbp; .else; .define RBa, RB; .endif
37|.define RC, eax // Must be eax (fcomparepp and others). 38|.define RC, eax // Must be eax (fcomparepp and others).
38|.define RCW, ax 39|.define RCW, ax
39|.define RCH, ah 40|.define RCH, ah
@@ -41,6 +42,7 @@
41|.define OP, RB 42|.define OP, RB
42|.define RD, RC 43|.define RD, RC
43|.if X64; .define RDa, rax; .else; .define RDa, RD; .endif 44|.if X64; .define RDa, rax; .else; .define RDa, RD; .endif
45|.define RDW, RCW
44|.define RDL, RCL 46|.define RDL, RCL
45| 47|
46|.if not X64 48|.if not X64
@@ -323,14 +325,6 @@
323|.macro fpop1; fstp st1; .endmacro 325|.macro fpop1; fstp st1; .endmacro
324| 326|
325|// Synthesize SSE FP constants. 327|// Synthesize SSE FP constants.
326|.macro sseconst_sign, reg, tmp // Synthesize sign mask.
327|.if X64
328| mov64 tmp, U64x(80000000,00000000); movd reg, tmp
329|.else
330| mov tmp, 0x80000000; movd xmm1, tmp; pshufd reg, reg, 0x51
331|.endif
332|.endmacro
333|
334|.macro sseconst_abs, reg, tmp // Synthesize abs mask. 328|.macro sseconst_abs, reg, tmp // Synthesize abs mask.
335|.if X64 329|.if X64
336| mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp 330| mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
@@ -339,21 +333,28 @@
339|.endif 333|.endif
340|.endmacro 334|.endmacro
341| 335|
342|.macro sseconst_1, reg, tmp // Synthesize 1.0. 336|.macro sseconst_hi, reg, tmp, val // Synthesize hi-32 bit const.
343|.if X64 337|.if X64
344| mov64 tmp, U64x(3ff00000,00000000) 338| mov64 tmp, U64x(val,00000000); movd reg, tmp
345| movd reg, tmp
346|.else 339|.else
347| mov tmp, 0x3ff00000; movd reg, tmp; pshufd reg, reg, 0x51 340| mov tmp, 0x .. val; movd reg, tmp; pshufd reg, reg, 0x51
348|.endif 341|.endif
349|.endmacro 342|.endmacro
350| 343|
344|.macro sseconst_sign, reg, tmp // Synthesize sign mask.
345| sseconst_hi reg, tmp, 80000000
346|.endmacro
347|.macro sseconst_1, reg, tmp // Synthesize 1.0.
348| sseconst_hi reg, tmp, 3ff00000
349|.endmacro
350|.macro sseconst_m1, reg, tmp // Synthesize -1.0.
351| sseconst_hi reg, tmp, bff00000
352|.endmacro
351|.macro sseconst_2p52, reg, tmp // Synthesize 2^52. 353|.macro sseconst_2p52, reg, tmp // Synthesize 2^52.
352|.if X64 354| sseconst_hi reg, tmp, 43300000
353| mov64 tmp, U64x(43300000,00000000); movd reg, tmp 355|.endmacro
354|.else 356|.macro sseconst_tobit, reg, tmp // Synthesize 2^52 + 2^51.
355| mov tmp, 0x43300000; movd reg, tmp; pshufd reg, reg, 0x51 357| sseconst_hi reg, tmp, 43380000
356|.endif
357|.endmacro 358|.endmacro
358| 359|
359|// Move table write barrier back. Overwrites reg. 360|// Move table write barrier back. Overwrites reg.
@@ -894,10 +895,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
894 | jmp >2 895 | jmp >2
895 | 896 |
896 |->vmeta_tgetb: 897 |->vmeta_tgetb:
897 | movzx RC, PC_RC // Ugly, cannot fild from a byte. 898 | movzx RC, PC_RC
898 | mov ARG4, RC 899 if (sse) {
899 | fild ARG4 900 | cvtsi2sd xmm0, RC
900 | fstp TMPQ 901 | movsd TMPQ, xmm0
902 } else {
903 | mov ARG4, RC
904 | fild ARG4
905 | fstp TMPQ
906 }
901 | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. 907 | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2.
902 | jmp >1 908 | jmp >1
903 | 909 |
@@ -960,10 +966,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
960 | jmp >2 966 | jmp >2
961 | 967 |
962 |->vmeta_tsetb: 968 |->vmeta_tsetb:
963 | movzx RC, PC_RC // Ugly, cannot fild from a byte. 969 | movzx RC, PC_RC
964 | mov ARG4, RC 970 if (sse) {
965 | fild ARG4 971 | cvtsi2sd xmm0, RC
966 | fstp TMPQ 972 | movsd TMPQ, xmm0
973 } else {
974 | mov ARG4, RC
975 | fild ARG4
976 | fstp TMPQ
977 }
967 | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. 978 | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2.
968 | jmp >1 979 | jmp >1
969 | 980 |
@@ -1274,6 +1285,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1274 | fld qword [RA] 1285 | fld qword [RA]
1275 |.endmacro 1286 |.endmacro
1276 | 1287 |
1288 |.macro .ffunc_nsse, name, op
1289 | .ffunc_1 name
1290 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
1291 | op xmm0, qword [RA]
1292 |.endmacro
1293 |
1294 |.macro .ffunc_nsse, name
1295 | .ffunc_nsse name, movsd
1296 |.endmacro
1297 |
1277 |.macro .ffunc_nn, name 1298 |.macro .ffunc_nn, name
1278 | .ffunc_2 name 1299 | .ffunc_2 name
1279 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback 1300 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
@@ -1282,6 +1303,14 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1282 | fld qword [RA+8] 1303 | fld qword [RA+8]
1283 |.endmacro 1304 |.endmacro
1284 | 1305 |
1306 |.macro .ffunc_nnsse, name
1307 | .ffunc_1 name
1308 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
1309 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback
1310 | movsd xmm0, qword [RA]
1311 | movsd xmm1, qword [RA+8]
1312 |.endmacro
1313 |
1285 |.macro .ffunc_nnr, name 1314 |.macro .ffunc_nnr, name
1286 | .ffunc_2 name 1315 | .ffunc_2 name
1287 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback 1316 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
@@ -1440,8 +1469,11 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1440 | // Only handles the number case inline (without a base argument). 1469 | // Only handles the number case inline (without a base argument).
1441 | cmp NARGS:RC, 1+1; jne ->fff_fallback // Exactly one argument. 1470 | cmp NARGS:RC, 1+1; jne ->fff_fallback // Exactly one argument.
1442 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback 1471 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
1443 | fld qword [RA] 1472 if (sse) {
1444 | jmp ->fff_resn 1473 | movsd xmm0, qword [RA]; jmp ->fff_resxmm0
1474 } else {
1475 | fld qword [RA]; jmp ->fff_resn
1476 }
1445 | 1477 |
1446 |.ffunc_1 tostring 1478 |.ffunc_1 tostring
1447 | // Only handles the string or number case inline. 1479 | // Only handles the string or number case inline.
@@ -1531,13 +1563,33 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1531 |.ffunc_1 ipairs_aux 1563 |.ffunc_1 ipairs_aux
1532 | cmp dword [RA+4], LJ_TTAB; jne ->fff_fallback 1564 | cmp dword [RA+4], LJ_TTAB; jne ->fff_fallback
1533 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback 1565 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback
1534 | fld qword [RA+8] 1566 | // Caveat: xmm0/xmm1/ARG2 used in getinth call, too.
1535 | fld1 1567 if (sse) {
1536 | faddp st1 1568 | movsd xmm0, qword [RA+8]
1537 | fist ARG2 // Caveat: used in getinth call, too. 1569 | sseconst_1 xmm1, RBa
1538 | fstp qword [RA-8] 1570 |.if X64WIN
1571 | addsd xmm1, xmm0
1572 | cvtsd2si RC, xmm1
1573 | movsd qword [RA-8], xmm1
1574 |.else
1575 | addsd xmm0, xmm1
1576 | cvtsd2si RC, xmm0
1577 | movsd qword [RA-8], xmm0
1578 | .if not X64
1579 | mov ARG2, RC
1580 | .endif
1581 |.endif
1582 } else {
1583 |.if not X64
1584 | fld qword [RA+8]
1585 | fld1
1586 | faddp st1
1587 | fist ARG2
1588 | fstp qword [RA-8]
1589 | mov RC, ARG2
1590 |.endif
1591 }
1539 | mov TAB:RB, [RA] 1592 | mov TAB:RB, [RA]
1540 | mov RC, ARG2
1541 | cmp RC, TAB:RB->asize; jae >2 // Not in array part? 1593 | cmp RC, TAB:RB->asize; jae >2 // Not in array part?
1542 | shl RC, 3 1594 | shl RC, 3
1543 | add RC, TAB:RB->array 1595 | add RC, TAB:RB->array
@@ -1572,8 +1624,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1572 | mov CFUNC:RC, CFUNC:RB->upvalue[0] 1624 | mov CFUNC:RC, CFUNC:RB->upvalue[0]
1573 | mov dword [RA-4], LJ_TFUNC 1625 | mov dword [RA-4], LJ_TFUNC
1574 | mov [RA-8], CFUNC:RC 1626 | mov [RA-8], CFUNC:RC
1575 | fldz 1627 if (sse) {
1576 | fstp qword [RA+8] 1628 | xorps xmm0, xmm0
1629 | movsd qword [RA+8], xmm0
1630 } else {
1631 | fldz
1632 | fstp qword [RA+8]
1633 }
1577 | mov RD, 1+3 1634 | mov RD, 1+3
1578 | jmp ->fff_res 1635 | jmp ->fff_res
1579 | 1636 |
@@ -1804,11 +1861,25 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1804 | 1861 |
1805 |//-- Math library ------------------------------------------------------- 1862 |//-- Math library -------------------------------------------------------
1806 | 1863 |
1807 |.ffunc_n math_abs 1864 if (sse) {
1808 | fabs 1865 |->fff_resn:
1809 | // fallthrough 1866 | fstp qword [RA-8]
1810 |->fff_resn: 1867 | jmp ->fff_res1
1811 | fstp qword [RA-8] 1868 |
1869 |.ffunc_nsse math_abs
1870 | sseconst_abs xmm1, RDa
1871 | andps xmm0, xmm1
1872 |->fff_resxmm0:
1873 | movsd qword [RA-8], xmm0
1874 | // fallthrough
1875 } else {
1876 |.ffunc_n math_abs
1877 | fabs
1878 | // fallthrough
1879 |->fff_resxmm0: // Dummy.
1880 |->fff_resn:
1881 | fstp qword [RA-8]
1882 }
1812 |->fff_res1: 1883 |->fff_res1:
1813 | mov RD, 1+1 1884 | mov RD, 1+1
1814 |->fff_res: 1885 |->fff_res:
@@ -1832,10 +1903,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1832 | mov RA, -8 // Results start at BASE+RA = BASE-8. 1903 | mov RA, -8 // Results start at BASE+RA = BASE-8.
1833 | jmp ->vm_return 1904 | jmp ->vm_return
1834 | 1905 |
1835 |.ffunc_n math_floor; call ->vm_floor; jmp ->fff_resn 1906 if (sse) {
1836 |.ffunc_n math_ceil; call ->vm_ceil; jmp ->fff_resn 1907 |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
1837 | 1908 |.ffunc_nsse math_floor; call ->vm_floor; jmp ->fff_resxmm0
1838 |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn 1909 |.ffunc_nsse math_ceil; call ->vm_ceil; jmp ->fff_resxmm0
1910 } else {
1911 |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
1912 |.ffunc_n math_floor; call ->vm_floor; jmp ->fff_resn
1913 |.ffunc_n math_ceil; call ->vm_ceil; jmp ->fff_resn
1914 }
1839 | 1915 |
1840 |.ffunc_n math_log, fldln2; fyl2x; jmp ->fff_resn 1916 |.ffunc_n math_log, fldln2; fyl2x; jmp ->fff_resn
1841 |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn 1917 |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn
@@ -1854,14 +1930,27 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1854 |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn 1930 |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn
1855 | 1931 |
1856 |.macro math_extern, func 1932 |.macro math_extern, func
1857 |.ffunc_n math_ .. func 1933 ||if (sse) {
1934 | .ffunc_nsse math_ .. func
1935 | .if not X64
1936 | movsd FPARG1, xmm0
1937 | .endif
1938 ||} else {
1939 | .if not X64
1940 | .ffunc_n math_ .. func
1941 | fstp FPARG1
1942 | .endif
1943 ||}
1858 | mov TMP1, RA 1944 | mov TMP1, RA
1859 | fstp FPARG1
1860 | mov RB, BASE 1945 | mov RB, BASE
1861 | call extern lj_wrapper_ .. func 1946 | call extern lj_wrapper_ .. func
1862 | mov RA, TMP1 1947 | mov RA, TMP1
1863 | mov BASE, RB 1948 | mov BASE, RB
1864 | jmp ->fff_resn 1949 | .if X64
1950 | jmp ->fff_resxmm0
1951 | .else
1952 | jmp ->fff_resn
1953 | .endif
1865 |.endmacro 1954 |.endmacro
1866 | 1955 |
1867 | math_extern sinh 1956 | math_extern sinh
@@ -1869,7 +1958,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1869 | math_extern tanh 1958 | math_extern tanh
1870 | 1959 |
1871 |->ff_math_deg: 1960 |->ff_math_deg:
1872 |.ffunc_n math_rad; fmul qword CFUNC:RB->upvalue[0]; jmp ->fff_resn 1961 if (sse) {
1962 |.ffunc_nsse math_rad
1963 | mulsd xmm0, qword CFUNC:RB->upvalue[0]
1964 | jmp ->fff_resxmm0
1965 } else {
1966 |.ffunc_n math_rad
1967 | fmul qword CFUNC:RB->upvalue[0]
1968 | jmp ->fff_resn
1969 }
1873 | 1970 |
1874 |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn 1971 |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn
1875 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn 1972 |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn
@@ -1885,31 +1982,64 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1885 | cmp RB, 0x00200000; jb >4 1982 | cmp RB, 0x00200000; jb >4
1886 |1: 1983 |1:
1887 | shr RB, 21; sub RB, RC // Extract and unbias exponent. 1984 | shr RB, 21; sub RB, RC // Extract and unbias exponent.
1888 | mov TMP1, RB; fild TMP1 1985 if (sse) {
1986 | cvtsi2sd xmm0, RB
1987 } else {
1988 | mov TMP1, RB; fild TMP1
1989 }
1889 | mov RB, [RA-4] 1990 | mov RB, [RA-4]
1890 | and RB, 0x800fffff // Mask off exponent. 1991 | and RB, 0x800fffff // Mask off exponent.
1891 | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0. 1992 | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0.
1892 | mov [RA-4], RB 1993 | mov [RA-4], RB
1893 |2: 1994 |2:
1894 | fstp qword [RA] 1995 if (sse) {
1996 | movsd qword [RA], xmm0
1997 } else {
1998 | fstp qword [RA]
1999 }
1895 | mov RD, 1+2 2000 | mov RD, 1+2
1896 | jmp ->fff_res 2001 | jmp ->fff_res
1897 |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0. 2002 |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
1898 | fldz; jmp <2 2003 if (sse) {
2004 | xorps xmm0, xmm0; jmp <2
2005 } else {
2006 | fldz; jmp <2
2007 }
1899 |4: // Handle denormals by multiplying with 2^54 and adjusting the bias. 2008 |4: // Handle denormals by multiplying with 2^54 and adjusting the bias.
1900 | fld qword [RA] 2009 if (sse) {
1901 | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54 2010 | movsd xmm0, qword [RA]
1902 | fstp qword [RA-8] 2011 | sseconst_hi xmm1, RBa, 43500000 // 2^54.
2012 | mulsd xmm0, xmm1
2013 | movsd qword [RA-8], xmm0
2014 } else {
2015 | fld qword [RA]
2016 | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54
2017 | fstp qword [RA-8]
2018 }
1903 | mov RB, [RA-4]; mov RC, 1076; shl RB, 1; jmp <1 2019 | mov RB, [RA-4]; mov RC, 1076; shl RB, 1; jmp <1
1904 | 2020 |
1905 |.ffunc_n math_modf 2021 if (sse) {
2022 |.ffunc_nsse math_modf
2023 } else {
2024 |.ffunc_n math_modf
2025 }
1906 | mov RB, [RA+4] 2026 | mov RB, [RA+4]
1907 | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf? 2027 | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf?
1908 | fdup 2028 if (sse) {
1909 | call ->vm_trunc 2029 | movaps xmm4, xmm0
1910 | fsub st1, st0 2030 | call ->vm_trunc
1911 |1: 2031 | subsd xmm4, xmm0
1912 | fstp qword [RA-8]; fstp qword [RA] 2032 |1:
2033 | movsd qword [RA-8], xmm0
2034 | movsd qword [RA], xmm4
2035 } else {
2036 | fdup
2037 | call ->vm_trunc
2038 | fsub st1, st0
2039 |1:
2040 | fstp qword [RA-8]
2041 | fstp qword [RA]
2042 }
1913 | mov RC, [RA-4]; mov RB, [RA+4] 2043 | mov RC, [RA-4]; mov RB, [RA+4]
1914 | xor RC, RB; js >3 // Need to adjust sign? 2044 | xor RC, RB; js >3 // Need to adjust sign?
1915 |2: 2045 |2:
@@ -1918,20 +2048,41 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1918 |3: 2048 |3:
1919 | xor RB, 0x80000000; mov [RA+4], RB; jmp <2 // Flip sign of fraction. 2049 | xor RB, 0x80000000; mov [RA+4], RB; jmp <2 // Flip sign of fraction.
1920 |4: 2050 |4:
1921 | fldz; fxch; jmp <1 // Return +-Inf and +-0. 2051 if (sse) {
2052 | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0.
2053 } else {
2054 | fldz; fxch; jmp <1 // Return +-Inf and +-0.
2055 }
1922 | 2056 |
1923 |.ffunc_nnr math_fmod 2057 |.ffunc_nnr math_fmod
1924 |1: ; fprem; fnstsw ax; sahf; jp <1 2058 |1: ; fprem; fnstsw ax; sahf; jp <1
1925 | fpop1 2059 | fpop1
1926 | jmp ->fff_resn 2060 | jmp ->fff_resn
1927 | 2061 |
1928 |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn 2062 if (0 && sse) { // NYI
2063 |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0
2064 } else {
2065 |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn
2066 }
1929 | 2067 |
1930 |.macro math_minmax, name, cmovop, nocmovop 2068 |.macro math_minmax, name, cmovop, nocmovop, sseop
2069 ||if (sse) {
2070 |.ffunc_nsse name
2071 | mov RB, 2
2072 |1:
2073 | cmp RB, RD
2074 | jae ->fff_resxmm0
2075 | cmp dword [RA+RB*8-4], LJ_TISNUM; ja ->fff_fallback
2076 | movsd xmm1, qword [RA+RB*8-8]
2077 | sseop xmm0, xmm1
2078 | add RB, 1
2079 | jmp <1
2080 ||} else {
1931 |.ffunc_n name 2081 |.ffunc_n name
1932 | mov RB, 2 2082 | mov RB, 2
1933 |1: 2083 |1:
1934 | cmp RB, RD; jae ->fff_resn 2084 | cmp RB, RD
2085 | jae ->fff_resn
1935 | cmp dword [RA+RB*8-4], LJ_TISNUM; ja >5 2086 | cmp dword [RA+RB*8-4], LJ_TISNUM; ja >5
1936 | fld qword [RA+RB*8-8] 2087 | fld qword [RA+RB*8-8]
1937 ||if (cmov) { 2088 ||if (cmov) {
@@ -1943,20 +2094,26 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1943 ||} 2094 ||}
1944 | add RB, 1 2095 | add RB, 1
1945 | jmp <1 2096 | jmp <1
2097 ||}
1946 |.endmacro 2098 |.endmacro
1947 | 2099 |
1948 | math_minmax math_min, fcmovnbe, jz 2100 | math_minmax math_min, fcmovnbe, jz, minsd
1949 | math_minmax math_max, fcmovbe, jnz 2101 | math_minmax math_max, fcmovbe, jnz, maxsd
1950 |5: 2102 if (!sse) {
1951 | fpop; jmp ->fff_fallback 2103 |5:
2104 | fpop; jmp ->fff_fallback
2105 }
1952 | 2106 |
1953 |//-- String library ----------------------------------------------------- 2107 |//-- String library -----------------------------------------------------
1954 | 2108 |
1955 |.ffunc_1 string_len 2109 |.ffunc_1 string_len
1956 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback 2110 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback
1957 | mov STR:RB, [RA] 2111 | mov STR:RB, [RA]
1958 | fild dword STR:RB->len 2112 if (sse) {
1959 | jmp ->fff_resn 2113 | cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
2114 } else {
2115 | fild dword STR:RB->len; jmp ->fff_resn
2116 }
1960 | 2117 |
1961 |.ffunc string_byte // Only handle the 1-arg case here. 2118 |.ffunc string_byte // Only handle the 1-arg case here.
1962 | cmp NARGS:RC, 1+1; jne ->fff_fallback 2119 | cmp NARGS:RC, 1+1; jne ->fff_fallback
@@ -1965,17 +2122,25 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
1965 | cmp dword STR:RB->len, 1 2122 | cmp dword STR:RB->len, 1
1966 | jb ->fff_res0 // Return no results for empty string. 2123 | jb ->fff_res0 // Return no results for empty string.
1967 | movzx RB, byte STR:RB[1] 2124 | movzx RB, byte STR:RB[1]
1968 | mov TMP1, RB 2125 if (sse) {
1969 | fild TMP1 2126 | cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
1970 | jmp ->fff_resn 2127 } else {
2128 | mov TMP1, RB; fild TMP1; jmp ->fff_resn
2129 }
1971 | 2130 |
1972 |.ffunc string_char // Only handle the 1-arg case here. 2131 |.ffunc string_char // Only handle the 1-arg case here.
1973 | ffgccheck 2132 | ffgccheck
1974 | cmp NARGS:RC, 1+1; jne ->fff_fallback // *Exactly* 1 arg. 2133 | cmp NARGS:RC, 1+1; jne ->fff_fallback // *Exactly* 1 arg.
1975 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback 2134 | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback
1976 | fld qword [RA] 2135 if (sse) {
1977 | fistp TMP2 2136 | cvtsd2si RC, qword [RA]
1978 | cmp TMP2, 255; ja ->fff_fallback 2137 | cmp RC, 255; ja ->fff_fallback
2138 | mov TMP2, RC
2139 } else {
2140 | fld qword [RA]
2141 | fistp TMP2
2142 | cmp TMP2, 255; ja ->fff_fallback
2143 }
1979 | lea RC, TMP2 // Little-endian. 2144 | lea RC, TMP2 // Little-endian.
1980 | mov TMP1, RA // Save RA. 2145 | mov TMP1, RA // Save RA.
1981 | mov ARG3, 1 2146 | mov ARG3, 1
@@ -2000,16 +2165,26 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2000 | cmp NARGS:RC, 1+2; jb ->fff_fallback 2165 | cmp NARGS:RC, 1+2; jb ->fff_fallback
2001 | jna >1 2166 | jna >1
2002 | cmp dword [RA+20], LJ_TISNUM; ja ->fff_fallback 2167 | cmp dword [RA+20], LJ_TISNUM; ja ->fff_fallback
2003 | fld qword [RA+16] 2168 if (sse) {
2004 | fistp TMP2 2169 | cvtsd2si RB, qword [RA+16]
2170 | mov TMP2, RB
2171 } else {
2172 | fld qword [RA+16]
2173 | fistp TMP2
2174 }
2005 |1: 2175 |1:
2006 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback 2176 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback
2007 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback 2177 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback
2008 | mov STR:RB, [RA] 2178 | mov STR:RB, [RA]
2009 | mov ARG2, STR:RB 2179 | mov ARG2, STR:RB
2010 | mov RB, STR:RB->len 2180 | mov RB, STR:RB->len
2011 | fld qword [RA+8] 2181 if (sse) {
2012 | fistp ARG3 2182 | cvtsd2si RC, qword [RA+8]
2183 | mov ARG3, RC
2184 } else {
2185 | fld qword [RA+8]
2186 | fistp ARG3
2187 }
2013 | mov RC, TMP2 2188 | mov RC, TMP2
2014 | cmp RB, RC // len < end? (unsigned compare) 2189 | cmp RB, RC // len < end? (unsigned compare)
2015 | jb >5 2190 | jb >5
@@ -2055,9 +2230,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2055 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback 2230 | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback
2056 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback 2231 | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback
2057 | mov STR:RB, [RA] 2232 | mov STR:RB, [RA]
2058 | fld qword [RA+8] 2233 if (sse) {
2059 | fistp TMP2 2234 | cvtsd2si RC, qword [RA+8]
2060 | mov RC, TMP2 2235 } else {
2236 | fld qword [RA+8]
2237 | fistp TMP2
2238 | mov RC, TMP2
2239 }
2061 | test RC, RC 2240 | test RC, RC
2062 | jle ->fff_emptystr // Count <= 0? (or non-int) 2241 | jle ->fff_emptystr // Count <= 0? (or non-int)
2063 | cmp dword STR:RB->len, 1 2242 | cmp dword STR:RB->len, 1
@@ -2140,43 +2319,73 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2140 | mov TAB:FCARG1, [RA] // Caveat: FCARG1 == RA 2319 | mov TAB:FCARG1, [RA] // Caveat: FCARG1 == RA
2141 | call extern lj_tab_len@4 // LJ_FASTCALL (GCtab *t) 2320 | call extern lj_tab_len@4 // LJ_FASTCALL (GCtab *t)
2142 | // Length of table returned in eax (RC). 2321 | // Length of table returned in eax (RC).
2143 | mov ARG1, RC
2144 | mov RA, RB // Restore RA and BASE. 2322 | mov RA, RB // Restore RA and BASE.
2145 | mov BASE, TMP1 2323 | mov BASE, TMP1
2146 | fild ARG1 2324 if (sse) {
2147 | jmp ->fff_resn 2325 | cvtsi2sd xmm0, RC; jmp ->fff_resxmm0
2326 } else {
2327 | mov ARG1, RC; fild ARG1; jmp ->fff_resn
2328 }
2148 | 2329 |
2149 |//-- Bit library -------------------------------------------------------- 2330 |//-- Bit library --------------------------------------------------------
2150 | 2331 |
2151 |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!). 2332 |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!).
2152 | 2333 |
2153 |.ffunc_n bit_tobit 2334 if (sse) {
2154 | mov TMP1, TOBIT_BIAS 2335 |.ffunc_nsse bit_tobit
2155 | fadd TMP1 2336 | sseconst_tobit xmm1, RBa
2156 | fstp FPARG1 // 64 bit FP store. 2337 | addsd xmm0, xmm1
2157 | fild ARG1 // 32 bit integer load (s2lfwd ok). 2338 | movd RB, xmm0
2158 | jmp ->fff_resn 2339 | cvtsi2sd xmm0, RB
2340 | jmp ->fff_resxmm0
2341 } else {
2342 |.ffunc_n bit_tobit
2343 | mov TMP1, TOBIT_BIAS
2344 | fadd TMP1
2345 | fstp FPARG1 // 64 bit FP store.
2346 | fild ARG1 // 32 bit integer load (s2lfwd ok).
2347 | jmp ->fff_resn
2348 }
2159 | 2349 |
2160 |.macro .ffunc_bit, name 2350 |.macro .ffunc_bit, name
2351 ||if (sse) {
2352 | .ffunc_nsse name
2353 | sseconst_tobit xmm1, RBa
2354 | addsd xmm0, xmm1
2355 | movd RB, xmm0
2356 ||} else {
2161 | .ffunc_n name 2357 | .ffunc_n name
2162 | mov TMP1, TOBIT_BIAS 2358 | mov TMP1, TOBIT_BIAS
2163 | fadd TMP1 2359 | fadd TMP1
2164 | fstp FPARG1 2360 | fstp FPARG1
2165 | mov RB, ARG1 2361 | mov RB, ARG1
2362 ||}
2166 |.endmacro 2363 |.endmacro
2167 | 2364 |
2168 |.macro .ffunc_bit_op, name, ins 2365 |.macro .ffunc_bit_op, name, ins
2169 | .ffunc_bit name 2366 | .ffunc_bit name
2170 | mov NRESULTS, NARGS:RC // Save for fallback. 2367 | mov TMP2, NARGS:RC // Save for fallback.
2171 | lea RC, [RA+NARGS:RC*8-16] 2368 | lea RC, [RA+NARGS:RC*8-16]
2369 ||if (sse) {
2370 | mov TMP1, BASE // Need BASE as a scratch register.
2371 ||}
2172 |1: 2372 |1:
2173 | cmp RC, RA 2373 | cmp RC, RA
2174 | jbe ->fff_resbit 2374 | jbe ->fff_resbit_op
2175 | cmp dword [RC+4], LJ_TISNUM; ja ->fff_fallback_bit_op 2375 | cmp dword [RC+4], LJ_TISNUM; ja ->fff_fallback_bit_op
2376 ||if (sse) {
2377 | movsd xmm0, qword [RC]
2378 | addsd xmm0, xmm1
2379 | movd BASE, xmm0
2380 | ins RB, BASE
2381 ||} else {
2382 |.if not X64
2176 | fld qword [RC] 2383 | fld qword [RC]
2177 | fadd TMP1 2384 | fadd TMP1
2178 | fstp FPARG1 2385 | fstp FPARG1
2179 | ins RB, ARG1 2386 | ins RB, ARG1
2387 |.endif
2388 ||}
2180 | sub RC, 8 2389 | sub RC, 8
2181 | jmp <1 2390 | jmp <1
2182 |.endmacro 2391 |.endmacro
@@ -2191,16 +2400,39 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2191 | 2400 |
2192 |.ffunc_bit bit_bnot 2401 |.ffunc_bit bit_bnot
2193 | not RB 2402 | not RB
2194 |->fff_resbit: 2403 if (sse) {
2195 | mov ARG1, RB 2404 |->fff_resbit:
2196 | fild ARG1 2405 | cvtsi2sd xmm0, RB
2197 | jmp ->fff_resn 2406 | jmp ->fff_resxmm0
2407 |->fff_resbit_op:
2408 | cvtsi2sd xmm0, RB
2409 | mov BASE, TMP1
2410 | jmp ->fff_resxmm0
2411 } else {
2412 |->fff_resbit:
2413 |->fff_resbit_op:
2414 | mov ARG1, RB
2415 | fild ARG1
2416 | jmp ->fff_resn
2417 }
2198 | 2418 |
2199 |->fff_fallback_bit_op: 2419 |->fff_fallback_bit_op:
2200 | mov NARGS:RC, NRESULTS // Restore for fallback 2420 if (sse) {
2421 | mov BASE, TMP1
2422 }
2423 | mov NARGS:RC, TMP2 // Restore for fallback
2201 | jmp ->fff_fallback 2424 | jmp ->fff_fallback
2202 | 2425 |
2203 |.macro .ffunc_bit_sh, name, ins 2426 |.macro .ffunc_bit_sh, name, ins
2427 ||if (sse) {
2428 | .ffunc_nnsse name
2429 | sseconst_tobit xmm2, RBa
2430 | addsd xmm0, xmm2
2431 | addsd xmm1, xmm2
2432 | mov RC, RA // Assumes RA is ecx.
2433 | movd RB, xmm0
2434 | movd RA, xmm1
2435 ||} else {
2204 | .ffunc_nn name 2436 | .ffunc_nn name
2205 | mov TMP1, TOBIT_BIAS 2437 | mov TMP1, TOBIT_BIAS
2206 | fadd TMP1 2438 | fadd TMP1
@@ -2210,6 +2442,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2210 | mov RC, RA // Assumes RA is ecx. 2442 | mov RC, RA // Assumes RA is ecx.
2211 | mov RA, ARG3 2443 | mov RA, ARG3
2212 | mov RB, ARG1 2444 | mov RB, ARG1
2445 ||}
2213 | ins RB, cl 2446 | ins RB, cl
2214 | mov RA, RC 2447 | mov RA, RC
2215 | jmp ->fff_resbit 2448 | jmp ->fff_resbit
@@ -2461,8 +2694,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2461 |//----------------------------------------------------------------------- 2694 |//-----------------------------------------------------------------------
2462 | 2695 |
2463 |// FP value rounding. Called by math.floor/math.ceil fast functions 2696 |// FP value rounding. Called by math.floor/math.ceil fast functions
2464 |// and from JIT code. Arg/ret on x87 stack. No int/xmm registers modified. 2697 |// and from JIT code.
2465 |.macro vm_round, mode1, mode2 2698 |
2699 |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
2700 |.macro vm_round_x87, mode1, mode2
2466 | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2. 2701 | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2.
2467 | mov [esp+8], eax 2702 | mov [esp+8], eax
2468 | mov ax, mode1 2703 | mov ax, mode1
@@ -2478,14 +2713,55 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2478 | ret 2713 | ret
2479 |.endmacro 2714 |.endmacro
2480 | 2715 |
2481 |->vm_floor: 2716 |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
2482 | vm_round 0x0400, 0xf7ff 2717 |.macro vm_round_sse, mode
2718 | sseconst_abs xmm2, RDa
2719 | sseconst_2p52 xmm3, RDa
2720 | movaps xmm1, xmm0
2721 | andpd xmm1, xmm2 // |x|
2722 | ucomisd xmm3, xmm1 // No truncation if 2^52 <= |x|.
2723 | jbe >1
2724 | andnpd xmm2, xmm0 // Isolate sign bit.
2725 |.if mode == 2 // trunc(x)?
2726 | movaps xmm0, xmm1
2727 | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52
2728 | subsd xmm1, xmm3
2729 | sseconst_1 xmm3, RDa
2730 | cmpsd xmm0, xmm1, 1 // |x| < result?
2731 | andpd xmm0, xmm3
2732 | subsd xmm1, xmm0 // If yes, subtract -1.
2733 | orpd xmm1, xmm2 // Merge sign bit back in.
2734 |.else
2735 | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52
2736 | subsd xmm1, xmm3
2737 | orpd xmm1, xmm2 // Merge sign bit back in.
2738 | .if mode == 1 // ceil(x)?
2739 | sseconst_m1 xmm2, RDa // Must subtract -1 to preserve -0.
2740 | cmpsd xmm0, xmm1, 6 // x > result?
2741 | .else // floor(x)?
2742 | sseconst_1 xmm2, RDa
2743 | cmpsd xmm0, xmm1, 1 // x < result?
2744 | .endif
2745 | andpd xmm0, xmm2
2746 | subsd xmm1, xmm0 // If yes, subtract +-1.
2747 |.endif
2748 | movaps xmm0, xmm1
2749 |1:
2750 | ret
2751 |.endmacro
2483 | 2752 |
2484 |->vm_ceil: 2753 |.macro vm_round, name, ssemode, mode1, mode2
2485 | vm_round 0x0800, 0xfbff 2754 |->name:
2755 ||if (!sse) {
2756 | vm_round_x87 mode1, mode2
2757 ||}
2758 |->name .. _sse:
2759 | vm_round_sse ssemode
2760 |.endmacro
2486 | 2761 |
2487 |->vm_trunc: 2762 | vm_round vm_floor, 0, 0x0400, 0xf7ff
2488 | vm_round 0x0c00, 0xffff 2763 | vm_round vm_ceil, 1, 0x0800, 0xfbff
2764 | vm_round vm_trunc, 2, 0x0c00, 0xffff
2489 | 2765 |
2490 |// FP modulo x%y. Called by BC_MOD* and vm_arith. 2766 |// FP modulo x%y. Called by BC_MOD* and vm_arith.
2491 |->vm_mod: 2767 |->vm_mod:
@@ -2532,8 +2808,8 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2532 | fldcw word [esp+4] 2808 | fldcw word [esp+4]
2533 | fmulp st1 2809 | fmulp st1
2534 | fsubp st1 2810 | fsubp st1
2811 | ret
2535 } 2812 }
2536 | ret
2537 | 2813 |
2538 |// FP exponentiation e^x and 2^x. Called by math.exp fast function and 2814 |// FP exponentiation e^x and 2^x. Called by math.exp fast function and
2539 |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified. 2815 |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified.
@@ -2662,19 +2938,51 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2662 |// Callable from C: double lj_vm_foldfpm(double x, int fpm) 2938 |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
2663 |// Computes fpm(x) for extended math functions. ORDER FPM. 2939 |// Computes fpm(x) for extended math functions. ORDER FPM.
2664 |->vm_foldfpm: 2940 |->vm_foldfpm:
2665 | mov eax, [esp+12] 2941 if (sse) {
2666 | fld qword [esp+4] 2942 |.if X64WIN
2667 | cmp eax, 1; jb ->vm_floor; je ->vm_ceil 2943 | .define fpmop, CARG2d
2668 | cmp eax, 3; jb ->vm_trunc; ja >1 2944 |.elif X64
2669 | fsqrt; ret 2945 | .define fpmop, CARG1d
2670 |1: ; cmp eax, 5; jb ->vm_exp; je ->vm_exp2 2946 |.else
2671 | cmp eax, 7; je >1; ja >2 2947 | .define fpmop, eax
2948 | mov fpmop, [esp+12]
2949 | movsd xmm0, qword [esp+4]
2950 |.endif
2951 |.if X64
2952 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
2953 | cmp fpmop, 3; jb ->vm_trunc; ja >2
2954 | sqrtsd xmm0, xmm0; ret
2955 |.else
2956 | cmp fpmop, 1; je >1; ja >2
2957 | call ->vm_floor; jmp >7
2958 |1: ; call ->vm_ceil; jmp >7
2959 |2: ; cmp fpmop, 3; je >1; ja >2
2960 | call ->vm_trunc; jmp >7
2961 |1:
2962 | sqrtsd xmm0, xmm0
2963 |7:
2964 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
2965 | fld qword [esp+4]
2966 | ret
2967 |.endif
2968 |2:
2969 | fld qword [esp+4]
2970 } else {
2971 | mov fpmop, [esp+12]
2972 | fld qword [esp+4]
2973 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
2974 | cmp fpmop, 3; jb ->vm_trunc; ja >2
2975 | fsqrt; ret
2976 |2:
2977 }
2978 | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
2979 | cmp fpmop, 7; je >1; ja >2
2672 | fldln2; fxch; fyl2x; ret 2980 | fldln2; fxch; fyl2x; ret
2673 |1: ; fld1; fxch; fyl2x; ret 2981 |1: ; fld1; fxch; fyl2x; ret
2674 |2: ; cmp eax, 9; je >1; ja >2 2982 |2: ; cmp fpmop, 9; je >1; ja >2
2675 | fldlg2; fxch; fyl2x; ret 2983 | fldlg2; fxch; fyl2x; ret
2676 |1: ; fsin; ret 2984 |1: ; fsin; ret
2677 |2: ; cmp eax, 11; je >1; ja >9 2985 |2: ; cmp fpmop, 11; je >1; ja >9
2678 | fcos; ret 2986 | fcos; ret
2679 |1: ; fptan; fpop; ret 2987 |1: ; fptan; fpop; ret
2680 |9: ; int3 // Bad fpm. 2988 |9: ; int3 // Bad fpm.
@@ -3198,14 +3506,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3198 break; 3506 break;
3199 case BC_KSHORT: 3507 case BC_KSHORT:
3200 | ins_AD // RA = dst, RD = signed int16 literal 3508 | ins_AD // RA = dst, RD = signed int16 literal
3201 | fild PC_RD // Refetch signed RD from instruction. 3509 if (sse) {
3202 | fstp qword [BASE+RA*8] 3510 | movsx RD, RDW // Sign-extend literal.
3511 | cvtsi2sd xmm0, RD
3512 | movsd qword [BASE+RA*8], xmm0
3513 } else {
3514 | fild PC_RD // Refetch signed RD from instruction.
3515 | fstp qword [BASE+RA*8]
3516 }
3203 | ins_next 3517 | ins_next
3204 break; 3518 break;
3205 case BC_KNUM: 3519 case BC_KNUM:
3206 | ins_AD // RA = dst, RD = num const 3520 | ins_AD // RA = dst, RD = num const
3207 | fld qword [KBASE+RD*8] 3521 if (sse) {
3208 | fstp qword [BASE+RA*8] 3522 | movsd xmm0, qword [KBASE+RD*8]
3523 | movsd qword [BASE+RA*8], xmm0
3524 } else {
3525 | fld qword [KBASE+RD*8]
3526 | fstp qword [BASE+RA*8]
3527 }
3209 | ins_next 3528 | ins_next
3210 break; 3529 break;
3211 case BC_KPRI: 3530 case BC_KPRI:
@@ -3307,10 +3626,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3307 case BC_USETN: 3626 case BC_USETN:
3308 | ins_AD // RA = upvalue #, RD = num const 3627 | ins_AD // RA = upvalue #, RD = num const
3309 | mov LFUNC:RB, [BASE-8] 3628 | mov LFUNC:RB, [BASE-8]
3310 | fld qword [KBASE+RD*8] 3629 if (sse) {
3630 | movsd xmm0, qword [KBASE+RD*8]
3631 } else {
3632 | fld qword [KBASE+RD*8]
3633 }
3311 | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)] 3634 | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
3312 | mov RA, UPVAL:RB->v 3635 | mov RA, UPVAL:RB->v
3313 | fstp qword [RA] 3636 if (sse) {
3637 | movsd qword [RA], xmm0
3638 } else {
3639 | fstp qword [RA]
3640 }
3314 | ins_next 3641 | ins_next
3315 break; 3642 break;
3316 case BC_USETP: 3643 case BC_USETP:
@@ -3438,11 +3765,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3438 | 3765 |
3439 | // Integer key? Convert number to int and back and compare. 3766 | // Integer key? Convert number to int and back and compare.
3440 | checknum RC, >5 3767 | checknum RC, >5
3441 | fld qword [BASE+RC*8] 3768 if (sse) {
3442 | fist ARG1 3769 | movsd xmm0, qword [BASE+RC*8]
3443 | fild ARG1 3770 | cvtsd2si RC, xmm0
3444 | fcomparepp // eax (RC) modified! 3771 | cvtsi2sd xmm1, RC
3445 | mov RC, ARG1 3772 | ucomisd xmm0, xmm1
3773 } else {
3774 |.if not X64
3775 | fld qword [BASE+RC*8]
3776 | fist ARG1
3777 | fild ARG1
3778 | fcomparepp // eax (RC) modified!
3779 | mov RC, ARG1
3780 |.endif
3781 }
3446 | jne ->vmeta_tgetv // Generic numeric key? Use fallback. 3782 | jne ->vmeta_tgetv // Generic numeric key? Use fallback.
3447 | cmp RC, TAB:RB->asize // Takes care of unordered, too. 3783 | cmp RC, TAB:RB->asize // Takes care of unordered, too.
3448 | jae ->vmeta_tgetv // Not in array part? Use fallback. 3784 | jae ->vmeta_tgetv // Not in array part? Use fallback.
@@ -3551,11 +3887,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3551 | 3887 |
3552 | // Integer key? Convert number to int and back and compare. 3888 | // Integer key? Convert number to int and back and compare.
3553 | checknum RC, >5 3889 | checknum RC, >5
3554 | fld qword [BASE+RC*8] 3890 if (sse) {
3555 | fist ARG1 3891 | movsd xmm0, qword [BASE+RC*8]
3556 | fild ARG1 3892 | cvtsd2si RC, xmm0
3557 | fcomparepp // eax (RC) modified! 3893 | cvtsi2sd xmm1, RC
3558 | mov RC, ARG1 3894 | ucomisd xmm0, xmm1
3895 } else {
3896 |.if not X64
3897 | fld qword [BASE+RC*8]
3898 | fist ARG1
3899 | fild ARG1
3900 | fcomparepp // eax (RC) modified!
3901 | mov RC, ARG1
3902 |.endif
3903 }
3559 | jne ->vmeta_tsetv // Generic numeric key? Use fallback. 3904 | jne ->vmeta_tsetv // Generic numeric key? Use fallback.
3560 | cmp RC, TAB:RB->asize // Takes care of unordered, too. 3905 | cmp RC, TAB:RB->asize // Takes care of unordered, too.
3561 | jae ->vmeta_tsetv 3906 | jae ->vmeta_tsetv
@@ -3626,11 +3971,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3626 |4: // Check for __newindex if previous value is nil. 3971 |4: // Check for __newindex if previous value is nil.
3627 | cmp dword TAB:RB->metatable, 0 // Shouldn't overwrite RA for fastpath. 3972 | cmp dword TAB:RB->metatable, 0 // Shouldn't overwrite RA for fastpath.
3628 | jz <2 3973 | jz <2
3629 | mov ARG1, RA // Save RA. 3974 | mov TMP1, RA // Save RA.
3630 | mov TAB:RA, TAB:RB->metatable 3975 | mov TAB:RA, TAB:RB->metatable
3631 | test byte TAB:RA->nomm, 1<<MM_newindex 3976 | test byte TAB:RA->nomm, 1<<MM_newindex
3632 | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check. 3977 | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check.
3633 | mov RA, ARG1 // Restore RA. 3978 | mov RA, TMP1 // Restore RA.
3634 | jmp <2 3979 | jmp <2
3635 | 3980 |
3636 |5: // Follow hash chain. 3981 |5: // Follow hash chain.
@@ -3705,8 +4050,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3705 case BC_TSETM: 4050 case BC_TSETM:
3706 | ins_AD // RA = base (table at base-1), RD = num const (start index) 4051 | ins_AD // RA = base (table at base-1), RD = num const (start index)
3707 | mov TMP1, KBASE // Need one more free register. 4052 | mov TMP1, KBASE // Need one more free register.
3708 | fld qword [KBASE+RD*8] 4053 if (sse) {
3709 | fistp ARG4 // Const is guaranteed to be an int. 4054 | movsd xmm0, qword [KBASE+RD*8]
4055 } else {
4056 |.if not X64
4057 | fld qword [KBASE+RD*8]
4058 | fistp ARG4 // Const is guaranteed to be an int.
4059 |.endif
4060 }
3710 |1: 4061 |1:
3711 | lea RA, [BASE+RA*8] 4062 | lea RA, [BASE+RA*8]
3712 | mov TAB:RB, [RA-8] // Guaranteed to be a table. 4063 | mov TAB:RB, [RA-8] // Guaranteed to be a table.
@@ -3714,7 +4065,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3714 | jnz >7 4065 | jnz >7
3715 |2: 4066 |2:
3716 | mov RD, NRESULTS 4067 | mov RD, NRESULTS
3717 | mov KBASE, ARG4 4068 if (sse) {
4069 | cvtsd2si KBASE, xmm0 // Const is guaranteed to be an int.
4070 } else {
4071 |.if not X64
4072 | mov KBASE, ARG4
4073 |.endif
4074 }
3718 | sub RD, 1 4075 | sub RD, 1
3719 | jz >4 // Nothing to copy? 4076 | jz >4 // Nothing to copy?
3720 | add RD, KBASE // Compute needed size. 4077 | add RD, KBASE // Compute needed size.
@@ -4034,20 +4391,37 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
4034 if (!vk) { 4391 if (!vk) {
4035 | cmp RB, LJ_TISNUM; ja ->vmeta_for 4392 | cmp RB, LJ_TISNUM; ja ->vmeta_for
4036 } 4393 }
4037 | fld FOR_STOP 4394 if (sse) {
4038 | fld FOR_IDX 4395 | movsd xmm0, FOR_IDX
4039 if (vk) { 4396 | movsd xmm1, FOR_STOP
4040 | fadd FOR_STEP // nidx = idx + step 4397 if (vk) {
4041 | fst FOR_IDX 4398 | addsd xmm0, FOR_STEP
4042 } 4399 | movsd FOR_IDX, xmm0
4043 | fst FOR_EXT 4400 | test RB, RB; js >3
4044 | test RB, RB // Swap lim/(n)idx if step non-negative. 4401 } else {
4045 | js >1 4402 | jl >3
4046 | fxch 4403 }
4047 |1: 4404 | ucomisd xmm1, xmm0
4048 | fcomparepp // eax (RD) modified if !cmov. 4405 |1:
4049 if (!cmov) { 4406 | movsd FOR_EXT, xmm0
4050 | movzx RD, PC_RD // Need to reload RD. 4407 } else {
4408 | fld FOR_STOP
4409 | fld FOR_IDX
4410 if (vk) {
4411 | fadd FOR_STEP // nidx = idx + step
4412 | fst FOR_IDX
4413 | fst FOR_EXT
4414 | test RB, RB; js >1
4415 } else {
4416 | fst FOR_EXT
4417 | jl >1
4418 }
4419 | fxch // Swap lim/(n)idx if step non-negative.
4420 |1:
4421 | fcomparepp // eax (RD) modified if !cmov.
4422 if (!cmov) {
4423 | movzx RD, PC_RD // Need to reload RD.
4424 }
4051 } 4425 }
4052 if (op == BC_FORI) { 4426 if (op == BC_FORI) {
4053 | jnb >2 4427 | jnb >2
@@ -4064,6 +4438,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
4064 } 4438 }
4065 |2: 4439 |2:
4066 | ins_next 4440 | ins_next
4441 if (sse) {
4442 |3: // Invert comparison if step is negative.
4443 | ucomisd xmm0, xmm1
4444 | jmp <1
4445 }
4067 break; 4446 break;
4068 4447
4069 case BC_ITERL: 4448 case BC_ITERL: