diff options
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r-- | src/buildvm_x86.dasc | 683 |
1 files changed, 531 insertions, 152 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc index b220c58f..58767c1e 100644 --- a/src/buildvm_x86.dasc +++ b/src/buildvm_x86.dasc | |||
@@ -34,6 +34,7 @@ | |||
34 | |.if X64; .define RAa, rcx; .else; .define RAa, RA; .endif | 34 | |.if X64; .define RAa, rcx; .else; .define RAa, RA; .endif |
35 | |.define RAL, cl | 35 | |.define RAL, cl |
36 | |.define RB, ebp // Must be ebp (C callee-save). | 36 | |.define RB, ebp // Must be ebp (C callee-save). |
37 | |.if X64; .define RBa, rbp; .else; .define RBa, RB; .endif | ||
37 | |.define RC, eax // Must be eax (fcomparepp and others). | 38 | |.define RC, eax // Must be eax (fcomparepp and others). |
38 | |.define RCW, ax | 39 | |.define RCW, ax |
39 | |.define RCH, ah | 40 | |.define RCH, ah |
@@ -41,6 +42,7 @@ | |||
41 | |.define OP, RB | 42 | |.define OP, RB |
42 | |.define RD, RC | 43 | |.define RD, RC |
43 | |.if X64; .define RDa, rax; .else; .define RDa, RD; .endif | 44 | |.if X64; .define RDa, rax; .else; .define RDa, RD; .endif |
45 | |.define RDW, RCW | ||
44 | |.define RDL, RCL | 46 | |.define RDL, RCL |
45 | | | 47 | | |
46 | |.if not X64 | 48 | |.if not X64 |
@@ -323,14 +325,6 @@ | |||
323 | |.macro fpop1; fstp st1; .endmacro | 325 | |.macro fpop1; fstp st1; .endmacro |
324 | | | 326 | | |
325 | |// Synthesize SSE FP constants. | 327 | |// Synthesize SSE FP constants. |
326 | |.macro sseconst_sign, reg, tmp // Synthesize sign mask. | ||
327 | |.if X64 | ||
328 | | mov64 tmp, U64x(80000000,00000000); movd reg, tmp | ||
329 | |.else | ||
330 | | mov tmp, 0x80000000; movd xmm1, tmp; pshufd reg, reg, 0x51 | ||
331 | |.endif | ||
332 | |.endmacro | ||
333 | | | ||
334 | |.macro sseconst_abs, reg, tmp // Synthesize abs mask. | 328 | |.macro sseconst_abs, reg, tmp // Synthesize abs mask. |
335 | |.if X64 | 329 | |.if X64 |
336 | | mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp | 330 | | mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp |
@@ -339,21 +333,28 @@ | |||
339 | |.endif | 333 | |.endif |
340 | |.endmacro | 334 | |.endmacro |
341 | | | 335 | | |
342 | |.macro sseconst_1, reg, tmp // Synthesize 1.0. | 336 | |.macro sseconst_hi, reg, tmp, val // Synthesize hi-32 bit const. |
343 | |.if X64 | 337 | |.if X64 |
344 | | mov64 tmp, U64x(3ff00000,00000000) | 338 | | mov64 tmp, U64x(val,00000000); movd reg, tmp |
345 | | movd reg, tmp | ||
346 | |.else | 339 | |.else |
347 | | mov tmp, 0x3ff00000; movd reg, tmp; pshufd reg, reg, 0x51 | 340 | | mov tmp, 0x .. val; movd reg, tmp; pshufd reg, reg, 0x51 |
348 | |.endif | 341 | |.endif |
349 | |.endmacro | 342 | |.endmacro |
350 | | | 343 | | |
344 | |.macro sseconst_sign, reg, tmp // Synthesize sign mask. | ||
345 | | sseconst_hi reg, tmp, 80000000 | ||
346 | |.endmacro | ||
347 | |.macro sseconst_1, reg, tmp // Synthesize 1.0. | ||
348 | | sseconst_hi reg, tmp, 3ff00000 | ||
349 | |.endmacro | ||
350 | |.macro sseconst_m1, reg, tmp // Synthesize -1.0. | ||
351 | | sseconst_hi reg, tmp, bff00000 | ||
352 | |.endmacro | ||
351 | |.macro sseconst_2p52, reg, tmp // Synthesize 2^52. | 353 | |.macro sseconst_2p52, reg, tmp // Synthesize 2^52. |
352 | |.if X64 | 354 | | sseconst_hi reg, tmp, 43300000 |
353 | | mov64 tmp, U64x(43300000,00000000); movd reg, tmp | 355 | |.endmacro |
354 | |.else | 356 | |.macro sseconst_tobit, reg, tmp // Synthesize 2^52 + 2^51. |
355 | | mov tmp, 0x43300000; movd reg, tmp; pshufd reg, reg, 0x51 | 357 | | sseconst_hi reg, tmp, 43380000 |
356 | |.endif | ||
357 | |.endmacro | 358 | |.endmacro |
358 | | | 359 | | |
359 | |// Move table write barrier back. Overwrites reg. | 360 | |// Move table write barrier back. Overwrites reg. |
@@ -894,10 +895,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
894 | | jmp >2 | 895 | | jmp >2 |
895 | | | 896 | | |
896 | |->vmeta_tgetb: | 897 | |->vmeta_tgetb: |
897 | | movzx RC, PC_RC // Ugly, cannot fild from a byte. | 898 | | movzx RC, PC_RC |
898 | | mov ARG4, RC | 899 | if (sse) { |
899 | | fild ARG4 | 900 | | cvtsi2sd xmm0, RC |
900 | | fstp TMPQ | 901 | | movsd TMPQ, xmm0 |
902 | } else { | ||
903 | | mov ARG4, RC | ||
904 | | fild ARG4 | ||
905 | | fstp TMPQ | ||
906 | } | ||
901 | | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. | 907 | | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. |
902 | | jmp >1 | 908 | | jmp >1 |
903 | | | 909 | | |
@@ -960,10 +966,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
960 | | jmp >2 | 966 | | jmp >2 |
961 | | | 967 | | |
962 | |->vmeta_tsetb: | 968 | |->vmeta_tsetb: |
963 | | movzx RC, PC_RC // Ugly, cannot fild from a byte. | 969 | | movzx RC, PC_RC |
964 | | mov ARG4, RC | 970 | if (sse) { |
965 | | fild ARG4 | 971 | | cvtsi2sd xmm0, RC |
966 | | fstp TMPQ | 972 | | movsd TMPQ, xmm0 |
973 | } else { | ||
974 | | mov ARG4, RC | ||
975 | | fild ARG4 | ||
976 | | fstp TMPQ | ||
977 | } | ||
967 | | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. | 978 | | lea RC, TMP1 // Store temp. TValue in TMP1/TMP2. |
968 | | jmp >1 | 979 | | jmp >1 |
969 | | | 980 | | |
@@ -1274,6 +1285,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1274 | | fld qword [RA] | 1285 | | fld qword [RA] |
1275 | |.endmacro | 1286 | |.endmacro |
1276 | | | 1287 | | |
1288 | |.macro .ffunc_nsse, name, op | ||
1289 | | .ffunc_1 name | ||
1290 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | ||
1291 | | op xmm0, qword [RA] | ||
1292 | |.endmacro | ||
1293 | | | ||
1294 | |.macro .ffunc_nsse, name | ||
1295 | | .ffunc_nsse name, movsd | ||
1296 | |.endmacro | ||
1297 | | | ||
1277 | |.macro .ffunc_nn, name | 1298 | |.macro .ffunc_nn, name |
1278 | | .ffunc_2 name | 1299 | | .ffunc_2 name |
1279 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | 1300 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback |
@@ -1282,6 +1303,14 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1282 | | fld qword [RA+8] | 1303 | | fld qword [RA+8] |
1283 | |.endmacro | 1304 | |.endmacro |
1284 | | | 1305 | | |
1306 | |.macro .ffunc_nnsse, name | ||
1307 | | .ffunc_1 name | ||
1308 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | ||
1309 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback | ||
1310 | | movsd xmm0, qword [RA] | ||
1311 | | movsd xmm1, qword [RA+8] | ||
1312 | |.endmacro | ||
1313 | | | ||
1285 | |.macro .ffunc_nnr, name | 1314 | |.macro .ffunc_nnr, name |
1286 | | .ffunc_2 name | 1315 | | .ffunc_2 name |
1287 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | 1316 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback |
@@ -1440,8 +1469,11 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1440 | | // Only handles the number case inline (without a base argument). | 1469 | | // Only handles the number case inline (without a base argument). |
1441 | | cmp NARGS:RC, 1+1; jne ->fff_fallback // Exactly one argument. | 1470 | | cmp NARGS:RC, 1+1; jne ->fff_fallback // Exactly one argument. |
1442 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | 1471 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback |
1443 | | fld qword [RA] | 1472 | if (sse) { |
1444 | | jmp ->fff_resn | 1473 | | movsd xmm0, qword [RA]; jmp ->fff_resxmm0 |
1474 | } else { | ||
1475 | | fld qword [RA]; jmp ->fff_resn | ||
1476 | } | ||
1445 | | | 1477 | | |
1446 | |.ffunc_1 tostring | 1478 | |.ffunc_1 tostring |
1447 | | // Only handles the string or number case inline. | 1479 | | // Only handles the string or number case inline. |
@@ -1531,13 +1563,33 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1531 | |.ffunc_1 ipairs_aux | 1563 | |.ffunc_1 ipairs_aux |
1532 | | cmp dword [RA+4], LJ_TTAB; jne ->fff_fallback | 1564 | | cmp dword [RA+4], LJ_TTAB; jne ->fff_fallback |
1533 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback | 1565 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback |
1534 | | fld qword [RA+8] | 1566 | | // Caveat: xmm0/xmm1/ARG2 used in getinth call, too. |
1535 | | fld1 | 1567 | if (sse) { |
1536 | | faddp st1 | 1568 | | movsd xmm0, qword [RA+8] |
1537 | | fist ARG2 // Caveat: used in getinth call, too. | 1569 | | sseconst_1 xmm1, RBa |
1538 | | fstp qword [RA-8] | 1570 | |.if X64WIN |
1571 | | addsd xmm1, xmm0 | ||
1572 | | cvtsd2si RC, xmm1 | ||
1573 | | movsd qword [RA-8], xmm1 | ||
1574 | |.else | ||
1575 | | addsd xmm0, xmm1 | ||
1576 | | cvtsd2si RC, xmm0 | ||
1577 | | movsd qword [RA-8], xmm0 | ||
1578 | | .if not X64 | ||
1579 | | mov ARG2, RC | ||
1580 | | .endif | ||
1581 | |.endif | ||
1582 | } else { | ||
1583 | |.if not X64 | ||
1584 | | fld qword [RA+8] | ||
1585 | | fld1 | ||
1586 | | faddp st1 | ||
1587 | | fist ARG2 | ||
1588 | | fstp qword [RA-8] | ||
1589 | | mov RC, ARG2 | ||
1590 | |.endif | ||
1591 | } | ||
1539 | | mov TAB:RB, [RA] | 1592 | | mov TAB:RB, [RA] |
1540 | | mov RC, ARG2 | ||
1541 | | cmp RC, TAB:RB->asize; jae >2 // Not in array part? | 1593 | | cmp RC, TAB:RB->asize; jae >2 // Not in array part? |
1542 | | shl RC, 3 | 1594 | | shl RC, 3 |
1543 | | add RC, TAB:RB->array | 1595 | | add RC, TAB:RB->array |
@@ -1572,8 +1624,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1572 | | mov CFUNC:RC, CFUNC:RB->upvalue[0] | 1624 | | mov CFUNC:RC, CFUNC:RB->upvalue[0] |
1573 | | mov dword [RA-4], LJ_TFUNC | 1625 | | mov dword [RA-4], LJ_TFUNC |
1574 | | mov [RA-8], CFUNC:RC | 1626 | | mov [RA-8], CFUNC:RC |
1575 | | fldz | 1627 | if (sse) { |
1576 | | fstp qword [RA+8] | 1628 | | xorps xmm0, xmm0 |
1629 | | movsd qword [RA+8], xmm0 | ||
1630 | } else { | ||
1631 | | fldz | ||
1632 | | fstp qword [RA+8] | ||
1633 | } | ||
1577 | | mov RD, 1+3 | 1634 | | mov RD, 1+3 |
1578 | | jmp ->fff_res | 1635 | | jmp ->fff_res |
1579 | | | 1636 | | |
@@ -1804,11 +1861,25 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1804 | | | 1861 | | |
1805 | |//-- Math library ------------------------------------------------------- | 1862 | |//-- Math library ------------------------------------------------------- |
1806 | | | 1863 | | |
1807 | |.ffunc_n math_abs | 1864 | if (sse) { |
1808 | | fabs | 1865 | |->fff_resn: |
1809 | | // fallthrough | 1866 | | fstp qword [RA-8] |
1810 | |->fff_resn: | 1867 | | jmp ->fff_res1 |
1811 | | fstp qword [RA-8] | 1868 | | |
1869 | |.ffunc_nsse math_abs | ||
1870 | | sseconst_abs xmm1, RDa | ||
1871 | | andps xmm0, xmm1 | ||
1872 | |->fff_resxmm0: | ||
1873 | | movsd qword [RA-8], xmm0 | ||
1874 | | // fallthrough | ||
1875 | } else { | ||
1876 | |.ffunc_n math_abs | ||
1877 | | fabs | ||
1878 | | // fallthrough | ||
1879 | |->fff_resxmm0: // Dummy. | ||
1880 | |->fff_resn: | ||
1881 | | fstp qword [RA-8] | ||
1882 | } | ||
1812 | |->fff_res1: | 1883 | |->fff_res1: |
1813 | | mov RD, 1+1 | 1884 | | mov RD, 1+1 |
1814 | |->fff_res: | 1885 | |->fff_res: |
@@ -1832,10 +1903,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1832 | | mov RA, -8 // Results start at BASE+RA = BASE-8. | 1903 | | mov RA, -8 // Results start at BASE+RA = BASE-8. |
1833 | | jmp ->vm_return | 1904 | | jmp ->vm_return |
1834 | | | 1905 | | |
1835 | |.ffunc_n math_floor; call ->vm_floor; jmp ->fff_resn | 1906 | if (sse) { |
1836 | |.ffunc_n math_ceil; call ->vm_ceil; jmp ->fff_resn | 1907 | |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0 |
1837 | | | 1908 | |.ffunc_nsse math_floor; call ->vm_floor; jmp ->fff_resxmm0 |
1838 | |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn | 1909 | |.ffunc_nsse math_ceil; call ->vm_ceil; jmp ->fff_resxmm0 |
1910 | } else { | ||
1911 | |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn | ||
1912 | |.ffunc_n math_floor; call ->vm_floor; jmp ->fff_resn | ||
1913 | |.ffunc_n math_ceil; call ->vm_ceil; jmp ->fff_resn | ||
1914 | } | ||
1839 | | | 1915 | | |
1840 | |.ffunc_n math_log, fldln2; fyl2x; jmp ->fff_resn | 1916 | |.ffunc_n math_log, fldln2; fyl2x; jmp ->fff_resn |
1841 | |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn | 1917 | |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn |
@@ -1854,14 +1930,27 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1854 | |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn | 1930 | |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn |
1855 | | | 1931 | | |
1856 | |.macro math_extern, func | 1932 | |.macro math_extern, func |
1857 | |.ffunc_n math_ .. func | 1933 | ||if (sse) { |
1934 | | .ffunc_nsse math_ .. func | ||
1935 | | .if not X64 | ||
1936 | | movsd FPARG1, xmm0 | ||
1937 | | .endif | ||
1938 | ||} else { | ||
1939 | | .if not X64 | ||
1940 | | .ffunc_n math_ .. func | ||
1941 | | fstp FPARG1 | ||
1942 | | .endif | ||
1943 | ||} | ||
1858 | | mov TMP1, RA | 1944 | | mov TMP1, RA |
1859 | | fstp FPARG1 | ||
1860 | | mov RB, BASE | 1945 | | mov RB, BASE |
1861 | | call extern lj_wrapper_ .. func | 1946 | | call extern lj_wrapper_ .. func |
1862 | | mov RA, TMP1 | 1947 | | mov RA, TMP1 |
1863 | | mov BASE, RB | 1948 | | mov BASE, RB |
1864 | | jmp ->fff_resn | 1949 | | .if X64 |
1950 | | jmp ->fff_resxmm0 | ||
1951 | | .else | ||
1952 | | jmp ->fff_resn | ||
1953 | | .endif | ||
1865 | |.endmacro | 1954 | |.endmacro |
1866 | | | 1955 | | |
1867 | | math_extern sinh | 1956 | | math_extern sinh |
@@ -1869,7 +1958,15 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1869 | | math_extern tanh | 1958 | | math_extern tanh |
1870 | | | 1959 | | |
1871 | |->ff_math_deg: | 1960 | |->ff_math_deg: |
1872 | |.ffunc_n math_rad; fmul qword CFUNC:RB->upvalue[0]; jmp ->fff_resn | 1961 | if (sse) { |
1962 | |.ffunc_nsse math_rad | ||
1963 | | mulsd xmm0, qword CFUNC:RB->upvalue[0] | ||
1964 | | jmp ->fff_resxmm0 | ||
1965 | } else { | ||
1966 | |.ffunc_n math_rad | ||
1967 | | fmul qword CFUNC:RB->upvalue[0] | ||
1968 | | jmp ->fff_resn | ||
1969 | } | ||
1873 | | | 1970 | | |
1874 | |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn | 1971 | |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn |
1875 | |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn | 1972 | |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn |
@@ -1885,31 +1982,64 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1885 | | cmp RB, 0x00200000; jb >4 | 1982 | | cmp RB, 0x00200000; jb >4 |
1886 | |1: | 1983 | |1: |
1887 | | shr RB, 21; sub RB, RC // Extract and unbias exponent. | 1984 | | shr RB, 21; sub RB, RC // Extract and unbias exponent. |
1888 | | mov TMP1, RB; fild TMP1 | 1985 | if (sse) { |
1986 | | cvtsi2sd xmm0, RB | ||
1987 | } else { | ||
1988 | | mov TMP1, RB; fild TMP1 | ||
1989 | } | ||
1889 | | mov RB, [RA-4] | 1990 | | mov RB, [RA-4] |
1890 | | and RB, 0x800fffff // Mask off exponent. | 1991 | | and RB, 0x800fffff // Mask off exponent. |
1891 | | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0. | 1992 | | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0. |
1892 | | mov [RA-4], RB | 1993 | | mov [RA-4], RB |
1893 | |2: | 1994 | |2: |
1894 | | fstp qword [RA] | 1995 | if (sse) { |
1996 | | movsd qword [RA], xmm0 | ||
1997 | } else { | ||
1998 | | fstp qword [RA] | ||
1999 | } | ||
1895 | | mov RD, 1+2 | 2000 | | mov RD, 1+2 |
1896 | | jmp ->fff_res | 2001 | | jmp ->fff_res |
1897 | |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0. | 2002 | |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0. |
1898 | | fldz; jmp <2 | 2003 | if (sse) { |
2004 | | xorps xmm0, xmm0; jmp <2 | ||
2005 | } else { | ||
2006 | | fldz; jmp <2 | ||
2007 | } | ||
1899 | |4: // Handle denormals by multiplying with 2^54 and adjusting the bias. | 2008 | |4: // Handle denormals by multiplying with 2^54 and adjusting the bias. |
1900 | | fld qword [RA] | 2009 | if (sse) { |
1901 | | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54 | 2010 | | movsd xmm0, qword [RA] |
1902 | | fstp qword [RA-8] | 2011 | | sseconst_hi xmm1, RBa, 43500000 // 2^54. |
2012 | | mulsd xmm0, xmm1 | ||
2013 | | movsd qword [RA-8], xmm0 | ||
2014 | } else { | ||
2015 | | fld qword [RA] | ||
2016 | | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54 | ||
2017 | | fstp qword [RA-8] | ||
2018 | } | ||
1903 | | mov RB, [RA-4]; mov RC, 1076; shl RB, 1; jmp <1 | 2019 | | mov RB, [RA-4]; mov RC, 1076; shl RB, 1; jmp <1 |
1904 | | | 2020 | | |
1905 | |.ffunc_n math_modf | 2021 | if (sse) { |
2022 | |.ffunc_nsse math_modf | ||
2023 | } else { | ||
2024 | |.ffunc_n math_modf | ||
2025 | } | ||
1906 | | mov RB, [RA+4] | 2026 | | mov RB, [RA+4] |
1907 | | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf? | 2027 | | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf? |
1908 | | fdup | 2028 | if (sse) { |
1909 | | call ->vm_trunc | 2029 | | movaps xmm4, xmm0 |
1910 | | fsub st1, st0 | 2030 | | call ->vm_trunc |
1911 | |1: | 2031 | | subsd xmm4, xmm0 |
1912 | | fstp qword [RA-8]; fstp qword [RA] | 2032 | |1: |
2033 | | movsd qword [RA-8], xmm0 | ||
2034 | | movsd qword [RA], xmm4 | ||
2035 | } else { | ||
2036 | | fdup | ||
2037 | | call ->vm_trunc | ||
2038 | | fsub st1, st0 | ||
2039 | |1: | ||
2040 | | fstp qword [RA-8] | ||
2041 | | fstp qword [RA] | ||
2042 | } | ||
1913 | | mov RC, [RA-4]; mov RB, [RA+4] | 2043 | | mov RC, [RA-4]; mov RB, [RA+4] |
1914 | | xor RC, RB; js >3 // Need to adjust sign? | 2044 | | xor RC, RB; js >3 // Need to adjust sign? |
1915 | |2: | 2045 | |2: |
@@ -1918,20 +2048,41 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1918 | |3: | 2048 | |3: |
1919 | | xor RB, 0x80000000; mov [RA+4], RB; jmp <2 // Flip sign of fraction. | 2049 | | xor RB, 0x80000000; mov [RA+4], RB; jmp <2 // Flip sign of fraction. |
1920 | |4: | 2050 | |4: |
1921 | | fldz; fxch; jmp <1 // Return +-Inf and +-0. | 2051 | if (sse) { |
2052 | | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0. | ||
2053 | } else { | ||
2054 | | fldz; fxch; jmp <1 // Return +-Inf and +-0. | ||
2055 | } | ||
1922 | | | 2056 | | |
1923 | |.ffunc_nnr math_fmod | 2057 | |.ffunc_nnr math_fmod |
1924 | |1: ; fprem; fnstsw ax; sahf; jp <1 | 2058 | |1: ; fprem; fnstsw ax; sahf; jp <1 |
1925 | | fpop1 | 2059 | | fpop1 |
1926 | | jmp ->fff_resn | 2060 | | jmp ->fff_resn |
1927 | | | 2061 | | |
1928 | |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn | 2062 | if (0 && sse) { // NYI |
2063 | |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0 | ||
2064 | } else { | ||
2065 | |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn | ||
2066 | } | ||
1929 | | | 2067 | | |
1930 | |.macro math_minmax, name, cmovop, nocmovop | 2068 | |.macro math_minmax, name, cmovop, nocmovop, sseop |
2069 | ||if (sse) { | ||
2070 | |.ffunc_nsse name | ||
2071 | | mov RB, 2 | ||
2072 | |1: | ||
2073 | | cmp RB, RD | ||
2074 | | jae ->fff_resxmm0 | ||
2075 | | cmp dword [RA+RB*8-4], LJ_TISNUM; ja ->fff_fallback | ||
2076 | | movsd xmm1, qword [RA+RB*8-8] | ||
2077 | | sseop xmm0, xmm1 | ||
2078 | | add RB, 1 | ||
2079 | | jmp <1 | ||
2080 | ||} else { | ||
1931 | |.ffunc_n name | 2081 | |.ffunc_n name |
1932 | | mov RB, 2 | 2082 | | mov RB, 2 |
1933 | |1: | 2083 | |1: |
1934 | | cmp RB, RD; jae ->fff_resn | 2084 | | cmp RB, RD |
2085 | | jae ->fff_resn | ||
1935 | | cmp dword [RA+RB*8-4], LJ_TISNUM; ja >5 | 2086 | | cmp dword [RA+RB*8-4], LJ_TISNUM; ja >5 |
1936 | | fld qword [RA+RB*8-8] | 2087 | | fld qword [RA+RB*8-8] |
1937 | ||if (cmov) { | 2088 | ||if (cmov) { |
@@ -1943,20 +2094,26 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1943 | ||} | 2094 | ||} |
1944 | | add RB, 1 | 2095 | | add RB, 1 |
1945 | | jmp <1 | 2096 | | jmp <1 |
2097 | ||} | ||
1946 | |.endmacro | 2098 | |.endmacro |
1947 | | | 2099 | | |
1948 | | math_minmax math_min, fcmovnbe, jz | 2100 | | math_minmax math_min, fcmovnbe, jz, minsd |
1949 | | math_minmax math_max, fcmovbe, jnz | 2101 | | math_minmax math_max, fcmovbe, jnz, maxsd |
1950 | |5: | 2102 | if (!sse) { |
1951 | | fpop; jmp ->fff_fallback | 2103 | |5: |
2104 | | fpop; jmp ->fff_fallback | ||
2105 | } | ||
1952 | | | 2106 | | |
1953 | |//-- String library ----------------------------------------------------- | 2107 | |//-- String library ----------------------------------------------------- |
1954 | | | 2108 | | |
1955 | |.ffunc_1 string_len | 2109 | |.ffunc_1 string_len |
1956 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback | 2110 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback |
1957 | | mov STR:RB, [RA] | 2111 | | mov STR:RB, [RA] |
1958 | | fild dword STR:RB->len | 2112 | if (sse) { |
1959 | | jmp ->fff_resn | 2113 | | cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0 |
2114 | } else { | ||
2115 | | fild dword STR:RB->len; jmp ->fff_resn | ||
2116 | } | ||
1960 | | | 2117 | | |
1961 | |.ffunc string_byte // Only handle the 1-arg case here. | 2118 | |.ffunc string_byte // Only handle the 1-arg case here. |
1962 | | cmp NARGS:RC, 1+1; jne ->fff_fallback | 2119 | | cmp NARGS:RC, 1+1; jne ->fff_fallback |
@@ -1965,17 +2122,25 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
1965 | | cmp dword STR:RB->len, 1 | 2122 | | cmp dword STR:RB->len, 1 |
1966 | | jb ->fff_res0 // Return no results for empty string. | 2123 | | jb ->fff_res0 // Return no results for empty string. |
1967 | | movzx RB, byte STR:RB[1] | 2124 | | movzx RB, byte STR:RB[1] |
1968 | | mov TMP1, RB | 2125 | if (sse) { |
1969 | | fild TMP1 | 2126 | | cvtsi2sd xmm0, RB; jmp ->fff_resxmm0 |
1970 | | jmp ->fff_resn | 2127 | } else { |
2128 | | mov TMP1, RB; fild TMP1; jmp ->fff_resn | ||
2129 | } | ||
1971 | | | 2130 | | |
1972 | |.ffunc string_char // Only handle the 1-arg case here. | 2131 | |.ffunc string_char // Only handle the 1-arg case here. |
1973 | | ffgccheck | 2132 | | ffgccheck |
1974 | | cmp NARGS:RC, 1+1; jne ->fff_fallback // *Exactly* 1 arg. | 2133 | | cmp NARGS:RC, 1+1; jne ->fff_fallback // *Exactly* 1 arg. |
1975 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback | 2134 | | cmp dword [RA+4], LJ_TISNUM; ja ->fff_fallback |
1976 | | fld qword [RA] | 2135 | if (sse) { |
1977 | | fistp TMP2 | 2136 | | cvtsd2si RC, qword [RA] |
1978 | | cmp TMP2, 255; ja ->fff_fallback | 2137 | | cmp RC, 255; ja ->fff_fallback |
2138 | | mov TMP2, RC | ||
2139 | } else { | ||
2140 | | fld qword [RA] | ||
2141 | | fistp TMP2 | ||
2142 | | cmp TMP2, 255; ja ->fff_fallback | ||
2143 | } | ||
1979 | | lea RC, TMP2 // Little-endian. | 2144 | | lea RC, TMP2 // Little-endian. |
1980 | | mov TMP1, RA // Save RA. | 2145 | | mov TMP1, RA // Save RA. |
1981 | | mov ARG3, 1 | 2146 | | mov ARG3, 1 |
@@ -2000,16 +2165,26 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2000 | | cmp NARGS:RC, 1+2; jb ->fff_fallback | 2165 | | cmp NARGS:RC, 1+2; jb ->fff_fallback |
2001 | | jna >1 | 2166 | | jna >1 |
2002 | | cmp dword [RA+20], LJ_TISNUM; ja ->fff_fallback | 2167 | | cmp dword [RA+20], LJ_TISNUM; ja ->fff_fallback |
2003 | | fld qword [RA+16] | 2168 | if (sse) { |
2004 | | fistp TMP2 | 2169 | | cvtsd2si RB, qword [RA+16] |
2170 | | mov TMP2, RB | ||
2171 | } else { | ||
2172 | | fld qword [RA+16] | ||
2173 | | fistp TMP2 | ||
2174 | } | ||
2005 | |1: | 2175 | |1: |
2006 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback | 2176 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback |
2007 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback | 2177 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback |
2008 | | mov STR:RB, [RA] | 2178 | | mov STR:RB, [RA] |
2009 | | mov ARG2, STR:RB | 2179 | | mov ARG2, STR:RB |
2010 | | mov RB, STR:RB->len | 2180 | | mov RB, STR:RB->len |
2011 | | fld qword [RA+8] | 2181 | if (sse) { |
2012 | | fistp ARG3 | 2182 | | cvtsd2si RC, qword [RA+8] |
2183 | | mov ARG3, RC | ||
2184 | } else { | ||
2185 | | fld qword [RA+8] | ||
2186 | | fistp ARG3 | ||
2187 | } | ||
2013 | | mov RC, TMP2 | 2188 | | mov RC, TMP2 |
2014 | | cmp RB, RC // len < end? (unsigned compare) | 2189 | | cmp RB, RC // len < end? (unsigned compare) |
2015 | | jb >5 | 2190 | | jb >5 |
@@ -2055,9 +2230,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2055 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback | 2230 | | cmp dword [RA+4], LJ_TSTR; jne ->fff_fallback |
2056 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback | 2231 | | cmp dword [RA+12], LJ_TISNUM; ja ->fff_fallback |
2057 | | mov STR:RB, [RA] | 2232 | | mov STR:RB, [RA] |
2058 | | fld qword [RA+8] | 2233 | if (sse) { |
2059 | | fistp TMP2 | 2234 | | cvtsd2si RC, qword [RA+8] |
2060 | | mov RC, TMP2 | 2235 | } else { |
2236 | | fld qword [RA+8] | ||
2237 | | fistp TMP2 | ||
2238 | | mov RC, TMP2 | ||
2239 | } | ||
2061 | | test RC, RC | 2240 | | test RC, RC |
2062 | | jle ->fff_emptystr // Count <= 0? (or non-int) | 2241 | | jle ->fff_emptystr // Count <= 0? (or non-int) |
2063 | | cmp dword STR:RB->len, 1 | 2242 | | cmp dword STR:RB->len, 1 |
@@ -2140,43 +2319,73 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2140 | | mov TAB:FCARG1, [RA] // Caveat: FCARG1 == RA | 2319 | | mov TAB:FCARG1, [RA] // Caveat: FCARG1 == RA |
2141 | | call extern lj_tab_len@4 // LJ_FASTCALL (GCtab *t) | 2320 | | call extern lj_tab_len@4 // LJ_FASTCALL (GCtab *t) |
2142 | | // Length of table returned in eax (RC). | 2321 | | // Length of table returned in eax (RC). |
2143 | | mov ARG1, RC | ||
2144 | | mov RA, RB // Restore RA and BASE. | 2322 | | mov RA, RB // Restore RA and BASE. |
2145 | | mov BASE, TMP1 | 2323 | | mov BASE, TMP1 |
2146 | | fild ARG1 | 2324 | if (sse) { |
2147 | | jmp ->fff_resn | 2325 | | cvtsi2sd xmm0, RC; jmp ->fff_resxmm0 |
2326 | } else { | ||
2327 | | mov ARG1, RC; fild ARG1; jmp ->fff_resn | ||
2328 | } | ||
2148 | | | 2329 | | |
2149 | |//-- Bit library -------------------------------------------------------- | 2330 | |//-- Bit library -------------------------------------------------------- |
2150 | | | 2331 | | |
2151 | |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!). | 2332 | |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!). |
2152 | | | 2333 | | |
2153 | |.ffunc_n bit_tobit | 2334 | if (sse) { |
2154 | | mov TMP1, TOBIT_BIAS | 2335 | |.ffunc_nsse bit_tobit |
2155 | | fadd TMP1 | 2336 | | sseconst_tobit xmm1, RBa |
2156 | | fstp FPARG1 // 64 bit FP store. | 2337 | | addsd xmm0, xmm1 |
2157 | | fild ARG1 // 32 bit integer load (s2lfwd ok). | 2338 | | movd RB, xmm0 |
2158 | | jmp ->fff_resn | 2339 | | cvtsi2sd xmm0, RB |
2340 | | jmp ->fff_resxmm0 | ||
2341 | } else { | ||
2342 | |.ffunc_n bit_tobit | ||
2343 | | mov TMP1, TOBIT_BIAS | ||
2344 | | fadd TMP1 | ||
2345 | | fstp FPARG1 // 64 bit FP store. | ||
2346 | | fild ARG1 // 32 bit integer load (s2lfwd ok). | ||
2347 | | jmp ->fff_resn | ||
2348 | } | ||
2159 | | | 2349 | | |
2160 | |.macro .ffunc_bit, name | 2350 | |.macro .ffunc_bit, name |
2351 | ||if (sse) { | ||
2352 | | .ffunc_nsse name | ||
2353 | | sseconst_tobit xmm1, RBa | ||
2354 | | addsd xmm0, xmm1 | ||
2355 | | movd RB, xmm0 | ||
2356 | ||} else { | ||
2161 | | .ffunc_n name | 2357 | | .ffunc_n name |
2162 | | mov TMP1, TOBIT_BIAS | 2358 | | mov TMP1, TOBIT_BIAS |
2163 | | fadd TMP1 | 2359 | | fadd TMP1 |
2164 | | fstp FPARG1 | 2360 | | fstp FPARG1 |
2165 | | mov RB, ARG1 | 2361 | | mov RB, ARG1 |
2362 | ||} | ||
2166 | |.endmacro | 2363 | |.endmacro |
2167 | | | 2364 | | |
2168 | |.macro .ffunc_bit_op, name, ins | 2365 | |.macro .ffunc_bit_op, name, ins |
2169 | | .ffunc_bit name | 2366 | | .ffunc_bit name |
2170 | | mov NRESULTS, NARGS:RC // Save for fallback. | 2367 | | mov TMP2, NARGS:RC // Save for fallback. |
2171 | | lea RC, [RA+NARGS:RC*8-16] | 2368 | | lea RC, [RA+NARGS:RC*8-16] |
2369 | ||if (sse) { | ||
2370 | | mov TMP1, BASE // Need BASE as a scratch register. | ||
2371 | ||} | ||
2172 | |1: | 2372 | |1: |
2173 | | cmp RC, RA | 2373 | | cmp RC, RA |
2174 | | jbe ->fff_resbit | 2374 | | jbe ->fff_resbit_op |
2175 | | cmp dword [RC+4], LJ_TISNUM; ja ->fff_fallback_bit_op | 2375 | | cmp dword [RC+4], LJ_TISNUM; ja ->fff_fallback_bit_op |
2376 | ||if (sse) { | ||
2377 | | movsd xmm0, qword [RC] | ||
2378 | | addsd xmm0, xmm1 | ||
2379 | | movd BASE, xmm0 | ||
2380 | | ins RB, BASE | ||
2381 | ||} else { | ||
2382 | |.if not X64 | ||
2176 | | fld qword [RC] | 2383 | | fld qword [RC] |
2177 | | fadd TMP1 | 2384 | | fadd TMP1 |
2178 | | fstp FPARG1 | 2385 | | fstp FPARG1 |
2179 | | ins RB, ARG1 | 2386 | | ins RB, ARG1 |
2387 | |.endif | ||
2388 | ||} | ||
2180 | | sub RC, 8 | 2389 | | sub RC, 8 |
2181 | | jmp <1 | 2390 | | jmp <1 |
2182 | |.endmacro | 2391 | |.endmacro |
@@ -2191,16 +2400,39 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2191 | | | 2400 | | |
2192 | |.ffunc_bit bit_bnot | 2401 | |.ffunc_bit bit_bnot |
2193 | | not RB | 2402 | | not RB |
2194 | |->fff_resbit: | 2403 | if (sse) { |
2195 | | mov ARG1, RB | 2404 | |->fff_resbit: |
2196 | | fild ARG1 | 2405 | | cvtsi2sd xmm0, RB |
2197 | | jmp ->fff_resn | 2406 | | jmp ->fff_resxmm0 |
2407 | |->fff_resbit_op: | ||
2408 | | cvtsi2sd xmm0, RB | ||
2409 | | mov BASE, TMP1 | ||
2410 | | jmp ->fff_resxmm0 | ||
2411 | } else { | ||
2412 | |->fff_resbit: | ||
2413 | |->fff_resbit_op: | ||
2414 | | mov ARG1, RB | ||
2415 | | fild ARG1 | ||
2416 | | jmp ->fff_resn | ||
2417 | } | ||
2198 | | | 2418 | | |
2199 | |->fff_fallback_bit_op: | 2419 | |->fff_fallback_bit_op: |
2200 | | mov NARGS:RC, NRESULTS // Restore for fallback | 2420 | if (sse) { |
2421 | | mov BASE, TMP1 | ||
2422 | } | ||
2423 | | mov NARGS:RC, TMP2 // Restore for fallback | ||
2201 | | jmp ->fff_fallback | 2424 | | jmp ->fff_fallback |
2202 | | | 2425 | | |
2203 | |.macro .ffunc_bit_sh, name, ins | 2426 | |.macro .ffunc_bit_sh, name, ins |
2427 | ||if (sse) { | ||
2428 | | .ffunc_nnsse name | ||
2429 | | sseconst_tobit xmm2, RBa | ||
2430 | | addsd xmm0, xmm2 | ||
2431 | | addsd xmm1, xmm2 | ||
2432 | | mov RC, RA // Assumes RA is ecx. | ||
2433 | | movd RB, xmm0 | ||
2434 | | movd RA, xmm1 | ||
2435 | ||} else { | ||
2204 | | .ffunc_nn name | 2436 | | .ffunc_nn name |
2205 | | mov TMP1, TOBIT_BIAS | 2437 | | mov TMP1, TOBIT_BIAS |
2206 | | fadd TMP1 | 2438 | | fadd TMP1 |
@@ -2210,6 +2442,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2210 | | mov RC, RA // Assumes RA is ecx. | 2442 | | mov RC, RA // Assumes RA is ecx. |
2211 | | mov RA, ARG3 | 2443 | | mov RA, ARG3 |
2212 | | mov RB, ARG1 | 2444 | | mov RB, ARG1 |
2445 | ||} | ||
2213 | | ins RB, cl | 2446 | | ins RB, cl |
2214 | | mov RA, RC | 2447 | | mov RA, RC |
2215 | | jmp ->fff_resbit | 2448 | | jmp ->fff_resbit |
@@ -2461,8 +2694,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2461 | |//----------------------------------------------------------------------- | 2694 | |//----------------------------------------------------------------------- |
2462 | | | 2695 | | |
2463 | |// FP value rounding. Called by math.floor/math.ceil fast functions | 2696 | |// FP value rounding. Called by math.floor/math.ceil fast functions |
2464 | |// and from JIT code. Arg/ret on x87 stack. No int/xmm registers modified. | 2697 | |// and from JIT code. |
2465 | |.macro vm_round, mode1, mode2 | 2698 | | |
2699 | |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified. | ||
2700 | |.macro vm_round_x87, mode1, mode2 | ||
2466 | | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2. | 2701 | | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2. |
2467 | | mov [esp+8], eax | 2702 | | mov [esp+8], eax |
2468 | | mov ax, mode1 | 2703 | | mov ax, mode1 |
@@ -2478,14 +2713,55 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2478 | | ret | 2713 | | ret |
2479 | |.endmacro | 2714 | |.endmacro |
2480 | | | 2715 | | |
2481 | |->vm_floor: | 2716 | |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified. |
2482 | | vm_round 0x0400, 0xf7ff | 2717 | |.macro vm_round_sse, mode |
2718 | | sseconst_abs xmm2, RDa | ||
2719 | | sseconst_2p52 xmm3, RDa | ||
2720 | | movaps xmm1, xmm0 | ||
2721 | | andpd xmm1, xmm2 // |x| | ||
2722 | | ucomisd xmm3, xmm1 // No truncation if 2^52 <= |x|. | ||
2723 | | jbe >1 | ||
2724 | | andnpd xmm2, xmm0 // Isolate sign bit. | ||
2725 | |.if mode == 2 // trunc(x)? | ||
2726 | | movaps xmm0, xmm1 | ||
2727 | | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52 | ||
2728 | | subsd xmm1, xmm3 | ||
2729 | | sseconst_1 xmm3, RDa | ||
2730 | | cmpsd xmm0, xmm1, 1 // |x| < result? | ||
2731 | | andpd xmm0, xmm3 | ||
2732 | | subsd xmm1, xmm0 // If yes, subtract -1. | ||
2733 | | orpd xmm1, xmm2 // Merge sign bit back in. | ||
2734 | |.else | ||
2735 | | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52 | ||
2736 | | subsd xmm1, xmm3 | ||
2737 | | orpd xmm1, xmm2 // Merge sign bit back in. | ||
2738 | | .if mode == 1 // ceil(x)? | ||
2739 | | sseconst_m1 xmm2, RDa // Must subtract -1 to preserve -0. | ||
2740 | | cmpsd xmm0, xmm1, 6 // x > result? | ||
2741 | | .else // floor(x)? | ||
2742 | | sseconst_1 xmm2, RDa | ||
2743 | | cmpsd xmm0, xmm1, 1 // x < result? | ||
2744 | | .endif | ||
2745 | | andpd xmm0, xmm2 | ||
2746 | | subsd xmm1, xmm0 // If yes, subtract +-1. | ||
2747 | |.endif | ||
2748 | | movaps xmm0, xmm1 | ||
2749 | |1: | ||
2750 | | ret | ||
2751 | |.endmacro | ||
2483 | | | 2752 | | |
2484 | |->vm_ceil: | 2753 | |.macro vm_round, name, ssemode, mode1, mode2 |
2485 | | vm_round 0x0800, 0xfbff | 2754 | |->name: |
2755 | ||if (!sse) { | ||
2756 | | vm_round_x87 mode1, mode2 | ||
2757 | ||} | ||
2758 | |->name .. _sse: | ||
2759 | | vm_round_sse ssemode | ||
2760 | |.endmacro | ||
2486 | | | 2761 | | |
2487 | |->vm_trunc: | 2762 | | vm_round vm_floor, 0, 0x0400, 0xf7ff |
2488 | | vm_round 0x0c00, 0xffff | 2763 | | vm_round vm_ceil, 1, 0x0800, 0xfbff |
2764 | | vm_round vm_trunc, 2, 0x0c00, 0xffff | ||
2489 | | | 2765 | | |
2490 | |// FP modulo x%y. Called by BC_MOD* and vm_arith. | 2766 | |// FP modulo x%y. Called by BC_MOD* and vm_arith. |
2491 | |->vm_mod: | 2767 | |->vm_mod: |
@@ -2532,8 +2808,8 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2532 | | fldcw word [esp+4] | 2808 | | fldcw word [esp+4] |
2533 | | fmulp st1 | 2809 | | fmulp st1 |
2534 | | fsubp st1 | 2810 | | fsubp st1 |
2811 | | ret | ||
2535 | } | 2812 | } |
2536 | | ret | ||
2537 | | | 2813 | | |
2538 | |// FP exponentiation e^x and 2^x. Called by math.exp fast function and | 2814 | |// FP exponentiation e^x and 2^x. Called by math.exp fast function and |
2539 | |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified. | 2815 | |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified. |
@@ -2662,19 +2938,51 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2662 | |// Callable from C: double lj_vm_foldfpm(double x, int fpm) | 2938 | |// Callable from C: double lj_vm_foldfpm(double x, int fpm) |
2663 | |// Computes fpm(x) for extended math functions. ORDER FPM. | 2939 | |// Computes fpm(x) for extended math functions. ORDER FPM. |
2664 | |->vm_foldfpm: | 2940 | |->vm_foldfpm: |
2665 | | mov eax, [esp+12] | 2941 | if (sse) { |
2666 | | fld qword [esp+4] | 2942 | |.if X64WIN |
2667 | | cmp eax, 1; jb ->vm_floor; je ->vm_ceil | 2943 | | .define fpmop, CARG2d |
2668 | | cmp eax, 3; jb ->vm_trunc; ja >1 | 2944 | |.elif X64 |
2669 | | fsqrt; ret | 2945 | | .define fpmop, CARG1d |
2670 | |1: ; cmp eax, 5; jb ->vm_exp; je ->vm_exp2 | 2946 | |.else |
2671 | | cmp eax, 7; je >1; ja >2 | 2947 | | .define fpmop, eax |
2948 | | mov fpmop, [esp+12] | ||
2949 | | movsd xmm0, qword [esp+4] | ||
2950 | |.endif | ||
2951 | |.if X64 | ||
2952 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil | ||
2953 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 | ||
2954 | | sqrtsd xmm0, xmm0; ret | ||
2955 | |.else | ||
2956 | | cmp fpmop, 1; je >1; ja >2 | ||
2957 | | call ->vm_floor; jmp >7 | ||
2958 | |1: ; call ->vm_ceil; jmp >7 | ||
2959 | |2: ; cmp fpmop, 3; je >1; ja >2 | ||
2960 | | call ->vm_trunc; jmp >7 | ||
2961 | |1: | ||
2962 | | sqrtsd xmm0, xmm0 | ||
2963 | |7: | ||
2964 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. | ||
2965 | | fld qword [esp+4] | ||
2966 | | ret | ||
2967 | |.endif | ||
2968 | |2: | ||
2969 | | fld qword [esp+4] | ||
2970 | } else { | ||
2971 | | mov fpmop, [esp+12] | ||
2972 | | fld qword [esp+4] | ||
2973 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil | ||
2974 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 | ||
2975 | | fsqrt; ret | ||
2976 | |2: | ||
2977 | } | ||
2978 | | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2 | ||
2979 | | cmp fpmop, 7; je >1; ja >2 | ||
2672 | | fldln2; fxch; fyl2x; ret | 2980 | | fldln2; fxch; fyl2x; ret |
2673 | |1: ; fld1; fxch; fyl2x; ret | 2981 | |1: ; fld1; fxch; fyl2x; ret |
2674 | |2: ; cmp eax, 9; je >1; ja >2 | 2982 | |2: ; cmp fpmop, 9; je >1; ja >2 |
2675 | | fldlg2; fxch; fyl2x; ret | 2983 | | fldlg2; fxch; fyl2x; ret |
2676 | |1: ; fsin; ret | 2984 | |1: ; fsin; ret |
2677 | |2: ; cmp eax, 11; je >1; ja >9 | 2985 | |2: ; cmp fpmop, 11; je >1; ja >9 |
2678 | | fcos; ret | 2986 | | fcos; ret |
2679 | |1: ; fptan; fpop; ret | 2987 | |1: ; fptan; fpop; ret |
2680 | |9: ; int3 // Bad fpm. | 2988 | |9: ; int3 // Bad fpm. |
@@ -3198,14 +3506,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3198 | break; | 3506 | break; |
3199 | case BC_KSHORT: | 3507 | case BC_KSHORT: |
3200 | | ins_AD // RA = dst, RD = signed int16 literal | 3508 | | ins_AD // RA = dst, RD = signed int16 literal |
3201 | | fild PC_RD // Refetch signed RD from instruction. | 3509 | if (sse) { |
3202 | | fstp qword [BASE+RA*8] | 3510 | | movsx RD, RDW // Sign-extend literal. |
3511 | | cvtsi2sd xmm0, RD | ||
3512 | | movsd qword [BASE+RA*8], xmm0 | ||
3513 | } else { | ||
3514 | | fild PC_RD // Refetch signed RD from instruction. | ||
3515 | | fstp qword [BASE+RA*8] | ||
3516 | } | ||
3203 | | ins_next | 3517 | | ins_next |
3204 | break; | 3518 | break; |
3205 | case BC_KNUM: | 3519 | case BC_KNUM: |
3206 | | ins_AD // RA = dst, RD = num const | 3520 | | ins_AD // RA = dst, RD = num const |
3207 | | fld qword [KBASE+RD*8] | 3521 | if (sse) { |
3208 | | fstp qword [BASE+RA*8] | 3522 | | movsd xmm0, qword [KBASE+RD*8] |
3523 | | movsd qword [BASE+RA*8], xmm0 | ||
3524 | } else { | ||
3525 | | fld qword [KBASE+RD*8] | ||
3526 | | fstp qword [BASE+RA*8] | ||
3527 | } | ||
3209 | | ins_next | 3528 | | ins_next |
3210 | break; | 3529 | break; |
3211 | case BC_KPRI: | 3530 | case BC_KPRI: |
@@ -3307,10 +3626,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3307 | case BC_USETN: | 3626 | case BC_USETN: |
3308 | | ins_AD // RA = upvalue #, RD = num const | 3627 | | ins_AD // RA = upvalue #, RD = num const |
3309 | | mov LFUNC:RB, [BASE-8] | 3628 | | mov LFUNC:RB, [BASE-8] |
3310 | | fld qword [KBASE+RD*8] | 3629 | if (sse) { |
3630 | | movsd xmm0, qword [KBASE+RD*8] | ||
3631 | } else { | ||
3632 | | fld qword [KBASE+RD*8] | ||
3633 | } | ||
3311 | | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)] | 3634 | | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)] |
3312 | | mov RA, UPVAL:RB->v | 3635 | | mov RA, UPVAL:RB->v |
3313 | | fstp qword [RA] | 3636 | if (sse) { |
3637 | | movsd qword [RA], xmm0 | ||
3638 | } else { | ||
3639 | | fstp qword [RA] | ||
3640 | } | ||
3314 | | ins_next | 3641 | | ins_next |
3315 | break; | 3642 | break; |
3316 | case BC_USETP: | 3643 | case BC_USETP: |
@@ -3438,11 +3765,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3438 | | | 3765 | | |
3439 | | // Integer key? Convert number to int and back and compare. | 3766 | | // Integer key? Convert number to int and back and compare. |
3440 | | checknum RC, >5 | 3767 | | checknum RC, >5 |
3441 | | fld qword [BASE+RC*8] | 3768 | if (sse) { |
3442 | | fist ARG1 | 3769 | | movsd xmm0, qword [BASE+RC*8] |
3443 | | fild ARG1 | 3770 | | cvtsd2si RC, xmm0 |
3444 | | fcomparepp // eax (RC) modified! | 3771 | | cvtsi2sd xmm1, RC |
3445 | | mov RC, ARG1 | 3772 | | ucomisd xmm0, xmm1 |
3773 | } else { | ||
3774 | |.if not X64 | ||
3775 | | fld qword [BASE+RC*8] | ||
3776 | | fist ARG1 | ||
3777 | | fild ARG1 | ||
3778 | | fcomparepp // eax (RC) modified! | ||
3779 | | mov RC, ARG1 | ||
3780 | |.endif | ||
3781 | } | ||
3446 | | jne ->vmeta_tgetv // Generic numeric key? Use fallback. | 3782 | | jne ->vmeta_tgetv // Generic numeric key? Use fallback. |
3447 | | cmp RC, TAB:RB->asize // Takes care of unordered, too. | 3783 | | cmp RC, TAB:RB->asize // Takes care of unordered, too. |
3448 | | jae ->vmeta_tgetv // Not in array part? Use fallback. | 3784 | | jae ->vmeta_tgetv // Not in array part? Use fallback. |
@@ -3551,11 +3887,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3551 | | | 3887 | | |
3552 | | // Integer key? Convert number to int and back and compare. | 3888 | | // Integer key? Convert number to int and back and compare. |
3553 | | checknum RC, >5 | 3889 | | checknum RC, >5 |
3554 | | fld qword [BASE+RC*8] | 3890 | if (sse) { |
3555 | | fist ARG1 | 3891 | | movsd xmm0, qword [BASE+RC*8] |
3556 | | fild ARG1 | 3892 | | cvtsd2si RC, xmm0 |
3557 | | fcomparepp // eax (RC) modified! | 3893 | | cvtsi2sd xmm1, RC |
3558 | | mov RC, ARG1 | 3894 | | ucomisd xmm0, xmm1 |
3895 | } else { | ||
3896 | |.if not X64 | ||
3897 | | fld qword [BASE+RC*8] | ||
3898 | | fist ARG1 | ||
3899 | | fild ARG1 | ||
3900 | | fcomparepp // eax (RC) modified! | ||
3901 | | mov RC, ARG1 | ||
3902 | |.endif | ||
3903 | } | ||
3559 | | jne ->vmeta_tsetv // Generic numeric key? Use fallback. | 3904 | | jne ->vmeta_tsetv // Generic numeric key? Use fallback. |
3560 | | cmp RC, TAB:RB->asize // Takes care of unordered, too. | 3905 | | cmp RC, TAB:RB->asize // Takes care of unordered, too. |
3561 | | jae ->vmeta_tsetv | 3906 | | jae ->vmeta_tsetv |
@@ -3626,11 +3971,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3626 | |4: // Check for __newindex if previous value is nil. | 3971 | |4: // Check for __newindex if previous value is nil. |
3627 | | cmp dword TAB:RB->metatable, 0 // Shouldn't overwrite RA for fastpath. | 3972 | | cmp dword TAB:RB->metatable, 0 // Shouldn't overwrite RA for fastpath. |
3628 | | jz <2 | 3973 | | jz <2 |
3629 | | mov ARG1, RA // Save RA. | 3974 | | mov TMP1, RA // Save RA. |
3630 | | mov TAB:RA, TAB:RB->metatable | 3975 | | mov TAB:RA, TAB:RB->metatable |
3631 | | test byte TAB:RA->nomm, 1<<MM_newindex | 3976 | | test byte TAB:RA->nomm, 1<<MM_newindex |
3632 | | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check. | 3977 | | jz ->vmeta_tsets // 'no __newindex' flag NOT set: check. |
3633 | | mov RA, ARG1 // Restore RA. | 3978 | | mov RA, TMP1 // Restore RA. |
3634 | | jmp <2 | 3979 | | jmp <2 |
3635 | | | 3980 | | |
3636 | |5: // Follow hash chain. | 3981 | |5: // Follow hash chain. |
@@ -3705,8 +4050,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3705 | case BC_TSETM: | 4050 | case BC_TSETM: |
3706 | | ins_AD // RA = base (table at base-1), RD = num const (start index) | 4051 | | ins_AD // RA = base (table at base-1), RD = num const (start index) |
3707 | | mov TMP1, KBASE // Need one more free register. | 4052 | | mov TMP1, KBASE // Need one more free register. |
3708 | | fld qword [KBASE+RD*8] | 4053 | if (sse) { |
3709 | | fistp ARG4 // Const is guaranteed to be an int. | 4054 | | movsd xmm0, qword [KBASE+RD*8] |
4055 | } else { | ||
4056 | |.if not X64 | ||
4057 | | fld qword [KBASE+RD*8] | ||
4058 | | fistp ARG4 // Const is guaranteed to be an int. | ||
4059 | |.endif | ||
4060 | } | ||
3710 | |1: | 4061 | |1: |
3711 | | lea RA, [BASE+RA*8] | 4062 | | lea RA, [BASE+RA*8] |
3712 | | mov TAB:RB, [RA-8] // Guaranteed to be a table. | 4063 | | mov TAB:RB, [RA-8] // Guaranteed to be a table. |
@@ -3714,7 +4065,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3714 | | jnz >7 | 4065 | | jnz >7 |
3715 | |2: | 4066 | |2: |
3716 | | mov RD, NRESULTS | 4067 | | mov RD, NRESULTS |
3717 | | mov KBASE, ARG4 | 4068 | if (sse) { |
4069 | | cvtsd2si KBASE, xmm0 // Const is guaranteed to be an int. | ||
4070 | } else { | ||
4071 | |.if not X64 | ||
4072 | | mov KBASE, ARG4 | ||
4073 | |.endif | ||
4074 | } | ||
3718 | | sub RD, 1 | 4075 | | sub RD, 1 |
3719 | | jz >4 // Nothing to copy? | 4076 | | jz >4 // Nothing to copy? |
3720 | | add RD, KBASE // Compute needed size. | 4077 | | add RD, KBASE // Compute needed size. |
@@ -4034,20 +4391,37 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
4034 | if (!vk) { | 4391 | if (!vk) { |
4035 | | cmp RB, LJ_TISNUM; ja ->vmeta_for | 4392 | | cmp RB, LJ_TISNUM; ja ->vmeta_for |
4036 | } | 4393 | } |
4037 | | fld FOR_STOP | 4394 | if (sse) { |
4038 | | fld FOR_IDX | 4395 | | movsd xmm0, FOR_IDX |
4039 | if (vk) { | 4396 | | movsd xmm1, FOR_STOP |
4040 | | fadd FOR_STEP // nidx = idx + step | 4397 | if (vk) { |
4041 | | fst FOR_IDX | 4398 | | addsd xmm0, FOR_STEP |
4042 | } | 4399 | | movsd FOR_IDX, xmm0 |
4043 | | fst FOR_EXT | 4400 | | test RB, RB; js >3 |
4044 | | test RB, RB // Swap lim/(n)idx if step non-negative. | 4401 | } else { |
4045 | | js >1 | 4402 | | jl >3 |
4046 | | fxch | 4403 | } |
4047 | |1: | 4404 | | ucomisd xmm1, xmm0 |
4048 | | fcomparepp // eax (RD) modified if !cmov. | 4405 | |1: |
4049 | if (!cmov) { | 4406 | | movsd FOR_EXT, xmm0 |
4050 | | movzx RD, PC_RD // Need to reload RD. | 4407 | } else { |
4408 | | fld FOR_STOP | ||
4409 | | fld FOR_IDX | ||
4410 | if (vk) { | ||
4411 | | fadd FOR_STEP // nidx = idx + step | ||
4412 | | fst FOR_IDX | ||
4413 | | fst FOR_EXT | ||
4414 | | test RB, RB; js >1 | ||
4415 | } else { | ||
4416 | | fst FOR_EXT | ||
4417 | | jl >1 | ||
4418 | } | ||
4419 | | fxch // Swap lim/(n)idx if step non-negative. | ||
4420 | |1: | ||
4421 | | fcomparepp // eax (RD) modified if !cmov. | ||
4422 | if (!cmov) { | ||
4423 | | movzx RD, PC_RD // Need to reload RD. | ||
4424 | } | ||
4051 | } | 4425 | } |
4052 | if (op == BC_FORI) { | 4426 | if (op == BC_FORI) { |
4053 | | jnb >2 | 4427 | | jnb >2 |
@@ -4064,6 +4438,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
4064 | } | 4438 | } |
4065 | |2: | 4439 | |2: |
4066 | | ins_next | 4440 | | ins_next |
4441 | if (sse) { | ||
4442 | |3: // Invert comparison if step is negative. | ||
4443 | | ucomisd xmm0, xmm1 | ||
4444 | | jmp <1 | ||
4445 | } | ||
4067 | break; | 4446 | break; |
4068 | 4447 | ||
4069 | case BC_ITERL: | 4448 | case BC_ITERL: |