diff options
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r-- | src/buildvm_x86.dasc | 353 |
1 files changed, 278 insertions, 75 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc index 960afa1d..b220c58f 100644 --- a/src/buildvm_x86.dasc +++ b/src/buildvm_x86.dasc | |||
@@ -322,6 +322,40 @@ | |||
322 | |.macro fdup; fld st0; .endmacro | 322 | |.macro fdup; fld st0; .endmacro |
323 | |.macro fpop1; fstp st1; .endmacro | 323 | |.macro fpop1; fstp st1; .endmacro |
324 | | | 324 | | |
325 | |// Synthesize SSE FP constants. | ||
326 | |.macro sseconst_sign, reg, tmp // Synthesize sign mask. | ||
327 | |.if X64 | ||
328 | | mov64 tmp, U64x(80000000,00000000); movd reg, tmp | ||
329 | |.else | ||
330 | | mov tmp, 0x80000000; movd xmm1, tmp; pshufd reg, reg, 0x51 | ||
331 | |.endif | ||
332 | |.endmacro | ||
333 | | | ||
334 | |.macro sseconst_abs, reg, tmp // Synthesize abs mask. | ||
335 | |.if X64 | ||
336 | | mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp | ||
337 | |.else | ||
338 | | pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1 | ||
339 | |.endif | ||
340 | |.endmacro | ||
341 | | | ||
342 | |.macro sseconst_1, reg, tmp // Synthesize 1.0. | ||
343 | |.if X64 | ||
344 | | mov64 tmp, U64x(3ff00000,00000000) | ||
345 | | movd reg, tmp | ||
346 | |.else | ||
347 | | mov tmp, 0x3ff00000; movd reg, tmp; pshufd reg, reg, 0x51 | ||
348 | |.endif | ||
349 | |.endmacro | ||
350 | | | ||
351 | |.macro sseconst_2p52, reg, tmp // Synthesize 2^52. | ||
352 | |.if X64 | ||
353 | | mov64 tmp, U64x(43300000,00000000); movd reg, tmp | ||
354 | |.else | ||
355 | | mov tmp, 0x43300000; movd reg, tmp; pshufd reg, reg, 0x51 | ||
356 | |.endif | ||
357 | |.endmacro | ||
358 | | | ||
325 | |// Move table write barrier back. Overwrites reg. | 359 | |// Move table write barrier back. Overwrites reg. |
326 | |.macro barrierback, tab, reg | 360 | |.macro barrierback, tab, reg |
327 | | and byte tab->marked, cast_byte(~LJ_GC_BLACK) // black2gray(tab) | 361 | | and byte tab->marked, cast_byte(~LJ_GC_BLACK) // black2gray(tab) |
@@ -334,7 +368,7 @@ | |||
334 | 368 | ||
335 | /* Generate subroutines used by opcodes and other parts of the VM. */ | 369 | /* Generate subroutines used by opcodes and other parts of the VM. */ |
336 | /* The .code_sub section should be last to help static branch prediction. */ | 370 | /* The .code_sub section should be last to help static branch prediction. */ |
337 | static void build_subroutines(BuildCtx *ctx, int cmov) | 371 | static void build_subroutines(BuildCtx *ctx, int cmov, int sse) |
338 | { | 372 | { |
339 | |.code_sub | 373 | |.code_sub |
340 | | | 374 | | |
@@ -2454,21 +2488,51 @@ static void build_subroutines(BuildCtx *ctx, int cmov) | |||
2454 | | vm_round 0x0c00, 0xffff | 2488 | | vm_round 0x0c00, 0xffff |
2455 | | | 2489 | | |
2456 | |// FP modulo x%y. Called by BC_MOD* and vm_arith. | 2490 | |// FP modulo x%y. Called by BC_MOD* and vm_arith. |
2457 | |// Args/ret on x87 stack (y on top). No xmm registers modified. | ||
2458 | |// Caveat: needs 3 slots on x87 stack! RC (eax) modified! | ||
2459 | |->vm_mod: | 2491 | |->vm_mod: |
2460 | | fld st1 | 2492 | if (sse) { |
2461 | | fdiv st1 | 2493 | |// Args in xmm0/xmm1, return value in xmm0. |
2462 | | fnstcw word [esp+4] | 2494 | |// Caveat: xmm0-xmm5 and RC (eax) modified! |
2463 | | mov ax, 0x0400 | 2495 | | movaps xmm5, xmm0 |
2464 | | or ax, [esp+4] | 2496 | | divsd xmm0, xmm1 |
2465 | | and ax, 0xf7ff | 2497 | | sseconst_abs xmm2, RDa |
2466 | | mov [esp+6], ax | 2498 | | sseconst_2p52 xmm3, RDa |
2467 | | fldcw word [esp+6] | 2499 | | movaps xmm4, xmm0 |
2468 | | frndint | 2500 | | andpd xmm4, xmm2 // |x/y| |
2469 | | fldcw word [esp+4] | 2501 | | ucomisd xmm3, xmm4 // No truncation if 2^52 <= |x/y|. |
2470 | | fmulp st1 | 2502 | | jbe >1 |
2471 | | fsubp st1 | 2503 | | andnpd xmm2, xmm0 // Isolate sign bit. |
2504 | | addsd xmm4, xmm3 // (|x/y| + 2^52) - 2^52 | ||
2505 | | subsd xmm4, xmm3 | ||
2506 | | orpd xmm4, xmm2 // Merge sign bit back in. | ||
2507 | | sseconst_1 xmm2, RDa | ||
2508 | | cmpsd xmm0, xmm4, 1 // x/y < result? | ||
2509 | | andpd xmm0, xmm2 | ||
2510 | | subsd xmm4, xmm0 // If yes, subtract 1.0. | ||
2511 | | movaps xmm0, xmm5 | ||
2512 | | mulsd xmm1, xmm4 | ||
2513 | | subsd xmm0, xmm1 | ||
2514 | | ret | ||
2515 | |1: | ||
2516 | | mulsd xmm1, xmm0 | ||
2517 | | movaps xmm0, xmm5 | ||
2518 | | subsd xmm0, xmm1 | ||
2519 | | ret | ||
2520 | } else { | ||
2521 | |// Args/ret on x87 stack (y on top). No xmm registers modified. | ||
2522 | |// Caveat: needs 3 slots on x87 stack! RC (eax) modified! | ||
2523 | | fld st1 | ||
2524 | | fdiv st1 | ||
2525 | | fnstcw word [esp+4] | ||
2526 | | mov ax, 0x0400 | ||
2527 | | or ax, [esp+4] | ||
2528 | | and ax, 0xf7ff | ||
2529 | | mov [esp+6], ax | ||
2530 | | fldcw word [esp+6] | ||
2531 | | frndint | ||
2532 | | fldcw word [esp+4] | ||
2533 | | fmulp st1 | ||
2534 | | fsubp st1 | ||
2535 | } | ||
2472 | | ret | 2536 | | ret |
2473 | | | 2537 | | |
2474 | |// FP exponentiation e^x and 2^x. Called by math.exp fast function and | 2538 | |// FP exponentiation e^x and 2^x. Called by math.exp fast function and |
@@ -2619,31 +2683,100 @@ static void build_subroutines(BuildCtx *ctx, int cmov) | |||
2619 | |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -) | 2683 | |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -) |
2620 | |// and basic math functions. ORDER ARITH | 2684 | |// and basic math functions. ORDER ARITH |
2621 | |->vm_foldarith: | 2685 | |->vm_foldarith: |
2622 | | mov eax, [esp+20] | 2686 | if (sse) { |
2623 | | fld qword [esp+4] | 2687 | |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro |
2624 | | fld qword [esp+12] | 2688 | |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro |
2625 | | cmp eax, 1; je >1; ja >2 | 2689 | | |
2626 | | faddp st1; ret | 2690 | |.if X64WIN |
2627 | |1: ; fsubp st1; ret | 2691 | | .define foldop, CARG3d |
2628 | |2: ; cmp eax, 3; je >1; ja >2 | 2692 | |.elif X64 |
2629 | | fmulp st1; ret | 2693 | | .define foldop, CARG1d |
2630 | |1: ; fdivp st1; ret | 2694 | |.else |
2631 | |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow | 2695 | | .define foldop, eax |
2632 | | cmp eax, 7; je >1; ja >2 | 2696 | | mov foldop, [esp+20] |
2633 | | fpop; fchs; ret | 2697 | | movsd xmm0, qword [esp+4] |
2634 | |1: ; fpop; fabs; ret | 2698 | | movsd xmm1, qword [esp+12] |
2635 | |2: ; cmp eax, 9; je >1; ja >2 | 2699 | |.endif |
2636 | | fpatan; ret | 2700 | | cmp foldop, 1; je >1; ja >2 |
2637 | |1: ; fxch; fscale; fpop1; ret | 2701 | | addsd xmm0, xmm1; retxmm0 |
2638 | |2: ; cmp eax, 11; je >1; ja >9 | 2702 | |1: ; subsd xmm0, xmm1; retxmm0 |
2639 | ||if (cmov) { | 2703 | |2: ; cmp foldop, 3; je >1; ja >2 |
2640 | | fucomi st1; fcmovnbe st1; fpop1; ret | 2704 | | mulsd xmm0, xmm1; retxmm0 |
2641 | |1: ; fucomi st1; fcmovbe st1; fpop1; ret | 2705 | |1: ; divsd xmm0, xmm1; retxmm0 |
2642 | ||} else { | 2706 | |2: ; cmp foldop, 5 |
2643 | | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret | 2707 | |.if X64 |
2644 | |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret | 2708 | | jb ->vm_mod; je ->vm_pow // NYI: broken without SSE vm_pow. |
2645 | ||} | 2709 | |.else |
2646 | |9: ; int3 // Bad op. | 2710 | | je >1; ja >2 |
2711 | | call ->vm_mod; retxmm0 | ||
2712 | |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow // NYI | ||
2713 | |2: | ||
2714 | |.endif | ||
2715 | | cmp foldop, 7; je >1; ja >2 | ||
2716 | | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0 | ||
2717 | |1: | ||
2718 | | sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0 | ||
2719 | |2: ; cmp foldop, 9; ja >2 | ||
2720 | |.if X64WIN | ||
2721 | | movsd qword [esp+8], xmm0 // Use scratch area. | ||
2722 | | movsd qword [esp+16], xmm1 | ||
2723 | | fld qword [esp+8] | ||
2724 | | fld qword [esp+16] | ||
2725 | |.elif X64 | ||
2726 | | movsd qword [esp-8], xmm0 // Use red zone. | ||
2727 | | movsd qword [esp-16], xmm1 | ||
2728 | | fld qword [esp-8] | ||
2729 | | fld qword [esp-16] | ||
2730 | |.else | ||
2731 | | fld qword [esp+4] // Reload from stack | ||
2732 | | fld qword [esp+12] | ||
2733 | |.endif | ||
2734 | | je >1 | ||
2735 | | fpatan; retst0 | ||
2736 | |1: ; fxch; fscale; fpop1; retst0 | ||
2737 | |2: ; cmp foldop, 11; je >1; ja >9 | ||
2738 | | minsd xmm0, xmm1; retxmm0 | ||
2739 | |1: ; maxsd xmm0, xmm1; retxmm0 | ||
2740 | |9: ; int3 // Bad op. | ||
2741 | |7: // Move return value depending on calling convention. | ||
2742 | |.if X64WIN | ||
2743 | | fstp qword [esp+8] // Use scratch area. | ||
2744 | | movsd xmm0, qword [esp+8] | ||
2745 | |.elif X64 | ||
2746 | | fstp qword [esp-8] // Use red zone. | ||
2747 | | movsd xmm0, qword [esp-8] | ||
2748 | |.else | ||
2749 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. | ||
2750 | | fld qword [esp+4] | ||
2751 | |.endif | ||
2752 | | ret | ||
2753 | } else { | ||
2754 | | mov eax, [esp+20] | ||
2755 | | fld qword [esp+4] | ||
2756 | | fld qword [esp+12] | ||
2757 | | cmp eax, 1; je >1; ja >2 | ||
2758 | | faddp st1; ret | ||
2759 | |1: ; fsubp st1; ret | ||
2760 | |2: ; cmp eax, 3; je >1; ja >2 | ||
2761 | | fmulp st1; ret | ||
2762 | |1: ; fdivp st1; ret | ||
2763 | |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow | ||
2764 | | cmp eax, 7; je >1; ja >2 | ||
2765 | | fpop; fchs; ret | ||
2766 | |1: ; fpop; fabs; ret | ||
2767 | |2: ; cmp eax, 9; je >1; ja >2 | ||
2768 | | fpatan; ret | ||
2769 | |1: ; fxch; fscale; fpop1; ret | ||
2770 | |2: ; cmp eax, 11; je >1; ja >9 | ||
2771 | ||if (cmov) { | ||
2772 | | fucomi st1; fcmovnbe st1; fpop1; ret | ||
2773 | |1: ; fucomi st1; fcmovbe st1; fpop1; ret | ||
2774 | ||} else { | ||
2775 | | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret | ||
2776 | |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret | ||
2777 | ||} | ||
2778 | |9: ; int3 // Bad op. | ||
2779 | } | ||
2647 | | | 2780 | | |
2648 | |//----------------------------------------------------------------------- | 2781 | |//----------------------------------------------------------------------- |
2649 | |//-- Miscellaneous functions -------------------------------------------- | 2782 | |//-- Miscellaneous functions -------------------------------------------- |
@@ -2694,7 +2827,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov) | |||
2694 | } | 2827 | } |
2695 | 2828 | ||
2696 | /* Generate the code for a single instruction. */ | 2829 | /* Generate the code for a single instruction. */ |
2697 | static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | 2830 | static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) |
2698 | { | 2831 | { |
2699 | int vk = 0; | 2832 | int vk = 0; |
2700 | |// Note: aligning all instructions does not pay off. | 2833 | |// Note: aligning all instructions does not pay off. |
@@ -2711,10 +2844,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | |||
2711 | | ins_AD | 2844 | | ins_AD |
2712 | | checknum RA, ->vmeta_comp | 2845 | | checknum RA, ->vmeta_comp |
2713 | | checknum RD, ->vmeta_comp | 2846 | | checknum RD, ->vmeta_comp |
2714 | | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A. | 2847 | if (sse) { |
2715 | | fld qword [BASE+RD*8] | 2848 | | movsd xmm0, qword [BASE+RD*8] |
2716 | | add PC, 4 | 2849 | | add PC, 4 |
2717 | | fcomparepp // eax (RD) modified! | 2850 | | ucomisd xmm0, qword [BASE+RA*8] |
2851 | } else { | ||
2852 | | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A. | ||
2853 | | fld qword [BASE+RD*8] | ||
2854 | | add PC, 4 | ||
2855 | | fcomparepp // eax (RD) modified! | ||
2856 | } | ||
2718 | | // Unordered: all of ZF CF PF set, ordered: PF clear. | 2857 | | // Unordered: all of ZF CF PF set, ordered: PF clear. |
2719 | | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't. | 2858 | | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't. |
2720 | switch (op) { | 2859 | switch (op) { |
@@ -2746,9 +2885,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | |||
2746 | | add PC, 4 | 2885 | | add PC, 4 |
2747 | | cmp RB, LJ_TISNUM; ja >5 | 2886 | | cmp RB, LJ_TISNUM; ja >5 |
2748 | | checknum RA, >5 | 2887 | | checknum RA, >5 |
2749 | | fld qword [BASE+RA*8] | 2888 | if (sse) { |
2750 | | fld qword [BASE+RD*8] | 2889 | | movsd xmm0, qword [BASE+RD*8] |
2751 | | fcomparepp // eax (RD) modified! | 2890 | | ucomisd xmm0, qword [BASE+RA*8] |
2891 | } else { | ||
2892 | | fld qword [BASE+RA*8] | ||
2893 | | fld qword [BASE+RD*8] | ||
2894 | | fcomparepp // eax (RD) modified! | ||
2895 | } | ||
2752 | iseqne_fp: | 2896 | iseqne_fp: |
2753 | if (vk) { | 2897 | if (vk) { |
2754 | | jp >2 // Unordered means not equal. | 2898 | | jp >2 // Unordered means not equal. |
@@ -2820,9 +2964,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | |||
2820 | | ins_AD // RA = src, RD = num const, JMP with RD = target | 2964 | | ins_AD // RA = src, RD = num const, JMP with RD = target |
2821 | | add PC, 4 | 2965 | | add PC, 4 |
2822 | | checknum RA, >2 | 2966 | | checknum RA, >2 |
2823 | | fld qword [BASE+RA*8] | 2967 | if (sse) { |
2824 | | fld qword [KBASE+RD*8] | 2968 | | movsd xmm0, qword [KBASE+RD*8] |
2825 | | fcomparepp // eax (RD) modified! | 2969 | | ucomisd xmm0, qword [BASE+RA*8] |
2970 | } else { | ||
2971 | | fld qword [BASE+RA*8] | ||
2972 | | fld qword [KBASE+RD*8] | ||
2973 | | fcomparepp // eax (RD) modified! | ||
2974 | } | ||
2826 | goto iseqne_fp; | 2975 | goto iseqne_fp; |
2827 | case BC_ISEQP: case BC_ISNEP: | 2976 | case BC_ISEQP: case BC_ISNEP: |
2828 | vk = op == BC_ISEQP; | 2977 | vk = op == BC_ISEQP; |
@@ -2875,18 +3024,32 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | |||
2875 | case BC_UNM: | 3024 | case BC_UNM: |
2876 | | ins_AD // RA = dst, RD = src | 3025 | | ins_AD // RA = dst, RD = src |
2877 | | checknum RD, ->vmeta_unm | 3026 | | checknum RD, ->vmeta_unm |
2878 | | fld qword [BASE+RD*8] | 3027 | if (sse) { |
2879 | | fchs | 3028 | | movsd xmm0, qword [BASE+RD*8] |
2880 | | fstp qword [BASE+RA*8] | 3029 | | sseconst_sign xmm1, RDa |
3030 | | xorps xmm0, xmm1 | ||
3031 | | movsd qword [BASE+RA*8], xmm0 | ||
3032 | } else { | ||
3033 | | fld qword [BASE+RD*8] | ||
3034 | | fchs | ||
3035 | | fstp qword [BASE+RA*8] | ||
3036 | } | ||
2881 | | ins_next | 3037 | | ins_next |
2882 | break; | 3038 | break; |
2883 | case BC_LEN: | 3039 | case BC_LEN: |
2884 | | ins_AD // RA = dst, RD = src | 3040 | | ins_AD // RA = dst, RD = src |
2885 | | checkstr RD, >2 | 3041 | | checkstr RD, >2 |
2886 | | mov STR:RD, [BASE+RD*8] | 3042 | | mov STR:RD, [BASE+RD*8] |
2887 | | fild dword STR:RD->len | 3043 | if (sse) { |
2888 | |1: | 3044 | | xorps xmm0, xmm0 |
2889 | | fstp qword [BASE+RA*8] | 3045 | | cvtsi2sd xmm0, dword STR:RD->len |
3046 | |1: | ||
3047 | | movsd qword [BASE+RA*8], xmm0 | ||
3048 | } else { | ||
3049 | | fild dword STR:RD->len | ||
3050 | |1: | ||
3051 | | fstp qword [BASE+RA*8] | ||
3052 | } | ||
2890 | | ins_next | 3053 | | ins_next |
2891 | |2: | 3054 | |2: |
2892 | | checktab RD, ->vmeta_len | 3055 | | checktab RD, ->vmeta_len |
@@ -2894,72 +3057,108 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) | |||
2894 | | mov RB, BASE // Save BASE. | 3057 | | mov RB, BASE // Save BASE. |
2895 | | call extern lj_tab_len@4 // (GCtab *t) | 3058 | | call extern lj_tab_len@4 // (GCtab *t) |
2896 | | // Length of table returned in eax (RC). | 3059 | | // Length of table returned in eax (RC). |
2897 | | mov ARG1, RC | 3060 | if (sse) { |
2898 | | mov BASE, RB // Restore BASE. | 3061 | | cvtsi2sd xmm0, RC |
2899 | | fild ARG1 | 3062 | | mov BASE, RB // Restore BASE. |
3063 | } else { | ||
3064 | | mov ARG1, RC | ||
3065 | | mov BASE, RB // Restore BASE. | ||
3066 | | fild ARG1 | ||
3067 | } | ||
2900 | | movzx RA, PC_RA | 3068 | | movzx RA, PC_RA |
2901 | | jmp <1 | 3069 | | jmp <1 |
2902 | break; | 3070 | break; |
2903 | 3071 | ||
2904 | /* -- Binary ops -------------------------------------------------------- */ | 3072 | /* -- Binary ops -------------------------------------------------------- */ |
2905 | 3073 | ||
2906 | |.macro ins_arithpre, ins | 3074 | |.macro ins_arithpre, ins, sseins, ssereg |
2907 | | ins_ABC | 3075 | | ins_ABC |
2908 | ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); | 3076 | ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); |
2909 | ||switch (vk) { | 3077 | ||switch (vk) { |
2910 | ||case 0: | 3078 | ||case 0: |
2911 | | checknum RB, ->vmeta_arith_vn | 3079 | | checknum RB, ->vmeta_arith_vn |
3080 | ||if (sse) { | ||
3081 | | movsd xmm0, qword [BASE+RB*8] | ||
3082 | | sseins ssereg, qword [KBASE+RC*8] | ||
3083 | ||} else { | ||
2912 | | fld qword [BASE+RB*8] | 3084 | | fld qword [BASE+RB*8] |
2913 | | ins qword [KBASE+RC*8] | 3085 | | ins qword [KBASE+RC*8] |
3086 | ||} | ||
2914 | || break; | 3087 | || break; |
2915 | ||case 1: | 3088 | ||case 1: |
2916 | | checknum RB, ->vmeta_arith_nv | 3089 | | checknum RB, ->vmeta_arith_nv |
3090 | ||if (sse) { | ||
3091 | | movsd xmm0, qword [KBASE+RC*8] | ||
3092 | | sseins ssereg, qword [BASE+RB*8] | ||
3093 | ||} else { | ||
2917 | | fld qword [KBASE+RC*8] | 3094 | | fld qword [KBASE+RC*8] |
2918 | | ins qword [BASE+RB*8] | 3095 | | ins qword [BASE+RB*8] |
3096 | ||} | ||
2919 | || break; | 3097 | || break; |
2920 | ||default: | 3098 | ||default: |
2921 | | checknum RB, ->vmeta_arith_vv | 3099 | | checknum RB, ->vmeta_arith_vv |
2922 | | checknum RC, ->vmeta_arith_vv | 3100 | | checknum RC, ->vmeta_arith_vv |
3101 | ||if (sse) { | ||
3102 | | movsd xmm0, qword [BASE+RB*8] | ||
3103 | | sseins ssereg, qword [BASE+RC*8] | ||
3104 | ||} else { | ||
2923 | | fld qword [BASE+RB*8] | 3105 | | fld qword [BASE+RB*8] |
2924 | | ins qword [BASE+RC*8] | 3106 | | ins qword [BASE+RC*8] |
3107 | ||} | ||
2925 | || break; | 3108 | || break; |
2926 | ||} | 3109 | ||} |
2927 | |.endmacro | 3110 | |.endmacro |
2928 | | | 3111 | | |
2929 | |.macro ins_arith, ins | 3112 | |.macro ins_arithpost |
2930 | | ins_arithpre ins | 3113 | ||if (sse) { |
3114 | | movsd qword [BASE+RA*8], xmm0 | ||
3115 | ||} else { | ||
2931 | | fstp qword [BASE+RA*8] | 3116 | | fstp qword [BASE+RA*8] |
3117 | ||} | ||
3118 | |.endmacro | ||
3119 | | | ||
3120 | |.macro ins_arith, ins, sseins | ||
3121 | | ins_arithpre ins, sseins, xmm0 | ||
3122 | | ins_arithpost | ||
2932 | | ins_next | 3123 | | ins_next |
2933 | |.endmacro | 3124 | |.endmacro |
2934 | 3125 | ||
2935 | | // RA = dst, RB = src1 or num const, RC = src2 or num const | 3126 | | // RA = dst, RB = src1 or num const, RC = src2 or num const |
2936 | case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: | 3127 | case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: |
2937 | | ins_arith fadd | 3128 | | ins_arith fadd, addsd |
2938 | break; | 3129 | break; |
2939 | case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: | 3130 | case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: |
2940 | | ins_arith fsub | 3131 | | ins_arith fsub, subsd |
2941 | break; | 3132 | break; |
2942 | case BC_MULVN: case BC_MULNV: case BC_MULVV: | 3133 | case BC_MULVN: case BC_MULNV: case BC_MULVV: |
2943 | | ins_arith fmul | 3134 | | ins_arith fmul, mulsd |
2944 | break; | 3135 | break; |
2945 | case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: | 3136 | case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: |
2946 | | ins_arith fdiv | 3137 | | ins_arith fdiv, divsd |
2947 | break; | 3138 | break; |
2948 | case BC_MODVN: | 3139 | case BC_MODVN: |
2949 | | ins_arithpre fld | 3140 | | ins_arithpre fld, movsd, xmm1 |
2950 | |->BC_MODVN_Z: | 3141 | |->BC_MODVN_Z: |
2951 | | call ->vm_mod | 3142 | | call ->vm_mod |
2952 | | fstp qword [BASE+RA*8] | 3143 | | ins_arithpost |
2953 | | ins_next | 3144 | | ins_next |
2954 | break; | 3145 | break; |
2955 | case BC_MODNV: case BC_MODVV: | 3146 | case BC_MODNV: case BC_MODVV: |
2956 | | ins_arithpre fld | 3147 | | ins_arithpre fld, movsd, xmm1 |
2957 | | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. | 3148 | | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. |
2958 | break; | 3149 | break; |
2959 | case BC_POW: | 3150 | case BC_POW: |
2960 | | ins_arithpre fld | 3151 | if (sse) { |
2961 | | call ->vm_pow | 3152 | sse = 0; /* NYI: temporary workaround. */ |
2962 | | fstp qword [BASE+RA*8] | 3153 | | ins_arithpre fld, movsd, xmm1 |
3154 | | call ->vm_pow | ||
3155 | | ins_arithpost | ||
3156 | sse = 1; | ||
3157 | } else { | ||
3158 | | ins_arithpre fld, movsd, xmm1 | ||
3159 | | call ->vm_pow | ||
3160 | | ins_arithpost | ||
3161 | } | ||
2963 | | ins_next | 3162 | | ins_next |
2964 | break; | 3163 | break; |
2965 | 3164 | ||
@@ -3945,17 +4144,21 @@ static int build_backend(BuildCtx *ctx) | |||
3945 | { | 4144 | { |
3946 | int op; | 4145 | int op; |
3947 | int cmov = 1; | 4146 | int cmov = 1; |
4147 | int sse = 0; | ||
3948 | #ifdef LUAJIT_CPU_NOCMOV | 4148 | #ifdef LUAJIT_CPU_NOCMOV |
3949 | cmov = 0; | 4149 | cmov = 0; |
3950 | #endif | 4150 | #endif |
4151 | #ifdef LUAJIT_CPU_SSE2 | ||
4152 | sse = 1; | ||
4153 | #endif | ||
3951 | 4154 | ||
3952 | dasm_growpc(Dst, BC__MAX); | 4155 | dasm_growpc(Dst, BC__MAX); |
3953 | 4156 | ||
3954 | build_subroutines(ctx, cmov); | 4157 | build_subroutines(ctx, cmov, sse); |
3955 | 4158 | ||
3956 | |.code_op | 4159 | |.code_op |
3957 | for (op = 0; op < BC__MAX; op++) | 4160 | for (op = 0; op < BC__MAX; op++) |
3958 | build_ins(ctx, (BCOp)op, op, cmov); | 4161 | build_ins(ctx, (BCOp)op, op, cmov, sse); |
3959 | 4162 | ||
3960 | return BC__MAX; | 4163 | return BC__MAX; |
3961 | } | 4164 | } |