summaryrefslogtreecommitdiff
path: root/src/buildvm_x86.dasc
diff options
context:
space:
mode:
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r--src/buildvm_x86.dasc353
1 files changed, 278 insertions, 75 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc
index 960afa1d..b220c58f 100644
--- a/src/buildvm_x86.dasc
+++ b/src/buildvm_x86.dasc
@@ -322,6 +322,40 @@
322|.macro fdup; fld st0; .endmacro 322|.macro fdup; fld st0; .endmacro
323|.macro fpop1; fstp st1; .endmacro 323|.macro fpop1; fstp st1; .endmacro
324| 324|
325|// Synthesize SSE FP constants.
326|.macro sseconst_sign, reg, tmp // Synthesize sign mask.
327|.if X64
328| mov64 tmp, U64x(80000000,00000000); movd reg, tmp
329|.else
330| mov tmp, 0x80000000; movd xmm1, tmp; pshufd reg, reg, 0x51
331|.endif
332|.endmacro
333|
334|.macro sseconst_abs, reg, tmp // Synthesize abs mask.
335|.if X64
336| mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
337|.else
338| pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1
339|.endif
340|.endmacro
341|
342|.macro sseconst_1, reg, tmp // Synthesize 1.0.
343|.if X64
344| mov64 tmp, U64x(3ff00000,00000000)
345| movd reg, tmp
346|.else
347| mov tmp, 0x3ff00000; movd reg, tmp; pshufd reg, reg, 0x51
348|.endif
349|.endmacro
350|
351|.macro sseconst_2p52, reg, tmp // Synthesize 2^52.
352|.if X64
353| mov64 tmp, U64x(43300000,00000000); movd reg, tmp
354|.else
355| mov tmp, 0x43300000; movd reg, tmp; pshufd reg, reg, 0x51
356|.endif
357|.endmacro
358|
325|// Move table write barrier back. Overwrites reg. 359|// Move table write barrier back. Overwrites reg.
326|.macro barrierback, tab, reg 360|.macro barrierback, tab, reg
327| and byte tab->marked, cast_byte(~LJ_GC_BLACK) // black2gray(tab) 361| and byte tab->marked, cast_byte(~LJ_GC_BLACK) // black2gray(tab)
@@ -334,7 +368,7 @@
334 368
335/* Generate subroutines used by opcodes and other parts of the VM. */ 369/* Generate subroutines used by opcodes and other parts of the VM. */
336/* The .code_sub section should be last to help static branch prediction. */ 370/* The .code_sub section should be last to help static branch prediction. */
337static void build_subroutines(BuildCtx *ctx, int cmov) 371static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
338{ 372{
339 |.code_sub 373 |.code_sub
340 | 374 |
@@ -2454,21 +2488,51 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
2454 | vm_round 0x0c00, 0xffff 2488 | vm_round 0x0c00, 0xffff
2455 | 2489 |
2456 |// FP modulo x%y. Called by BC_MOD* and vm_arith. 2490 |// FP modulo x%y. Called by BC_MOD* and vm_arith.
2457 |// Args/ret on x87 stack (y on top). No xmm registers modified.
2458 |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
2459 |->vm_mod: 2491 |->vm_mod:
2460 | fld st1 2492 if (sse) {
2461 | fdiv st1 2493 |// Args in xmm0/xmm1, return value in xmm0.
2462 | fnstcw word [esp+4] 2494 |// Caveat: xmm0-xmm5 and RC (eax) modified!
2463 | mov ax, 0x0400 2495 | movaps xmm5, xmm0
2464 | or ax, [esp+4] 2496 | divsd xmm0, xmm1
2465 | and ax, 0xf7ff 2497 | sseconst_abs xmm2, RDa
2466 | mov [esp+6], ax 2498 | sseconst_2p52 xmm3, RDa
2467 | fldcw word [esp+6] 2499 | movaps xmm4, xmm0
2468 | frndint 2500 | andpd xmm4, xmm2 // |x/y|
2469 | fldcw word [esp+4] 2501 | ucomisd xmm3, xmm4 // No truncation if 2^52 <= |x/y|.
2470 | fmulp st1 2502 | jbe >1
2471 | fsubp st1 2503 | andnpd xmm2, xmm0 // Isolate sign bit.
2504 | addsd xmm4, xmm3 // (|x/y| + 2^52) - 2^52
2505 | subsd xmm4, xmm3
2506 | orpd xmm4, xmm2 // Merge sign bit back in.
2507 | sseconst_1 xmm2, RDa
2508 | cmpsd xmm0, xmm4, 1 // x/y < result?
2509 | andpd xmm0, xmm2
2510 | subsd xmm4, xmm0 // If yes, subtract 1.0.
2511 | movaps xmm0, xmm5
2512 | mulsd xmm1, xmm4
2513 | subsd xmm0, xmm1
2514 | ret
2515 |1:
2516 | mulsd xmm1, xmm0
2517 | movaps xmm0, xmm5
2518 | subsd xmm0, xmm1
2519 | ret
2520 } else {
2521 |// Args/ret on x87 stack (y on top). No xmm registers modified.
2522 |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
2523 | fld st1
2524 | fdiv st1
2525 | fnstcw word [esp+4]
2526 | mov ax, 0x0400
2527 | or ax, [esp+4]
2528 | and ax, 0xf7ff
2529 | mov [esp+6], ax
2530 | fldcw word [esp+6]
2531 | frndint
2532 | fldcw word [esp+4]
2533 | fmulp st1
2534 | fsubp st1
2535 }
2472 | ret 2536 | ret
2473 | 2537 |
2474 |// FP exponentiation e^x and 2^x. Called by math.exp fast function and 2538 |// FP exponentiation e^x and 2^x. Called by math.exp fast function and
@@ -2619,31 +2683,100 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
2619 |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -) 2683 |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -)
2620 |// and basic math functions. ORDER ARITH 2684 |// and basic math functions. ORDER ARITH
2621 |->vm_foldarith: 2685 |->vm_foldarith:
2622 | mov eax, [esp+20] 2686 if (sse) {
2623 | fld qword [esp+4] 2687 |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro
2624 | fld qword [esp+12] 2688 |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro
2625 | cmp eax, 1; je >1; ja >2 2689 |
2626 | faddp st1; ret 2690 |.if X64WIN
2627 |1: ; fsubp st1; ret 2691 | .define foldop, CARG3d
2628 |2: ; cmp eax, 3; je >1; ja >2 2692 |.elif X64
2629 | fmulp st1; ret 2693 | .define foldop, CARG1d
2630 |1: ; fdivp st1; ret 2694 |.else
2631 |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow 2695 | .define foldop, eax
2632 | cmp eax, 7; je >1; ja >2 2696 | mov foldop, [esp+20]
2633 | fpop; fchs; ret 2697 | movsd xmm0, qword [esp+4]
2634 |1: ; fpop; fabs; ret 2698 | movsd xmm1, qword [esp+12]
2635 |2: ; cmp eax, 9; je >1; ja >2 2699 |.endif
2636 | fpatan; ret 2700 | cmp foldop, 1; je >1; ja >2
2637 |1: ; fxch; fscale; fpop1; ret 2701 | addsd xmm0, xmm1; retxmm0
2638 |2: ; cmp eax, 11; je >1; ja >9 2702 |1: ; subsd xmm0, xmm1; retxmm0
2639 ||if (cmov) { 2703 |2: ; cmp foldop, 3; je >1; ja >2
2640 | fucomi st1; fcmovnbe st1; fpop1; ret 2704 | mulsd xmm0, xmm1; retxmm0
2641 |1: ; fucomi st1; fcmovbe st1; fpop1; ret 2705 |1: ; divsd xmm0, xmm1; retxmm0
2642 ||} else { 2706 |2: ; cmp foldop, 5
2643 | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret 2707 |.if X64
2644 |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret 2708 | jb ->vm_mod; je ->vm_pow // NYI: broken without SSE vm_pow.
2645 ||} 2709 |.else
2646 |9: ; int3 // Bad op. 2710 | je >1; ja >2
2711 | call ->vm_mod; retxmm0
2712 |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow // NYI
2713 |2:
2714 |.endif
2715 | cmp foldop, 7; je >1; ja >2
2716 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0
2717 |1:
2718 | sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0
2719 |2: ; cmp foldop, 9; ja >2
2720 |.if X64WIN
2721 | movsd qword [esp+8], xmm0 // Use scratch area.
2722 | movsd qword [esp+16], xmm1
2723 | fld qword [esp+8]
2724 | fld qword [esp+16]
2725 |.elif X64
2726 | movsd qword [esp-8], xmm0 // Use red zone.
2727 | movsd qword [esp-16], xmm1
2728 | fld qword [esp-8]
2729 | fld qword [esp-16]
2730 |.else
2731 | fld qword [esp+4] // Reload from stack
2732 | fld qword [esp+12]
2733 |.endif
2734 | je >1
2735 | fpatan; retst0
2736 |1: ; fxch; fscale; fpop1; retst0
2737 |2: ; cmp foldop, 11; je >1; ja >9
2738 | minsd xmm0, xmm1; retxmm0
2739 |1: ; maxsd xmm0, xmm1; retxmm0
2740 |9: ; int3 // Bad op.
2741 |7: // Move return value depending on calling convention.
2742 |.if X64WIN
2743 | fstp qword [esp+8] // Use scratch area.
2744 | movsd xmm0, qword [esp+8]
2745 |.elif X64
2746 | fstp qword [esp-8] // Use red zone.
2747 | movsd xmm0, qword [esp-8]
2748 |.else
2749 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
2750 | fld qword [esp+4]
2751 |.endif
2752 | ret
2753 } else {
2754 | mov eax, [esp+20]
2755 | fld qword [esp+4]
2756 | fld qword [esp+12]
2757 | cmp eax, 1; je >1; ja >2
2758 | faddp st1; ret
2759 |1: ; fsubp st1; ret
2760 |2: ; cmp eax, 3; je >1; ja >2
2761 | fmulp st1; ret
2762 |1: ; fdivp st1; ret
2763 |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
2764 | cmp eax, 7; je >1; ja >2
2765 | fpop; fchs; ret
2766 |1: ; fpop; fabs; ret
2767 |2: ; cmp eax, 9; je >1; ja >2
2768 | fpatan; ret
2769 |1: ; fxch; fscale; fpop1; ret
2770 |2: ; cmp eax, 11; je >1; ja >9
2771 ||if (cmov) {
2772 | fucomi st1; fcmovnbe st1; fpop1; ret
2773 |1: ; fucomi st1; fcmovbe st1; fpop1; ret
2774 ||} else {
2775 | fucom st1; fnstsw ax; test ah, 1; jz >2; fxch; 2: ; fpop; ret
2776 |1: ; fucom st1; fnstsw ax; test ah, 1; jnz >2; fxch; 2: ; fpop; ret
2777 ||}
2778 |9: ; int3 // Bad op.
2779 }
2647 | 2780 |
2648 |//----------------------------------------------------------------------- 2781 |//-----------------------------------------------------------------------
2649 |//-- Miscellaneous functions -------------------------------------------- 2782 |//-- Miscellaneous functions --------------------------------------------
@@ -2694,7 +2827,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov)
2694} 2827}
2695 2828
2696/* Generate the code for a single instruction. */ 2829/* Generate the code for a single instruction. */
2697static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov) 2830static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
2698{ 2831{
2699 int vk = 0; 2832 int vk = 0;
2700 |// Note: aligning all instructions does not pay off. 2833 |// Note: aligning all instructions does not pay off.
@@ -2711,10 +2844,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
2711 | ins_AD 2844 | ins_AD
2712 | checknum RA, ->vmeta_comp 2845 | checknum RA, ->vmeta_comp
2713 | checknum RD, ->vmeta_comp 2846 | checknum RD, ->vmeta_comp
2714 | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A. 2847 if (sse) {
2715 | fld qword [BASE+RD*8] 2848 | movsd xmm0, qword [BASE+RD*8]
2716 | add PC, 4 2849 | add PC, 4
2717 | fcomparepp // eax (RD) modified! 2850 | ucomisd xmm0, qword [BASE+RA*8]
2851 } else {
2852 | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A.
2853 | fld qword [BASE+RD*8]
2854 | add PC, 4
2855 | fcomparepp // eax (RD) modified!
2856 }
2718 | // Unordered: all of ZF CF PF set, ordered: PF clear. 2857 | // Unordered: all of ZF CF PF set, ordered: PF clear.
2719 | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't. 2858 | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
2720 switch (op) { 2859 switch (op) {
@@ -2746,9 +2885,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
2746 | add PC, 4 2885 | add PC, 4
2747 | cmp RB, LJ_TISNUM; ja >5 2886 | cmp RB, LJ_TISNUM; ja >5
2748 | checknum RA, >5 2887 | checknum RA, >5
2749 | fld qword [BASE+RA*8] 2888 if (sse) {
2750 | fld qword [BASE+RD*8] 2889 | movsd xmm0, qword [BASE+RD*8]
2751 | fcomparepp // eax (RD) modified! 2890 | ucomisd xmm0, qword [BASE+RA*8]
2891 } else {
2892 | fld qword [BASE+RA*8]
2893 | fld qword [BASE+RD*8]
2894 | fcomparepp // eax (RD) modified!
2895 }
2752 iseqne_fp: 2896 iseqne_fp:
2753 if (vk) { 2897 if (vk) {
2754 | jp >2 // Unordered means not equal. 2898 | jp >2 // Unordered means not equal.
@@ -2820,9 +2964,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
2820 | ins_AD // RA = src, RD = num const, JMP with RD = target 2964 | ins_AD // RA = src, RD = num const, JMP with RD = target
2821 | add PC, 4 2965 | add PC, 4
2822 | checknum RA, >2 2966 | checknum RA, >2
2823 | fld qword [BASE+RA*8] 2967 if (sse) {
2824 | fld qword [KBASE+RD*8] 2968 | movsd xmm0, qword [KBASE+RD*8]
2825 | fcomparepp // eax (RD) modified! 2969 | ucomisd xmm0, qword [BASE+RA*8]
2970 } else {
2971 | fld qword [BASE+RA*8]
2972 | fld qword [KBASE+RD*8]
2973 | fcomparepp // eax (RD) modified!
2974 }
2826 goto iseqne_fp; 2975 goto iseqne_fp;
2827 case BC_ISEQP: case BC_ISNEP: 2976 case BC_ISEQP: case BC_ISNEP:
2828 vk = op == BC_ISEQP; 2977 vk = op == BC_ISEQP;
@@ -2875,18 +3024,32 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
2875 case BC_UNM: 3024 case BC_UNM:
2876 | ins_AD // RA = dst, RD = src 3025 | ins_AD // RA = dst, RD = src
2877 | checknum RD, ->vmeta_unm 3026 | checknum RD, ->vmeta_unm
2878 | fld qword [BASE+RD*8] 3027 if (sse) {
2879 | fchs 3028 | movsd xmm0, qword [BASE+RD*8]
2880 | fstp qword [BASE+RA*8] 3029 | sseconst_sign xmm1, RDa
3030 | xorps xmm0, xmm1
3031 | movsd qword [BASE+RA*8], xmm0
3032 } else {
3033 | fld qword [BASE+RD*8]
3034 | fchs
3035 | fstp qword [BASE+RA*8]
3036 }
2881 | ins_next 3037 | ins_next
2882 break; 3038 break;
2883 case BC_LEN: 3039 case BC_LEN:
2884 | ins_AD // RA = dst, RD = src 3040 | ins_AD // RA = dst, RD = src
2885 | checkstr RD, >2 3041 | checkstr RD, >2
2886 | mov STR:RD, [BASE+RD*8] 3042 | mov STR:RD, [BASE+RD*8]
2887 | fild dword STR:RD->len 3043 if (sse) {
2888 |1: 3044 | xorps xmm0, xmm0
2889 | fstp qword [BASE+RA*8] 3045 | cvtsi2sd xmm0, dword STR:RD->len
3046 |1:
3047 | movsd qword [BASE+RA*8], xmm0
3048 } else {
3049 | fild dword STR:RD->len
3050 |1:
3051 | fstp qword [BASE+RA*8]
3052 }
2890 | ins_next 3053 | ins_next
2891 |2: 3054 |2:
2892 | checktab RD, ->vmeta_len 3055 | checktab RD, ->vmeta_len
@@ -2894,72 +3057,108 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov)
2894 | mov RB, BASE // Save BASE. 3057 | mov RB, BASE // Save BASE.
2895 | call extern lj_tab_len@4 // (GCtab *t) 3058 | call extern lj_tab_len@4 // (GCtab *t)
2896 | // Length of table returned in eax (RC). 3059 | // Length of table returned in eax (RC).
2897 | mov ARG1, RC 3060 if (sse) {
2898 | mov BASE, RB // Restore BASE. 3061 | cvtsi2sd xmm0, RC
2899 | fild ARG1 3062 | mov BASE, RB // Restore BASE.
3063 } else {
3064 | mov ARG1, RC
3065 | mov BASE, RB // Restore BASE.
3066 | fild ARG1
3067 }
2900 | movzx RA, PC_RA 3068 | movzx RA, PC_RA
2901 | jmp <1 3069 | jmp <1
2902 break; 3070 break;
2903 3071
2904 /* -- Binary ops -------------------------------------------------------- */ 3072 /* -- Binary ops -------------------------------------------------------- */
2905 3073
2906 |.macro ins_arithpre, ins 3074 |.macro ins_arithpre, ins, sseins, ssereg
2907 | ins_ABC 3075 | ins_ABC
2908 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); 3076 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
2909 ||switch (vk) { 3077 ||switch (vk) {
2910 ||case 0: 3078 ||case 0:
2911 | checknum RB, ->vmeta_arith_vn 3079 | checknum RB, ->vmeta_arith_vn
3080 ||if (sse) {
3081 | movsd xmm0, qword [BASE+RB*8]
3082 | sseins ssereg, qword [KBASE+RC*8]
3083 ||} else {
2912 | fld qword [BASE+RB*8] 3084 | fld qword [BASE+RB*8]
2913 | ins qword [KBASE+RC*8] 3085 | ins qword [KBASE+RC*8]
3086 ||}
2914 || break; 3087 || break;
2915 ||case 1: 3088 ||case 1:
2916 | checknum RB, ->vmeta_arith_nv 3089 | checknum RB, ->vmeta_arith_nv
3090 ||if (sse) {
3091 | movsd xmm0, qword [KBASE+RC*8]
3092 | sseins ssereg, qword [BASE+RB*8]
3093 ||} else {
2917 | fld qword [KBASE+RC*8] 3094 | fld qword [KBASE+RC*8]
2918 | ins qword [BASE+RB*8] 3095 | ins qword [BASE+RB*8]
3096 ||}
2919 || break; 3097 || break;
2920 ||default: 3098 ||default:
2921 | checknum RB, ->vmeta_arith_vv 3099 | checknum RB, ->vmeta_arith_vv
2922 | checknum RC, ->vmeta_arith_vv 3100 | checknum RC, ->vmeta_arith_vv
3101 ||if (sse) {
3102 | movsd xmm0, qword [BASE+RB*8]
3103 | sseins ssereg, qword [BASE+RC*8]
3104 ||} else {
2923 | fld qword [BASE+RB*8] 3105 | fld qword [BASE+RB*8]
2924 | ins qword [BASE+RC*8] 3106 | ins qword [BASE+RC*8]
3107 ||}
2925 || break; 3108 || break;
2926 ||} 3109 ||}
2927 |.endmacro 3110 |.endmacro
2928 | 3111 |
2929 |.macro ins_arith, ins 3112 |.macro ins_arithpost
2930 | ins_arithpre ins 3113 ||if (sse) {
3114 | movsd qword [BASE+RA*8], xmm0
3115 ||} else {
2931 | fstp qword [BASE+RA*8] 3116 | fstp qword [BASE+RA*8]
3117 ||}
3118 |.endmacro
3119 |
3120 |.macro ins_arith, ins, sseins
3121 | ins_arithpre ins, sseins, xmm0
3122 | ins_arithpost
2932 | ins_next 3123 | ins_next
2933 |.endmacro 3124 |.endmacro
2934 3125
2935 | // RA = dst, RB = src1 or num const, RC = src2 or num const 3126 | // RA = dst, RB = src1 or num const, RC = src2 or num const
2936 case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: 3127 case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
2937 | ins_arith fadd 3128 | ins_arith fadd, addsd
2938 break; 3129 break;
2939 case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: 3130 case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
2940 | ins_arith fsub 3131 | ins_arith fsub, subsd
2941 break; 3132 break;
2942 case BC_MULVN: case BC_MULNV: case BC_MULVV: 3133 case BC_MULVN: case BC_MULNV: case BC_MULVV:
2943 | ins_arith fmul 3134 | ins_arith fmul, mulsd
2944 break; 3135 break;
2945 case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: 3136 case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
2946 | ins_arith fdiv 3137 | ins_arith fdiv, divsd
2947 break; 3138 break;
2948 case BC_MODVN: 3139 case BC_MODVN:
2949 | ins_arithpre fld 3140 | ins_arithpre fld, movsd, xmm1
2950 |->BC_MODVN_Z: 3141 |->BC_MODVN_Z:
2951 | call ->vm_mod 3142 | call ->vm_mod
2952 | fstp qword [BASE+RA*8] 3143 | ins_arithpost
2953 | ins_next 3144 | ins_next
2954 break; 3145 break;
2955 case BC_MODNV: case BC_MODVV: 3146 case BC_MODNV: case BC_MODVV:
2956 | ins_arithpre fld 3147 | ins_arithpre fld, movsd, xmm1
2957 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. 3148 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
2958 break; 3149 break;
2959 case BC_POW: 3150 case BC_POW:
2960 | ins_arithpre fld 3151 if (sse) {
2961 | call ->vm_pow 3152 sse = 0; /* NYI: temporary workaround. */
2962 | fstp qword [BASE+RA*8] 3153 | ins_arithpre fld, movsd, xmm1
3154 | call ->vm_pow
3155 | ins_arithpost
3156 sse = 1;
3157 } else {
3158 | ins_arithpre fld, movsd, xmm1
3159 | call ->vm_pow
3160 | ins_arithpost
3161 }
2963 | ins_next 3162 | ins_next
2964 break; 3163 break;
2965 3164
@@ -3945,17 +4144,21 @@ static int build_backend(BuildCtx *ctx)
3945{ 4144{
3946 int op; 4145 int op;
3947 int cmov = 1; 4146 int cmov = 1;
4147 int sse = 0;
3948#ifdef LUAJIT_CPU_NOCMOV 4148#ifdef LUAJIT_CPU_NOCMOV
3949 cmov = 0; 4149 cmov = 0;
3950#endif 4150#endif
4151#ifdef LUAJIT_CPU_SSE2
4152 sse = 1;
4153#endif
3951 4154
3952 dasm_growpc(Dst, BC__MAX); 4155 dasm_growpc(Dst, BC__MAX);
3953 4156
3954 build_subroutines(ctx, cmov); 4157 build_subroutines(ctx, cmov, sse);
3955 4158
3956 |.code_op 4159 |.code_op
3957 for (op = 0; op < BC__MAX; op++) 4160 for (op = 0; op < BC__MAX; op++)
3958 build_ins(ctx, (BCOp)op, op, cmov); 4161 build_ins(ctx, (BCOp)op, op, cmov, sse);
3959 4162
3960 return BC__MAX; 4163 return BC__MAX;
3961} 4164}