aboutsummaryrefslogtreecommitdiff
path: root/src/buildvm_x86.dasc
diff options
context:
space:
mode:
authorMike Pall <mike>2009-12-25 23:12:30 +0100
committerMike Pall <mike>2009-12-25 23:12:30 +0100
commit690760aa3853e63331f46e40c8276d9f5939261d (patch)
treeb68fb518d22c3a08d8886bc532de91fdfdcc9360 /src/buildvm_x86.dasc
parent6ce0c90ed642157f019b50ad1eb06246471a47b1 (diff)
downloadluajit-690760aa3853e63331f46e40c8276d9f5939261d.tar.gz
luajit-690760aa3853e63331f46e40c8276d9f5939261d.tar.bz2
luajit-690760aa3853e63331f46e40c8276d9f5939261d.zip
Add SSE variant of pow/powi to interpreter.
Use SSE pow/powi helper functions from compiled code. Cleanup use of helper functions. Related cleanups of folding functions in x64 interpreter.
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r--src/buildvm_x86.dasc418
1 files changed, 300 insertions, 118 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc
index 99842d08..9ce8ef16 100644
--- a/src/buildvm_x86.dasc
+++ b/src/buildvm_x86.dasc
@@ -96,10 +96,6 @@
96|.type TRACE, Trace 96|.type TRACE, Trace
97|.type EXITINFO, ExitInfo 97|.type EXITINFO, ExitInfo
98| 98|
99|// x86/x64 portability macros
100|.macro push_eax; .if X64; push rax; .else; push eax; .endif; .endmacro
101|.macro pop_eax; .if X64; pop rax; .else; pop eax; .endif; .endmacro
102|
103|// Stack layout while in interpreter. Must match with lj_frame.h. 99|// Stack layout while in interpreter. Must match with lj_frame.h.
104|//----------------------------------------------------------------------- 100|//-----------------------------------------------------------------------
105|.if not X64 // x86 stack layout. 101|.if not X64 // x86 stack layout.
@@ -2072,10 +2068,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2072 | fpop1 2068 | fpop1
2073 | jmp ->fff_resn 2069 | jmp ->fff_resn
2074 | 2070 |
2075 if (0 && sse) { // NYI 2071 if (sse) {
2076 |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0 2072 |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0
2077 } else { 2073 } else {
2078 |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn 2074 |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn
2079 } 2075 }
2080 | 2076 |
2081 |.macro math_minmax, name, cmovop, nocmovop, sseop 2077 |.macro math_minmax, name, cmovop, nocmovop, sseop
@@ -2091,6 +2087,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2091 | add RB, 1 2087 | add RB, 1
2092 | jmp <1 2088 | jmp <1
2093 ||} else { 2089 ||} else {
2090 |.if not X64
2094 |.ffunc_n name 2091 |.ffunc_n name
2095 | mov RB, 2 2092 | mov RB, 2
2096 |1: 2093 |1:
@@ -2101,12 +2098,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2101 ||if (cmov) { 2098 ||if (cmov) {
2102 | fucomi st1; cmovop st1; fpop1 2099 | fucomi st1; cmovop st1; fpop1
2103 ||} else { 2100 ||} else {
2104 | push_eax 2101 | push eax
2105 | fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop 2102 | fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop
2106 | pop_eax 2103 | pop eax
2107 ||} 2104 ||}
2108 | add RB, 1 2105 | add RB, 1
2109 | jmp <1 2106 | jmp <1
2107 |.endif
2110 ||} 2108 ||}
2111 |.endmacro 2109 |.endmacro
2112 | 2110 |
@@ -2842,19 +2840,29 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2842 |->vm_exp: 2840 |->vm_exp:
2843 | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e)) 2841 | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e))
2844 |->vm_exp2: 2842 |->vm_exp2:
2845 | fst dword [esp+4] // Caveat: overwrites ARG1. 2843 | .if X64WIN
2846 | cmp dword [esp+4], 0x7f800000; je >1 // Special case: e^+Inf = +Inf 2844 | .define expscratch, dword [rsp+8] // Use scratch area.
2847 | cmp dword [esp+4], 0xff800000; je >2 // Special case: e^-Inf = 0 2845 | .elif X64
2846 | .define expscratch, dword [rsp-8] // Use red zone.
2847 | .else
2848 | .define expscratch, dword [esp+4] // Needs 4 byte scratch area.
2849 | .endif
2850 | fst expscratch // Caveat: overwrites ARG1.
2851 | cmp expscratch, 0x7f800000; je >1 // Special case: e^+Inf = +Inf
2852 | cmp expscratch, 0xff800000; je >2 // Special case: e^-Inf = 0
2848 |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check. 2853 |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check.
2849 | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. 2854 | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
2850 | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int 2855 | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
2851 |1: 2856 |1:
2852 | ret 2857 | ret
2853 |2: 2858 |2:
2854 | fpop; fldz; ret 2859 | fpop; fldz; ret
2855 | 2860 |
2856 |// Generic power function x^y. Called by BC_POW, math.pow fast function 2861 |// Generic power function x^y. Called by BC_POW, math.pow fast function,
2857 |// and vm_arith. Args/ret on x87 stack (y on top). No int/xmm regs modified. 2862 |// and vm_arith.
2863 if (!sse) {
2864 |.if not X64
2865 |// Args/ret on x87 stack (y on top). RC (eax) modified.
2858 |// Caveat: needs 3 slots on x87 stack! 2866 |// Caveat: needs 3 slots on x87 stack!
2859 |->vm_pow: 2867 |->vm_pow:
2860 | fist dword [esp+4] // Store/reload int before comparison. 2868 | fist dword [esp+4] // Store/reload int before comparison.
@@ -2862,18 +2870,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2862 ||if (cmov) { 2870 ||if (cmov) {
2863 | fucomip st1 2871 | fucomip st1
2864 ||} else { 2872 ||} else {
2865 | push_eax; fucomp st1; fnstsw ax; sahf; pop_eax 2873 | fucomp st1; fnstsw ax; sahf
2866 ||} 2874 ||}
2867 | jnz >8 // Branch for FP exponents. 2875 | jnz >8 // Branch for FP exponents.
2868 | jp >9 // Branch for NaN exponent. 2876 | jp >9 // Branch for NaN exponent.
2869 | fpop // Pop y and fallthrough to vm_powi. 2877 | fpop // Pop y and fallthrough to vm_powi.
2870 | 2878 |
2871 |// FP/int power function x^i. Called from JIT code. Arg1/ret on x87 stack. 2879 |// FP/int power function x^i. Arg1/ret on x87 stack.
2872 |// Arg2 (int) on C stack. No int/xmm regs modified. 2880 |// Arg2 (int) on C stack. RC (eax) modified.
2873 |// Caveat: needs 2 slots on x87 stack! 2881 |// Caveat: needs 2 slots on x87 stack!
2874 |->vm_powi: 2882 | mov eax, [esp+4]
2875 | push_eax
2876 | mov eax, [esp+8]
2877 | cmp eax, 1; jle >6 // i<=1? 2883 | cmp eax, 1; jle >6 // i<=1?
2878 | // Now 1 < (unsigned)i <= 0x80000000. 2884 | // Now 1 < (unsigned)i <= 0x80000000.
2879 |1: // Handle leading zeros. 2885 |1: // Handle leading zeros.
@@ -2893,7 +2899,6 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2893 |4: 2899 |4:
2894 | fmulp st1 2900 | fmulp st1
2895 |5: 2901 |5:
2896 | pop_eax
2897 | ret 2902 | ret
2898 |6: 2903 |6:
2899 | je <5 // x^1 ==> x 2904 | je <5 // x^1 ==> x
@@ -2904,19 +2909,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2904 | jmp <1 // x^-i ==> (1/x)^i 2909 | jmp <1 // x^-i ==> (1/x)^i
2905 |7: 2910 |7:
2906 | fpop; fld1 // x^0 ==> 1 2911 | fpop; fld1 // x^0 ==> 1
2907 | pop_eax
2908 | ret 2912 | ret
2909 | 2913 |
2910 |8: // FP/FP power function x^y. 2914 |8: // FP/FP power function x^y.
2911 | push_eax 2915 | fst dword [esp+4]
2912 | fst dword [esp+8]
2913 | fxch 2916 | fxch
2914 | fst dword [esp+12] 2917 | fst dword [esp+8]
2915 | mov eax, [esp+8]; shl eax, 1 2918 | mov eax, [esp+4]; shl eax, 1
2916 | cmp eax, 0xff000000; je >2 // x^+-Inf? 2919 | cmp eax, 0xff000000; je >2 // x^+-Inf?
2917 | mov eax, [esp+12]; shl eax, 1; je >4 // +-0^y? 2920 | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
2918 | cmp eax, 0xff000000; je >4 // +-Inf^y? 2921 | cmp eax, 0xff000000; je >4 // +-Inf^y?
2919 | pop_eax
2920 | fyl2x 2922 | fyl2x
2921 | jmp ->vm_exp2raw 2923 | jmp ->vm_exp2raw
2922 | 2924 |
@@ -2925,7 +2927,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2925 ||if (cmov) { 2927 ||if (cmov) {
2926 | fucomip st2 2928 | fucomip st2
2927 ||} else { 2929 ||} else {
2928 | push_eax; fucomp st2; fnstsw ax; sahf; pop_eax 2930 | fucomp st2; fnstsw ax; sahf
2929 ||} 2931 ||}
2930 | je >1 // 1^NaN ==> 1 2932 | je >1 // 1^NaN ==> 1
2931 | fxch // x^NaN ==> NaN 2933 | fxch // x^NaN ==> NaN
@@ -2943,41 +2945,205 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2943 ||} 2945 ||}
2944 | je >3 // +-1^+-Inf ==> 1 2946 | je >3 // +-1^+-Inf ==> 1
2945 | fpop; fabs; fldz; mov eax, 0; setc al 2947 | fpop; fabs; fldz; mov eax, 0; setc al
2946 | ror eax, 1; xor eax, [esp+8]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0 2948 | ror eax, 1; xor eax, [esp+4]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0
2947 | fxch 2949 | fxch
2948 |3: 2950 |3:
2949 | fpop1; fabs; pop_eax 2951 | fpop1; fabs
2950 | ret 2952 | ret
2951 | 2953 |
2952 |4: // Handle +-0^y or +-Inf^y. 2954 |4: // Handle +-0^y or +-Inf^y.
2953 | cmp dword [esp+8], 0; jge <3 // y >= 0, x^y ==> |x| 2955 | cmp dword [esp+4], 0; jge <3 // y >= 0, x^y ==> |x|
2954 | fpop; fpop 2956 | fpop; fpop
2955 | test eax, eax; pop_eax; jz >5 // y < 0, +-0^y ==> +Inf 2957 | test eax, eax; jz >5 // y < 0, +-0^y ==> +Inf
2956 | fldz // y < 0, +-Inf^y ==> 0 2958 | fldz // y < 0, +-Inf^y ==> 0
2957 | ret 2959 | ret
2958 |5: 2960 |5:
2959 | mov dword [esp+8], 0x7f800000 // Return +Inf. 2961 | mov dword [esp+4], 0x7f800000 // Return +Inf.
2960 | fld dword [esp+8] 2962 | fld dword [esp+4]
2963 | ret
2964 |.endif
2965 } else {
2966 |->vm_pow:
2967 }
2968 |
2969 |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
2970 |// Needs 16 byte scratch area for x86. Also called from JIT code.
2971 |->vm_pow_sse:
2972 | cvtsd2si eax, xmm1
2973 | cvtsi2sd xmm2, eax
2974 | ucomisd xmm1, xmm2
2975 | jnz >8 // Branch for FP exponents.
2976 | jp >9 // Branch for NaN exponent.
2977 | // Fallthrough to vm_powi_sse.
2978 |
2979 |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
2980 |->vm_powi_sse:
2981 | cmp eax, 1; jle >6 // i<=1?
2982 | // Now 1 < (unsigned)i <= 0x80000000.
2983 |1: // Handle leading zeros.
2984 | test eax, 1; jnz >2
2985 | mulsd xmm0, xmm0
2986 | shr eax, 1
2987 | jmp <1
2988 |2:
2989 | shr eax, 1; jz >5
2990 | movaps xmm1, xmm0
2991 |3: // Handle trailing bits.
2992 | mulsd xmm0, xmm0
2993 | shr eax, 1; jz >4
2994 | jnc <3
2995 | mulsd xmm1, xmm0
2996 | jmp <3
2997 |4:
2998 | mulsd xmm0, xmm1
2999 |5:
3000 | ret
3001 |6:
3002 | je <5 // x^1 ==> x
3003 | jb >7
3004 | push RDa
3005 | sseconst_1 xmm1, RDa
3006 | divsd xmm1, xmm0
3007 | pop RDa
3008 | movaps xmm0, xmm1
3009 | neg eax
3010 | cmp eax, 1; je <5 // x^-1 ==> 1/x
3011 | jmp <1 // x^-i ==> (1/x)^i
3012 |7:
3013 | sseconst_1 xmm0, RDa
3014 | ret
3015 |
3016 |8: // FP/FP power function x^y.
3017 |.if X64
3018 | movd rax, xmm1; shl rax, 1
3019 | ror rax, 32; cmp rax, 0xffe00000; je >2 // x^+-Inf?
3020 | movd rax, xmm0; shl rax, 1; je >4 // +-0^y?
3021 | ror rax, 32; cmp rax, 0xffe00000; je >5 // +-Inf^y?
3022 | .if X64WIN
3023 | movsd qword [rsp+16], xmm1 // Use scratch area.
3024 | movsd qword [rsp+8], xmm0
3025 | fld qword [rsp+16]
3026 | fld qword [rsp+8]
3027 | .else
3028 | movsd qword [rsp-16], xmm1 // Use red zone.
3029 | movsd qword [rsp-8], xmm0
3030 | fld qword [rsp-16]
3031 | fld qword [rsp-8]
3032 | .endif
3033 |.else
3034 | movsd qword [esp+12], xmm1 // Needs 16 byte scratch area.
3035 | movsd qword [esp+4], xmm0
3036 | cmp dword [esp+12], 0; jne >1
3037 | mov eax, [esp+16]; shl eax, 1
3038 | cmp eax, 0xffe00000; je >2 // x^+-Inf?
3039 |1:
3040 | cmp dword [esp+4], 0; jne >1
3041 | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
3042 | cmp eax, 0xffe00000; je >5 // +-Inf^y?
3043 |1:
3044 | fld qword [esp+12]
3045 | fld qword [esp+4]
3046 |.endif
3047 | fyl2x // y*log2(x)
3048 | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
3049 | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
3050 |.if X64WIN
3051 | fstp qword [rsp+8] // Use scratch area.
3052 | movsd xmm0, qword [rsp+8]
3053 |.elif X64
3054 | fstp qword [rsp-8] // Use red zone.
3055 | movsd xmm0, qword [rsp-8]
3056 |.else
3057 | fstp qword [esp+4] // Needs 8 byte scratch area.
3058 | movsd xmm0, qword [esp+4]
3059 |.endif
3060 | ret
3061 |
3062 |9: // Handle x^NaN.
3063 | sseconst_1 xmm2, RDa
3064 | ucomisd xmm0, xmm2; je >1 // 1^NaN ==> 1
3065 | movaps xmm0, xmm1 // x^NaN ==> NaN
3066 |1:
3067 | ret
3068 |
3069 |2: // Handle x^+-Inf.
3070 | sseconst_abs xmm2, RDa
3071 | andpd xmm0, xmm2 // |x|
3072 | sseconst_1 xmm2, RDa
3073 | ucomisd xmm0, xmm2; je <1 // +-1^+-Inf ==> 1
3074 | movmskpd eax, xmm1
3075 | xorps xmm0, xmm0
3076 | mov ah, al; setc al; xor al, ah; jne <1 // |x|<>1, x^+-Inf ==> +Inf/0
3077 |3:
3078 | sseconst_hi xmm0, RDa, 7ff00000 // +Inf
3079 | ret
3080 |
3081 |4: // Handle +-0^y.
3082 | movmskpd eax, xmm1; test eax, eax; jnz <3 // y < 0, +-0^y ==> +Inf
3083 | xorps xmm0, xmm0 // y >= 0, +-0^y ==> 0
3084 | ret
3085 |
3086 |5: // Handle +-Inf^y.
3087 | movmskpd eax, xmm1; test eax, eax; jz <3 // y >= 0, +-Inf^y ==> +Inf
3088 | xorps xmm0, xmm0 // y < 0, +-Inf^y ==> 0
2961 | ret 3089 | ret
2962 | 3090 |
2963 |// Callable from C: double lj_vm_foldfpm(double x, int fpm) 3091 |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
2964 |// Computes fpm(x) for extended math functions. ORDER FPM. 3092 |// Computes fpm(x) for extended math functions. ORDER FPM.
2965 |->vm_foldfpm: 3093 |->vm_foldfpm:
2966 if (sse) { 3094 if (sse) {
2967 |.if X64WIN
2968 | .define fpmop, CARG2d
2969 |.elif X64
2970 | .define fpmop, CARG1d
2971 |.else
2972 | .define fpmop, eax
2973 | mov fpmop, [esp+12]
2974 | movsd xmm0, qword [esp+4]
2975 |.endif
2976 |.if X64 3095 |.if X64
3096 |
3097 | .if X64WIN
3098 | .define fpmop, CARG2d
3099 | .else
3100 | .define fpmop, CARG1d
3101 | .endif
2977 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil 3102 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
2978 | cmp fpmop, 3; jb ->vm_trunc; ja >2 3103 | cmp fpmop, 3; jb ->vm_trunc; ja >2
2979 | sqrtsd xmm0, xmm0; ret 3104 | sqrtsd xmm0, xmm0; ret
2980 |.else 3105 |2:
3106 | .if X64WIN
3107 | movsd qword [rsp+8], xmm0 // Use scratch area.
3108 | fld qword [rsp+8]
3109 | .else
3110 | movsd qword [rsp-8], xmm0 // Use red zone.
3111 | fld qword [rsp-8]
3112 | .endif
3113 | cmp fpmop, 5; ja >2
3114 | .if X64WIN; pop rax; .endif
3115 | je >1
3116 | call ->vm_exp
3117 | .if X64WIN; push rax; .endif
3118 | jmp >7
3119 |1:
3120 | call ->vm_exp2
3121 | .if X64WIN; push rax; .endif
3122 | jmp >7
3123 |2: ; cmp fpmop, 7; je >1; ja >2
3124 | fldln2; fxch; fyl2x; jmp >7
3125 |1: ; fld1; fxch; fyl2x; jmp >7
3126 |2: ; cmp fpmop, 9; je >1; ja >2
3127 | fldlg2; fxch; fyl2x; jmp >7
3128 |1: ; fsin; jmp >7
3129 |2: ; cmp fpmop, 11; je >1; ja >9
3130 | fcos; jmp >7
3131 |1: ; fptan; fpop
3132 |7:
3133 | .if X64WIN
3134 | fstp qword [rsp+8] // Use scratch area.
3135 | movsd xmm0, qword [rsp+8]
3136 | .else
3137 | fstp qword [rsp-8] // Use red zone.
3138 | movsd xmm0, qword [rsp-8]
3139 | .endif
3140 | ret
3141 |
3142 |.else // x86 calling convention.
3143 |
3144 | .define fpmop, eax
3145 | mov fpmop, [esp+12]
3146 | movsd xmm0, qword [esp+4]
2981 | cmp fpmop, 1; je >1; ja >2 3147 | cmp fpmop, 1; je >1; ja >2
2982 | call ->vm_floor; jmp >7 3148 | call ->vm_floor; jmp >7
2983 |1: ; call ->vm_ceil; jmp >7 3149 |1: ; call ->vm_ceil; jmp >7
@@ -2989,27 +3155,36 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
2989 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. 3155 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
2990 | fld qword [esp+4] 3156 | fld qword [esp+4]
2991 | ret 3157 | ret
3158 |2: ; fld qword [esp+4]
3159 | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
3160 |2: ; cmp fpmop, 7; je >1; ja >2
3161 | fldln2; fxch; fyl2x; ret
3162 |1: ; fld1; fxch; fyl2x; ret
3163 |2: ; cmp fpmop, 9; je >1; ja >2
3164 | fldlg2; fxch; fyl2x; ret
3165 |1: ; fsin; ret
3166 |2: ; cmp fpmop, 11; je >1; ja >9
3167 | fcos; ret
3168 |1: ; fptan; fpop; ret
3169 |
2992 |.endif 3170 |.endif
2993 |2:
2994 | fld qword [esp+4]
2995 } else { 3171 } else {
2996 | mov fpmop, [esp+12] 3172 | mov fpmop, [esp+12]
2997 | fld qword [esp+4] 3173 | fld qword [esp+4]
2998 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil 3174 | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
2999 | cmp fpmop, 3; jb ->vm_trunc; ja >2 3175 | cmp fpmop, 3; jb ->vm_trunc; ja >2
3000 | fsqrt; ret 3176 | fsqrt; ret
3001 |2: 3177 |2: ; cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
3178 | cmp fpmop, 7; je >1; ja >2
3179 | fldln2; fxch; fyl2x; ret
3180 |1: ; fld1; fxch; fyl2x; ret
3181 |2: ; cmp fpmop, 9; je >1; ja >2
3182 | fldlg2; fxch; fyl2x; ret
3183 |1: ; fsin; ret
3184 |2: ; cmp fpmop, 11; je >1; ja >9
3185 | fcos; ret
3186 |1: ; fptan; fpop; ret
3002 } 3187 }
3003 | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
3004 | cmp fpmop, 7; je >1; ja >2
3005 | fldln2; fxch; fyl2x; ret
3006 |1: ; fld1; fxch; fyl2x; ret
3007 |2: ; cmp fpmop, 9; je >1; ja >2
3008 | fldlg2; fxch; fyl2x; ret
3009 |1: ; fsin; ret
3010 |2: ; cmp fpmop, 11; je >1; ja >9
3011 | fcos; ret
3012 |1: ; fptan; fpop; ret
3013 |9: ; int3 // Bad fpm. 3188 |9: ; int3 // Bad fpm.
3014 | 3189 |
3015 |// Callable from C: double lj_vm_foldarith(double x, double y, int op) 3190 |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
@@ -3017,72 +3192,87 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
3017 |// and basic math functions. ORDER ARITH 3192 |// and basic math functions. ORDER ARITH
3018 |->vm_foldarith: 3193 |->vm_foldarith:
3019 if (sse) { 3194 if (sse) {
3020 |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro 3195 |.if X64
3021 |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro
3022 | 3196 |
3197 | .if X64WIN
3198 | .define foldop, CARG3d
3199 | .else
3200 | .define foldop, CARG1d
3201 | .endif
3202 | cmp foldop, 1; je >1; ja >2
3203 | addsd xmm0, xmm1; ret
3204 |1: ; subsd xmm0, xmm1; ret
3205 |2: ; cmp foldop, 3; je >1; ja >2
3206 | mulsd xmm0, xmm1; ret
3207 |1: ; divsd xmm0, xmm1; ret
3208 |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow
3209 | cmp foldop, 7; je >1; ja >2
3210 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
3211 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
3212 |2: ; cmp foldop, 9; ja >2
3023 |.if X64WIN 3213 |.if X64WIN
3024 | .define foldop, CARG3d 3214 | movsd qword [rsp+8], xmm0 // Use scratch area.
3025 |.elif X64 3215 | movsd qword [rsp+16], xmm1
3026 | .define foldop, CARG1d 3216 | fld qword [rsp+8]
3217 | fld qword [rsp+16]
3027 |.else 3218 |.else
3219 | movsd qword [rsp-8], xmm0 // Use red zone.
3220 | movsd qword [rsp-16], xmm1
3221 | fld qword [rsp-8]
3222 | fld qword [rsp-16]
3223 |.endif
3224 | je >1
3225 | fpatan
3226 |7:
3227 |.if X64WIN
3228 | fstp qword [rsp+8] // Use scratch area.
3229 | movsd xmm0, qword [rsp+8]
3230 |.else
3231 | fstp qword [rsp-8] // Use red zone.
3232 | movsd xmm0, qword [rsp-8]
3233 |.endif
3234 | ret
3235 |1: ; fxch; fscale; fpop1; jmp <7
3236 |2: ; cmp foldop, 11; je >1; ja >9
3237 | minsd xmm0, xmm1; ret
3238 |1: ; maxsd xmm0, xmm1; ret
3239 |9: ; int3 // Bad op.
3240 |
3241 |.else // x86 calling convention.
3242 |
3028 | .define foldop, eax 3243 | .define foldop, eax
3029 | mov foldop, [esp+20] 3244 | mov foldop, [esp+20]
3030 | movsd xmm0, qword [esp+4] 3245 | movsd xmm0, qword [esp+4]
3031 | movsd xmm1, qword [esp+12] 3246 | movsd xmm1, qword [esp+12]
3032 |.endif
3033 | cmp foldop, 1; je >1; ja >2 3247 | cmp foldop, 1; je >1; ja >2
3034 | addsd xmm0, xmm1; retxmm0 3248 | addsd xmm0, xmm1
3035 |1: ; subsd xmm0, xmm1; retxmm0 3249 |7:
3250 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
3251 | fld qword [esp+4]
3252 | ret
3253 |1: ; subsd xmm0, xmm1; jmp <7
3036 |2: ; cmp foldop, 3; je >1; ja >2 3254 |2: ; cmp foldop, 3; je >1; ja >2
3037 | mulsd xmm0, xmm1; retxmm0 3255 | mulsd xmm0, xmm1; jmp <7
3038 |1: ; divsd xmm0, xmm1; retxmm0 3256 |1: ; divsd xmm0, xmm1; jmp <7
3039 |2: ; cmp foldop, 5 3257 |2: ; cmp foldop, 5
3040 |.if X64
3041 | jb ->vm_mod; je ->vm_pow // NYI: broken without SSE vm_pow.
3042 |.else
3043 | je >1; ja >2 3258 | je >1; ja >2
3044 | call ->vm_mod; retxmm0 3259 | call ->vm_mod; jmp <7
3045 |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow // NYI 3260 |1: ; pop edx; call ->vm_pow; push edx; jmp <7 // Writes to scratch area.
3046 |2: 3261 |2: ; cmp foldop, 7; je >1; ja >2
3047 |.endif 3262 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
3048 | cmp foldop, 7; je >1; ja >2 3263 |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
3049 | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0
3050 |1:
3051 | sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0
3052 |2: ; cmp foldop, 9; ja >2 3264 |2: ; cmp foldop, 9; ja >2
3053 |.if X64WIN
3054 | movsd qword [esp+8], xmm0 // Use scratch area.
3055 | movsd qword [esp+16], xmm1
3056 | fld qword [esp+8]
3057 | fld qword [esp+16]
3058 |.elif X64
3059 | movsd qword [esp-8], xmm0 // Use red zone.
3060 | movsd qword [esp-16], xmm1
3061 | fld qword [esp-8]
3062 | fld qword [esp-16]
3063 |.else
3064 | fld qword [esp+4] // Reload from stack 3265 | fld qword [esp+4] // Reload from stack
3065 | fld qword [esp+12] 3266 | fld qword [esp+12]
3066 |.endif
3067 | je >1 3267 | je >1
3068 | fpatan; retst0 3268 | fpatan; ret
3069 |1: ; fxch; fscale; fpop1; retst0 3269 |1: ; fxch; fscale; fpop1; ret
3070 |2: ; cmp foldop, 11; je >1; ja >9 3270 |2: ; cmp foldop, 11; je >1; ja >9
3071 | minsd xmm0, xmm1; retxmm0 3271 | minsd xmm0, xmm1; jmp <7
3072 |1: ; maxsd xmm0, xmm1; retxmm0 3272 |1: ; maxsd xmm0, xmm1; jmp <7
3073 |9: ; int3 // Bad op. 3273 |9: ; int3 // Bad op.
3074 |7: // Move return value depending on calling convention. 3274 |
3075 |.if X64WIN
3076 | fstp qword [esp+8] // Use scratch area.
3077 | movsd xmm0, qword [esp+8]
3078 |.elif X64
3079 | fstp qword [esp-8] // Use red zone.
3080 | movsd xmm0, qword [esp-8]
3081 |.else
3082 | movsd qword [esp+4], xmm0 // Overwrite callee-owned args.
3083 | fld qword [esp+4]
3084 |.endif 3275 |.endif
3085 | ret
3086 } else { 3276 } else {
3087 | mov eax, [esp+20] 3277 | mov eax, [esp+20]
3088 | fld qword [esp+4] 3278 | fld qword [esp+4]
@@ -3483,17 +3673,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
3483 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. 3673 | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
3484 break; 3674 break;
3485 case BC_POW: 3675 case BC_POW:
3486 if (sse) { 3676 | ins_arithpre fld, movsd, xmm1
3487 sse = 0; /* NYI: temporary workaround. */ 3677 | call ->vm_pow
3488 | ins_arithpre fld, movsd, xmm1 3678 | ins_arithpost
3489 | call ->vm_pow
3490 | ins_arithpost
3491 sse = 1;
3492 } else {
3493 | ins_arithpre fld, movsd, xmm1
3494 | call ->vm_pow
3495 | ins_arithpost
3496 }
3497 | ins_next 3679 | ins_next
3498 break; 3680 break;
3499 3681