diff options
author | Mike Pall <mike> | 2009-12-25 23:12:30 +0100 |
---|---|---|
committer | Mike Pall <mike> | 2009-12-25 23:12:30 +0100 |
commit | 690760aa3853e63331f46e40c8276d9f5939261d (patch) | |
tree | b68fb518d22c3a08d8886bc532de91fdfdcc9360 /src/buildvm_x86.dasc | |
parent | 6ce0c90ed642157f019b50ad1eb06246471a47b1 (diff) | |
download | luajit-690760aa3853e63331f46e40c8276d9f5939261d.tar.gz luajit-690760aa3853e63331f46e40c8276d9f5939261d.tar.bz2 luajit-690760aa3853e63331f46e40c8276d9f5939261d.zip |
Add SSE variant of pow/powi to interpreter.
Use SSE pow/powi helper functions from compiled code.
Cleanup use of helper functions.
Related cleanups of folding functions in x64 interpreter.
Diffstat (limited to 'src/buildvm_x86.dasc')
-rw-r--r-- | src/buildvm_x86.dasc | 418 |
1 files changed, 300 insertions, 118 deletions
diff --git a/src/buildvm_x86.dasc b/src/buildvm_x86.dasc index 99842d08..9ce8ef16 100644 --- a/src/buildvm_x86.dasc +++ b/src/buildvm_x86.dasc | |||
@@ -96,10 +96,6 @@ | |||
96 | |.type TRACE, Trace | 96 | |.type TRACE, Trace |
97 | |.type EXITINFO, ExitInfo | 97 | |.type EXITINFO, ExitInfo |
98 | | | 98 | | |
99 | |// x86/x64 portability macros | ||
100 | |.macro push_eax; .if X64; push rax; .else; push eax; .endif; .endmacro | ||
101 | |.macro pop_eax; .if X64; pop rax; .else; pop eax; .endif; .endmacro | ||
102 | | | ||
103 | |// Stack layout while in interpreter. Must match with lj_frame.h. | 99 | |// Stack layout while in interpreter. Must match with lj_frame.h. |
104 | |//----------------------------------------------------------------------- | 100 | |//----------------------------------------------------------------------- |
105 | |.if not X64 // x86 stack layout. | 101 | |.if not X64 // x86 stack layout. |
@@ -2072,10 +2068,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2072 | | fpop1 | 2068 | | fpop1 |
2073 | | jmp ->fff_resn | 2069 | | jmp ->fff_resn |
2074 | | | 2070 | | |
2075 | if (0 && sse) { // NYI | 2071 | if (sse) { |
2076 | |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0 | 2072 | |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0 |
2077 | } else { | 2073 | } else { |
2078 | |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn | 2074 | |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn |
2079 | } | 2075 | } |
2080 | | | 2076 | | |
2081 | |.macro math_minmax, name, cmovop, nocmovop, sseop | 2077 | |.macro math_minmax, name, cmovop, nocmovop, sseop |
@@ -2091,6 +2087,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2091 | | add RB, 1 | 2087 | | add RB, 1 |
2092 | | jmp <1 | 2088 | | jmp <1 |
2093 | ||} else { | 2089 | ||} else { |
2090 | |.if not X64 | ||
2094 | |.ffunc_n name | 2091 | |.ffunc_n name |
2095 | | mov RB, 2 | 2092 | | mov RB, 2 |
2096 | |1: | 2093 | |1: |
@@ -2101,12 +2098,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2101 | ||if (cmov) { | 2098 | ||if (cmov) { |
2102 | | fucomi st1; cmovop st1; fpop1 | 2099 | | fucomi st1; cmovop st1; fpop1 |
2103 | ||} else { | 2100 | ||} else { |
2104 | | push_eax | 2101 | | push eax |
2105 | | fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop | 2102 | | fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop |
2106 | | pop_eax | 2103 | | pop eax |
2107 | ||} | 2104 | ||} |
2108 | | add RB, 1 | 2105 | | add RB, 1 |
2109 | | jmp <1 | 2106 | | jmp <1 |
2107 | |.endif | ||
2110 | ||} | 2108 | ||} |
2111 | |.endmacro | 2109 | |.endmacro |
2112 | | | 2110 | | |
@@ -2842,19 +2840,29 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2842 | |->vm_exp: | 2840 | |->vm_exp: |
2843 | | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e)) | 2841 | | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e)) |
2844 | |->vm_exp2: | 2842 | |->vm_exp2: |
2845 | | fst dword [esp+4] // Caveat: overwrites ARG1. | 2843 | | .if X64WIN |
2846 | | cmp dword [esp+4], 0x7f800000; je >1 // Special case: e^+Inf = +Inf | 2844 | | .define expscratch, dword [rsp+8] // Use scratch area. |
2847 | | cmp dword [esp+4], 0xff800000; je >2 // Special case: e^-Inf = 0 | 2845 | | .elif X64 |
2846 | | .define expscratch, dword [rsp-8] // Use red zone. | ||
2847 | | .else | ||
2848 | | .define expscratch, dword [esp+4] // Needs 4 byte scratch area. | ||
2849 | | .endif | ||
2850 | | fst expscratch // Caveat: overwrites ARG1. | ||
2851 | | cmp expscratch, 0x7f800000; je >1 // Special case: e^+Inf = +Inf | ||
2852 | | cmp expscratch, 0xff800000; je >2 // Special case: e^-Inf = 0 | ||
2848 | |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check. | 2853 | |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check. |
2849 | | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. | 2854 | | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. |
2850 | | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int | 2855 | | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int |
2851 | |1: | 2856 | |1: |
2852 | | ret | 2857 | | ret |
2853 | |2: | 2858 | |2: |
2854 | | fpop; fldz; ret | 2859 | | fpop; fldz; ret |
2855 | | | 2860 | | |
2856 | |// Generic power function x^y. Called by BC_POW, math.pow fast function | 2861 | |// Generic power function x^y. Called by BC_POW, math.pow fast function, |
2857 | |// and vm_arith. Args/ret on x87 stack (y on top). No int/xmm regs modified. | 2862 | |// and vm_arith. |
2863 | if (!sse) { | ||
2864 | |.if not X64 | ||
2865 | |// Args/ret on x87 stack (y on top). RC (eax) modified. | ||
2858 | |// Caveat: needs 3 slots on x87 stack! | 2866 | |// Caveat: needs 3 slots on x87 stack! |
2859 | |->vm_pow: | 2867 | |->vm_pow: |
2860 | | fist dword [esp+4] // Store/reload int before comparison. | 2868 | | fist dword [esp+4] // Store/reload int before comparison. |
@@ -2862,18 +2870,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2862 | ||if (cmov) { | 2870 | ||if (cmov) { |
2863 | | fucomip st1 | 2871 | | fucomip st1 |
2864 | ||} else { | 2872 | ||} else { |
2865 | | push_eax; fucomp st1; fnstsw ax; sahf; pop_eax | 2873 | | fucomp st1; fnstsw ax; sahf |
2866 | ||} | 2874 | ||} |
2867 | | jnz >8 // Branch for FP exponents. | 2875 | | jnz >8 // Branch for FP exponents. |
2868 | | jp >9 // Branch for NaN exponent. | 2876 | | jp >9 // Branch for NaN exponent. |
2869 | | fpop // Pop y and fallthrough to vm_powi. | 2877 | | fpop // Pop y and fallthrough to vm_powi. |
2870 | | | 2878 | | |
2871 | |// FP/int power function x^i. Called from JIT code. Arg1/ret on x87 stack. | 2879 | |// FP/int power function x^i. Arg1/ret on x87 stack. |
2872 | |// Arg2 (int) on C stack. No int/xmm regs modified. | 2880 | |// Arg2 (int) on C stack. RC (eax) modified. |
2873 | |// Caveat: needs 2 slots on x87 stack! | 2881 | |// Caveat: needs 2 slots on x87 stack! |
2874 | |->vm_powi: | 2882 | | mov eax, [esp+4] |
2875 | | push_eax | ||
2876 | | mov eax, [esp+8] | ||
2877 | | cmp eax, 1; jle >6 // i<=1? | 2883 | | cmp eax, 1; jle >6 // i<=1? |
2878 | | // Now 1 < (unsigned)i <= 0x80000000. | 2884 | | // Now 1 < (unsigned)i <= 0x80000000. |
2879 | |1: // Handle leading zeros. | 2885 | |1: // Handle leading zeros. |
@@ -2893,7 +2899,6 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2893 | |4: | 2899 | |4: |
2894 | | fmulp st1 | 2900 | | fmulp st1 |
2895 | |5: | 2901 | |5: |
2896 | | pop_eax | ||
2897 | | ret | 2902 | | ret |
2898 | |6: | 2903 | |6: |
2899 | | je <5 // x^1 ==> x | 2904 | | je <5 // x^1 ==> x |
@@ -2904,19 +2909,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2904 | | jmp <1 // x^-i ==> (1/x)^i | 2909 | | jmp <1 // x^-i ==> (1/x)^i |
2905 | |7: | 2910 | |7: |
2906 | | fpop; fld1 // x^0 ==> 1 | 2911 | | fpop; fld1 // x^0 ==> 1 |
2907 | | pop_eax | ||
2908 | | ret | 2912 | | ret |
2909 | | | 2913 | | |
2910 | |8: // FP/FP power function x^y. | 2914 | |8: // FP/FP power function x^y. |
2911 | | push_eax | 2915 | | fst dword [esp+4] |
2912 | | fst dword [esp+8] | ||
2913 | | fxch | 2916 | | fxch |
2914 | | fst dword [esp+12] | 2917 | | fst dword [esp+8] |
2915 | | mov eax, [esp+8]; shl eax, 1 | 2918 | | mov eax, [esp+4]; shl eax, 1 |
2916 | | cmp eax, 0xff000000; je >2 // x^+-Inf? | 2919 | | cmp eax, 0xff000000; je >2 // x^+-Inf? |
2917 | | mov eax, [esp+12]; shl eax, 1; je >4 // +-0^y? | 2920 | | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y? |
2918 | | cmp eax, 0xff000000; je >4 // +-Inf^y? | 2921 | | cmp eax, 0xff000000; je >4 // +-Inf^y? |
2919 | | pop_eax | ||
2920 | | fyl2x | 2922 | | fyl2x |
2921 | | jmp ->vm_exp2raw | 2923 | | jmp ->vm_exp2raw |
2922 | | | 2924 | | |
@@ -2925,7 +2927,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2925 | ||if (cmov) { | 2927 | ||if (cmov) { |
2926 | | fucomip st2 | 2928 | | fucomip st2 |
2927 | ||} else { | 2929 | ||} else { |
2928 | | push_eax; fucomp st2; fnstsw ax; sahf; pop_eax | 2930 | | fucomp st2; fnstsw ax; sahf |
2929 | ||} | 2931 | ||} |
2930 | | je >1 // 1^NaN ==> 1 | 2932 | | je >1 // 1^NaN ==> 1 |
2931 | | fxch // x^NaN ==> NaN | 2933 | | fxch // x^NaN ==> NaN |
@@ -2943,41 +2945,205 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2943 | ||} | 2945 | ||} |
2944 | | je >3 // +-1^+-Inf ==> 1 | 2946 | | je >3 // +-1^+-Inf ==> 1 |
2945 | | fpop; fabs; fldz; mov eax, 0; setc al | 2947 | | fpop; fabs; fldz; mov eax, 0; setc al |
2946 | | ror eax, 1; xor eax, [esp+8]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0 | 2948 | | ror eax, 1; xor eax, [esp+4]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0 |
2947 | | fxch | 2949 | | fxch |
2948 | |3: | 2950 | |3: |
2949 | | fpop1; fabs; pop_eax | 2951 | | fpop1; fabs |
2950 | | ret | 2952 | | ret |
2951 | | | 2953 | | |
2952 | |4: // Handle +-0^y or +-Inf^y. | 2954 | |4: // Handle +-0^y or +-Inf^y. |
2953 | | cmp dword [esp+8], 0; jge <3 // y >= 0, x^y ==> |x| | 2955 | | cmp dword [esp+4], 0; jge <3 // y >= 0, x^y ==> |x| |
2954 | | fpop; fpop | 2956 | | fpop; fpop |
2955 | | test eax, eax; pop_eax; jz >5 // y < 0, +-0^y ==> +Inf | 2957 | | test eax, eax; jz >5 // y < 0, +-0^y ==> +Inf |
2956 | | fldz // y < 0, +-Inf^y ==> 0 | 2958 | | fldz // y < 0, +-Inf^y ==> 0 |
2957 | | ret | 2959 | | ret |
2958 | |5: | 2960 | |5: |
2959 | | mov dword [esp+8], 0x7f800000 // Return +Inf. | 2961 | | mov dword [esp+4], 0x7f800000 // Return +Inf. |
2960 | | fld dword [esp+8] | 2962 | | fld dword [esp+4] |
2963 | | ret | ||
2964 | |.endif | ||
2965 | } else { | ||
2966 | |->vm_pow: | ||
2967 | } | ||
2968 | | | ||
2969 | |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified. | ||
2970 | |// Needs 16 byte scratch area for x86. Also called from JIT code. | ||
2971 | |->vm_pow_sse: | ||
2972 | | cvtsd2si eax, xmm1 | ||
2973 | | cvtsi2sd xmm2, eax | ||
2974 | | ucomisd xmm1, xmm2 | ||
2975 | | jnz >8 // Branch for FP exponents. | ||
2976 | | jp >9 // Branch for NaN exponent. | ||
2977 | | // Fallthrough to vm_powi_sse. | ||
2978 | | | ||
2979 | |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified. | ||
2980 | |->vm_powi_sse: | ||
2981 | | cmp eax, 1; jle >6 // i<=1? | ||
2982 | | // Now 1 < (unsigned)i <= 0x80000000. | ||
2983 | |1: // Handle leading zeros. | ||
2984 | | test eax, 1; jnz >2 | ||
2985 | | mulsd xmm0, xmm0 | ||
2986 | | shr eax, 1 | ||
2987 | | jmp <1 | ||
2988 | |2: | ||
2989 | | shr eax, 1; jz >5 | ||
2990 | | movaps xmm1, xmm0 | ||
2991 | |3: // Handle trailing bits. | ||
2992 | | mulsd xmm0, xmm0 | ||
2993 | | shr eax, 1; jz >4 | ||
2994 | | jnc <3 | ||
2995 | | mulsd xmm1, xmm0 | ||
2996 | | jmp <3 | ||
2997 | |4: | ||
2998 | | mulsd xmm0, xmm1 | ||
2999 | |5: | ||
3000 | | ret | ||
3001 | |6: | ||
3002 | | je <5 // x^1 ==> x | ||
3003 | | jb >7 | ||
3004 | | push RDa | ||
3005 | | sseconst_1 xmm1, RDa | ||
3006 | | divsd xmm1, xmm0 | ||
3007 | | pop RDa | ||
3008 | | movaps xmm0, xmm1 | ||
3009 | | neg eax | ||
3010 | | cmp eax, 1; je <5 // x^-1 ==> 1/x | ||
3011 | | jmp <1 // x^-i ==> (1/x)^i | ||
3012 | |7: | ||
3013 | | sseconst_1 xmm0, RDa | ||
3014 | | ret | ||
3015 | | | ||
3016 | |8: // FP/FP power function x^y. | ||
3017 | |.if X64 | ||
3018 | | movd rax, xmm1; shl rax, 1 | ||
3019 | | ror rax, 32; cmp rax, 0xffe00000; je >2 // x^+-Inf? | ||
3020 | | movd rax, xmm0; shl rax, 1; je >4 // +-0^y? | ||
3021 | | ror rax, 32; cmp rax, 0xffe00000; je >5 // +-Inf^y? | ||
3022 | | .if X64WIN | ||
3023 | | movsd qword [rsp+16], xmm1 // Use scratch area. | ||
3024 | | movsd qword [rsp+8], xmm0 | ||
3025 | | fld qword [rsp+16] | ||
3026 | | fld qword [rsp+8] | ||
3027 | | .else | ||
3028 | | movsd qword [rsp-16], xmm1 // Use red zone. | ||
3029 | | movsd qword [rsp-8], xmm0 | ||
3030 | | fld qword [rsp-16] | ||
3031 | | fld qword [rsp-8] | ||
3032 | | .endif | ||
3033 | |.else | ||
3034 | | movsd qword [esp+12], xmm1 // Needs 16 byte scratch area. | ||
3035 | | movsd qword [esp+4], xmm0 | ||
3036 | | cmp dword [esp+12], 0; jne >1 | ||
3037 | | mov eax, [esp+16]; shl eax, 1 | ||
3038 | | cmp eax, 0xffe00000; je >2 // x^+-Inf? | ||
3039 | |1: | ||
3040 | | cmp dword [esp+4], 0; jne >1 | ||
3041 | | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y? | ||
3042 | | cmp eax, 0xffe00000; je >5 // +-Inf^y? | ||
3043 | |1: | ||
3044 | | fld qword [esp+12] | ||
3045 | | fld qword [esp+4] | ||
3046 | |.endif | ||
3047 | | fyl2x // y*log2(x) | ||
3048 | | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. | ||
3049 | | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int | ||
3050 | |.if X64WIN | ||
3051 | | fstp qword [rsp+8] // Use scratch area. | ||
3052 | | movsd xmm0, qword [rsp+8] | ||
3053 | |.elif X64 | ||
3054 | | fstp qword [rsp-8] // Use red zone. | ||
3055 | | movsd xmm0, qword [rsp-8] | ||
3056 | |.else | ||
3057 | | fstp qword [esp+4] // Needs 8 byte scratch area. | ||
3058 | | movsd xmm0, qword [esp+4] | ||
3059 | |.endif | ||
3060 | | ret | ||
3061 | | | ||
3062 | |9: // Handle x^NaN. | ||
3063 | | sseconst_1 xmm2, RDa | ||
3064 | | ucomisd xmm0, xmm2; je >1 // 1^NaN ==> 1 | ||
3065 | | movaps xmm0, xmm1 // x^NaN ==> NaN | ||
3066 | |1: | ||
3067 | | ret | ||
3068 | | | ||
3069 | |2: // Handle x^+-Inf. | ||
3070 | | sseconst_abs xmm2, RDa | ||
3071 | | andpd xmm0, xmm2 // |x| | ||
3072 | | sseconst_1 xmm2, RDa | ||
3073 | | ucomisd xmm0, xmm2; je <1 // +-1^+-Inf ==> 1 | ||
3074 | | movmskpd eax, xmm1 | ||
3075 | | xorps xmm0, xmm0 | ||
3076 | | mov ah, al; setc al; xor al, ah; jne <1 // |x|<>1, x^+-Inf ==> +Inf/0 | ||
3077 | |3: | ||
3078 | | sseconst_hi xmm0, RDa, 7ff00000 // +Inf | ||
3079 | | ret | ||
3080 | | | ||
3081 | |4: // Handle +-0^y. | ||
3082 | | movmskpd eax, xmm1; test eax, eax; jnz <3 // y < 0, +-0^y ==> +Inf | ||
3083 | | xorps xmm0, xmm0 // y >= 0, +-0^y ==> 0 | ||
3084 | | ret | ||
3085 | | | ||
3086 | |5: // Handle +-Inf^y. | ||
3087 | | movmskpd eax, xmm1; test eax, eax; jz <3 // y >= 0, +-Inf^y ==> +Inf | ||
3088 | | xorps xmm0, xmm0 // y < 0, +-Inf^y ==> 0 | ||
2961 | | ret | 3089 | | ret |
2962 | | | 3090 | | |
2963 | |// Callable from C: double lj_vm_foldfpm(double x, int fpm) | 3091 | |// Callable from C: double lj_vm_foldfpm(double x, int fpm) |
2964 | |// Computes fpm(x) for extended math functions. ORDER FPM. | 3092 | |// Computes fpm(x) for extended math functions. ORDER FPM. |
2965 | |->vm_foldfpm: | 3093 | |->vm_foldfpm: |
2966 | if (sse) { | 3094 | if (sse) { |
2967 | |.if X64WIN | ||
2968 | | .define fpmop, CARG2d | ||
2969 | |.elif X64 | ||
2970 | | .define fpmop, CARG1d | ||
2971 | |.else | ||
2972 | | .define fpmop, eax | ||
2973 | | mov fpmop, [esp+12] | ||
2974 | | movsd xmm0, qword [esp+4] | ||
2975 | |.endif | ||
2976 | |.if X64 | 3095 | |.if X64 |
3096 | | | ||
3097 | | .if X64WIN | ||
3098 | | .define fpmop, CARG2d | ||
3099 | | .else | ||
3100 | | .define fpmop, CARG1d | ||
3101 | | .endif | ||
2977 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil | 3102 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil |
2978 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 | 3103 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 |
2979 | | sqrtsd xmm0, xmm0; ret | 3104 | | sqrtsd xmm0, xmm0; ret |
2980 | |.else | 3105 | |2: |
3106 | | .if X64WIN | ||
3107 | | movsd qword [rsp+8], xmm0 // Use scratch area. | ||
3108 | | fld qword [rsp+8] | ||
3109 | | .else | ||
3110 | | movsd qword [rsp-8], xmm0 // Use red zone. | ||
3111 | | fld qword [rsp-8] | ||
3112 | | .endif | ||
3113 | | cmp fpmop, 5; ja >2 | ||
3114 | | .if X64WIN; pop rax; .endif | ||
3115 | | je >1 | ||
3116 | | call ->vm_exp | ||
3117 | | .if X64WIN; push rax; .endif | ||
3118 | | jmp >7 | ||
3119 | |1: | ||
3120 | | call ->vm_exp2 | ||
3121 | | .if X64WIN; push rax; .endif | ||
3122 | | jmp >7 | ||
3123 | |2: ; cmp fpmop, 7; je >1; ja >2 | ||
3124 | | fldln2; fxch; fyl2x; jmp >7 | ||
3125 | |1: ; fld1; fxch; fyl2x; jmp >7 | ||
3126 | |2: ; cmp fpmop, 9; je >1; ja >2 | ||
3127 | | fldlg2; fxch; fyl2x; jmp >7 | ||
3128 | |1: ; fsin; jmp >7 | ||
3129 | |2: ; cmp fpmop, 11; je >1; ja >9 | ||
3130 | | fcos; jmp >7 | ||
3131 | |1: ; fptan; fpop | ||
3132 | |7: | ||
3133 | | .if X64WIN | ||
3134 | | fstp qword [rsp+8] // Use scratch area. | ||
3135 | | movsd xmm0, qword [rsp+8] | ||
3136 | | .else | ||
3137 | | fstp qword [rsp-8] // Use red zone. | ||
3138 | | movsd xmm0, qword [rsp-8] | ||
3139 | | .endif | ||
3140 | | ret | ||
3141 | | | ||
3142 | |.else // x86 calling convention. | ||
3143 | | | ||
3144 | | .define fpmop, eax | ||
3145 | | mov fpmop, [esp+12] | ||
3146 | | movsd xmm0, qword [esp+4] | ||
2981 | | cmp fpmop, 1; je >1; ja >2 | 3147 | | cmp fpmop, 1; je >1; ja >2 |
2982 | | call ->vm_floor; jmp >7 | 3148 | | call ->vm_floor; jmp >7 |
2983 | |1: ; call ->vm_ceil; jmp >7 | 3149 | |1: ; call ->vm_ceil; jmp >7 |
@@ -2989,27 +3155,36 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
2989 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. | 3155 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. |
2990 | | fld qword [esp+4] | 3156 | | fld qword [esp+4] |
2991 | | ret | 3157 | | ret |
3158 | |2: ; fld qword [esp+4] | ||
3159 | | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2 | ||
3160 | |2: ; cmp fpmop, 7; je >1; ja >2 | ||
3161 | | fldln2; fxch; fyl2x; ret | ||
3162 | |1: ; fld1; fxch; fyl2x; ret | ||
3163 | |2: ; cmp fpmop, 9; je >1; ja >2 | ||
3164 | | fldlg2; fxch; fyl2x; ret | ||
3165 | |1: ; fsin; ret | ||
3166 | |2: ; cmp fpmop, 11; je >1; ja >9 | ||
3167 | | fcos; ret | ||
3168 | |1: ; fptan; fpop; ret | ||
3169 | | | ||
2992 | |.endif | 3170 | |.endif |
2993 | |2: | ||
2994 | | fld qword [esp+4] | ||
2995 | } else { | 3171 | } else { |
2996 | | mov fpmop, [esp+12] | 3172 | | mov fpmop, [esp+12] |
2997 | | fld qword [esp+4] | 3173 | | fld qword [esp+4] |
2998 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil | 3174 | | cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil |
2999 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 | 3175 | | cmp fpmop, 3; jb ->vm_trunc; ja >2 |
3000 | | fsqrt; ret | 3176 | | fsqrt; ret |
3001 | |2: | 3177 | |2: ; cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2 |
3178 | | cmp fpmop, 7; je >1; ja >2 | ||
3179 | | fldln2; fxch; fyl2x; ret | ||
3180 | |1: ; fld1; fxch; fyl2x; ret | ||
3181 | |2: ; cmp fpmop, 9; je >1; ja >2 | ||
3182 | | fldlg2; fxch; fyl2x; ret | ||
3183 | |1: ; fsin; ret | ||
3184 | |2: ; cmp fpmop, 11; je >1; ja >9 | ||
3185 | | fcos; ret | ||
3186 | |1: ; fptan; fpop; ret | ||
3002 | } | 3187 | } |
3003 | | cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2 | ||
3004 | | cmp fpmop, 7; je >1; ja >2 | ||
3005 | | fldln2; fxch; fyl2x; ret | ||
3006 | |1: ; fld1; fxch; fyl2x; ret | ||
3007 | |2: ; cmp fpmop, 9; je >1; ja >2 | ||
3008 | | fldlg2; fxch; fyl2x; ret | ||
3009 | |1: ; fsin; ret | ||
3010 | |2: ; cmp fpmop, 11; je >1; ja >9 | ||
3011 | | fcos; ret | ||
3012 | |1: ; fptan; fpop; ret | ||
3013 | |9: ; int3 // Bad fpm. | 3188 | |9: ; int3 // Bad fpm. |
3014 | | | 3189 | | |
3015 | |// Callable from C: double lj_vm_foldarith(double x, double y, int op) | 3190 | |// Callable from C: double lj_vm_foldarith(double x, double y, int op) |
@@ -3017,72 +3192,87 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse) | |||
3017 | |// and basic math functions. ORDER ARITH | 3192 | |// and basic math functions. ORDER ARITH |
3018 | |->vm_foldarith: | 3193 | |->vm_foldarith: |
3019 | if (sse) { | 3194 | if (sse) { |
3020 | |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro | 3195 | |.if X64 |
3021 | |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro | ||
3022 | | | 3196 | | |
3197 | | .if X64WIN | ||
3198 | | .define foldop, CARG3d | ||
3199 | | .else | ||
3200 | | .define foldop, CARG1d | ||
3201 | | .endif | ||
3202 | | cmp foldop, 1; je >1; ja >2 | ||
3203 | | addsd xmm0, xmm1; ret | ||
3204 | |1: ; subsd xmm0, xmm1; ret | ||
3205 | |2: ; cmp foldop, 3; je >1; ja >2 | ||
3206 | | mulsd xmm0, xmm1; ret | ||
3207 | |1: ; divsd xmm0, xmm1; ret | ||
3208 | |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow | ||
3209 | | cmp foldop, 7; je >1; ja >2 | ||
3210 | | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret | ||
3211 | |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret | ||
3212 | |2: ; cmp foldop, 9; ja >2 | ||
3023 | |.if X64WIN | 3213 | |.if X64WIN |
3024 | | .define foldop, CARG3d | 3214 | | movsd qword [rsp+8], xmm0 // Use scratch area. |
3025 | |.elif X64 | 3215 | | movsd qword [rsp+16], xmm1 |
3026 | | .define foldop, CARG1d | 3216 | | fld qword [rsp+8] |
3217 | | fld qword [rsp+16] | ||
3027 | |.else | 3218 | |.else |
3219 | | movsd qword [rsp-8], xmm0 // Use red zone. | ||
3220 | | movsd qword [rsp-16], xmm1 | ||
3221 | | fld qword [rsp-8] | ||
3222 | | fld qword [rsp-16] | ||
3223 | |.endif | ||
3224 | | je >1 | ||
3225 | | fpatan | ||
3226 | |7: | ||
3227 | |.if X64WIN | ||
3228 | | fstp qword [rsp+8] // Use scratch area. | ||
3229 | | movsd xmm0, qword [rsp+8] | ||
3230 | |.else | ||
3231 | | fstp qword [rsp-8] // Use red zone. | ||
3232 | | movsd xmm0, qword [rsp-8] | ||
3233 | |.endif | ||
3234 | | ret | ||
3235 | |1: ; fxch; fscale; fpop1; jmp <7 | ||
3236 | |2: ; cmp foldop, 11; je >1; ja >9 | ||
3237 | | minsd xmm0, xmm1; ret | ||
3238 | |1: ; maxsd xmm0, xmm1; ret | ||
3239 | |9: ; int3 // Bad op. | ||
3240 | | | ||
3241 | |.else // x86 calling convention. | ||
3242 | | | ||
3028 | | .define foldop, eax | 3243 | | .define foldop, eax |
3029 | | mov foldop, [esp+20] | 3244 | | mov foldop, [esp+20] |
3030 | | movsd xmm0, qword [esp+4] | 3245 | | movsd xmm0, qword [esp+4] |
3031 | | movsd xmm1, qword [esp+12] | 3246 | | movsd xmm1, qword [esp+12] |
3032 | |.endif | ||
3033 | | cmp foldop, 1; je >1; ja >2 | 3247 | | cmp foldop, 1; je >1; ja >2 |
3034 | | addsd xmm0, xmm1; retxmm0 | 3248 | | addsd xmm0, xmm1 |
3035 | |1: ; subsd xmm0, xmm1; retxmm0 | 3249 | |7: |
3250 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. | ||
3251 | | fld qword [esp+4] | ||
3252 | | ret | ||
3253 | |1: ; subsd xmm0, xmm1; jmp <7 | ||
3036 | |2: ; cmp foldop, 3; je >1; ja >2 | 3254 | |2: ; cmp foldop, 3; je >1; ja >2 |
3037 | | mulsd xmm0, xmm1; retxmm0 | 3255 | | mulsd xmm0, xmm1; jmp <7 |
3038 | |1: ; divsd xmm0, xmm1; retxmm0 | 3256 | |1: ; divsd xmm0, xmm1; jmp <7 |
3039 | |2: ; cmp foldop, 5 | 3257 | |2: ; cmp foldop, 5 |
3040 | |.if X64 | ||
3041 | | jb ->vm_mod; je ->vm_pow // NYI: broken without SSE vm_pow. | ||
3042 | |.else | ||
3043 | | je >1; ja >2 | 3258 | | je >1; ja >2 |
3044 | | call ->vm_mod; retxmm0 | 3259 | | call ->vm_mod; jmp <7 |
3045 | |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow // NYI | 3260 | |1: ; pop edx; call ->vm_pow; push edx; jmp <7 // Writes to scratch area. |
3046 | |2: | 3261 | |2: ; cmp foldop, 7; je >1; ja >2 |
3047 | |.endif | 3262 | | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7 |
3048 | | cmp foldop, 7; je >1; ja >2 | 3263 | |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7 |
3049 | | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0 | ||
3050 | |1: | ||
3051 | | sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0 | ||
3052 | |2: ; cmp foldop, 9; ja >2 | 3264 | |2: ; cmp foldop, 9; ja >2 |
3053 | |.if X64WIN | ||
3054 | | movsd qword [esp+8], xmm0 // Use scratch area. | ||
3055 | | movsd qword [esp+16], xmm1 | ||
3056 | | fld qword [esp+8] | ||
3057 | | fld qword [esp+16] | ||
3058 | |.elif X64 | ||
3059 | | movsd qword [esp-8], xmm0 // Use red zone. | ||
3060 | | movsd qword [esp-16], xmm1 | ||
3061 | | fld qword [esp-8] | ||
3062 | | fld qword [esp-16] | ||
3063 | |.else | ||
3064 | | fld qword [esp+4] // Reload from stack | 3265 | | fld qword [esp+4] // Reload from stack |
3065 | | fld qword [esp+12] | 3266 | | fld qword [esp+12] |
3066 | |.endif | ||
3067 | | je >1 | 3267 | | je >1 |
3068 | | fpatan; retst0 | 3268 | | fpatan; ret |
3069 | |1: ; fxch; fscale; fpop1; retst0 | 3269 | |1: ; fxch; fscale; fpop1; ret |
3070 | |2: ; cmp foldop, 11; je >1; ja >9 | 3270 | |2: ; cmp foldop, 11; je >1; ja >9 |
3071 | | minsd xmm0, xmm1; retxmm0 | 3271 | | minsd xmm0, xmm1; jmp <7 |
3072 | |1: ; maxsd xmm0, xmm1; retxmm0 | 3272 | |1: ; maxsd xmm0, xmm1; jmp <7 |
3073 | |9: ; int3 // Bad op. | 3273 | |9: ; int3 // Bad op. |
3074 | |7: // Move return value depending on calling convention. | 3274 | | |
3075 | |.if X64WIN | ||
3076 | | fstp qword [esp+8] // Use scratch area. | ||
3077 | | movsd xmm0, qword [esp+8] | ||
3078 | |.elif X64 | ||
3079 | | fstp qword [esp-8] // Use red zone. | ||
3080 | | movsd xmm0, qword [esp-8] | ||
3081 | |.else | ||
3082 | | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. | ||
3083 | | fld qword [esp+4] | ||
3084 | |.endif | 3275 | |.endif |
3085 | | ret | ||
3086 | } else { | 3276 | } else { |
3087 | | mov eax, [esp+20] | 3277 | | mov eax, [esp+20] |
3088 | | fld qword [esp+4] | 3278 | | fld qword [esp+4] |
@@ -3483,17 +3673,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse) | |||
3483 | | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. | 3673 | | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. |
3484 | break; | 3674 | break; |
3485 | case BC_POW: | 3675 | case BC_POW: |
3486 | if (sse) { | 3676 | | ins_arithpre fld, movsd, xmm1 |
3487 | sse = 0; /* NYI: temporary workaround. */ | 3677 | | call ->vm_pow |
3488 | | ins_arithpre fld, movsd, xmm1 | 3678 | | ins_arithpost |
3489 | | call ->vm_pow | ||
3490 | | ins_arithpost | ||
3491 | sse = 1; | ||
3492 | } else { | ||
3493 | | ins_arithpre fld, movsd, xmm1 | ||
3494 | | call ->vm_pow | ||
3495 | | ins_arithpost | ||
3496 | } | ||
3497 | | ins_next | 3679 | | ins_next |
3498 | break; | 3680 | break; |
3499 | 3681 | ||