diff options
| author | Mike Pall <mike> | 2010-09-08 00:09:36 +0200 |
|---|---|---|
| committer | Mike Pall <mike> | 2010-09-08 00:09:36 +0200 |
| commit | e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63 (patch) | |
| tree | 9cbd8789fde25317d95dadb8c35769d26c2ebb89 /src | |
| parent | db735e051951ceed2cd9f097dd7b09879b8d8790 (diff) | |
| download | luajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.tar.gz luajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.tar.bz2 luajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.zip | |
PPC: Add heavily optimized floor/ceil/trunc functions.
Diffstat (limited to 'src')
| -rw-r--r-- | src/buildvm_ppc.dasc | 72 |
1 files changed, 69 insertions, 3 deletions
diff --git a/src/buildvm_ppc.dasc b/src/buildvm_ppc.dasc index 6b71ec84..dd99c077 100644 --- a/src/buildvm_ppc.dasc +++ b/src/buildvm_ppc.dasc | |||
| @@ -887,10 +887,75 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 887 | | | 887 | | |
| 888 | |// FP value rounding. Called by math.floor/math.ceil fast functions | 888 | |// FP value rounding. Called by math.floor/math.ceil fast functions |
| 889 | |// and from JIT code. | 889 | |// and from JIT code. |
| 890 | | | 890 | |// |
| 891 | |// This can be inlined if the CPU has the frin/friz/frip/frim instructions. | ||
| 892 | |// The alternative hard-float approaches have a deep dependency chain. | ||
| 893 | |// The resulting latency is at least 3x-7x the double-precision FP latency | ||
| 894 | |// (e500v2: 6cy, e600: 5cy, Cell: 10cy) or around 20-70 cycles. | ||
| 895 | |// | ||
| 896 | |// The soft-float approach is tedious, but much faster (e500v2: ~11cy/~6cy). | ||
| 897 | |// However it relies on a fast way to transfer the FP value to GPRs | ||
| 898 | |// (e500v2: 0cy for lo-word, 1cy for hi-word). | ||
| 899 | |// | ||
| 891 | |.macro vm_round, name, mode | 900 | |.macro vm_round, name, mode |
| 892 | |->name: | 901 | | // Used temporaries: TMP0, TMP1, TMP2, TMP3. |
| 893 | | NYI | 902 | |->name: // Input: CARG2, output: CRET2 |
| 903 | | evmergehi CARG1, CARG2, CARG2 | ||
| 904 | |->name.._hilo: | ||
| 905 | | // Input: CARG1 (hi), CARG2 (hi, lo), output: CRET2 | ||
| 906 | | rlwinm TMP2, CARG1, 12, 21, 31 | ||
| 907 | | addic. TMP2, TMP2, -1023 // exp = exponent(x) - 1023 | ||
| 908 | | li TMP1, -1 | ||
| 909 | | cmplwi cr1, TMP2, 51 // 0 <= exp < 51? | ||
| 910 | | subfic TMP0, TMP2, 52 | ||
| 911 | | bgt cr1, >1 | ||
| 912 | | lus TMP3, 0xfff0 | ||
| 913 | | slw TMP0, TMP1, TMP0 // lomask = -1 << (52-exp) | ||
| 914 | | sraw TMP1, TMP3, TMP2 // himask = (int32_t)0xfff00000 >> exp | ||
| 915 | |.if mode == 2 // trunc(x): | ||
| 916 | | evmergelo TMP0, TMP1, TMP0 | ||
| 917 | | evand CRET2, CARG2, TMP0 // hi &= himask, lo &= lomask | ||
| 918 | |.else | ||
| 919 | | andc TMP2, CARG2, TMP0 | ||
| 920 | | andc TMP3, CARG1, TMP1 | ||
| 921 | | or TMP2, TMP2, TMP3 // ztest = (hi&~himask) | (lo&~lomask) | ||
| 922 | | srawi TMP3, CARG1, 31 // signmask = (int32_t)hi >> 31 | ||
| 923 | |.if mode == 0 // floor(x): | ||
| 924 | | and. TMP2, TMP2, TMP3 // iszero = ((ztest & signmask) == 0) | ||
| 925 | |.else // ceil(x): | ||
| 926 | | andc. TMP2, TMP2, TMP3 // iszero = ((ztest & ~signmask) == 0) | ||
| 927 | |.endif | ||
| 928 | | and CARG2, CARG2, TMP0 // lo &= lomask | ||
| 929 | | and CARG1, CARG1, TMP1 // hi &= himask | ||
| 930 | | subc TMP0, CARG2, TMP0 | ||
| 931 | | iseleq TMP0, CARG2, TMP0 // lo = iszero ? lo : lo-lomask | ||
| 932 | | sube TMP1, CARG1, TMP1 | ||
| 933 | | iseleq TMP1, CARG1, TMP1 // hi = iszero ? hi : hi-himask+carry | ||
| 934 | | evmergelo CRET2, TMP1, TMP0 | ||
| 935 | |.endif | ||
| 936 | | blr | ||
| 937 | |1: | ||
| 938 | | bgtlr // Already done if >=2^52, +-inf or nan. | ||
| 939 | |.if mode == 2 // trunc(x): | ||
| 940 | | rlwinm TMP1, CARG1, 0, 0, 0 // hi = sign(x) | ||
| 941 | | li TMP0, 0 | ||
| 942 | | evmergelo CRET2, TMP1, TMP0 | ||
| 943 | |.else | ||
| 944 | | rlwinm TMP2, CARG1, 0, 1, 31 | ||
| 945 | | srawi TMP0, CARG1, 31 // signmask = (int32_t)hi >> 31 | ||
| 946 | | or TMP2, TMP2, CARG2 // ztest = abs(hi) | lo | ||
| 947 | | lus TMP1, 0x3ff0 | ||
| 948 | |.if mode == 0 // floor(x): | ||
| 949 | | and. TMP2, TMP2, TMP0 // iszero = ((ztest & signmask) == 0) | ||
| 950 | |.else // ceil(x): | ||
| 951 | | andc. TMP2, TMP2, TMP0 // iszero = ((ztest & ~signmask) == 0) | ||
| 952 | |.endif | ||
| 953 | | li TMP0, 0 | ||
| 954 | | iseleq TMP1, r0, TMP1 | ||
| 955 | | rlwimi CARG1, TMP1, 0, 1, 31 // hi = sign(x) | (iszero ? 0.0 : 1.0) | ||
| 956 | | evmergelo CRET2, CARG1, TMP0 | ||
| 957 | |.endif | ||
| 958 | | blr | ||
| 894 | |.endmacro | 959 | |.endmacro |
| 895 | | | 960 | | |
| 896 | | vm_round vm_floor, 0 | 961 | | vm_round vm_floor, 0 |
| @@ -899,6 +964,7 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 899 | | vm_round vm_trunc, 2 | 964 | | vm_round vm_trunc, 2 |
| 900 | #else | 965 | #else |
| 901 | |->vm_trunc: | 966 | |->vm_trunc: |
| 967 | |->vm_trunc_hilo: | ||
| 902 | #endif | 968 | #endif |
| 903 | | | 969 | | |
| 904 | |->vm_powi: | 970 | |->vm_powi: |
