aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMike Pall <mike>2010-09-08 00:09:36 +0200
committerMike Pall <mike>2010-09-08 00:09:36 +0200
commite9e7df5bfe9205eb3559a81f5af083cd8ae4aa63 (patch)
tree9cbd8789fde25317d95dadb8c35769d26c2ebb89 /src
parentdb735e051951ceed2cd9f097dd7b09879b8d8790 (diff)
downloadluajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.tar.gz
luajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.tar.bz2
luajit-e9e7df5bfe9205eb3559a81f5af083cd8ae4aa63.zip
PPC: Add heavily optimized floor/ceil/trunc functions.
Diffstat (limited to 'src')
-rw-r--r--src/buildvm_ppc.dasc72
1 files changed, 69 insertions, 3 deletions
diff --git a/src/buildvm_ppc.dasc b/src/buildvm_ppc.dasc
index 6b71ec84..dd99c077 100644
--- a/src/buildvm_ppc.dasc
+++ b/src/buildvm_ppc.dasc
@@ -887,10 +887,75 @@ static void build_subroutines(BuildCtx *ctx)
887 | 887 |
888 |// FP value rounding. Called by math.floor/math.ceil fast functions 888 |// FP value rounding. Called by math.floor/math.ceil fast functions
889 |// and from JIT code. 889 |// and from JIT code.
890 | 890 |//
891 |// This can be inlined if the CPU has the frin/friz/frip/frim instructions.
892 |// The alternative hard-float approaches have a deep dependency chain.
893 |// The resulting latency is at least 3x-7x the double-precision FP latency
894 |// (e500v2: 6cy, e600: 5cy, Cell: 10cy) or around 20-70 cycles.
895 |//
896 |// The soft-float approach is tedious, but much faster (e500v2: ~11cy/~6cy).
897 |// However it relies on a fast way to transfer the FP value to GPRs
898 |// (e500v2: 0cy for lo-word, 1cy for hi-word).
899 |//
891 |.macro vm_round, name, mode 900 |.macro vm_round, name, mode
892 |->name: 901 | // Used temporaries: TMP0, TMP1, TMP2, TMP3.
893 | NYI 902 |->name: // Input: CARG2, output: CRET2
903 | evmergehi CARG1, CARG2, CARG2
904 |->name.._hilo:
905 | // Input: CARG1 (hi), CARG2 (hi, lo), output: CRET2
906 | rlwinm TMP2, CARG1, 12, 21, 31
907 | addic. TMP2, TMP2, -1023 // exp = exponent(x) - 1023
908 | li TMP1, -1
909 | cmplwi cr1, TMP2, 51 // 0 <= exp < 51?
910 | subfic TMP0, TMP2, 52
911 | bgt cr1, >1
912 | lus TMP3, 0xfff0
913 | slw TMP0, TMP1, TMP0 // lomask = -1 << (52-exp)
914 | sraw TMP1, TMP3, TMP2 // himask = (int32_t)0xfff00000 >> exp
915 |.if mode == 2 // trunc(x):
916 | evmergelo TMP0, TMP1, TMP0
917 | evand CRET2, CARG2, TMP0 // hi &= himask, lo &= lomask
918 |.else
919 | andc TMP2, CARG2, TMP0
920 | andc TMP3, CARG1, TMP1
921 | or TMP2, TMP2, TMP3 // ztest = (hi&~himask) | (lo&~lomask)
922 | srawi TMP3, CARG1, 31 // signmask = (int32_t)hi >> 31
923 |.if mode == 0 // floor(x):
924 | and. TMP2, TMP2, TMP3 // iszero = ((ztest & signmask) == 0)
925 |.else // ceil(x):
926 | andc. TMP2, TMP2, TMP3 // iszero = ((ztest & ~signmask) == 0)
927 |.endif
928 | and CARG2, CARG2, TMP0 // lo &= lomask
929 | and CARG1, CARG1, TMP1 // hi &= himask
930 | subc TMP0, CARG2, TMP0
931 | iseleq TMP0, CARG2, TMP0 // lo = iszero ? lo : lo-lomask
932 | sube TMP1, CARG1, TMP1
933 | iseleq TMP1, CARG1, TMP1 // hi = iszero ? hi : hi-himask+carry
934 | evmergelo CRET2, TMP1, TMP0
935 |.endif
936 | blr
937 |1:
938 | bgtlr // Already done if >=2^52, +-inf or nan.
939 |.if mode == 2 // trunc(x):
940 | rlwinm TMP1, CARG1, 0, 0, 0 // hi = sign(x)
941 | li TMP0, 0
942 | evmergelo CRET2, TMP1, TMP0
943 |.else
944 | rlwinm TMP2, CARG1, 0, 1, 31
945 | srawi TMP0, CARG1, 31 // signmask = (int32_t)hi >> 31
946 | or TMP2, TMP2, CARG2 // ztest = abs(hi) | lo
947 | lus TMP1, 0x3ff0
948 |.if mode == 0 // floor(x):
949 | and. TMP2, TMP2, TMP0 // iszero = ((ztest & signmask) == 0)
950 |.else // ceil(x):
951 | andc. TMP2, TMP2, TMP0 // iszero = ((ztest & ~signmask) == 0)
952 |.endif
953 | li TMP0, 0
954 | iseleq TMP1, r0, TMP1
955 | rlwimi CARG1, TMP1, 0, 1, 31 // hi = sign(x) | (iszero ? 0.0 : 1.0)
956 | evmergelo CRET2, CARG1, TMP0
957 |.endif
958 | blr
894 |.endmacro 959 |.endmacro
895 | 960 |
896 | vm_round vm_floor, 0 961 | vm_round vm_floor, 0
@@ -899,6 +964,7 @@ static void build_subroutines(BuildCtx *ctx)
899 | vm_round vm_trunc, 2 964 | vm_round vm_trunc, 2
900#else 965#else
901 |->vm_trunc: 966 |->vm_trunc:
967 |->vm_trunc_hilo:
902#endif 968#endif
903 | 969 |
904 |->vm_powi: 970 |->vm_powi: