aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMike Pall <mike>2012-10-15 16:53:03 +0200
committerMike Pall <mike>2012-10-15 16:53:03 +0200
commit2621617a9292ea821a0630339e20e83e11858a5e (patch)
treeb9fc47ee02e0afeb124466455ee07d27da32fc58 /src
parent894d2d6ef4bf50a7c355e49e4508de5d07edad2d (diff)
downloadluajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.gz
luajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.bz2
luajit-2621617a9292ea821a0630339e20e83e11858a5e.zip
ARM: Drop hard-fp variants of floor/ceil/trunc.
Soft-fp variants are faster on a Cortex-A9. Duh.
Diffstat (limited to 'src')
-rw-r--r--src/lj_asm_arm.h24
-rw-r--r--src/lj_vm.h6
-rw-r--r--src/vm_arm.dasc115
3 files changed, 57 insertions, 88 deletions
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h
index a7465cb7..ef907fbe 100644
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -1574,15 +1574,23 @@ static void asm_callid(ASMState *as, IRIns *ir, IRCallID id)
1574static void asm_callround(ASMState *as, IRIns *ir, int id) 1574static void asm_callround(ASMState *as, IRIns *ir, int id)
1575{ 1575{
1576 /* The modified regs must match with the *.dasc implementation. */ 1576 /* The modified regs must match with the *.dasc implementation. */
1577 RegSet drop = RID2RSET(RID_D0)|RID2RSET(RID_D1)|RID2RSET(RID_D2)| 1577 RegSet drop = RID2RSET(RID_R0)|RID2RSET(RID_R1)|RID2RSET(RID_R2)|
1578 RID2RSET(RID_R0)|RID2RSET(RID_R1); 1578 RID2RSET(RID_R3)|RID2RSET(RID_R12);
1579 if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); 1579 RegSet of;
1580 Reg dest, src;
1580 ra_evictset(as, drop); 1581 ra_evictset(as, drop);
1581 ra_destreg(as, ir, RID_FPRET); 1582 dest = ra_dest(as, ir, RSET_FPR);
1582 emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_hf : 1583 emit_dnm(as, ARMI_VMOV_D_RR, RID_RETLO, RID_RETHI, (dest & 15));
1583 id == IRFPM_CEIL ? (void *)lj_vm_ceil_hf : 1584 emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_sf :
1584 (void *)lj_vm_trunc_hf); 1585 id == IRFPM_CEIL ? (void *)lj_vm_ceil_sf :
1585 ra_leftov(as, RID_D0, ir->op1); 1586 (void *)lj_vm_trunc_sf);
1587 /* Workaround to protect argument GPRs from being used for remat. */
1588 of = as->freeset;
1589 as->freeset &= ~RSET_RANGE(RID_R0, RID_R1+1);
1590 as->cost[RID_R0] = as->cost[RID_R1] = REGCOST(~0u, ASMREF_L);
1591 src = ra_alloc1(as, ir->op1, RSET_FPR); /* May alloc GPR to remat FPR. */
1592 as->freeset |= (of & RSET_RANGE(RID_R0, RID_R1+1));
1593 emit_dnm(as, ARMI_VMOV_RR_D, RID_R0, RID_R1, (src & 15));
1586} 1594}
1587#endif 1595#endif
1588 1596
diff --git a/src/lj_vm.h b/src/lj_vm.h
index 813335e3..b6b3e7e5 100644
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -56,8 +56,8 @@ LJ_ASMF void lj_vm_exit_interp(void);
56LJ_ASMF double lj_vm_floor(double); 56LJ_ASMF double lj_vm_floor(double);
57LJ_ASMF double lj_vm_ceil(double); 57LJ_ASMF double lj_vm_ceil(double);
58#if LJ_TARGET_ARM 58#if LJ_TARGET_ARM
59LJ_ASMF double lj_vm_floor_hf(double); 59LJ_ASMF double lj_vm_floor_sf(double);
60LJ_ASMF double lj_vm_ceil_hf(double); 60LJ_ASMF double lj_vm_ceil_sf(double);
61#endif 61#endif
62#endif 62#endif
63#if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 63#if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64
@@ -81,7 +81,7 @@ LJ_ASMF void lj_vm_powi_sse(void);
81#else 81#else
82LJ_ASMF double lj_vm_trunc(double); 82LJ_ASMF double lj_vm_trunc(double);
83#if LJ_TARGET_ARM 83#if LJ_TARGET_ARM
84LJ_ASMF double lj_vm_trunc_hf(double); 84LJ_ASMF double lj_vm_trunc_sf(double);
85#endif 85#endif
86#endif 86#endif
87LJ_ASMF double lj_vm_powi(double, int32_t); 87LJ_ASMF double lj_vm_powi(double, int32_t);
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index f00b3028..fb9363e4 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -1368,14 +1368,8 @@ static void build_subroutines(BuildCtx *ctx)
1368 | movmi CARG1, #0x80000000 1368 | movmi CARG1, #0x80000000
1369 | bmi <1 1369 | bmi <1
1370 |4: 1370 |4:
1371 |.if HFABI 1371 | bl ->vm_..func.._sf
1372 | vmov d0, CARG1, CARG2
1373 | bl ->vm_..func.._hf
1374 | b ->fff_resd
1375 |.else
1376 | bl ->vm_..func
1377 | b ->fff_restv 1372 | b ->fff_restv
1378 |.endif
1379 |.endmacro 1373 |.endmacro
1380 | 1374 |
1381 | math_round floor 1375 | math_round floor
@@ -2221,52 +2215,9 @@ static void build_subroutines(BuildCtx *ctx)
2221 |// 2215 |//
2222 |// double lj_vm_floor/ceil/trunc(double x); 2216 |// double lj_vm_floor/ceil/trunc(double x);
2223 |.macro vm_round, func, hf 2217 |.macro vm_round, func, hf
2224 |.if FPU 2218 |.if hf == 1
2225 |.if hf == 0
2226 | vmov d0, CARG1, CARG2
2227 | vldr d2, <8 // 2^52
2228 |.else
2229 | vldr d2, <8 // 2^52
2230 | vmov CARG1, CARG2, d0 2219 | vmov CARG1, CARG2, d0
2231 |.endif 2220 |.endif
2232 | vabs.f64 d1, d0
2233 | vcmp.f64 d1, d2 // |x| >= 2^52 or NaN?
2234 | vmrs
2235 |.if "func" == "trunc"
2236 | bxpl lr // Return argument unchanged.
2237 | vadd.f64 d0, d1, d2
2238 | vsub.f64 d0, d0, d2 // (|x| + 2^52) - 2^52
2239 | vldr d2, <9 // +1.0
2240 | vcmp.f64 d1, d0 // |x| < result: subtract +1.0
2241 | vmrs
2242 | vsubmi.f64 d0, d0, d2
2243 | cmp CARG2, #0
2244 | vnegmi.f64 d0, d0 // Merge sign bit back in.
2245 |.else
2246 | vadd.f64 d1, d1, d2
2247 | bxpl lr // Return argument unchanged.
2248 | cmp CARG2, #0
2249 | vsub.f64 d1, d1, d2 // (|x| + 2^52) - 2^52
2250 | vldr d2, <9 // +1.0
2251 | vnegmi.f64 d1, d1 // Merge sign bit back in.
2252 |.if "func" == "floor"
2253 | vcmp.f64 d0, d1 // x < result: subtract +1.0.
2254 | vmrs
2255 | vsubmi.f64 d0, d1, d2
2256 |.else
2257 | vcmp.f64 d1, d0 // x > result: add +1.0.
2258 | vmrs
2259 | vaddmi.f64 d0, d1, d2
2260 |.endif
2261 | vmovpl.f64 d0, d1
2262 |.endif
2263 |.if hf == 0
2264 | vmov CARG1, CARG2, d0
2265 |.endif
2266 | bx lr
2267 |
2268 |.else
2269 |
2270 | lsl CARG3, CARG2, #1 2221 | lsl CARG3, CARG2, #1
2271 | adds RB, CARG3, #0x00200000 2222 | adds RB, CARG3, #0x00200000
2272 | bpl >2 // |x| < 1? 2223 | bpl >2 // |x| < 1?
@@ -2286,6 +2237,9 @@ static void build_subroutines(BuildCtx *ctx)
2286 |.else 2237 |.else
2287 | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) 2238 | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0)
2288 |.endif 2239 |.endif
2240 |.if hf == 1
2241 | vmoveq d0, CARG1, CARG2
2242 |.endif
2289 | bxeq lr // iszero: done. 2243 | bxeq lr // iszero: done.
2290 | mvn CARG4, #1 2244 | mvn CARG4, #1
2291 | cmp RB, #0 2245 | cmp RB, #0
@@ -2294,6 +2248,9 @@ static void build_subroutines(BuildCtx *ctx)
2294 | add RB, RB, #32 2248 | add RB, RB, #32
2295 | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask 2249 | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask
2296 | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry 2250 | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry
2251 |.if hf == 1
2252 | vmov d0, CARG1, CARG2
2253 |.endif
2297 | bx lr 2254 | bx lr
2298 | 2255 |
2299 |2: // |x| < 1: 2256 |2: // |x| < 1:
@@ -2308,45 +2265,41 @@ static void build_subroutines(BuildCtx *ctx)
2308 | and CARG2, CARG2, #0x80000000 2265 | and CARG2, CARG2, #0x80000000
2309 | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) 2266 | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0)
2310 | orrne CARG2, CARG2, CARG4 2267 | orrne CARG2, CARG2, CARG4
2311 | bx lr 2268 |.if hf == 1
2269 | vmov d0, CARG1, CARG2
2312 |.endif 2270 |.endif
2271 | bx lr
2313 |.endmacro 2272 |.endmacro
2314 | 2273 |
2315 |.if FPU
2316 |.align 8
2317 |9:
2318 | .long 0, 0x3ff00000 // +1.0
2319 |8:
2320 | .long 0, 0x43300000 // 2^52
2321 |.else
2322 |9: 2274 |9:
2323 | .long 0x3ff00000 // hiword(+1.0) 2275 | .long 0x3ff00000 // hiword(+1.0)
2324 |.endif
2325 | 2276 |
2326 |->vm_floor: 2277 |->vm_floor:
2327 |.if not HFABI 2278 |.if HFABI
2328 | vm_round floor, 0
2329 |.endif
2330 |->vm_floor_hf:
2331 |.if FPU
2332 | vm_round floor, 1 2279 | vm_round floor, 1
2333 |.endif 2280 |.endif
2281 |->vm_floor_sf:
2282 | vm_round floor, 0
2334 | 2283 |
2335 |->vm_ceil: 2284 |->vm_ceil:
2336 |.if not HFABI 2285 |.if HFABI
2337 | vm_round ceil, 0
2338 |.endif
2339 |->vm_ceil_hf:
2340 |.if FPU
2341 | vm_round ceil, 1 2286 | vm_round ceil, 1
2342 |.endif 2287 |.endif
2288 |->vm_ceil_sf:
2289 | vm_round ceil, 0
2343 | 2290 |
2344 |->vm_trunc: 2291 |.macro vm_trunc, hf
2345 |.if JIT and not HFABI 2292 |.if JIT
2293 |.if hf == 1
2294 | vmov CARG1, CARG2, d0
2295 |.endif
2346 | lsl CARG3, CARG2, #1 2296 | lsl CARG3, CARG2, #1
2347 | adds RB, CARG3, #0x00200000 2297 | adds RB, CARG3, #0x00200000
2348 | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. 2298 | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0.
2349 | movpl CARG1, #0 2299 | movpl CARG1, #0
2300 |.if hf == 1
2301 | vmovpl d0, CARG1, CARG2
2302 |.endif
2350 | bxpl lr 2303 | bxpl lr
2351 | mvn CARG4, #0x3cc 2304 | mvn CARG4, #0x3cc
2352 | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. 2305 | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0.
@@ -2355,13 +2308,19 @@ static void build_subroutines(BuildCtx *ctx)
2355 | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask 2308 | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask
2356 | subs RB, RB, #32 2309 | subs RB, RB, #32
2357 | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask 2310 | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask
2311 |.if hf == 1
2312 | vmov d0, CARG1, CARG2
2313 |.endif
2358 | bx lr 2314 | bx lr
2359 |.endif 2315 |.endif
2316 |.endmacro
2360 | 2317 |
2361 |->vm_trunc_hf: 2318 |->vm_trunc:
2362 |.if JIT and FPU 2319 |.if HFABI
2363 | vm_round trunc, 1 2320 | vm_trunc 1
2364 |.endif 2321 |.endif
2322 |->vm_trunc_sf:
2323 | vm_trunc 0
2365 | 2324 |
2366 | // double lj_vm_mod(double dividend, double divisor); 2325 | // double lj_vm_mod(double dividend, double divisor);
2367 |->vm_mod: 2326 |->vm_mod:
@@ -2369,7 +2328,9 @@ static void build_subroutines(BuildCtx *ctx)
2369 | // Special calling convention. Also, RC (r11) is not preserved. 2328 | // Special calling convention. Also, RC (r11) is not preserved.
2370 | vdiv.f64 d0, d6, d7 2329 | vdiv.f64 d0, d6, d7
2371 | mov RC, lr 2330 | mov RC, lr
2372 | bl ->vm_floor_hf 2331 | vmov CARG1, CARG2, d0
2332 | bl ->vm_floor_sf
2333 | vmov d0, CARG1, CARG2
2373 | vmul.f64 d0, d0, d7 2334 | vmul.f64 d0, d0, d7
2374 | mov lr, RC 2335 | mov lr, RC
2375 | vsub.f64 d6, d6, d0 2336 | vsub.f64 d6, d6, d0
@@ -2377,7 +2338,7 @@ static void build_subroutines(BuildCtx *ctx)
2377 |.else 2338 |.else
2378 | push {r0, r1, r2, r3, r4, lr} 2339 | push {r0, r1, r2, r3, r4, lr}
2379 | bl extern __aeabi_ddiv 2340 | bl extern __aeabi_ddiv
2380 | bl ->vm_floor 2341 | bl ->vm_floor_sf
2381 | ldrd CARG34, [sp, #8] 2342 | ldrd CARG34, [sp, #8]
2382 | bl extern __aeabi_dmul 2343 | bl extern __aeabi_dmul
2383 | ldrd CARG34, [sp] 2344 | ldrd CARG34, [sp]