diff options
author | Mike Pall <mike> | 2012-10-15 16:53:03 +0200 |
---|---|---|
committer | Mike Pall <mike> | 2012-10-15 16:53:03 +0200 |
commit | 2621617a9292ea821a0630339e20e83e11858a5e (patch) | |
tree | b9fc47ee02e0afeb124466455ee07d27da32fc58 /src | |
parent | 894d2d6ef4bf50a7c355e49e4508de5d07edad2d (diff) | |
download | luajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.gz luajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.bz2 luajit-2621617a9292ea821a0630339e20e83e11858a5e.zip |
ARM: Drop hard-fp variants of floor/ceil/trunc.
Soft-fp variants are faster on a Cortex-A9. Duh.
Diffstat (limited to 'src')
-rw-r--r-- | src/lj_asm_arm.h | 24 | ||||
-rw-r--r-- | src/lj_vm.h | 6 | ||||
-rw-r--r-- | src/vm_arm.dasc | 115 |
3 files changed, 57 insertions, 88 deletions
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index a7465cb7..ef907fbe 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h | |||
@@ -1574,15 +1574,23 @@ static void asm_callid(ASMState *as, IRIns *ir, IRCallID id) | |||
1574 | static void asm_callround(ASMState *as, IRIns *ir, int id) | 1574 | static void asm_callround(ASMState *as, IRIns *ir, int id) |
1575 | { | 1575 | { |
1576 | /* The modified regs must match with the *.dasc implementation. */ | 1576 | /* The modified regs must match with the *.dasc implementation. */ |
1577 | RegSet drop = RID2RSET(RID_D0)|RID2RSET(RID_D1)|RID2RSET(RID_D2)| | 1577 | RegSet drop = RID2RSET(RID_R0)|RID2RSET(RID_R1)|RID2RSET(RID_R2)| |
1578 | RID2RSET(RID_R0)|RID2RSET(RID_R1); | 1578 | RID2RSET(RID_R3)|RID2RSET(RID_R12); |
1579 | if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); | 1579 | RegSet of; |
1580 | Reg dest, src; | ||
1580 | ra_evictset(as, drop); | 1581 | ra_evictset(as, drop); |
1581 | ra_destreg(as, ir, RID_FPRET); | 1582 | dest = ra_dest(as, ir, RSET_FPR); |
1582 | emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_hf : | 1583 | emit_dnm(as, ARMI_VMOV_D_RR, RID_RETLO, RID_RETHI, (dest & 15)); |
1583 | id == IRFPM_CEIL ? (void *)lj_vm_ceil_hf : | 1584 | emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_sf : |
1584 | (void *)lj_vm_trunc_hf); | 1585 | id == IRFPM_CEIL ? (void *)lj_vm_ceil_sf : |
1585 | ra_leftov(as, RID_D0, ir->op1); | 1586 | (void *)lj_vm_trunc_sf); |
1587 | /* Workaround to protect argument GPRs from being used for remat. */ | ||
1588 | of = as->freeset; | ||
1589 | as->freeset &= ~RSET_RANGE(RID_R0, RID_R1+1); | ||
1590 | as->cost[RID_R0] = as->cost[RID_R1] = REGCOST(~0u, ASMREF_L); | ||
1591 | src = ra_alloc1(as, ir->op1, RSET_FPR); /* May alloc GPR to remat FPR. */ | ||
1592 | as->freeset |= (of & RSET_RANGE(RID_R0, RID_R1+1)); | ||
1593 | emit_dnm(as, ARMI_VMOV_RR_D, RID_R0, RID_R1, (src & 15)); | ||
1586 | } | 1594 | } |
1587 | #endif | 1595 | #endif |
1588 | 1596 | ||
diff --git a/src/lj_vm.h b/src/lj_vm.h index 813335e3..b6b3e7e5 100644 --- a/src/lj_vm.h +++ b/src/lj_vm.h | |||
@@ -56,8 +56,8 @@ LJ_ASMF void lj_vm_exit_interp(void); | |||
56 | LJ_ASMF double lj_vm_floor(double); | 56 | LJ_ASMF double lj_vm_floor(double); |
57 | LJ_ASMF double lj_vm_ceil(double); | 57 | LJ_ASMF double lj_vm_ceil(double); |
58 | #if LJ_TARGET_ARM | 58 | #if LJ_TARGET_ARM |
59 | LJ_ASMF double lj_vm_floor_hf(double); | 59 | LJ_ASMF double lj_vm_floor_sf(double); |
60 | LJ_ASMF double lj_vm_ceil_hf(double); | 60 | LJ_ASMF double lj_vm_ceil_sf(double); |
61 | #endif | 61 | #endif |
62 | #endif | 62 | #endif |
63 | #if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 | 63 | #if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 |
@@ -81,7 +81,7 @@ LJ_ASMF void lj_vm_powi_sse(void); | |||
81 | #else | 81 | #else |
82 | LJ_ASMF double lj_vm_trunc(double); | 82 | LJ_ASMF double lj_vm_trunc(double); |
83 | #if LJ_TARGET_ARM | 83 | #if LJ_TARGET_ARM |
84 | LJ_ASMF double lj_vm_trunc_hf(double); | 84 | LJ_ASMF double lj_vm_trunc_sf(double); |
85 | #endif | 85 | #endif |
86 | #endif | 86 | #endif |
87 | LJ_ASMF double lj_vm_powi(double, int32_t); | 87 | LJ_ASMF double lj_vm_powi(double, int32_t); |
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc index f00b3028..fb9363e4 100644 --- a/src/vm_arm.dasc +++ b/src/vm_arm.dasc | |||
@@ -1368,14 +1368,8 @@ static void build_subroutines(BuildCtx *ctx) | |||
1368 | | movmi CARG1, #0x80000000 | 1368 | | movmi CARG1, #0x80000000 |
1369 | | bmi <1 | 1369 | | bmi <1 |
1370 | |4: | 1370 | |4: |
1371 | |.if HFABI | 1371 | | bl ->vm_..func.._sf |
1372 | | vmov d0, CARG1, CARG2 | ||
1373 | | bl ->vm_..func.._hf | ||
1374 | | b ->fff_resd | ||
1375 | |.else | ||
1376 | | bl ->vm_..func | ||
1377 | | b ->fff_restv | 1372 | | b ->fff_restv |
1378 | |.endif | ||
1379 | |.endmacro | 1373 | |.endmacro |
1380 | | | 1374 | | |
1381 | | math_round floor | 1375 | | math_round floor |
@@ -2221,52 +2215,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
2221 | |// | 2215 | |// |
2222 | |// double lj_vm_floor/ceil/trunc(double x); | 2216 | |// double lj_vm_floor/ceil/trunc(double x); |
2223 | |.macro vm_round, func, hf | 2217 | |.macro vm_round, func, hf |
2224 | |.if FPU | 2218 | |.if hf == 1 |
2225 | |.if hf == 0 | ||
2226 | | vmov d0, CARG1, CARG2 | ||
2227 | | vldr d2, <8 // 2^52 | ||
2228 | |.else | ||
2229 | | vldr d2, <8 // 2^52 | ||
2230 | | vmov CARG1, CARG2, d0 | 2219 | | vmov CARG1, CARG2, d0 |
2231 | |.endif | 2220 | |.endif |
2232 | | vabs.f64 d1, d0 | ||
2233 | | vcmp.f64 d1, d2 // |x| >= 2^52 or NaN? | ||
2234 | | vmrs | ||
2235 | |.if "func" == "trunc" | ||
2236 | | bxpl lr // Return argument unchanged. | ||
2237 | | vadd.f64 d0, d1, d2 | ||
2238 | | vsub.f64 d0, d0, d2 // (|x| + 2^52) - 2^52 | ||
2239 | | vldr d2, <9 // +1.0 | ||
2240 | | vcmp.f64 d1, d0 // |x| < result: subtract +1.0 | ||
2241 | | vmrs | ||
2242 | | vsubmi.f64 d0, d0, d2 | ||
2243 | | cmp CARG2, #0 | ||
2244 | | vnegmi.f64 d0, d0 // Merge sign bit back in. | ||
2245 | |.else | ||
2246 | | vadd.f64 d1, d1, d2 | ||
2247 | | bxpl lr // Return argument unchanged. | ||
2248 | | cmp CARG2, #0 | ||
2249 | | vsub.f64 d1, d1, d2 // (|x| + 2^52) - 2^52 | ||
2250 | | vldr d2, <9 // +1.0 | ||
2251 | | vnegmi.f64 d1, d1 // Merge sign bit back in. | ||
2252 | |.if "func" == "floor" | ||
2253 | | vcmp.f64 d0, d1 // x < result: subtract +1.0. | ||
2254 | | vmrs | ||
2255 | | vsubmi.f64 d0, d1, d2 | ||
2256 | |.else | ||
2257 | | vcmp.f64 d1, d0 // x > result: add +1.0. | ||
2258 | | vmrs | ||
2259 | | vaddmi.f64 d0, d1, d2 | ||
2260 | |.endif | ||
2261 | | vmovpl.f64 d0, d1 | ||
2262 | |.endif | ||
2263 | |.if hf == 0 | ||
2264 | | vmov CARG1, CARG2, d0 | ||
2265 | |.endif | ||
2266 | | bx lr | ||
2267 | | | ||
2268 | |.else | ||
2269 | | | ||
2270 | | lsl CARG3, CARG2, #1 | 2221 | | lsl CARG3, CARG2, #1 |
2271 | | adds RB, CARG3, #0x00200000 | 2222 | | adds RB, CARG3, #0x00200000 |
2272 | | bpl >2 // |x| < 1? | 2223 | | bpl >2 // |x| < 1? |
@@ -2286,6 +2237,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
2286 | |.else | 2237 | |.else |
2287 | | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) | 2238 | | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) |
2288 | |.endif | 2239 | |.endif |
2240 | |.if hf == 1 | ||
2241 | | vmoveq d0, CARG1, CARG2 | ||
2242 | |.endif | ||
2289 | | bxeq lr // iszero: done. | 2243 | | bxeq lr // iszero: done. |
2290 | | mvn CARG4, #1 | 2244 | | mvn CARG4, #1 |
2291 | | cmp RB, #0 | 2245 | | cmp RB, #0 |
@@ -2294,6 +2248,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
2294 | | add RB, RB, #32 | 2248 | | add RB, RB, #32 |
2295 | | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask | 2249 | | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask |
2296 | | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry | 2250 | | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry |
2251 | |.if hf == 1 | ||
2252 | | vmov d0, CARG1, CARG2 | ||
2253 | |.endif | ||
2297 | | bx lr | 2254 | | bx lr |
2298 | | | 2255 | | |
2299 | |2: // |x| < 1: | 2256 | |2: // |x| < 1: |
@@ -2308,45 +2265,41 @@ static void build_subroutines(BuildCtx *ctx) | |||
2308 | | and CARG2, CARG2, #0x80000000 | 2265 | | and CARG2, CARG2, #0x80000000 |
2309 | | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) | 2266 | | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) |
2310 | | orrne CARG2, CARG2, CARG4 | 2267 | | orrne CARG2, CARG2, CARG4 |
2311 | | bx lr | 2268 | |.if hf == 1 |
2269 | | vmov d0, CARG1, CARG2 | ||
2312 | |.endif | 2270 | |.endif |
2271 | | bx lr | ||
2313 | |.endmacro | 2272 | |.endmacro |
2314 | | | 2273 | | |
2315 | |.if FPU | ||
2316 | |.align 8 | ||
2317 | |9: | ||
2318 | | .long 0, 0x3ff00000 // +1.0 | ||
2319 | |8: | ||
2320 | | .long 0, 0x43300000 // 2^52 | ||
2321 | |.else | ||
2322 | |9: | 2274 | |9: |
2323 | | .long 0x3ff00000 // hiword(+1.0) | 2275 | | .long 0x3ff00000 // hiword(+1.0) |
2324 | |.endif | ||
2325 | | | 2276 | | |
2326 | |->vm_floor: | 2277 | |->vm_floor: |
2327 | |.if not HFABI | 2278 | |.if HFABI |
2328 | | vm_round floor, 0 | ||
2329 | |.endif | ||
2330 | |->vm_floor_hf: | ||
2331 | |.if FPU | ||
2332 | | vm_round floor, 1 | 2279 | | vm_round floor, 1 |
2333 | |.endif | 2280 | |.endif |
2281 | |->vm_floor_sf: | ||
2282 | | vm_round floor, 0 | ||
2334 | | | 2283 | | |
2335 | |->vm_ceil: | 2284 | |->vm_ceil: |
2336 | |.if not HFABI | 2285 | |.if HFABI |
2337 | | vm_round ceil, 0 | ||
2338 | |.endif | ||
2339 | |->vm_ceil_hf: | ||
2340 | |.if FPU | ||
2341 | | vm_round ceil, 1 | 2286 | | vm_round ceil, 1 |
2342 | |.endif | 2287 | |.endif |
2288 | |->vm_ceil_sf: | ||
2289 | | vm_round ceil, 0 | ||
2343 | | | 2290 | | |
2344 | |->vm_trunc: | 2291 | |.macro vm_trunc, hf |
2345 | |.if JIT and not HFABI | 2292 | |.if JIT |
2293 | |.if hf == 1 | ||
2294 | | vmov CARG1, CARG2, d0 | ||
2295 | |.endif | ||
2346 | | lsl CARG3, CARG2, #1 | 2296 | | lsl CARG3, CARG2, #1 |
2347 | | adds RB, CARG3, #0x00200000 | 2297 | | adds RB, CARG3, #0x00200000 |
2348 | | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. | 2298 | | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. |
2349 | | movpl CARG1, #0 | 2299 | | movpl CARG1, #0 |
2300 | |.if hf == 1 | ||
2301 | | vmovpl d0, CARG1, CARG2 | ||
2302 | |.endif | ||
2350 | | bxpl lr | 2303 | | bxpl lr |
2351 | | mvn CARG4, #0x3cc | 2304 | | mvn CARG4, #0x3cc |
2352 | | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. | 2305 | | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. |
@@ -2355,13 +2308,19 @@ static void build_subroutines(BuildCtx *ctx) | |||
2355 | | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask | 2308 | | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask |
2356 | | subs RB, RB, #32 | 2309 | | subs RB, RB, #32 |
2357 | | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask | 2310 | | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask |
2311 | |.if hf == 1 | ||
2312 | | vmov d0, CARG1, CARG2 | ||
2313 | |.endif | ||
2358 | | bx lr | 2314 | | bx lr |
2359 | |.endif | 2315 | |.endif |
2316 | |.endmacro | ||
2360 | | | 2317 | | |
2361 | |->vm_trunc_hf: | 2318 | |->vm_trunc: |
2362 | |.if JIT and FPU | 2319 | |.if HFABI |
2363 | | vm_round trunc, 1 | 2320 | | vm_trunc 1 |
2364 | |.endif | 2321 | |.endif |
2322 | |->vm_trunc_sf: | ||
2323 | | vm_trunc 0 | ||
2365 | | | 2324 | | |
2366 | | // double lj_vm_mod(double dividend, double divisor); | 2325 | | // double lj_vm_mod(double dividend, double divisor); |
2367 | |->vm_mod: | 2326 | |->vm_mod: |
@@ -2369,7 +2328,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
2369 | | // Special calling convention. Also, RC (r11) is not preserved. | 2328 | | // Special calling convention. Also, RC (r11) is not preserved. |
2370 | | vdiv.f64 d0, d6, d7 | 2329 | | vdiv.f64 d0, d6, d7 |
2371 | | mov RC, lr | 2330 | | mov RC, lr |
2372 | | bl ->vm_floor_hf | 2331 | | vmov CARG1, CARG2, d0 |
2332 | | bl ->vm_floor_sf | ||
2333 | | vmov d0, CARG1, CARG2 | ||
2373 | | vmul.f64 d0, d0, d7 | 2334 | | vmul.f64 d0, d0, d7 |
2374 | | mov lr, RC | 2335 | | mov lr, RC |
2375 | | vsub.f64 d6, d6, d0 | 2336 | | vsub.f64 d6, d6, d0 |
@@ -2377,7 +2338,7 @@ static void build_subroutines(BuildCtx *ctx) | |||
2377 | |.else | 2338 | |.else |
2378 | | push {r0, r1, r2, r3, r4, lr} | 2339 | | push {r0, r1, r2, r3, r4, lr} |
2379 | | bl extern __aeabi_ddiv | 2340 | | bl extern __aeabi_ddiv |
2380 | | bl ->vm_floor | 2341 | | bl ->vm_floor_sf |
2381 | | ldrd CARG34, [sp, #8] | 2342 | | ldrd CARG34, [sp, #8] |
2382 | | bl extern __aeabi_dmul | 2343 | | bl extern __aeabi_dmul |
2383 | | ldrd CARG34, [sp] | 2344 | | ldrd CARG34, [sp] |