diff options
| author | Mike Pall <mike> | 2012-10-15 16:53:03 +0200 |
|---|---|---|
| committer | Mike Pall <mike> | 2012-10-15 16:53:03 +0200 |
| commit | 2621617a9292ea821a0630339e20e83e11858a5e (patch) | |
| tree | b9fc47ee02e0afeb124466455ee07d27da32fc58 /src | |
| parent | 894d2d6ef4bf50a7c355e49e4508de5d07edad2d (diff) | |
| download | luajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.gz luajit-2621617a9292ea821a0630339e20e83e11858a5e.tar.bz2 luajit-2621617a9292ea821a0630339e20e83e11858a5e.zip | |
ARM: Drop hard-fp variants of floor/ceil/trunc.
Soft-fp variants are faster on a Cortex-A9. Duh.
Diffstat (limited to 'src')
| -rw-r--r-- | src/lj_asm_arm.h | 24 | ||||
| -rw-r--r-- | src/lj_vm.h | 6 | ||||
| -rw-r--r-- | src/vm_arm.dasc | 115 |
3 files changed, 57 insertions, 88 deletions
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index a7465cb7..ef907fbe 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h | |||
| @@ -1574,15 +1574,23 @@ static void asm_callid(ASMState *as, IRIns *ir, IRCallID id) | |||
| 1574 | static void asm_callround(ASMState *as, IRIns *ir, int id) | 1574 | static void asm_callround(ASMState *as, IRIns *ir, int id) |
| 1575 | { | 1575 | { |
| 1576 | /* The modified regs must match with the *.dasc implementation. */ | 1576 | /* The modified regs must match with the *.dasc implementation. */ |
| 1577 | RegSet drop = RID2RSET(RID_D0)|RID2RSET(RID_D1)|RID2RSET(RID_D2)| | 1577 | RegSet drop = RID2RSET(RID_R0)|RID2RSET(RID_R1)|RID2RSET(RID_R2)| |
| 1578 | RID2RSET(RID_R0)|RID2RSET(RID_R1); | 1578 | RID2RSET(RID_R3)|RID2RSET(RID_R12); |
| 1579 | if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); | 1579 | RegSet of; |
| 1580 | Reg dest, src; | ||
| 1580 | ra_evictset(as, drop); | 1581 | ra_evictset(as, drop); |
| 1581 | ra_destreg(as, ir, RID_FPRET); | 1582 | dest = ra_dest(as, ir, RSET_FPR); |
| 1582 | emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_hf : | 1583 | emit_dnm(as, ARMI_VMOV_D_RR, RID_RETLO, RID_RETHI, (dest & 15)); |
| 1583 | id == IRFPM_CEIL ? (void *)lj_vm_ceil_hf : | 1584 | emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_sf : |
| 1584 | (void *)lj_vm_trunc_hf); | 1585 | id == IRFPM_CEIL ? (void *)lj_vm_ceil_sf : |
| 1585 | ra_leftov(as, RID_D0, ir->op1); | 1586 | (void *)lj_vm_trunc_sf); |
| 1587 | /* Workaround to protect argument GPRs from being used for remat. */ | ||
| 1588 | of = as->freeset; | ||
| 1589 | as->freeset &= ~RSET_RANGE(RID_R0, RID_R1+1); | ||
| 1590 | as->cost[RID_R0] = as->cost[RID_R1] = REGCOST(~0u, ASMREF_L); | ||
| 1591 | src = ra_alloc1(as, ir->op1, RSET_FPR); /* May alloc GPR to remat FPR. */ | ||
| 1592 | as->freeset |= (of & RSET_RANGE(RID_R0, RID_R1+1)); | ||
| 1593 | emit_dnm(as, ARMI_VMOV_RR_D, RID_R0, RID_R1, (src & 15)); | ||
| 1586 | } | 1594 | } |
| 1587 | #endif | 1595 | #endif |
| 1588 | 1596 | ||
diff --git a/src/lj_vm.h b/src/lj_vm.h index 813335e3..b6b3e7e5 100644 --- a/src/lj_vm.h +++ b/src/lj_vm.h | |||
| @@ -56,8 +56,8 @@ LJ_ASMF void lj_vm_exit_interp(void); | |||
| 56 | LJ_ASMF double lj_vm_floor(double); | 56 | LJ_ASMF double lj_vm_floor(double); |
| 57 | LJ_ASMF double lj_vm_ceil(double); | 57 | LJ_ASMF double lj_vm_ceil(double); |
| 58 | #if LJ_TARGET_ARM | 58 | #if LJ_TARGET_ARM |
| 59 | LJ_ASMF double lj_vm_floor_hf(double); | 59 | LJ_ASMF double lj_vm_floor_sf(double); |
| 60 | LJ_ASMF double lj_vm_ceil_hf(double); | 60 | LJ_ASMF double lj_vm_ceil_sf(double); |
| 61 | #endif | 61 | #endif |
| 62 | #endif | 62 | #endif |
| 63 | #if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 | 63 | #if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 |
| @@ -81,7 +81,7 @@ LJ_ASMF void lj_vm_powi_sse(void); | |||
| 81 | #else | 81 | #else |
| 82 | LJ_ASMF double lj_vm_trunc(double); | 82 | LJ_ASMF double lj_vm_trunc(double); |
| 83 | #if LJ_TARGET_ARM | 83 | #if LJ_TARGET_ARM |
| 84 | LJ_ASMF double lj_vm_trunc_hf(double); | 84 | LJ_ASMF double lj_vm_trunc_sf(double); |
| 85 | #endif | 85 | #endif |
| 86 | #endif | 86 | #endif |
| 87 | LJ_ASMF double lj_vm_powi(double, int32_t); | 87 | LJ_ASMF double lj_vm_powi(double, int32_t); |
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc index f00b3028..fb9363e4 100644 --- a/src/vm_arm.dasc +++ b/src/vm_arm.dasc | |||
| @@ -1368,14 +1368,8 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 1368 | | movmi CARG1, #0x80000000 | 1368 | | movmi CARG1, #0x80000000 |
| 1369 | | bmi <1 | 1369 | | bmi <1 |
| 1370 | |4: | 1370 | |4: |
| 1371 | |.if HFABI | 1371 | | bl ->vm_..func.._sf |
| 1372 | | vmov d0, CARG1, CARG2 | ||
| 1373 | | bl ->vm_..func.._hf | ||
| 1374 | | b ->fff_resd | ||
| 1375 | |.else | ||
| 1376 | | bl ->vm_..func | ||
| 1377 | | b ->fff_restv | 1372 | | b ->fff_restv |
| 1378 | |.endif | ||
| 1379 | |.endmacro | 1373 | |.endmacro |
| 1380 | | | 1374 | | |
| 1381 | | math_round floor | 1375 | | math_round floor |
| @@ -2221,52 +2215,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2221 | |// | 2215 | |// |
| 2222 | |// double lj_vm_floor/ceil/trunc(double x); | 2216 | |// double lj_vm_floor/ceil/trunc(double x); |
| 2223 | |.macro vm_round, func, hf | 2217 | |.macro vm_round, func, hf |
| 2224 | |.if FPU | 2218 | |.if hf == 1 |
| 2225 | |.if hf == 0 | ||
| 2226 | | vmov d0, CARG1, CARG2 | ||
| 2227 | | vldr d2, <8 // 2^52 | ||
| 2228 | |.else | ||
| 2229 | | vldr d2, <8 // 2^52 | ||
| 2230 | | vmov CARG1, CARG2, d0 | 2219 | | vmov CARG1, CARG2, d0 |
| 2231 | |.endif | 2220 | |.endif |
| 2232 | | vabs.f64 d1, d0 | ||
| 2233 | | vcmp.f64 d1, d2 // |x| >= 2^52 or NaN? | ||
| 2234 | | vmrs | ||
| 2235 | |.if "func" == "trunc" | ||
| 2236 | | bxpl lr // Return argument unchanged. | ||
| 2237 | | vadd.f64 d0, d1, d2 | ||
| 2238 | | vsub.f64 d0, d0, d2 // (|x| + 2^52) - 2^52 | ||
| 2239 | | vldr d2, <9 // +1.0 | ||
| 2240 | | vcmp.f64 d1, d0 // |x| < result: subtract +1.0 | ||
| 2241 | | vmrs | ||
| 2242 | | vsubmi.f64 d0, d0, d2 | ||
| 2243 | | cmp CARG2, #0 | ||
| 2244 | | vnegmi.f64 d0, d0 // Merge sign bit back in. | ||
| 2245 | |.else | ||
| 2246 | | vadd.f64 d1, d1, d2 | ||
| 2247 | | bxpl lr // Return argument unchanged. | ||
| 2248 | | cmp CARG2, #0 | ||
| 2249 | | vsub.f64 d1, d1, d2 // (|x| + 2^52) - 2^52 | ||
| 2250 | | vldr d2, <9 // +1.0 | ||
| 2251 | | vnegmi.f64 d1, d1 // Merge sign bit back in. | ||
| 2252 | |.if "func" == "floor" | ||
| 2253 | | vcmp.f64 d0, d1 // x < result: subtract +1.0. | ||
| 2254 | | vmrs | ||
| 2255 | | vsubmi.f64 d0, d1, d2 | ||
| 2256 | |.else | ||
| 2257 | | vcmp.f64 d1, d0 // x > result: add +1.0. | ||
| 2258 | | vmrs | ||
| 2259 | | vaddmi.f64 d0, d1, d2 | ||
| 2260 | |.endif | ||
| 2261 | | vmovpl.f64 d0, d1 | ||
| 2262 | |.endif | ||
| 2263 | |.if hf == 0 | ||
| 2264 | | vmov CARG1, CARG2, d0 | ||
| 2265 | |.endif | ||
| 2266 | | bx lr | ||
| 2267 | | | ||
| 2268 | |.else | ||
| 2269 | | | ||
| 2270 | | lsl CARG3, CARG2, #1 | 2221 | | lsl CARG3, CARG2, #1 |
| 2271 | | adds RB, CARG3, #0x00200000 | 2222 | | adds RB, CARG3, #0x00200000 |
| 2272 | | bpl >2 // |x| < 1? | 2223 | | bpl >2 // |x| < 1? |
| @@ -2286,6 +2237,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2286 | |.else | 2237 | |.else |
| 2287 | | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) | 2238 | | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) |
| 2288 | |.endif | 2239 | |.endif |
| 2240 | |.if hf == 1 | ||
| 2241 | | vmoveq d0, CARG1, CARG2 | ||
| 2242 | |.endif | ||
| 2289 | | bxeq lr // iszero: done. | 2243 | | bxeq lr // iszero: done. |
| 2290 | | mvn CARG4, #1 | 2244 | | mvn CARG4, #1 |
| 2291 | | cmp RB, #0 | 2245 | | cmp RB, #0 |
| @@ -2294,6 +2248,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2294 | | add RB, RB, #32 | 2248 | | add RB, RB, #32 |
| 2295 | | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask | 2249 | | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask |
| 2296 | | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry | 2250 | | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry |
| 2251 | |.if hf == 1 | ||
| 2252 | | vmov d0, CARG1, CARG2 | ||
| 2253 | |.endif | ||
| 2297 | | bx lr | 2254 | | bx lr |
| 2298 | | | 2255 | | |
| 2299 | |2: // |x| < 1: | 2256 | |2: // |x| < 1: |
| @@ -2308,45 +2265,41 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2308 | | and CARG2, CARG2, #0x80000000 | 2265 | | and CARG2, CARG2, #0x80000000 |
| 2309 | | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) | 2266 | | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) |
| 2310 | | orrne CARG2, CARG2, CARG4 | 2267 | | orrne CARG2, CARG2, CARG4 |
| 2311 | | bx lr | 2268 | |.if hf == 1 |
| 2269 | | vmov d0, CARG1, CARG2 | ||
| 2312 | |.endif | 2270 | |.endif |
| 2271 | | bx lr | ||
| 2313 | |.endmacro | 2272 | |.endmacro |
| 2314 | | | 2273 | | |
| 2315 | |.if FPU | ||
| 2316 | |.align 8 | ||
| 2317 | |9: | ||
| 2318 | | .long 0, 0x3ff00000 // +1.0 | ||
| 2319 | |8: | ||
| 2320 | | .long 0, 0x43300000 // 2^52 | ||
| 2321 | |.else | ||
| 2322 | |9: | 2274 | |9: |
| 2323 | | .long 0x3ff00000 // hiword(+1.0) | 2275 | | .long 0x3ff00000 // hiword(+1.0) |
| 2324 | |.endif | ||
| 2325 | | | 2276 | | |
| 2326 | |->vm_floor: | 2277 | |->vm_floor: |
| 2327 | |.if not HFABI | 2278 | |.if HFABI |
| 2328 | | vm_round floor, 0 | ||
| 2329 | |.endif | ||
| 2330 | |->vm_floor_hf: | ||
| 2331 | |.if FPU | ||
| 2332 | | vm_round floor, 1 | 2279 | | vm_round floor, 1 |
| 2333 | |.endif | 2280 | |.endif |
| 2281 | |->vm_floor_sf: | ||
| 2282 | | vm_round floor, 0 | ||
| 2334 | | | 2283 | | |
| 2335 | |->vm_ceil: | 2284 | |->vm_ceil: |
| 2336 | |.if not HFABI | 2285 | |.if HFABI |
| 2337 | | vm_round ceil, 0 | ||
| 2338 | |.endif | ||
| 2339 | |->vm_ceil_hf: | ||
| 2340 | |.if FPU | ||
| 2341 | | vm_round ceil, 1 | 2286 | | vm_round ceil, 1 |
| 2342 | |.endif | 2287 | |.endif |
| 2288 | |->vm_ceil_sf: | ||
| 2289 | | vm_round ceil, 0 | ||
| 2343 | | | 2290 | | |
| 2344 | |->vm_trunc: | 2291 | |.macro vm_trunc, hf |
| 2345 | |.if JIT and not HFABI | 2292 | |.if JIT |
| 2293 | |.if hf == 1 | ||
| 2294 | | vmov CARG1, CARG2, d0 | ||
| 2295 | |.endif | ||
| 2346 | | lsl CARG3, CARG2, #1 | 2296 | | lsl CARG3, CARG2, #1 |
| 2347 | | adds RB, CARG3, #0x00200000 | 2297 | | adds RB, CARG3, #0x00200000 |
| 2348 | | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. | 2298 | | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. |
| 2349 | | movpl CARG1, #0 | 2299 | | movpl CARG1, #0 |
| 2300 | |.if hf == 1 | ||
| 2301 | | vmovpl d0, CARG1, CARG2 | ||
| 2302 | |.endif | ||
| 2350 | | bxpl lr | 2303 | | bxpl lr |
| 2351 | | mvn CARG4, #0x3cc | 2304 | | mvn CARG4, #0x3cc |
| 2352 | | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. | 2305 | | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. |
| @@ -2355,13 +2308,19 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2355 | | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask | 2308 | | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask |
| 2356 | | subs RB, RB, #32 | 2309 | | subs RB, RB, #32 |
| 2357 | | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask | 2310 | | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask |
| 2311 | |.if hf == 1 | ||
| 2312 | | vmov d0, CARG1, CARG2 | ||
| 2313 | |.endif | ||
| 2358 | | bx lr | 2314 | | bx lr |
| 2359 | |.endif | 2315 | |.endif |
| 2316 | |.endmacro | ||
| 2360 | | | 2317 | | |
| 2361 | |->vm_trunc_hf: | 2318 | |->vm_trunc: |
| 2362 | |.if JIT and FPU | 2319 | |.if HFABI |
| 2363 | | vm_round trunc, 1 | 2320 | | vm_trunc 1 |
| 2364 | |.endif | 2321 | |.endif |
| 2322 | |->vm_trunc_sf: | ||
| 2323 | | vm_trunc 0 | ||
| 2365 | | | 2324 | | |
| 2366 | | // double lj_vm_mod(double dividend, double divisor); | 2325 | | // double lj_vm_mod(double dividend, double divisor); |
| 2367 | |->vm_mod: | 2326 | |->vm_mod: |
| @@ -2369,7 +2328,9 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2369 | | // Special calling convention. Also, RC (r11) is not preserved. | 2328 | | // Special calling convention. Also, RC (r11) is not preserved. |
| 2370 | | vdiv.f64 d0, d6, d7 | 2329 | | vdiv.f64 d0, d6, d7 |
| 2371 | | mov RC, lr | 2330 | | mov RC, lr |
| 2372 | | bl ->vm_floor_hf | 2331 | | vmov CARG1, CARG2, d0 |
| 2332 | | bl ->vm_floor_sf | ||
| 2333 | | vmov d0, CARG1, CARG2 | ||
| 2373 | | vmul.f64 d0, d0, d7 | 2334 | | vmul.f64 d0, d0, d7 |
| 2374 | | mov lr, RC | 2335 | | mov lr, RC |
| 2375 | | vsub.f64 d6, d6, d0 | 2336 | | vsub.f64 d6, d6, d0 |
| @@ -2377,7 +2338,7 @@ static void build_subroutines(BuildCtx *ctx) | |||
| 2377 | |.else | 2338 | |.else |
| 2378 | | push {r0, r1, r2, r3, r4, lr} | 2339 | | push {r0, r1, r2, r3, r4, lr} |
| 2379 | | bl extern __aeabi_ddiv | 2340 | | bl extern __aeabi_ddiv |
| 2380 | | bl ->vm_floor | 2341 | | bl ->vm_floor_sf |
| 2381 | | ldrd CARG34, [sp, #8] | 2342 | | ldrd CARG34, [sp, #8] |
| 2382 | | bl extern __aeabi_dmul | 2343 | | bl extern __aeabi_dmul |
| 2383 | | ldrd CARG34, [sp] | 2344 | | ldrd CARG34, [sp] |
