diff options
| -rw-r--r-- | src/lj_asm.c | 4 | ||||
| -rw-r--r-- | src/lj_emit_arm64.h | 73 |
2 files changed, 41 insertions, 36 deletions
diff --git a/src/lj_asm.c b/src/lj_asm.c index c02a1b9e..844910ad 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c | |||
| @@ -606,7 +606,11 @@ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow) | |||
| 606 | IRIns *ir = IR(ref); | 606 | IRIns *ir = IR(ref); |
| 607 | if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) || | 607 | if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) || |
| 608 | #if LJ_GC64 | 608 | #if LJ_GC64 |
| 609 | #if LJ_TARGET_ARM64 | ||
| 610 | (ir->o == IR_KINT && (uint64_t)k == (uint32_t)ir->i) || | ||
| 611 | #else | ||
| 609 | (ir->o == IR_KINT && k == ir->i) || | 612 | (ir->o == IR_KINT && k == ir->i) || |
| 613 | #endif | ||
| 610 | (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) || | 614 | (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) || |
| 611 | ((ir->o == IR_KPTR || ir->o == IR_KKPTR) && | 615 | ((ir->o == IR_KPTR || ir->o == IR_KKPTR) && |
| 612 | k == (intptr_t)ir_kptr(ir)) | 616 | k == (intptr_t)ir_kptr(ir)) |
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 86626177..50e658dd 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h | |||
| @@ -20,7 +20,7 @@ static uint64_t get_k64val(ASMState *as, IRRef ref) | |||
| 20 | } else { | 20 | } else { |
| 21 | lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL, | 21 | lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL, |
| 22 | "bad 64 bit const IR op %d", ir->o); | 22 | "bad 64 bit const IR op %d", ir->o); |
| 23 | return ir->i; /* Sign-extended. */ | 23 | return (uint32_t)ir->i; /* Zero-extended. */ |
| 24 | } | 24 | } |
| 25 | } | 25 | } |
| 26 | 26 | ||
| @@ -152,11 +152,10 @@ nopair: | |||
| 152 | /* Prefer rematerialization of BASE/L from global_State over spills. */ | 152 | /* Prefer rematerialization of BASE/L from global_State over spills. */ |
| 153 | #define emit_canremat(ref) ((ref) <= ASMREF_L) | 153 | #define emit_canremat(ref) ((ref) <= ASMREF_L) |
| 154 | 154 | ||
| 155 | /* Try to find an N-step delta relative to other consts with N < lim. */ | 155 | /* Try to find a one-step delta relative to other consts. */ |
| 156 | static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) | 156 | static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64) |
| 157 | { | 157 | { |
| 158 | RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL); | 158 | RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL); |
| 159 | if (lim <= 1) return 0; /* Can't beat that. */ | ||
| 160 | while (work) { | 159 | while (work) { |
| 161 | Reg r = rset_picktop(work); | 160 | Reg r = rset_picktop(work); |
| 162 | IRRef ref = regcost_ref(as->cost[r]); | 161 | IRRef ref = regcost_ref(as->cost[r]); |
| @@ -165,13 +164,14 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) | |||
| 165 | uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) : | 164 | uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) : |
| 166 | get_k64val(as, ref); | 165 | get_k64val(as, ref); |
| 167 | int64_t delta = (int64_t)(k - kx); | 166 | int64_t delta = (int64_t)(k - kx); |
| 167 | if (!is64) delta = (int64_t)(int32_t)delta; /* Sign-extend. */ | ||
| 168 | if (delta == 0) { | 168 | if (delta == 0) { |
| 169 | emit_dm(as, A64I_MOVx, rd, r); | 169 | emit_dm(as, is64|A64I_MOVw, rd, r); |
| 170 | return 1; | 170 | return 1; |
| 171 | } else { | 171 | } else { |
| 172 | uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta); | 172 | uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta); |
| 173 | if (k12) { | 173 | if (k12) { |
| 174 | emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r); | 174 | emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r); |
| 175 | return 1; | 175 | return 1; |
| 176 | } | 176 | } |
| 177 | /* Do other ops or multi-step deltas pay off? Probably not. | 177 | /* Do other ops or multi-step deltas pay off? Probably not. |
| @@ -184,51 +184,52 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) | |||
| 184 | return 0; /* Failed. */ | 184 | return 0; /* Failed. */ |
| 185 | } | 185 | } |
| 186 | 186 | ||
| 187 | static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) | 187 | static void emit_loadk(ASMState *as, Reg rd, uint64_t u64) |
| 188 | { | 188 | { |
| 189 | int i, zeros = 0, ones = 0, neg; | 189 | int zeros = 0, ones = 0, neg, lshift = 0; |
| 190 | if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */ | 190 | int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2; |
| 191 | /* Count homogeneous 16 bit fragments. */ | 191 | /* Count non-homogeneous 16 bit fragments. */ |
| 192 | for (i = 0; i < 4; i++) { | 192 | while (--i >= 0) { |
| 193 | uint64_t frag = (u64 >> i*16) & 0xffff; | 193 | uint32_t frag = (u64 >> i*16) & 0xffff; |
| 194 | zeros += (frag == 0); | 194 | zeros += (frag != 0); |
| 195 | ones += (frag == 0xffff); | 195 | ones += (frag != 0xffff); |
| 196 | } | 196 | } |
| 197 | neg = ones > zeros; /* Use MOVN if it pays off. */ | 197 | neg = ones < zeros; /* Use MOVN if it pays off. */ |
| 198 | if ((neg ? ones : zeros) < 3) { /* Need 2+ ins. Try shorter K13 encoding. */ | 198 | if ((neg ? ones : zeros) > 1) { /* Need 2+ ins. Try 1 ins encodings. */ |
| 199 | uint32_t k13 = emit_isk13(u64, is64); | 199 | uint32_t k13 = emit_isk13(u64, is64); |
| 200 | if (k13) { | 200 | if (k13) { |
| 201 | emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO); | 201 | emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO); |
| 202 | return; | 202 | return; |
| 203 | } | 203 | } |
| 204 | } | 204 | if (emit_kdelta(as, rd, u64, is64)) { |
| 205 | if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) { | 205 | return; |
| 206 | int shift = 0, lshift = 0; | ||
| 207 | uint64_t n64 = neg ? ~u64 : u64; | ||
| 208 | if (n64 != 0) { | ||
| 209 | /* Find first/last fragment to be filled. */ | ||
| 210 | shift = (63-emit_clz64(n64)) & ~15; | ||
| 211 | lshift = emit_ctz64(n64) & ~15; | ||
| 212 | } | 206 | } |
| 213 | /* MOVK requires the original value (u64). */ | 207 | } |
| 214 | while (shift > lshift) { | 208 | if (neg) { |
| 215 | uint32_t u16 = (u64 >> shift) & 0xffff; | 209 | u64 = ~u64; |
| 216 | /* Skip fragments that are correctly filled by MOVN/MOVZ. */ | 210 | if (!is64) u64 = (uint32_t)u64; |
| 217 | if (u16 != (neg ? 0xffff : 0)) | 211 | } |
| 218 | emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd); | 212 | if (u64) { |
| 219 | shift -= 16; | 213 | /* Find first/last fragment to be filled. */ |
| 214 | int shift = (63-emit_clz64(u64)) & ~15; | ||
| 215 | lshift = emit_ctz64(u64) & ~15; | ||
| 216 | for (; shift > lshift; shift -= 16) { | ||
| 217 | uint32_t frag = (u64 >> shift) & 0xffff; | ||
| 218 | if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */ | ||
| 219 | if (neg) frag ^= 0xffff; /* MOVK requires the original value. */ | ||
| 220 | emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd); | ||
| 220 | } | 221 | } |
| 221 | /* But MOVN needs an inverted value (n64). */ | ||
| 222 | emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) | | ||
| 223 | A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd); | ||
| 224 | } | 222 | } |
| 223 | /* But MOVN needs an inverted value. */ | ||
| 224 | emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) | | ||
| 225 | A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd); | ||
| 225 | } | 226 | } |
| 226 | 227 | ||
| 227 | /* Load a 32 bit constant into a GPR. */ | 228 | /* Load a 32 bit constant into a GPR. */ |
| 228 | #define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0) | 229 | #define emit_loadi(as, rd, i) emit_loadk(as, rd, (uint32_t)i) |
| 229 | 230 | ||
| 230 | /* Load a 64 bit constant into a GPR. */ | 231 | /* Load a 64 bit constant into a GPR. */ |
| 231 | #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X) | 232 | #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i) |
| 232 | 233 | ||
| 233 | #define glofs(as, k) \ | 234 | #define glofs(as, k) \ |
| 234 | ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) | 235 | ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) |
