diff options
author | Mike Pall <mike> | 2016-03-28 23:05:20 +0200 |
---|---|---|
committer | Mike Pall <mike> | 2016-03-28 23:05:20 +0200 |
commit | 892887e5841fc91d8f954e780310a66404cbaadc (patch) | |
tree | 44aa22b73bb795a6d21f76c22db0ad84882a0d2b | |
parent | 6801e7165c3a5031db3cfe0e52f50cebb918695f (diff) | |
download | luajit-892887e5841fc91d8f954e780310a66404cbaadc.tar.gz luajit-892887e5841fc91d8f954e780310a66404cbaadc.tar.bz2 luajit-892887e5841fc91d8f954e780310a66404cbaadc.zip |
x86: Generate BMI2 shifts and rotates, if available.
Contributed by Peter Cawley.
Diffstat (limited to '')
-rw-r--r-- | src/jit/dis_x86.lua | 3 | ||||
-rw-r--r-- | src/lj_asm.c | 5 | ||||
-rw-r--r-- | src/lj_asm_x86.h | 28 | ||||
-rw-r--r-- | src/lj_emit_x86.h | 11 | ||||
-rw-r--r-- | src/lj_target_x86.h | 11 |
5 files changed, 51 insertions, 7 deletions
diff --git a/src/jit/dis_x86.lua b/src/jit/dis_x86.lua index f8a21ff3..d564988e 100644 --- a/src/jit/dis_x86.lua +++ b/src/jit/dis_x86.lua | |||
@@ -244,6 +244,7 @@ nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm", | |||
244 | [0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm", | 244 | [0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm", |
245 | --Fx | 245 | --Fx |
246 | [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt", | 246 | [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt", |
247 | [0xf7] = "|sarxVrmv|shlxVrmv|shrxVrmv", | ||
247 | }, | 248 | }, |
248 | 249 | ||
249 | ["3a"] = { -- [66] 0f 3a xx | 250 | ["3a"] = { -- [66] 0f 3a xx |
@@ -273,6 +274,8 @@ nil,nil,nil,nil, | |||
273 | [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu", | 274 | [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu", |
274 | [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu", | 275 | [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu", |
275 | [0xdf] = "||aeskeygenassistXrmu", | 276 | [0xdf] = "||aeskeygenassistXrmu", |
277 | --Fx | ||
278 | [0xf0] = "|||rorxVrmu", | ||
276 | }, | 279 | }, |
277 | } | 280 | } |
278 | 281 | ||
diff --git a/src/lj_asm.c b/src/lj_asm.c index 93f6bcd6..94d7bfc4 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c | |||
@@ -2150,7 +2150,10 @@ static void asm_setup_regsp(ASMState *as) | |||
2150 | #endif | 2150 | #endif |
2151 | #if LJ_TARGET_X86ORX64 | 2151 | #if LJ_TARGET_X86ORX64 |
2152 | /* Non-constant shift counts need to be in RID_ECX on x86/x64. */ | 2152 | /* Non-constant shift counts need to be in RID_ECX on x86/x64. */ |
2153 | case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR: | 2153 | case IR_BSHL: case IR_BSHR: case IR_BSAR: |
2154 | if ((as->flags & JIT_F_BMI2)) /* Except if BMI2 is available. */ | ||
2155 | break; | ||
2156 | case IR_BROL: case IR_BROR: | ||
2154 | if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) { | 2157 | if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) { |
2155 | IR(ir->op2)->r = REGSP_HINT(RID_ECX); | 2158 | IR(ir->op2)->r = REGSP_HINT(RID_ECX); |
2156 | if (inloop) | 2159 | if (inloop) |
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 512e0534..718cb12e 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h | |||
@@ -1956,7 +1956,7 @@ static void asm_bswap(ASMState *as, IRIns *ir) | |||
1956 | #define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR) | 1956 | #define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR) |
1957 | #define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR) | 1957 | #define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR) |
1958 | 1958 | ||
1959 | static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) | 1959 | static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv) |
1960 | { | 1960 | { |
1961 | IRRef rref = ir->op2; | 1961 | IRRef rref = ir->op2; |
1962 | IRIns *irr = IR(rref); | 1962 | IRIns *irr = IR(rref); |
@@ -1965,11 +1965,27 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) | |||
1965 | int shift; | 1965 | int shift; |
1966 | dest = ra_dest(as, ir, RSET_GPR); | 1966 | dest = ra_dest(as, ir, RSET_GPR); |
1967 | shift = irr->i & (irt_is64(ir->t) ? 63 : 31); | 1967 | shift = irr->i & (irt_is64(ir->t) ? 63 : 31); |
1968 | if (!xv && shift && (as->flags & JIT_F_BMI2)) { | ||
1969 | Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t)); | ||
1970 | if (left != dest) { /* BMI2 rotate right by constant. */ | ||
1971 | emit_i8(as, xs == XOg_ROL ? -shift : shift); | ||
1972 | emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left); | ||
1973 | return; | ||
1974 | } | ||
1975 | } | ||
1968 | switch (shift) { | 1976 | switch (shift) { |
1969 | case 0: break; | 1977 | case 0: break; |
1970 | case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; | 1978 | case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; |
1971 | default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; | 1979 | default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; |
1972 | } | 1980 | } |
1981 | } else if ((as->flags & JIT_F_BMI2) && xv) { /* BMI2 variable shifts. */ | ||
1982 | Reg left, right; | ||
1983 | dest = ra_dest(as, ir, RSET_GPR); | ||
1984 | right = ra_alloc1(as, rref, RSET_GPR); | ||
1985 | left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right), | ||
1986 | irt_is64(ir->t)); | ||
1987 | emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left); | ||
1988 | return; | ||
1973 | } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ | 1989 | } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ |
1974 | Reg right; | 1990 | Reg right; |
1975 | dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); | 1991 | dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); |
@@ -1995,11 +2011,11 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) | |||
1995 | */ | 2011 | */ |
1996 | } | 2012 | } |
1997 | 2013 | ||
1998 | #define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL) | 2014 | #define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX) |
1999 | #define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR) | 2015 | #define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX) |
2000 | #define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR) | 2016 | #define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX) |
2001 | #define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL) | 2017 | #define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0) |
2002 | #define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR) | 2018 | #define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0) |
2003 | 2019 | ||
2004 | /* -- Comparisons --------------------------------------------------------- */ | 2020 | /* -- Comparisons --------------------------------------------------------- */ |
2005 | 2021 | ||
diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index caf30859..cbaf4e85 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h | |||
@@ -13,10 +13,12 @@ | |||
13 | if (rex != 0x40) *--(p) = rex; } | 13 | if (rex != 0x40) *--(p) = rex; } |
14 | #define FORCE_REX 0x200 | 14 | #define FORCE_REX 0x200 |
15 | #define REX_64 (FORCE_REX|0x080000) | 15 | #define REX_64 (FORCE_REX|0x080000) |
16 | #define VEX_64 0x800000 | ||
16 | #else | 17 | #else |
17 | #define REXRB(p, rr, rb) ((void)0) | 18 | #define REXRB(p, rr, rb) ((void)0) |
18 | #define FORCE_REX 0 | 19 | #define FORCE_REX 0 |
19 | #define REX_64 0 | 20 | #define REX_64 0 |
21 | #define VEX_64 0 | ||
20 | #endif | 22 | #endif |
21 | 23 | ||
22 | #define emit_i8(as, i) (*--as->mcp = (MCode)(i)) | 24 | #define emit_i8(as, i) (*--as->mcp = (MCode)(i)) |
@@ -31,6 +33,13 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx, | |||
31 | MCode *p, int delta) | 33 | MCode *p, int delta) |
32 | { | 34 | { |
33 | int n = (int8_t)xo; | 35 | int n = (int8_t)xo; |
36 | if (n == -60) { /* VEX-encoded instruction */ | ||
37 | #if LJ_64 | ||
38 | xo ^= (((rr>>1)&4)+((rx>>2)&2)+((rb>>3)&1))<<13; | ||
39 | #endif | ||
40 | *(uint32_t *)(p+delta-5) = (uint32_t)xo; | ||
41 | return p+delta-5; | ||
42 | } | ||
34 | #if defined(__GNUC__) | 43 | #if defined(__GNUC__) |
35 | if (__builtin_constant_p(xo) && n == -2) | 44 | if (__builtin_constant_p(xo) && n == -2) |
36 | p[delta-2] = (MCode)(xo >> 24); | 45 | p[delta-2] = (MCode)(xo >> 24); |
@@ -412,8 +421,10 @@ static void emit_call_(ASMState *as, MCode *target) | |||
412 | /* Use 64 bit operations to handle 64 bit IR types. */ | 421 | /* Use 64 bit operations to handle 64 bit IR types. */ |
413 | #if LJ_64 | 422 | #if LJ_64 |
414 | #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) | 423 | #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) |
424 | #define VEX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? VEX_64 : 0)) | ||
415 | #else | 425 | #else |
416 | #define REX_64IR(ir, r) (r) | 426 | #define REX_64IR(ir, r) (r) |
427 | #define VEX_64IR(ir, r) (r) | ||
417 | #endif | 428 | #endif |
418 | 429 | ||
419 | /* Generic move between two regs. */ | 430 | /* Generic move between two regs. */ |
diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 289f83e1..e29f4748 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h | |||
@@ -189,6 +189,11 @@ typedef struct { | |||
189 | #define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24))) | 189 | #define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24))) |
190 | #define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24))) | 190 | #define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24))) |
191 | 191 | ||
192 | #define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24))) | ||
193 | #define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24))) | ||
194 | #define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24))) | ||
195 | #define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24))) | ||
196 | |||
192 | /* This list of x86 opcodes is not intended to be complete. Opcodes are only | 197 | /* This list of x86 opcodes is not intended to be complete. Opcodes are only |
193 | ** included when needed. Take a look at DynASM or jit.dis_x86 to see the | 198 | ** included when needed. Take a look at DynASM or jit.dis_x86 to see the |
194 | ** whole mess. | 199 | ** whole mess. |
@@ -231,6 +236,12 @@ typedef enum { | |||
231 | XI_FSCALE = 0xfdd9, | 236 | XI_FSCALE = 0xfdd9, |
232 | XI_FYL2X = 0xf1d9, | 237 | XI_FYL2X = 0xf1d9, |
233 | 238 | ||
239 | /* VEX-encoded instructions. XV_* prefix. */ | ||
240 | XV_RORX = XV_f20f3a(f0), | ||
241 | XV_SARX = XV_f30f38(f7), | ||
242 | XV_SHLX = XV_660f38(f7), | ||
243 | XV_SHRX = XV_f20f38(f7), | ||
244 | |||
234 | /* Variable-length opcodes. XO_* prefix. */ | 245 | /* Variable-length opcodes. XO_* prefix. */ |
235 | XO_MOV = XO_(8b), | 246 | XO_MOV = XO_(8b), |
236 | XO_MOVto = XO_(89), | 247 | XO_MOVto = XO_(89), |