diff options
author | Mike Pall <mike> | 2015-10-24 18:43:47 +0200 |
---|---|---|
committer | Mike Pall <mike> | 2015-10-24 18:43:47 +0200 |
commit | 8a13c9cebf368de9338c3c3c8c30c48d45d717bd (patch) | |
tree | 85480a97cf3b02dcf2412c7540b0f463726f4b42 /dynasm | |
parent | 7e22082480028a467c27d9c32852ec7a12f8235f (diff) | |
download | luajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.tar.gz luajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.tar.bz2 luajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.zip |
DynASM/x86: Add AVX and AVX2 opcodes.
Thanks to Peter Cawley.
Diffstat (limited to 'dynasm')
-rw-r--r-- | dynasm/dasm_x86.h | 7 | ||||
-rw-r--r-- | dynasm/dasm_x86.lua | 402 |
2 files changed, 338 insertions, 71 deletions
diff --git a/dynasm/dasm_x86.h b/dynasm/dasm_x86.h index 652e8c99..175febe0 100644 --- a/dynasm/dasm_x86.h +++ b/dynasm/dasm_x86.h | |||
@@ -391,7 +391,12 @@ int dasm_encode(Dst_DECL, void *buffer) | |||
391 | case DASM_IMM_D: wd: dasmd(n); break; | 391 | case DASM_IMM_D: wd: dasmd(n); break; |
392 | case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; | 392 | case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; |
393 | case DASM_IMM_W: dasmw(n); break; | 393 | case DASM_IMM_W: dasmw(n); break; |
394 | case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } | 394 | case DASM_VREG: { |
395 | int t = *p++; | ||
396 | if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3; | ||
397 | cp[-1] ^= n; | ||
398 | break; | ||
399 | } | ||
395 | case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; | 400 | case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; |
396 | b++; n = (int)(ptrdiff_t)D->globals[-n]; | 401 | b++; n = (int)(ptrdiff_t)D->globals[-n]; |
397 | case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ | 402 | case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ |
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua index d8203e3d..d85dfec4 100644 --- a/dynasm/dasm_x86.lua +++ b/dynasm/dasm_x86.lua | |||
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl | |||
27 | local _s = string | 27 | local _s = string |
28 | local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char | 28 | local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char |
29 | local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub | 29 | local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub |
30 | local concat, sort = table.concat, table.sort | 30 | local concat, sort, remove = table.concat, table.sort, table.remove |
31 | local bit = bit or require("bit") | 31 | local bit = bit or require("bit") |
32 | local band, shl, shr = bit.band, bit.lshift, bit.rshift | 32 | local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift |
33 | 33 | ||
34 | -- Inherited tables and callbacks. | 34 | -- Inherited tables and callbacks. |
35 | local g_opt, g_arch | 35 | local g_opt, g_arch |
@@ -299,7 +299,7 @@ local function mkrmap(sz, cl, names) | |||
299 | local iname = format("@%s%x%s", sz, i, needrex and "R" or "") | 299 | local iname = format("@%s%x%s", sz, i, needrex and "R" or "") |
300 | if needrex then map_reg_needrex[iname] = true end | 300 | if needrex then map_reg_needrex[iname] = true end |
301 | local name | 301 | local name |
302 | if sz == "o" then name = format("xmm%d", i) | 302 | if sz == "o" or sz == "y" then name = format("%s%d", cl, i) |
303 | elseif sz == "f" then name = format("st%d", i) | 303 | elseif sz == "f" then name = format("st%d", i) |
304 | else name = format("r%d%s", i, sz == addrsize and "" or sz) end | 304 | else name = format("r%d%s", i, sz == addrsize and "" or sz) end |
305 | map_archdef[name] = iname | 305 | map_archdef[name] = iname |
@@ -334,21 +334,24 @@ mkrmap("f", "Rf") | |||
334 | -- SSE registers (oword sized, but qword and dword accessible). | 334 | -- SSE registers (oword sized, but qword and dword accessible). |
335 | mkrmap("o", "xmm") | 335 | mkrmap("o", "xmm") |
336 | 336 | ||
337 | -- AVX registers (yword sized, but oword, qword and dword accessible). | ||
338 | mkrmap("y", "ymm") | ||
339 | |||
337 | -- Operand size prefixes to codes. | 340 | -- Operand size prefixes to codes. |
338 | local map_opsize = { | 341 | local map_opsize = { |
339 | byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", | 342 | byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y", |
340 | aword = addrsize, | 343 | tword = "t", aword = addrsize, |
341 | } | 344 | } |
342 | 345 | ||
343 | -- Operand size code to number. | 346 | -- Operand size code to number. |
344 | local map_opsizenum = { | 347 | local map_opsizenum = { |
345 | b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, | 348 | b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10, |
346 | } | 349 | } |
347 | 350 | ||
348 | -- Operand size code to name. | 351 | -- Operand size code to name. |
349 | local map_opsizename = { | 352 | local map_opsizename = { |
350 | b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", | 353 | b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword", |
351 | f = "fpword", | 354 | t = "tword", f = "fpword", |
352 | } | 355 | } |
353 | 356 | ||
354 | -- Valid index register scale factors. | 357 | -- Valid index register scale factors. |
@@ -460,7 +463,29 @@ local function wputszarg(sz, n) | |||
460 | end | 463 | end |
461 | 464 | ||
462 | -- Put multi-byte opcode with operand-size dependent modifications. | 465 | -- Put multi-byte opcode with operand-size dependent modifications. |
463 | local function wputop(sz, op, rex) | 466 | local function wputop(sz, op, rex, vex) |
467 | if vex then | ||
468 | local tail | ||
469 | if vex.m == 1 and band(rex, 11) == 0 then | ||
470 | wputb(0xc5) | ||
471 | tail = shl(bxor(band(rex, 4), 4), 5) | ||
472 | else | ||
473 | wputb(0xc4) | ||
474 | wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m) | ||
475 | tail = shl(band(rex, 8), 4) | ||
476 | end | ||
477 | local reg, vreg = 0, nil | ||
478 | if vex.v then | ||
479 | reg = vex.v.reg | ||
480 | if not reg then werror("bad vex operand") end | ||
481 | if reg < 0 then reg = 0; vreg = vex.v.vreg end | ||
482 | end | ||
483 | if sz == "y" or vex.l then tail = tail + 4 end | ||
484 | wputb(tail + shl(bxor(reg, 15), 3) + vex.p) | ||
485 | if vreg then waction("VREG", vreg); wputxb(4) end | ||
486 | rex = 0 | ||
487 | if op >= 256 then werror("bad vex opcode") end | ||
488 | end | ||
464 | local r | 489 | local r |
465 | if rex ~= 0 and not x64 then werror("bad operand size") end | 490 | if rex ~= 0 and not x64 then werror("bad operand size") end |
466 | if sz == "w" then wputb(102) end | 491 | if sz == "w" then wputb(102) end |
@@ -881,9 +906,15 @@ end | |||
881 | -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. | 906 | -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. |
882 | -- The spare 3 bits are either filled with the last hex digit or | 907 | -- The spare 3 bits are either filled with the last hex digit or |
883 | -- the result from a previous "r"/"R". The opcode is restored. | 908 | -- the result from a previous "r"/"R". The opcode is restored. |
909 | -- "u" Use VEX encoding, vvvv unused. | ||
910 | -- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is | ||
911 | -- removed from the list used by future characters). | ||
912 | -- "L" Force VEX.L | ||
884 | -- | 913 | -- |
885 | -- All of the following characters force a flush of the opcode: | 914 | -- All of the following characters force a flush of the opcode: |
886 | -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. | 915 | -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. |
916 | -- "s" stores a 4 bit immediate from the last register operand, | ||
917 | -- followed by 4 zero bits. | ||
887 | -- "S" stores a signed 8 bit immediate from the last operand. | 918 | -- "S" stores a signed 8 bit immediate from the last operand. |
888 | -- "U" stores an unsigned 8 bit immediate from the last operand. | 919 | -- "U" stores an unsigned 8 bit immediate from the last operand. |
889 | -- "W" stores an unsigned 16 bit immediate from the last operand. | 920 | -- "W" stores an unsigned 16 bit immediate from the last operand. |
@@ -1225,46 +1256,14 @@ local map_op = { | |||
1225 | movups_2 = "rmo:0F10rM|mro:0F11Rm", | 1256 | movups_2 = "rmo:0F10rM|mro:0F11Rm", |
1226 | orpd_2 = "rmo:660F56rM", | 1257 | orpd_2 = "rmo:660F56rM", |
1227 | orps_2 = "rmo:0F56rM", | 1258 | orps_2 = "rmo:0F56rM", |
1228 | packssdw_2 = "rmo:660F6BrM", | ||
1229 | packsswb_2 = "rmo:660F63rM", | ||
1230 | packuswb_2 = "rmo:660F67rM", | ||
1231 | paddb_2 = "rmo:660FFCrM", | ||
1232 | paddd_2 = "rmo:660FFErM", | ||
1233 | paddq_2 = "rmo:660FD4rM", | ||
1234 | paddsb_2 = "rmo:660FECrM", | ||
1235 | paddsw_2 = "rmo:660FEDrM", | ||
1236 | paddusb_2 = "rmo:660FDCrM", | ||
1237 | paddusw_2 = "rmo:660FDDrM", | ||
1238 | paddw_2 = "rmo:660FFDrM", | ||
1239 | pand_2 = "rmo:660FDBrM", | ||
1240 | pandn_2 = "rmo:660FDFrM", | ||
1241 | pause_0 = "F390", | 1259 | pause_0 = "F390", |
1242 | pavgb_2 = "rmo:660FE0rM", | ||
1243 | pavgw_2 = "rmo:660FE3rM", | ||
1244 | pcmpeqb_2 = "rmo:660F74rM", | ||
1245 | pcmpeqd_2 = "rmo:660F76rM", | ||
1246 | pcmpeqw_2 = "rmo:660F75rM", | ||
1247 | pcmpgtb_2 = "rmo:660F64rM", | ||
1248 | pcmpgtd_2 = "rmo:660F66rM", | ||
1249 | pcmpgtw_2 = "rmo:660F65rM", | ||
1250 | pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. | 1260 | pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. |
1251 | pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", | 1261 | pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", |
1252 | pmaddwd_2 = "rmo:660FF5rM", | ||
1253 | pmaxsw_2 = "rmo:660FEErM", | ||
1254 | pmaxub_2 = "rmo:660FDErM", | ||
1255 | pminsw_2 = "rmo:660FEArM", | ||
1256 | pminub_2 = "rmo:660FDArM", | ||
1257 | pmovmskb_2 = "rr/do:660FD7rM", | 1262 | pmovmskb_2 = "rr/do:660FD7rM", |
1258 | pmulhuw_2 = "rmo:660FE4rM", | ||
1259 | pmulhw_2 = "rmo:660FE5rM", | ||
1260 | pmullw_2 = "rmo:660FD5rM", | ||
1261 | pmuludq_2 = "rmo:660FF4rM", | ||
1262 | por_2 = "rmo:660FEBrM", | ||
1263 | prefetchnta_1 = "xb:n0F180m", | 1263 | prefetchnta_1 = "xb:n0F180m", |
1264 | prefetcht0_1 = "xb:n0F181m", | 1264 | prefetcht0_1 = "xb:n0F181m", |
1265 | prefetcht1_1 = "xb:n0F182m", | 1265 | prefetcht1_1 = "xb:n0F182m", |
1266 | prefetcht2_1 = "xb:n0F183m", | 1266 | prefetcht2_1 = "xb:n0F183m", |
1267 | psadbw_2 = "rmo:660FF6rM", | ||
1268 | pshufd_3 = "rmio:660F70rMU", | 1267 | pshufd_3 = "rmio:660F70rMU", |
1269 | pshufhw_3 = "rmio:F30F70rMU", | 1268 | pshufhw_3 = "rmio:F30F70rMU", |
1270 | pshuflw_3 = "rmio:F20F70rMU", | 1269 | pshuflw_3 = "rmio:F20F70rMU", |
@@ -1278,23 +1277,6 @@ local map_op = { | |||
1278 | psrldq_2 = "rio:660F733mU", | 1277 | psrldq_2 = "rio:660F733mU", |
1279 | psrlq_2 = "rmo:660FD3rM|rio:660F732mU", | 1278 | psrlq_2 = "rmo:660FD3rM|rio:660F732mU", |
1280 | psrlw_2 = "rmo:660FD1rM|rio:660F712mU", | 1279 | psrlw_2 = "rmo:660FD1rM|rio:660F712mU", |
1281 | psubb_2 = "rmo:660FF8rM", | ||
1282 | psubd_2 = "rmo:660FFArM", | ||
1283 | psubq_2 = "rmo:660FFBrM", | ||
1284 | psubsb_2 = "rmo:660FE8rM", | ||
1285 | psubsw_2 = "rmo:660FE9rM", | ||
1286 | psubusb_2 = "rmo:660FD8rM", | ||
1287 | psubusw_2 = "rmo:660FD9rM", | ||
1288 | psubw_2 = "rmo:660FF9rM", | ||
1289 | punpckhbw_2 = "rmo:660F68rM", | ||
1290 | punpckhdq_2 = "rmo:660F6ArM", | ||
1291 | punpckhqdq_2 = "rmo:660F6DrM", | ||
1292 | punpckhwd_2 = "rmo:660F69rM", | ||
1293 | punpcklbw_2 = "rmo:660F60rM", | ||
1294 | punpckldq_2 = "rmo:660F62rM", | ||
1295 | punpcklqdq_2 = "rmo:660F6CrM", | ||
1296 | punpcklwd_2 = "rmo:660F61rM", | ||
1297 | pxor_2 = "rmo:660FEFrM", | ||
1298 | rcpps_2 = "rmo:0F53rM", | 1280 | rcpps_2 = "rmo:0F53rM", |
1299 | rcpss_2 = "rro:F30F53rM|rx/od:", | 1281 | rcpss_2 = "rro:F30F53rM|rx/od:", |
1300 | rsqrtps_2 = "rmo:0F52rM", | 1282 | rsqrtps_2 = "rmo:0F52rM", |
@@ -1421,6 +1403,223 @@ local map_op = { | |||
1421 | aesimc_2 = "rmo:660F38DBrM", | 1403 | aesimc_2 = "rmo:660F38DBrM", |
1422 | aeskeygenassist_3 = "rmio:660F3ADFrMU", | 1404 | aeskeygenassist_3 = "rmio:660F3ADFrMU", |
1423 | pclmulqdq_3 = "rmio:660F3A44rMU", | 1405 | pclmulqdq_3 = "rmio:660F3A44rMU", |
1406 | |||
1407 | -- AVX FP ops | ||
1408 | vaddsubpd_3 = "rrmoy:660FVD0rM", | ||
1409 | vaddsubps_3 = "rrmoy:F20FVD0rM", | ||
1410 | vandpd_3 = "rrmoy:660FV54rM", | ||
1411 | vandps_3 = "rrmoy:0FV54rM", | ||
1412 | vandnpd_3 = "rrmoy:660FV55rM", | ||
1413 | vandnps_3 = "rrmoy:0FV55rM", | ||
1414 | vblendpd_4 = "rrmioy:660F3AV0DrMU", | ||
1415 | vblendps_4 = "rrmioy:660F3AV0CrMU", | ||
1416 | vblendvpd_4 = "rrmroy:660F3AV4BrMs", | ||
1417 | vblendvps_4 = "rrmroy:660F3AV4ArMs", | ||
1418 | vbroadcastf128_2 = "rx/yo:660F38u1ArM", | ||
1419 | vcmppd_4 = "rrmioy:660FVC2rMU", | ||
1420 | vcmpps_4 = "rrmioy:0FVC2rMU", | ||
1421 | vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:", | ||
1422 | vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:", | ||
1423 | vcomisd_2 = "rro:660Fu2FrM|rx/oq:", | ||
1424 | vcomiss_2 = "rro:0Fu2FrM|rx/od:", | ||
1425 | vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:", | ||
1426 | vcvtdq2ps_2 = "rmoy:0Fu5BrM", | ||
1427 | vcvtpd2dq_2 = "rmoy:F20FuE6rM", | ||
1428 | vcvtpd2ps_2 = "rmoy:660Fu5ArM", | ||
1429 | vcvtps2dq_2 = "rmoy:660Fu5BrM", | ||
1430 | vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:", | ||
1431 | vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:", | ||
1432 | vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:", | ||
1433 | vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM", | ||
1434 | vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM", | ||
1435 | vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:", | ||
1436 | vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:", | ||
1437 | vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM", | ||
1438 | vcvttps2dq_2 = "rmoy:F30Fu5BrM", | ||
1439 | vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:", | ||
1440 | vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:", | ||
1441 | vdppd_4 = "rrmio:660F3AV41rMU", | ||
1442 | vdpps_4 = "rrmioy:660F3AV40rMU", | ||
1443 | vextractf128_3 = "mri/oy:660F3AuL19RmU", | ||
1444 | vextractps_3 = "mri/do:660F3Au17RmU", | ||
1445 | vhaddpd_3 = "rrmoy:660FV7CrM", | ||
1446 | vhaddps_3 = "rrmoy:F20FV7CrM", | ||
1447 | vhsubpd_3 = "rrmoy:660FV7DrM", | ||
1448 | vhsubps_3 = "rrmoy:F20FV7DrM", | ||
1449 | vinsertf128_4 = "rrmi/yyo:660F3AV18rMU", | ||
1450 | vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:", | ||
1451 | vldmxcsr_1 = "xd:0FuAE2m", | ||
1452 | vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm", | ||
1453 | vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm", | ||
1454 | vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm", | ||
1455 | vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm", | ||
1456 | vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:", | ||
1457 | vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm", | ||
1458 | vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:", | ||
1459 | vmovhlps_3 = "rrro:0FV12rM", | ||
1460 | vmovhpd_2 = "xr/qo:660Fu17Rm", | ||
1461 | vmovhpd_3 = "rrx/ooq:660FV16rM", | ||
1462 | vmovhps_2 = "xr/qo:0Fu17Rm", | ||
1463 | vmovhps_3 = "rrx/ooq:0FV16rM", | ||
1464 | vmovlhps_3 = "rrro:0FV16rM", | ||
1465 | vmovlpd_2 = "xr/qo:660Fu13Rm", | ||
1466 | vmovlpd_3 = "rrx/ooq:660FV12rM", | ||
1467 | vmovlps_2 = "xr/qo:0Fu13Rm", | ||
1468 | vmovlps_3 = "rrx/ooq:0FV12rM", | ||
1469 | vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM", | ||
1470 | vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM", | ||
1471 | vmovntpd_2 = "xroy:660Fu2BRm", | ||
1472 | vmovntps_2 = "xroy:0Fu2BRm", | ||
1473 | vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm", | ||
1474 | vmovsd_3 = "rrro:F20FV10rM", | ||
1475 | vmovshdup_2 = "rmoy:F30Fu16rM", | ||
1476 | vmovsldup_2 = "rmoy:F30Fu12rM", | ||
1477 | vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm", | ||
1478 | vmovss_3 = "rrro:F30FV10rM", | ||
1479 | vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm", | ||
1480 | vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm", | ||
1481 | vorpd_3 = "rrmoy:660FV56rM", | ||
1482 | vorps_3 = "rrmoy:0FV56rM", | ||
1483 | vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU", | ||
1484 | vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU", | ||
1485 | vperm2f128_4 = "rrmiy:660F3AV06rMU", | ||
1486 | vptestpd_2 = "rmoy:660F38u0FrM", | ||
1487 | vptestps_2 = "rmoy:660F38u0ErM", | ||
1488 | vrcpps_2 = "rmoy:0Fu53rM", | ||
1489 | vrcpss_3 = "rrro:F30FV53rM|rrx/ood:", | ||
1490 | vrsqrtps_2 = "rmoy:0Fu52rM", | ||
1491 | vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:", | ||
1492 | vroundpd_3 = "rmioy:660F3AV09rMU", | ||
1493 | vroundps_3 = "rmioy:660F3AV08rMU", | ||
1494 | vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:", | ||
1495 | vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:", | ||
1496 | vshufpd_4 = "rrmioy:660FVC6rMU", | ||
1497 | vshufps_4 = "rrmioy:0FVC6rMU", | ||
1498 | vsqrtps_2 = "rmoy:0Fu51rM", | ||
1499 | vsqrtss_2 = "rro:F30Fu51rM|rx/od:", | ||
1500 | vsqrtpd_2 = "rmoy:660Fu51rM", | ||
1501 | vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:", | ||
1502 | vstmxcsr_1 = "xd:0FuAE3m", | ||
1503 | vucomisd_2 = "rro:660Fu2ErM|rx/oq:", | ||
1504 | vucomiss_2 = "rro:0Fu2ErM|rx/od:", | ||
1505 | vunpckhpd_3 = "rrmoy:660FV15rM", | ||
1506 | vunpckhps_3 = "rrmoy:0FV15rM", | ||
1507 | vunpcklpd_3 = "rrmoy:660FV14rM", | ||
1508 | vunpcklps_3 = "rrmoy:0FV14rM", | ||
1509 | vxorpd_3 = "rrmoy:660FV57rM", | ||
1510 | vxorps_3 = "rrmoy:0FV57rM", | ||
1511 | vzeroall_0 = "0FuL77", | ||
1512 | vzeroupper_0 = "0Fu77", | ||
1513 | |||
1514 | -- AVX2 FP ops | ||
1515 | vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:", | ||
1516 | vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:", | ||
1517 | -- *vgather* (!vsib) | ||
1518 | vpermpd_3 = "rmiy:660F3AuX01rMU", | ||
1519 | vpermps_3 = "rrmy:660F38V16rM", | ||
1520 | |||
1521 | -- AVX, AVX2 integer ops | ||
1522 | -- In general, xmm requires AVX, ymm requires AVX2. | ||
1523 | vlddqu_2 = "rxoy:F20FuF0rM", | ||
1524 | vmaskmovdqu_2 = "rro:660FuF7rM", | ||
1525 | vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm", | ||
1526 | vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm", | ||
1527 | vmovntdq_2 = "xroy:660FuE7Rm", | ||
1528 | vmovntdqa_2 = "rxoy:660F38u2ArM", | ||
1529 | vmpsadbw_4 = "rrmioy:660F3AV42rMU", | ||
1530 | vpabsb_2 = "rmoy:660F38u1CrM", | ||
1531 | vpabsd_2 = "rmoy:660F38u1ErM", | ||
1532 | vpabsw_2 = "rmoy:660F38u1DrM", | ||
1533 | vpackusdw_3 = "rrmoy:660F38V2BrM", | ||
1534 | vpalignr_4 = "rrmioy:660F3AV0FrMU", | ||
1535 | vpblendvb_4 = "rrmroy:660F3AV4CrMs", | ||
1536 | vpblendw_4 = "rrmioy:660F3AV0ErMU", | ||
1537 | vpclmulqdq_4 = "rrmio:660F3AV44rMU", | ||
1538 | vpcmpeqq_3 = "rrmoy:660F38V29rM", | ||
1539 | vpcmpestri_3 = "rmio:660F3Au61rMU", | ||
1540 | vpcmpestrm_3 = "rmio:660F3Au60rMU", | ||
1541 | vpcmpgtq_3 = "rrmoy:660F38V37rM", | ||
1542 | vpcmpistri_3 = "rmio:660F3Au63rMU", | ||
1543 | vpcmpistrm_3 = "rmio:660F3Au62rMU", | ||
1544 | vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:", | ||
1545 | vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU", | ||
1546 | vpextrd_3 = "mri/do:660F3Au16RmU", | ||
1547 | vpextrq_3 = "mri/qo:660F3Au16RmU", | ||
1548 | vphaddw_3 = "rrmoy:660F38V01rM", | ||
1549 | vphaddd_3 = "rrmoy:660F38V02rM", | ||
1550 | vphaddsw_3 = "rrmoy:660F38V03rM", | ||
1551 | vphminposuw_2 = "rmo:660F38u41rM", | ||
1552 | vphsubw_3 = "rrmoy:660F38V05rM", | ||
1553 | vphsubd_3 = "rrmoy:660F38V06rM", | ||
1554 | vphsubsw_3 = "rrmoy:660F38V07rM", | ||
1555 | vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:", | ||
1556 | vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:", | ||
1557 | vpinsrd_4 = "rrmi/ood:660F3AV22rMU", | ||
1558 | vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU", | ||
1559 | vpmaddubsw_3 = "rrmoy:660F38V04rM", | ||
1560 | vpmaxsb_3 = "rrmoy:660F38V3CrM", | ||
1561 | vpmaxsd_3 = "rrmoy:660F38V3DrM", | ||
1562 | vpmaxuw_3 = "rrmoy:660F38V3ErM", | ||
1563 | vpmaxud_3 = "rrmoy:660F38V3FrM", | ||
1564 | vpminsb_3 = "rrmoy:660F38V38rM", | ||
1565 | vpminsd_3 = "rrmoy:660F38V39rM", | ||
1566 | vpminuw_3 = "rrmoy:660F38V3ArM", | ||
1567 | vpminud_3 = "rrmoy:660F38V3BrM", | ||
1568 | vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM", | ||
1569 | vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:", | ||
1570 | vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:", | ||
1571 | vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:", | ||
1572 | vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:", | ||
1573 | vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:", | ||
1574 | vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:", | ||
1575 | vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:", | ||
1576 | vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:", | ||
1577 | vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:", | ||
1578 | vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:", | ||
1579 | vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:", | ||
1580 | vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:", | ||
1581 | vpmuldq_3 = "rrmoy:660F38V28rM", | ||
1582 | vpmulhrsw_3 = "rrmoy:660F38V0BrM", | ||
1583 | vpmulld_3 = "rrmoy:660F38V40rM", | ||
1584 | vpshufb_3 = "rrmoy:660F38V00rM", | ||
1585 | vpshufd_3 = "rmioy:660Fu70rMU", | ||
1586 | vpshufhw_3 = "rmioy:F30Fu70rMU", | ||
1587 | vpshuflw_3 = "rmioy:F20Fu70rMU", | ||
1588 | vpsignb_3 = "rrmoy:660F38V08rM", | ||
1589 | vpsignw_3 = "rrmoy:660F38V09rM", | ||
1590 | vpsignd_3 = "rrmoy:660F38V0ArM", | ||
1591 | vpslldq_3 = "rrioy:660Fv737mU", | ||
1592 | vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU", | ||
1593 | vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU", | ||
1594 | vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU", | ||
1595 | vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU", | ||
1596 | vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU", | ||
1597 | vpsrldq_3 = "rrioy:660Fv733mU", | ||
1598 | vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU", | ||
1599 | vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU", | ||
1600 | vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU", | ||
1601 | vptest_2 = "rmoy:660F38u17rM", | ||
1602 | |||
1603 | -- AVX2 integer ops | ||
1604 | vbroadcasti128_2 = "rx/yo:660F38u5ArM", | ||
1605 | vinserti128_4 = "rrmi/yyo:660F3AV38rMU", | ||
1606 | vextracti128_3 = "mri/oy:660F3AuL39RmU", | ||
1607 | vpblendd_4 = "rrmioy:660F3AV02rMU", | ||
1608 | vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:", | ||
1609 | vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:", | ||
1610 | vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:", | ||
1611 | vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:", | ||
1612 | vpermd_3 = "rrmy:660F38V36rM", | ||
1613 | vpermq_3 = "rmiy:660F3AuX00rMU", | ||
1614 | -- *vpgather* (!vsib) | ||
1615 | vperm2i128_4 = "rrmiy:660F3AV46rMU", | ||
1616 | vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm", | ||
1617 | vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm", | ||
1618 | vpsllvd_3 = "rrmoy:660F38V47rM", | ||
1619 | vpsllvq_3 = "rrmoy:660F38VX47rM", | ||
1620 | vpsravd_3 = "rrmoy:660F38V46rM", | ||
1621 | vpsrlvd_3 = "rrmoy:660F38V45rM", | ||
1622 | vpsrlvq_3 = "rrmoy:660F38VX45rM", | ||
1424 | } | 1623 | } |
1425 | 1624 | ||
1426 | ------------------------------------------------------------------------------ | 1625 | ------------------------------------------------------------------------------ |
@@ -1471,28 +1670,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do | |||
1471 | map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ | 1670 | map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ |
1472 | end | 1671 | end |
1473 | 1672 | ||
1474 | -- SSE FP arithmetic ops. | 1673 | -- SSE / AVX FP arithmetic ops. |
1475 | for name,n in pairs{ sqrt = 1, add = 8, mul = 9, | 1674 | for name,n in pairs{ sqrt = 1, add = 8, mul = 9, |
1476 | sub = 12, min = 13, div = 14, max = 15 } do | 1675 | sub = 12, min = 13, div = 14, max = 15 } do |
1477 | map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) | 1676 | map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) |
1478 | map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) | 1677 | map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) |
1479 | map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) | 1678 | map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) |
1480 | map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) | 1679 | map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) |
1680 | if n ~= 1 then | ||
1681 | map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n) | ||
1682 | map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n) | ||
1683 | map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n) | ||
1684 | map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n) | ||
1685 | end | ||
1686 | end | ||
1687 | |||
1688 | -- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf). | ||
1689 | for name,n in pairs{ | ||
1690 | paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4, | ||
1691 | paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B, | ||
1692 | packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC, | ||
1693 | paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0, | ||
1694 | pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76, | ||
1695 | pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66, | ||
1696 | pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE, | ||
1697 | pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA, | ||
1698 | pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5, | ||
1699 | pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8, | ||
1700 | psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8, | ||
1701 | psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9, | ||
1702 | punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A, | ||
1703 | punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61, | ||
1704 | punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF | ||
1705 | } do | ||
1706 | map_op[name.."_2"] = format("rmo:660F%02XrM", n) | ||
1707 | map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n) | ||
1481 | end | 1708 | end |
1482 | 1709 | ||
1483 | ------------------------------------------------------------------------------ | 1710 | ------------------------------------------------------------------------------ |
1484 | 1711 | ||
1712 | local map_vexarg = { u = false, v = 1, V = 2 } | ||
1713 | |||
1485 | -- Process pattern string. | 1714 | -- Process pattern string. |
1486 | local function dopattern(pat, args, sz, op, needrex) | 1715 | local function dopattern(pat, args, sz, op, needrex) |
1487 | local digit, addin | 1716 | local digit, addin, vex |
1488 | local opcode = 0 | 1717 | local opcode = 0 |
1489 | local szov = sz | 1718 | local szov = sz |
1490 | local narg = 1 | 1719 | local narg = 1 |
1491 | local rex = 0 | 1720 | local rex = 0 |
1492 | 1721 | ||
1493 | -- Limit number of section buffer positions used by a single dasm_put(). | 1722 | -- Limit number of section buffer positions used by a single dasm_put(). |
1494 | -- A single opcode needs a maximum of 5 positions. | 1723 | -- A single opcode needs a maximum of 6 positions. |
1495 | if secpos+5 > maxsecpos then wflush() end | 1724 | if secpos+6 > maxsecpos then wflush() end |
1496 | 1725 | ||
1497 | -- Process each character. | 1726 | -- Process each character. |
1498 | for c in gmatch(pat.."|", ".") do | 1727 | for c in gmatch(pat.."|", ".") do |
@@ -1506,6 +1735,8 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1506 | szov = nil | 1735 | szov = nil |
1507 | elseif c == "X" then -- Force REX.W. | 1736 | elseif c == "X" then -- Force REX.W. |
1508 | rex = 8 | 1737 | rex = 8 |
1738 | elseif c == "L" then -- Force VEX.L. | ||
1739 | vex.l = true | ||
1509 | elseif c == "r" then -- Merge 1st operand regno. into opcode. | 1740 | elseif c == "r" then -- Merge 1st operand regno. into opcode. |
1510 | addin = args[1]; opcode = opcode + (addin.reg % 8) | 1741 | addin = args[1]; opcode = opcode + (addin.reg % 8) |
1511 | if narg < 2 then narg = 2 end | 1742 | if narg < 2 then narg = 2 end |
@@ -1529,21 +1760,41 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1529 | if t.xreg and t.xreg > 7 then rex = rex + 2 end | 1760 | if t.xreg and t.xreg > 7 then rex = rex + 2 end |
1530 | if s > 7 then rex = rex + 4 end | 1761 | if s > 7 then rex = rex + 4 end |
1531 | if needrex then rex = rex + 16 end | 1762 | if needrex then rex = rex + 16 end |
1532 | wputop(szov, opcode, rex); opcode = nil | 1763 | wputop(szov, opcode, rex, vex); opcode = nil |
1533 | local imark = sub(pat, -1) -- Force a mark (ugly). | 1764 | local imark = sub(pat, -1) -- Force a mark (ugly). |
1534 | -- Put ModRM/SIB with regno/last digit as spare. | 1765 | -- Put ModRM/SIB with regno/last digit as spare. |
1535 | wputmrmsib(t, imark, s, addin and addin.vreg) | 1766 | wputmrmsib(t, imark, s, addin and addin.vreg) |
1536 | addin = nil | 1767 | addin = nil |
1768 | elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix | ||
1769 | local b = band(opcode, 255); opcode = shr(opcode, 8) | ||
1770 | local m = 1 | ||
1771 | if b == 0x38 then m = 2 | ||
1772 | elseif b == 0x3a then m = 3 end | ||
1773 | if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end | ||
1774 | if b ~= 0x0f then | ||
1775 | werror("expected `0F', `0F38', or `0F3A' to precede `"..c.. | ||
1776 | "' in pattern `"..pat.."' for `"..op.."'") | ||
1777 | end | ||
1778 | local v = map_vexarg[c] | ||
1779 | if v then v = remove(args, v) end | ||
1780 | b = band(opcode, 255) | ||
1781 | local p = 0 | ||
1782 | if b == 0x66 then p = 1 | ||
1783 | elseif b == 0xf3 then p = 2 | ||
1784 | elseif b == 0xf2 then p = 3 end | ||
1785 | if p ~= 0 then opcode = shr(opcode, 8) end | ||
1786 | if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end | ||
1787 | vex = { m = m, p = p, v = v } | ||
1537 | else | 1788 | else |
1538 | if opcode then -- Flush opcode. | 1789 | if opcode then -- Flush opcode. |
1539 | if szov == "q" and rex == 0 then rex = rex + 8 end | 1790 | if szov == "q" and rex == 0 then rex = rex + 8 end |
1540 | if needrex then rex = rex + 16 end | 1791 | if needrex then rex = rex + 16 end |
1541 | if addin and addin.reg == -1 then | 1792 | if addin and addin.reg == -1 then |
1542 | wputop(szov, opcode - 7, rex) | 1793 | wputop(szov, opcode - 7, rex, vex) |
1543 | waction("VREG", addin.vreg); wputxb(0) | 1794 | waction("VREG", addin.vreg); wputxb(0) |
1544 | else | 1795 | else |
1545 | if addin and addin.reg > 7 then rex = rex + 1 end | 1796 | if addin and addin.reg > 7 then rex = rex + 1 end |
1546 | wputop(szov, opcode, rex) | 1797 | wputop(szov, opcode, rex, vex) |
1547 | end | 1798 | end |
1548 | opcode = nil | 1799 | opcode = nil |
1549 | end | 1800 | end |
@@ -1580,6 +1831,14 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1580 | else | 1831 | else |
1581 | wputlabel("REL_", imm, 2) | 1832 | wputlabel("REL_", imm, 2) |
1582 | end | 1833 | end |
1834 | elseif c == "s" then | ||
1835 | local reg = a.reg | ||
1836 | if reg < 0 then | ||
1837 | wputb(0) | ||
1838 | waction("VREG", a.vreg); wputxb(5) | ||
1839 | else | ||
1840 | wputb(shl(reg, 4)) | ||
1841 | end | ||
1583 | else | 1842 | else |
1584 | werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") | 1843 | werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") |
1585 | end | 1844 | end |
@@ -1656,11 +1915,14 @@ map_op[".template__"] = function(params, template, nparams) | |||
1656 | if pat == "" then pat = lastpat else lastpat = pat end | 1915 | if pat == "" then pat = lastpat else lastpat = pat end |
1657 | if matchtm(tm, args) then | 1916 | if matchtm(tm, args) then |
1658 | local prefix = sub(szm, 1, 1) | 1917 | local prefix = sub(szm, 1, 1) |
1659 | if prefix == "/" then -- Match both operand sizes. | 1918 | if prefix == "/" then -- Exactly match leading operand sizes. |
1660 | if args[1].opsize == sub(szm, 2, 2) and | 1919 | for i = #szm, 1, -1 do |
1661 | args[2].opsize == sub(szm, 3, 3) then | 1920 | if i == 1 then |
1662 | dopattern(pat, args, sz, params.op, needrex) -- Process pattern. | 1921 | dopattern(pat, args, sz, params.op, needrex) -- Process pattern. |
1663 | return | 1922 | return |
1923 | elseif args[i-1].opsize ~= sub(szm, i, i) then | ||
1924 | break | ||
1925 | end | ||
1664 | end | 1926 | end |
1665 | else -- Match common operand size. | 1927 | else -- Match common operand size. |
1666 | local szp = sz | 1928 | local szp = sz |