aboutsummaryrefslogtreecommitdiff
path: root/dynasm
diff options
context:
space:
mode:
authorMike Pall <mike>2015-10-24 18:43:47 +0200
committerMike Pall <mike>2015-10-24 18:43:47 +0200
commit8a13c9cebf368de9338c3c3c8c30c48d45d717bd (patch)
tree85480a97cf3b02dcf2412c7540b0f463726f4b42 /dynasm
parent7e22082480028a467c27d9c32852ec7a12f8235f (diff)
downloadluajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.tar.gz
luajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.tar.bz2
luajit-8a13c9cebf368de9338c3c3c8c30c48d45d717bd.zip
DynASM/x86: Add AVX and AVX2 opcodes.
Thanks to Peter Cawley.
Diffstat (limited to 'dynasm')
-rw-r--r--dynasm/dasm_x86.h7
-rw-r--r--dynasm/dasm_x86.lua402
2 files changed, 338 insertions, 71 deletions
diff --git a/dynasm/dasm_x86.h b/dynasm/dasm_x86.h
index 652e8c99..175febe0 100644
--- a/dynasm/dasm_x86.h
+++ b/dynasm/dasm_x86.h
@@ -391,7 +391,12 @@ int dasm_encode(Dst_DECL, void *buffer)
391 case DASM_IMM_D: wd: dasmd(n); break; 391 case DASM_IMM_D: wd: dasmd(n); break;
392 case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; 392 case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
393 case DASM_IMM_W: dasmw(n); break; 393 case DASM_IMM_W: dasmw(n); break;
394 case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } 394 case DASM_VREG: {
395 int t = *p++;
396 if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3;
397 cp[-1] ^= n;
398 break;
399 }
395 case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; 400 case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
396 b++; n = (int)(ptrdiff_t)D->globals[-n]; 401 b++; n = (int)(ptrdiff_t)D->globals[-n];
397 case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ 402 case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua
index d8203e3d..d85dfec4 100644
--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
27local _s = string 27local _s = string
28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char 28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub 29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
30local concat, sort = table.concat, table.sort 30local concat, sort, remove = table.concat, table.sort, table.remove
31local bit = bit or require("bit") 31local bit = bit or require("bit")
32local band, shl, shr = bit.band, bit.lshift, bit.rshift 32local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
33 33
34-- Inherited tables and callbacks. 34-- Inherited tables and callbacks.
35local g_opt, g_arch 35local g_opt, g_arch
@@ -299,7 +299,7 @@ local function mkrmap(sz, cl, names)
299 local iname = format("@%s%x%s", sz, i, needrex and "R" or "") 299 local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
300 if needrex then map_reg_needrex[iname] = true end 300 if needrex then map_reg_needrex[iname] = true end
301 local name 301 local name
302 if sz == "o" then name = format("xmm%d", i) 302 if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
303 elseif sz == "f" then name = format("st%d", i) 303 elseif sz == "f" then name = format("st%d", i)
304 else name = format("r%d%s", i, sz == addrsize and "" or sz) end 304 else name = format("r%d%s", i, sz == addrsize and "" or sz) end
305 map_archdef[name] = iname 305 map_archdef[name] = iname
@@ -334,21 +334,24 @@ mkrmap("f", "Rf")
334-- SSE registers (oword sized, but qword and dword accessible). 334-- SSE registers (oword sized, but qword and dword accessible).
335mkrmap("o", "xmm") 335mkrmap("o", "xmm")
336 336
337-- AVX registers (yword sized, but oword, qword and dword accessible).
338mkrmap("y", "ymm")
339
337-- Operand size prefixes to codes. 340-- Operand size prefixes to codes.
338local map_opsize = { 341local map_opsize = {
339 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", 342 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
340 aword = addrsize, 343 tword = "t", aword = addrsize,
341} 344}
342 345
343-- Operand size code to number. 346-- Operand size code to number.
344local map_opsizenum = { 347local map_opsizenum = {
345 b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, 348 b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
346} 349}
347 350
348-- Operand size code to name. 351-- Operand size code to name.
349local map_opsizename = { 352local map_opsizename = {
350 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", 353 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
351 f = "fpword", 354 t = "tword", f = "fpword",
352} 355}
353 356
354-- Valid index register scale factors. 357-- Valid index register scale factors.
@@ -460,7 +463,29 @@ local function wputszarg(sz, n)
460end 463end
461 464
462-- Put multi-byte opcode with operand-size dependent modifications. 465-- Put multi-byte opcode with operand-size dependent modifications.
463local function wputop(sz, op, rex) 466local function wputop(sz, op, rex, vex)
467 if vex then
468 local tail
469 if vex.m == 1 and band(rex, 11) == 0 then
470 wputb(0xc5)
471 tail = shl(bxor(band(rex, 4), 4), 5)
472 else
473 wputb(0xc4)
474 wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
475 tail = shl(band(rex, 8), 4)
476 end
477 local reg, vreg = 0, nil
478 if vex.v then
479 reg = vex.v.reg
480 if not reg then werror("bad vex operand") end
481 if reg < 0 then reg = 0; vreg = vex.v.vreg end
482 end
483 if sz == "y" or vex.l then tail = tail + 4 end
484 wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
485 if vreg then waction("VREG", vreg); wputxb(4) end
486 rex = 0
487 if op >= 256 then werror("bad vex opcode") end
488 end
464 local r 489 local r
465 if rex ~= 0 and not x64 then werror("bad operand size") end 490 if rex ~= 0 and not x64 then werror("bad operand size") end
466 if sz == "w" then wputb(102) end 491 if sz == "w" then wputb(102) end
@@ -881,9 +906,15 @@ end
881-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. 906-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
882-- The spare 3 bits are either filled with the last hex digit or 907-- The spare 3 bits are either filled with the last hex digit or
883-- the result from a previous "r"/"R". The opcode is restored. 908-- the result from a previous "r"/"R". The opcode is restored.
909-- "u" Use VEX encoding, vvvv unused.
910-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
911-- removed from the list used by future characters).
912-- "L" Force VEX.L
884-- 913--
885-- All of the following characters force a flush of the opcode: 914-- All of the following characters force a flush of the opcode:
886-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. 915-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
916-- "s" stores a 4 bit immediate from the last register operand,
917-- followed by 4 zero bits.
887-- "S" stores a signed 8 bit immediate from the last operand. 918-- "S" stores a signed 8 bit immediate from the last operand.
888-- "U" stores an unsigned 8 bit immediate from the last operand. 919-- "U" stores an unsigned 8 bit immediate from the last operand.
889-- "W" stores an unsigned 16 bit immediate from the last operand. 920-- "W" stores an unsigned 16 bit immediate from the last operand.
@@ -1225,46 +1256,14 @@ local map_op = {
1225 movups_2 = "rmo:0F10rM|mro:0F11Rm", 1256 movups_2 = "rmo:0F10rM|mro:0F11Rm",
1226 orpd_2 = "rmo:660F56rM", 1257 orpd_2 = "rmo:660F56rM",
1227 orps_2 = "rmo:0F56rM", 1258 orps_2 = "rmo:0F56rM",
1228 packssdw_2 = "rmo:660F6BrM",
1229 packsswb_2 = "rmo:660F63rM",
1230 packuswb_2 = "rmo:660F67rM",
1231 paddb_2 = "rmo:660FFCrM",
1232 paddd_2 = "rmo:660FFErM",
1233 paddq_2 = "rmo:660FD4rM",
1234 paddsb_2 = "rmo:660FECrM",
1235 paddsw_2 = "rmo:660FEDrM",
1236 paddusb_2 = "rmo:660FDCrM",
1237 paddusw_2 = "rmo:660FDDrM",
1238 paddw_2 = "rmo:660FFDrM",
1239 pand_2 = "rmo:660FDBrM",
1240 pandn_2 = "rmo:660FDFrM",
1241 pause_0 = "F390", 1259 pause_0 = "F390",
1242 pavgb_2 = "rmo:660FE0rM",
1243 pavgw_2 = "rmo:660FE3rM",
1244 pcmpeqb_2 = "rmo:660F74rM",
1245 pcmpeqd_2 = "rmo:660F76rM",
1246 pcmpeqw_2 = "rmo:660F75rM",
1247 pcmpgtb_2 = "rmo:660F64rM",
1248 pcmpgtd_2 = "rmo:660F66rM",
1249 pcmpgtw_2 = "rmo:660F65rM",
1250 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. 1260 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
1251 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", 1261 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
1252 pmaddwd_2 = "rmo:660FF5rM",
1253 pmaxsw_2 = "rmo:660FEErM",
1254 pmaxub_2 = "rmo:660FDErM",
1255 pminsw_2 = "rmo:660FEArM",
1256 pminub_2 = "rmo:660FDArM",
1257 pmovmskb_2 = "rr/do:660FD7rM", 1262 pmovmskb_2 = "rr/do:660FD7rM",
1258 pmulhuw_2 = "rmo:660FE4rM",
1259 pmulhw_2 = "rmo:660FE5rM",
1260 pmullw_2 = "rmo:660FD5rM",
1261 pmuludq_2 = "rmo:660FF4rM",
1262 por_2 = "rmo:660FEBrM",
1263 prefetchnta_1 = "xb:n0F180m", 1263 prefetchnta_1 = "xb:n0F180m",
1264 prefetcht0_1 = "xb:n0F181m", 1264 prefetcht0_1 = "xb:n0F181m",
1265 prefetcht1_1 = "xb:n0F182m", 1265 prefetcht1_1 = "xb:n0F182m",
1266 prefetcht2_1 = "xb:n0F183m", 1266 prefetcht2_1 = "xb:n0F183m",
1267 psadbw_2 = "rmo:660FF6rM",
1268 pshufd_3 = "rmio:660F70rMU", 1267 pshufd_3 = "rmio:660F70rMU",
1269 pshufhw_3 = "rmio:F30F70rMU", 1268 pshufhw_3 = "rmio:F30F70rMU",
1270 pshuflw_3 = "rmio:F20F70rMU", 1269 pshuflw_3 = "rmio:F20F70rMU",
@@ -1278,23 +1277,6 @@ local map_op = {
1278 psrldq_2 = "rio:660F733mU", 1277 psrldq_2 = "rio:660F733mU",
1279 psrlq_2 = "rmo:660FD3rM|rio:660F732mU", 1278 psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
1280 psrlw_2 = "rmo:660FD1rM|rio:660F712mU", 1279 psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
1281 psubb_2 = "rmo:660FF8rM",
1282 psubd_2 = "rmo:660FFArM",
1283 psubq_2 = "rmo:660FFBrM",
1284 psubsb_2 = "rmo:660FE8rM",
1285 psubsw_2 = "rmo:660FE9rM",
1286 psubusb_2 = "rmo:660FD8rM",
1287 psubusw_2 = "rmo:660FD9rM",
1288 psubw_2 = "rmo:660FF9rM",
1289 punpckhbw_2 = "rmo:660F68rM",
1290 punpckhdq_2 = "rmo:660F6ArM",
1291 punpckhqdq_2 = "rmo:660F6DrM",
1292 punpckhwd_2 = "rmo:660F69rM",
1293 punpcklbw_2 = "rmo:660F60rM",
1294 punpckldq_2 = "rmo:660F62rM",
1295 punpcklqdq_2 = "rmo:660F6CrM",
1296 punpcklwd_2 = "rmo:660F61rM",
1297 pxor_2 = "rmo:660FEFrM",
1298 rcpps_2 = "rmo:0F53rM", 1280 rcpps_2 = "rmo:0F53rM",
1299 rcpss_2 = "rro:F30F53rM|rx/od:", 1281 rcpss_2 = "rro:F30F53rM|rx/od:",
1300 rsqrtps_2 = "rmo:0F52rM", 1282 rsqrtps_2 = "rmo:0F52rM",
@@ -1421,6 +1403,223 @@ local map_op = {
1421 aesimc_2 = "rmo:660F38DBrM", 1403 aesimc_2 = "rmo:660F38DBrM",
1422 aeskeygenassist_3 = "rmio:660F3ADFrMU", 1404 aeskeygenassist_3 = "rmio:660F3ADFrMU",
1423 pclmulqdq_3 = "rmio:660F3A44rMU", 1405 pclmulqdq_3 = "rmio:660F3A44rMU",
1406
1407 -- AVX FP ops
1408 vaddsubpd_3 = "rrmoy:660FVD0rM",
1409 vaddsubps_3 = "rrmoy:F20FVD0rM",
1410 vandpd_3 = "rrmoy:660FV54rM",
1411 vandps_3 = "rrmoy:0FV54rM",
1412 vandnpd_3 = "rrmoy:660FV55rM",
1413 vandnps_3 = "rrmoy:0FV55rM",
1414 vblendpd_4 = "rrmioy:660F3AV0DrMU",
1415 vblendps_4 = "rrmioy:660F3AV0CrMU",
1416 vblendvpd_4 = "rrmroy:660F3AV4BrMs",
1417 vblendvps_4 = "rrmroy:660F3AV4ArMs",
1418 vbroadcastf128_2 = "rx/yo:660F38u1ArM",
1419 vcmppd_4 = "rrmioy:660FVC2rMU",
1420 vcmpps_4 = "rrmioy:0FVC2rMU",
1421 vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
1422 vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
1423 vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
1424 vcomiss_2 = "rro:0Fu2FrM|rx/od:",
1425 vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
1426 vcvtdq2ps_2 = "rmoy:0Fu5BrM",
1427 vcvtpd2dq_2 = "rmoy:F20FuE6rM",
1428 vcvtpd2ps_2 = "rmoy:660Fu5ArM",
1429 vcvtps2dq_2 = "rmoy:660Fu5BrM",
1430 vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
1431 vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
1432 vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
1433 vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
1434 vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
1435 vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
1436 vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
1437 vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
1438 vcvttps2dq_2 = "rmoy:F30Fu5BrM",
1439 vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
1440 vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
1441 vdppd_4 = "rrmio:660F3AV41rMU",
1442 vdpps_4 = "rrmioy:660F3AV40rMU",
1443 vextractf128_3 = "mri/oy:660F3AuL19RmU",
1444 vextractps_3 = "mri/do:660F3Au17RmU",
1445 vhaddpd_3 = "rrmoy:660FV7CrM",
1446 vhaddps_3 = "rrmoy:F20FV7CrM",
1447 vhsubpd_3 = "rrmoy:660FV7DrM",
1448 vhsubps_3 = "rrmoy:F20FV7DrM",
1449 vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
1450 vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
1451 vldmxcsr_1 = "xd:0FuAE2m",
1452 vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
1453 vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
1454 vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
1455 vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
1456 vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
1457 vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
1458 vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
1459 vmovhlps_3 = "rrro:0FV12rM",
1460 vmovhpd_2 = "xr/qo:660Fu17Rm",
1461 vmovhpd_3 = "rrx/ooq:660FV16rM",
1462 vmovhps_2 = "xr/qo:0Fu17Rm",
1463 vmovhps_3 = "rrx/ooq:0FV16rM",
1464 vmovlhps_3 = "rrro:0FV16rM",
1465 vmovlpd_2 = "xr/qo:660Fu13Rm",
1466 vmovlpd_3 = "rrx/ooq:660FV12rM",
1467 vmovlps_2 = "xr/qo:0Fu13Rm",
1468 vmovlps_3 = "rrx/ooq:0FV12rM",
1469 vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
1470 vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
1471 vmovntpd_2 = "xroy:660Fu2BRm",
1472 vmovntps_2 = "xroy:0Fu2BRm",
1473 vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
1474 vmovsd_3 = "rrro:F20FV10rM",
1475 vmovshdup_2 = "rmoy:F30Fu16rM",
1476 vmovsldup_2 = "rmoy:F30Fu12rM",
1477 vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
1478 vmovss_3 = "rrro:F30FV10rM",
1479 vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
1480 vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
1481 vorpd_3 = "rrmoy:660FV56rM",
1482 vorps_3 = "rrmoy:0FV56rM",
1483 vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
1484 vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
1485 vperm2f128_4 = "rrmiy:660F3AV06rMU",
1486 vptestpd_2 = "rmoy:660F38u0FrM",
1487 vptestps_2 = "rmoy:660F38u0ErM",
1488 vrcpps_2 = "rmoy:0Fu53rM",
1489 vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
1490 vrsqrtps_2 = "rmoy:0Fu52rM",
1491 vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
1492 vroundpd_3 = "rmioy:660F3AV09rMU",
1493 vroundps_3 = "rmioy:660F3AV08rMU",
1494 vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
1495 vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
1496 vshufpd_4 = "rrmioy:660FVC6rMU",
1497 vshufps_4 = "rrmioy:0FVC6rMU",
1498 vsqrtps_2 = "rmoy:0Fu51rM",
1499 vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
1500 vsqrtpd_2 = "rmoy:660Fu51rM",
1501 vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
1502 vstmxcsr_1 = "xd:0FuAE3m",
1503 vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
1504 vucomiss_2 = "rro:0Fu2ErM|rx/od:",
1505 vunpckhpd_3 = "rrmoy:660FV15rM",
1506 vunpckhps_3 = "rrmoy:0FV15rM",
1507 vunpcklpd_3 = "rrmoy:660FV14rM",
1508 vunpcklps_3 = "rrmoy:0FV14rM",
1509 vxorpd_3 = "rrmoy:660FV57rM",
1510 vxorps_3 = "rrmoy:0FV57rM",
1511 vzeroall_0 = "0FuL77",
1512 vzeroupper_0 = "0Fu77",
1513
1514 -- AVX2 FP ops
1515 vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
1516 vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
1517 -- *vgather* (!vsib)
1518 vpermpd_3 = "rmiy:660F3AuX01rMU",
1519 vpermps_3 = "rrmy:660F38V16rM",
1520
1521 -- AVX, AVX2 integer ops
1522 -- In general, xmm requires AVX, ymm requires AVX2.
1523 vlddqu_2 = "rxoy:F20FuF0rM",
1524 vmaskmovdqu_2 = "rro:660FuF7rM",
1525 vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
1526 vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
1527 vmovntdq_2 = "xroy:660FuE7Rm",
1528 vmovntdqa_2 = "rxoy:660F38u2ArM",
1529 vmpsadbw_4 = "rrmioy:660F3AV42rMU",
1530 vpabsb_2 = "rmoy:660F38u1CrM",
1531 vpabsd_2 = "rmoy:660F38u1ErM",
1532 vpabsw_2 = "rmoy:660F38u1DrM",
1533 vpackusdw_3 = "rrmoy:660F38V2BrM",
1534 vpalignr_4 = "rrmioy:660F3AV0FrMU",
1535 vpblendvb_4 = "rrmroy:660F3AV4CrMs",
1536 vpblendw_4 = "rrmioy:660F3AV0ErMU",
1537 vpclmulqdq_4 = "rrmio:660F3AV44rMU",
1538 vpcmpeqq_3 = "rrmoy:660F38V29rM",
1539 vpcmpestri_3 = "rmio:660F3Au61rMU",
1540 vpcmpestrm_3 = "rmio:660F3Au60rMU",
1541 vpcmpgtq_3 = "rrmoy:660F38V37rM",
1542 vpcmpistri_3 = "rmio:660F3Au63rMU",
1543 vpcmpistrm_3 = "rmio:660F3Au62rMU",
1544 vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
1545 vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
1546 vpextrd_3 = "mri/do:660F3Au16RmU",
1547 vpextrq_3 = "mri/qo:660F3Au16RmU",
1548 vphaddw_3 = "rrmoy:660F38V01rM",
1549 vphaddd_3 = "rrmoy:660F38V02rM",
1550 vphaddsw_3 = "rrmoy:660F38V03rM",
1551 vphminposuw_2 = "rmo:660F38u41rM",
1552 vphsubw_3 = "rrmoy:660F38V05rM",
1553 vphsubd_3 = "rrmoy:660F38V06rM",
1554 vphsubsw_3 = "rrmoy:660F38V07rM",
1555 vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
1556 vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
1557 vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
1558 vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
1559 vpmaddubsw_3 = "rrmoy:660F38V04rM",
1560 vpmaxsb_3 = "rrmoy:660F38V3CrM",
1561 vpmaxsd_3 = "rrmoy:660F38V3DrM",
1562 vpmaxuw_3 = "rrmoy:660F38V3ErM",
1563 vpmaxud_3 = "rrmoy:660F38V3FrM",
1564 vpminsb_3 = "rrmoy:660F38V38rM",
1565 vpminsd_3 = "rrmoy:660F38V39rM",
1566 vpminuw_3 = "rrmoy:660F38V3ArM",
1567 vpminud_3 = "rrmoy:660F38V3BrM",
1568 vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
1569 vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
1570 vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
1571 vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
1572 vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
1573 vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
1574 vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
1575 vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
1576 vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
1577 vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
1578 vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
1579 vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
1580 vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
1581 vpmuldq_3 = "rrmoy:660F38V28rM",
1582 vpmulhrsw_3 = "rrmoy:660F38V0BrM",
1583 vpmulld_3 = "rrmoy:660F38V40rM",
1584 vpshufb_3 = "rrmoy:660F38V00rM",
1585 vpshufd_3 = "rmioy:660Fu70rMU",
1586 vpshufhw_3 = "rmioy:F30Fu70rMU",
1587 vpshuflw_3 = "rmioy:F20Fu70rMU",
1588 vpsignb_3 = "rrmoy:660F38V08rM",
1589 vpsignw_3 = "rrmoy:660F38V09rM",
1590 vpsignd_3 = "rrmoy:660F38V0ArM",
1591 vpslldq_3 = "rrioy:660Fv737mU",
1592 vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
1593 vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
1594 vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
1595 vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
1596 vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
1597 vpsrldq_3 = "rrioy:660Fv733mU",
1598 vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
1599 vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
1600 vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
1601 vptest_2 = "rmoy:660F38u17rM",
1602
1603 -- AVX2 integer ops
1604 vbroadcasti128_2 = "rx/yo:660F38u5ArM",
1605 vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
1606 vextracti128_3 = "mri/oy:660F3AuL39RmU",
1607 vpblendd_4 = "rrmioy:660F3AV02rMU",
1608 vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
1609 vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
1610 vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
1611 vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
1612 vpermd_3 = "rrmy:660F38V36rM",
1613 vpermq_3 = "rmiy:660F3AuX00rMU",
1614 -- *vpgather* (!vsib)
1615 vperm2i128_4 = "rrmiy:660F3AV46rMU",
1616 vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
1617 vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
1618 vpsllvd_3 = "rrmoy:660F38V47rM",
1619 vpsllvq_3 = "rrmoy:660F38VX47rM",
1620 vpsravd_3 = "rrmoy:660F38V46rM",
1621 vpsrlvd_3 = "rrmoy:660F38V45rM",
1622 vpsrlvq_3 = "rrmoy:660F38VX45rM",
1424} 1623}
1425 1624
1426------------------------------------------------------------------------------ 1625------------------------------------------------------------------------------
@@ -1471,28 +1670,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
1471 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ 1670 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
1472end 1671end
1473 1672
1474-- SSE FP arithmetic ops. 1673-- SSE / AVX FP arithmetic ops.
1475for name,n in pairs{ sqrt = 1, add = 8, mul = 9, 1674for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
1476 sub = 12, min = 13, div = 14, max = 15 } do 1675 sub = 12, min = 13, div = 14, max = 15 } do
1477 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) 1676 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
1478 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) 1677 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
1479 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) 1678 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
1480 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) 1679 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
1680 if n ~= 1 then
1681 map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
1682 map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
1683 map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
1684 map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
1685 end
1686end
1687
1688-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
1689for name,n in pairs{
1690 paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
1691 paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
1692 packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
1693 paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
1694 pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
1695 pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
1696 pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
1697 pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
1698 pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
1699 pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
1700 psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
1701 psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
1702 punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
1703 punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
1704 punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
1705} do
1706 map_op[name.."_2"] = format("rmo:660F%02XrM", n)
1707 map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
1481end 1708end
1482 1709
1483------------------------------------------------------------------------------ 1710------------------------------------------------------------------------------
1484 1711
1712local map_vexarg = { u = false, v = 1, V = 2 }
1713
1485-- Process pattern string. 1714-- Process pattern string.
1486local function dopattern(pat, args, sz, op, needrex) 1715local function dopattern(pat, args, sz, op, needrex)
1487 local digit, addin 1716 local digit, addin, vex
1488 local opcode = 0 1717 local opcode = 0
1489 local szov = sz 1718 local szov = sz
1490 local narg = 1 1719 local narg = 1
1491 local rex = 0 1720 local rex = 0
1492 1721
1493 -- Limit number of section buffer positions used by a single dasm_put(). 1722 -- Limit number of section buffer positions used by a single dasm_put().
1494 -- A single opcode needs a maximum of 5 positions. 1723 -- A single opcode needs a maximum of 6 positions.
1495 if secpos+5 > maxsecpos then wflush() end 1724 if secpos+6 > maxsecpos then wflush() end
1496 1725
1497 -- Process each character. 1726 -- Process each character.
1498 for c in gmatch(pat.."|", ".") do 1727 for c in gmatch(pat.."|", ".") do
@@ -1506,6 +1735,8 @@ local function dopattern(pat, args, sz, op, needrex)
1506 szov = nil 1735 szov = nil
1507 elseif c == "X" then -- Force REX.W. 1736 elseif c == "X" then -- Force REX.W.
1508 rex = 8 1737 rex = 8
1738 elseif c == "L" then -- Force VEX.L.
1739 vex.l = true
1509 elseif c == "r" then -- Merge 1st operand regno. into opcode. 1740 elseif c == "r" then -- Merge 1st operand regno. into opcode.
1510 addin = args[1]; opcode = opcode + (addin.reg % 8) 1741 addin = args[1]; opcode = opcode + (addin.reg % 8)
1511 if narg < 2 then narg = 2 end 1742 if narg < 2 then narg = 2 end
@@ -1529,21 +1760,41 @@ local function dopattern(pat, args, sz, op, needrex)
1529 if t.xreg and t.xreg > 7 then rex = rex + 2 end 1760 if t.xreg and t.xreg > 7 then rex = rex + 2 end
1530 if s > 7 then rex = rex + 4 end 1761 if s > 7 then rex = rex + 4 end
1531 if needrex then rex = rex + 16 end 1762 if needrex then rex = rex + 16 end
1532 wputop(szov, opcode, rex); opcode = nil 1763 wputop(szov, opcode, rex, vex); opcode = nil
1533 local imark = sub(pat, -1) -- Force a mark (ugly). 1764 local imark = sub(pat, -1) -- Force a mark (ugly).
1534 -- Put ModRM/SIB with regno/last digit as spare. 1765 -- Put ModRM/SIB with regno/last digit as spare.
1535 wputmrmsib(t, imark, s, addin and addin.vreg) 1766 wputmrmsib(t, imark, s, addin and addin.vreg)
1536 addin = nil 1767 addin = nil
1768 elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
1769 local b = band(opcode, 255); opcode = shr(opcode, 8)
1770 local m = 1
1771 if b == 0x38 then m = 2
1772 elseif b == 0x3a then m = 3 end
1773 if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
1774 if b ~= 0x0f then
1775 werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
1776 "' in pattern `"..pat.."' for `"..op.."'")
1777 end
1778 local v = map_vexarg[c]
1779 if v then v = remove(args, v) end
1780 b = band(opcode, 255)
1781 local p = 0
1782 if b == 0x66 then p = 1
1783 elseif b == 0xf3 then p = 2
1784 elseif b == 0xf2 then p = 3 end
1785 if p ~= 0 then opcode = shr(opcode, 8) end
1786 if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
1787 vex = { m = m, p = p, v = v }
1537 else 1788 else
1538 if opcode then -- Flush opcode. 1789 if opcode then -- Flush opcode.
1539 if szov == "q" and rex == 0 then rex = rex + 8 end 1790 if szov == "q" and rex == 0 then rex = rex + 8 end
1540 if needrex then rex = rex + 16 end 1791 if needrex then rex = rex + 16 end
1541 if addin and addin.reg == -1 then 1792 if addin and addin.reg == -1 then
1542 wputop(szov, opcode - 7, rex) 1793 wputop(szov, opcode - 7, rex, vex)
1543 waction("VREG", addin.vreg); wputxb(0) 1794 waction("VREG", addin.vreg); wputxb(0)
1544 else 1795 else
1545 if addin and addin.reg > 7 then rex = rex + 1 end 1796 if addin and addin.reg > 7 then rex = rex + 1 end
1546 wputop(szov, opcode, rex) 1797 wputop(szov, opcode, rex, vex)
1547 end 1798 end
1548 opcode = nil 1799 opcode = nil
1549 end 1800 end
@@ -1580,6 +1831,14 @@ local function dopattern(pat, args, sz, op, needrex)
1580 else 1831 else
1581 wputlabel("REL_", imm, 2) 1832 wputlabel("REL_", imm, 2)
1582 end 1833 end
1834 elseif c == "s" then
1835 local reg = a.reg
1836 if reg < 0 then
1837 wputb(0)
1838 waction("VREG", a.vreg); wputxb(5)
1839 else
1840 wputb(shl(reg, 4))
1841 end
1583 else 1842 else
1584 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") 1843 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
1585 end 1844 end
@@ -1656,11 +1915,14 @@ map_op[".template__"] = function(params, template, nparams)
1656 if pat == "" then pat = lastpat else lastpat = pat end 1915 if pat == "" then pat = lastpat else lastpat = pat end
1657 if matchtm(tm, args) then 1916 if matchtm(tm, args) then
1658 local prefix = sub(szm, 1, 1) 1917 local prefix = sub(szm, 1, 1)
1659 if prefix == "/" then -- Match both operand sizes. 1918 if prefix == "/" then -- Exactly match leading operand sizes.
1660 if args[1].opsize == sub(szm, 2, 2) and 1919 for i = #szm, 1, -1 do
1661 args[2].opsize == sub(szm, 3, 3) then 1920 if i == 1 then
1662 dopattern(pat, args, sz, params.op, needrex) -- Process pattern. 1921 dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
1663 return 1922 return
1923 elseif args[i-1].opsize ~= sub(szm, i, i) then
1924 break
1925 end
1664 end 1926 end
1665 else -- Match common operand size. 1927 else -- Match common operand size.
1666 local szp = sz 1928 local szp = sz