aboutsummaryrefslogtreecommitdiff
path: root/dynasm/dasm_x86.lua
diff options
context:
space:
mode:
Diffstat (limited to 'dynasm/dasm_x86.lua')
-rw-r--r--dynasm/dasm_x86.lua606
1 files changed, 510 insertions, 96 deletions
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua
index 13aa68ff..c5c8c17b 100644
--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@@ -11,9 +11,9 @@ local x64 = x64
11local _info = { 11local _info = {
12 arch = x64 and "x64" or "x86", 12 arch = x64 and "x64" or "x86",
13 description = "DynASM x86/x64 module", 13 description = "DynASM x86/x64 module",
14 version = "1.3.0", 14 version = "1.4.0",
15 vernum = 10300, 15 vernum = 10400,
16 release = "2011-05-05", 16 release = "2015-10-18",
17 author = "Mike Pall", 17 author = "Mike Pall",
18 license = "MIT", 18 license = "MIT",
19} 19}
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
27local _s = string 27local _s = string
28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char 28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub 29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
30local concat, sort = table.concat, table.sort 30local concat, sort, remove = table.concat, table.sort, table.remove
31local bit = bit or require("bit") 31local bit = bit or require("bit")
32local band, shl, shr = bit.band, bit.lshift, bit.rshift 32local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
33 33
34-- Inherited tables and callbacks. 34-- Inherited tables and callbacks.
35local g_opt, g_arch 35local g_opt, g_arch
@@ -41,7 +41,7 @@ local action_names = {
41 -- int arg, 1 buffer pos: 41 -- int arg, 1 buffer pos:
42 "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", 42 "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB",
43 -- action arg (1 byte), int arg, 1 buffer pos (reg/num): 43 -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
44 "VREG", "SPACE", -- !x64: VREG support NYI. 44 "VREG", "SPACE",
45 -- ptrdiff_t arg, 1 buffer pos (address): !x64 45 -- ptrdiff_t arg, 1 buffer pos (address): !x64
46 "SETLABEL", "REL_A", 46 "SETLABEL", "REL_A",
47 -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): 47 -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -83,6 +83,21 @@ local actargs = { 0 }
83-- Current number of section buffer positions for dasm_put(). 83-- Current number of section buffer positions for dasm_put().
84local secpos = 1 84local secpos = 1
85 85
86-- VREG kind encodings, pre-shifted by 5 bits.
87local map_vreg = {
88 ["modrm.rm.m"] = 0x00,
89 ["modrm.rm.r"] = 0x20,
90 ["opcode"] = 0x20,
91 ["sib.base"] = 0x20,
92 ["sib.index"] = 0x40,
93 ["modrm.reg"] = 0x80,
94 ["vex.v"] = 0xa0,
95 ["imm.hi"] = 0xc0,
96}
97
98-- Current number of VREG actions contributing to REX/VEX shrinkage.
99local vreg_shrink_count = 0
100
86------------------------------------------------------------------------------ 101------------------------------------------------------------------------------
87 102
88-- Compute action numbers for action names. 103-- Compute action numbers for action names.
@@ -134,6 +149,21 @@ local function waction(action, a, num)
134 if a or num then secpos = secpos + (num or 1) end 149 if a or num then secpos = secpos + (num or 1) end
135end 150end
136 151
152-- Optionally add a VREG action.
153local function wvreg(kind, vreg, psz, sk, defer)
154 if not vreg then return end
155 waction("VREG", vreg)
156 local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
157 if b < (sk or 0) then
158 vreg_shrink_count = vreg_shrink_count + 1
159 end
160 if not defer then
161 b = b + vreg_shrink_count * 8
162 vreg_shrink_count = 0
163 end
164 wputxb(b + (psz or 0))
165end
166
137-- Add call to embedded DynASM C code. 167-- Add call to embedded DynASM C code.
138local function wcall(func, args) 168local function wcall(func, args)
139 wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true) 169 wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
@@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names)
299 local iname = format("@%s%x%s", sz, i, needrex and "R" or "") 329 local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
300 if needrex then map_reg_needrex[iname] = true end 330 if needrex then map_reg_needrex[iname] = true end
301 local name 331 local name
302 if sz == "o" then name = format("xmm%d", i) 332 if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
303 elseif sz == "f" then name = format("st%d", i) 333 elseif sz == "f" then name = format("st%d", i)
304 else name = format("r%d%s", i, sz == addrsize and "" or sz) end 334 else name = format("r%d%s", i, sz == addrsize and "" or sz) end
305 map_archdef[name] = iname 335 map_archdef[name] = iname
@@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
326mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) 356mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
327map_reg_valid_index[map_archdef.esp] = false 357map_reg_valid_index[map_archdef.esp] = false
328if x64 then map_reg_valid_index[map_archdef.rsp] = false end 358if x64 then map_reg_valid_index[map_archdef.rsp] = false end
359if x64 then map_reg_needrex[map_archdef.Rb] = true end
329map_archdef["Ra"] = "@"..addrsize 360map_archdef["Ra"] = "@"..addrsize
330 361
331-- FP registers (internally tword sized, but use "f" as operand size). 362-- FP registers (internally tword sized, but use "f" as operand size).
@@ -334,21 +365,24 @@ mkrmap("f", "Rf")
334-- SSE registers (oword sized, but qword and dword accessible). 365-- SSE registers (oword sized, but qword and dword accessible).
335mkrmap("o", "xmm") 366mkrmap("o", "xmm")
336 367
368-- AVX registers (yword sized, but oword, qword and dword accessible).
369mkrmap("y", "ymm")
370
337-- Operand size prefixes to codes. 371-- Operand size prefixes to codes.
338local map_opsize = { 372local map_opsize = {
339 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", 373 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
340 aword = addrsize, 374 tword = "t", aword = addrsize,
341} 375}
342 376
343-- Operand size code to number. 377-- Operand size code to number.
344local map_opsizenum = { 378local map_opsizenum = {
345 b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, 379 b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
346} 380}
347 381
348-- Operand size code to name. 382-- Operand size code to name.
349local map_opsizename = { 383local map_opsizename = {
350 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", 384 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
351 f = "fpword", 385 t = "tword", f = "fpword",
352} 386}
353 387
354-- Valid index register scale factors. 388-- Valid index register scale factors.
@@ -460,9 +494,45 @@ local function wputszarg(sz, n)
460end 494end
461 495
462-- Put multi-byte opcode with operand-size dependent modifications. 496-- Put multi-byte opcode with operand-size dependent modifications.
463local function wputop(sz, op, rex) 497local function wputop(sz, op, rex, vex, vregr, vregxb)
498 local psz, sk = 0, nil
499 if vex then
500 local tail
501 if vex.m == 1 and band(rex, 11) == 0 then
502 if x64 and vregxb then
503 sk = map_vreg["modrm.reg"]
504 else
505 wputb(0xc5)
506 tail = shl(bxor(band(rex, 4), 4), 5)
507 psz = 3
508 end
509 end
510 if not tail then
511 wputb(0xc4)
512 wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
513 tail = shl(band(rex, 8), 4)
514 psz = 4
515 end
516 local reg, vreg = 0, nil
517 if vex.v then
518 reg = vex.v.reg
519 if not reg then werror("bad vex operand") end
520 if reg < 0 then reg = 0; vreg = vex.v.vreg end
521 end
522 if sz == "y" or vex.l then tail = tail + 4 end
523 wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
524 wvreg("vex.v", vreg)
525 rex = 0
526 if op >= 256 then werror("bad vex opcode") end
527 else
528 if rex ~= 0 then
529 if not x64 then werror("bad operand size") end
530 elseif (vregr or vregxb) and x64 then
531 rex = 0x10
532 sk = map_vreg["vex.v"]
533 end
534 end
464 local r 535 local r
465 if rex ~= 0 and not x64 then werror("bad operand size") end
466 if sz == "w" then wputb(102) end 536 if sz == "w" then wputb(102) end
467 -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] 537 -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
468 if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end 538 if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -471,20 +541,20 @@ local function wputop(sz, op, rex)
471 if rex ~= 0 then 541 if rex ~= 0 then
472 local opc3 = band(op, 0xffff00) 542 local opc3 = band(op, 0xffff00)
473 if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then 543 if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
474 wputb(64 + band(rex, 15)); rex = 0 544 wputb(64 + band(rex, 15)); rex = 0; psz = 2
475 end 545 end
476 end 546 end
477 wputb(shr(op, 16)); op = band(op, 0xffff) 547 wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
478 end 548 end
479 if op >= 256 then 549 if op >= 256 then
480 local b = shr(op, 8) 550 local b = shr(op, 8)
481 if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end 551 if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
482 wputb(b) 552 wputb(b); op = band(op, 255); psz = psz + 1
483 op = band(op, 255)
484 end 553 end
485 if rex ~= 0 then wputb(64 + band(rex, 15)) end 554 if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
486 if sz == "b" then op = op - 1 end 555 if sz == "b" then op = op - 1 end
487 wputb(op) 556 wputb(op)
557 return psz, sk
488end 558end
489 559
490-- Put ModRM or SIB formatted byte. 560-- Put ModRM or SIB formatted byte.
@@ -494,7 +564,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
494end 564end
495 565
496-- Put ModRM/SIB plus optional displacement. 566-- Put ModRM/SIB plus optional displacement.
497local function wputmrmsib(t, imark, s, vsreg) 567local function wputmrmsib(t, imark, s, vsreg, psz, sk)
498 local vreg, vxreg 568 local vreg, vxreg
499 local reg, xreg = t.reg, t.xreg 569 local reg, xreg = t.reg, t.xreg
500 if reg and reg < 0 then reg = 0; vreg = t.vreg end 570 if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -504,8 +574,8 @@ local function wputmrmsib(t, imark, s, vsreg)
504 -- Register mode. 574 -- Register mode.
505 if sub(t.mode, 1, 1) == "r" then 575 if sub(t.mode, 1, 1) == "r" then
506 wputmodrm(3, s, reg) 576 wputmodrm(3, s, reg)
507 if vsreg then waction("VREG", vsreg); wputxb(2) end 577 wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
508 if vreg then waction("VREG", vreg); wputxb(0) end 578 wvreg("modrm.rm.r", vreg, psz+1, sk)
509 return 579 return
510 end 580 end
511 581
@@ -519,21 +589,22 @@ local function wputmrmsib(t, imark, s, vsreg)
519 -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) 589 -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
520 wputmodrm(0, s, 4) 590 wputmodrm(0, s, 4)
521 if imark == "I" then waction("MARK") end 591 if imark == "I" then waction("MARK") end
522 if vsreg then waction("VREG", vsreg); wputxb(2) end 592 wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
523 wputmodrm(t.xsc, xreg, 5) 593 wputmodrm(t.xsc, xreg, 5)
524 if vxreg then waction("VREG", vxreg); wputxb(3) end 594 wvreg("sib.index", vxreg, psz+2, sk)
525 else 595 else
526 -- Pure 32 bit displacement. 596 -- Pure 32 bit displacement.
527 if x64 and tdisp ~= "table" then 597 if x64 and tdisp ~= "table" then
528 wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) 598 wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
599 wvreg("modrm.reg", vsreg, psz+1, sk)
529 if imark == "I" then waction("MARK") end 600 if imark == "I" then waction("MARK") end
530 wputmodrm(0, 4, 5) 601 wputmodrm(0, 4, 5)
531 else 602 else
532 riprel = x64 603 riprel = x64
533 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) 604 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
605 wvreg("modrm.reg", vsreg, psz+1, sk)
534 if imark == "I" then waction("MARK") end 606 if imark == "I" then waction("MARK") end
535 end 607 end
536 if vsreg then waction("VREG", vsreg); wputxb(2) end
537 end 608 end
538 if riprel then -- Emit rip-relative displacement. 609 if riprel then -- Emit rip-relative displacement.
539 if match("UWSiI", imark) then 610 if match("UWSiI", imark) then
@@ -561,16 +632,16 @@ local function wputmrmsib(t, imark, s, vsreg)
561 if xreg or band(reg, 7) == 4 then 632 if xreg or band(reg, 7) == 4 then
562 wputmodrm(m or 2, s, 4) -- ModRM. 633 wputmodrm(m or 2, s, 4) -- ModRM.
563 if m == nil or imark == "I" then waction("MARK") end 634 if m == nil or imark == "I" then waction("MARK") end
564 if vsreg then waction("VREG", vsreg); wputxb(2) end 635 wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
565 wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. 636 wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
566 if vxreg then waction("VREG", vxreg); wputxb(3) end 637 wvreg("sib.index", vxreg, psz+2, sk, vreg)
567 if vreg then waction("VREG", vreg); wputxb(1) end 638 wvreg("sib.base", vreg, psz+2, sk)
568 else 639 else
569 wputmodrm(m or 2, s, reg) -- ModRM. 640 wputmodrm(m or 2, s, reg) -- ModRM.
570 if (imark == "I" and (m == 1 or m == 2)) or 641 if (imark == "I" and (m == 1 or m == 2)) or
571 (m == nil and (vsreg or vreg)) then waction("MARK") end 642 (m == nil and (vsreg or vreg)) then waction("MARK") end
572 if vsreg then waction("VREG", vsreg); wputxb(2) end 643 wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
573 if vreg then waction("VREG", vreg); wputxb(1) end 644 wvreg("modrm.rm.m", vreg, psz+1, sk)
574 end 645 end
575 646
576 -- Put displacement. 647 -- Put displacement.
@@ -881,9 +952,16 @@ end
881-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. 952-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
882-- The spare 3 bits are either filled with the last hex digit or 953-- The spare 3 bits are either filled with the last hex digit or
883-- the result from a previous "r"/"R". The opcode is restored. 954-- the result from a previous "r"/"R". The opcode is restored.
955-- "u" Use VEX encoding, vvvv unused.
956-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
957-- removed from the list used by future characters).
958-- "w" Use VEX encoding, vvvv from 3rd operand.
959-- "L" Force VEX.L
884-- 960--
885-- All of the following characters force a flush of the opcode: 961-- All of the following characters force a flush of the opcode:
886-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. 962-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
963-- "s" stores a 4 bit immediate from the last register operand,
964-- followed by 4 zero bits.
887-- "S" stores a signed 8 bit immediate from the last operand. 965-- "S" stores a signed 8 bit immediate from the last operand.
888-- "U" stores an unsigned 8 bit immediate from the last operand. 966-- "U" stores an unsigned 8 bit immediate from the last operand.
889-- "W" stores an unsigned 16 bit immediate from the last operand. 967-- "W" stores an unsigned 16 bit immediate from the last operand.
@@ -1226,46 +1304,14 @@ local map_op = {
1226 movups_2 = "rmo:0F10rM|mro:0F11Rm", 1304 movups_2 = "rmo:0F10rM|mro:0F11Rm",
1227 orpd_2 = "rmo:660F56rM", 1305 orpd_2 = "rmo:660F56rM",
1228 orps_2 = "rmo:0F56rM", 1306 orps_2 = "rmo:0F56rM",
1229 packssdw_2 = "rmo:660F6BrM",
1230 packsswb_2 = "rmo:660F63rM",
1231 packuswb_2 = "rmo:660F67rM",
1232 paddb_2 = "rmo:660FFCrM",
1233 paddd_2 = "rmo:660FFErM",
1234 paddq_2 = "rmo:660FD4rM",
1235 paddsb_2 = "rmo:660FECrM",
1236 paddsw_2 = "rmo:660FEDrM",
1237 paddusb_2 = "rmo:660FDCrM",
1238 paddusw_2 = "rmo:660FDDrM",
1239 paddw_2 = "rmo:660FFDrM",
1240 pand_2 = "rmo:660FDBrM",
1241 pandn_2 = "rmo:660FDFrM",
1242 pause_0 = "F390", 1307 pause_0 = "F390",
1243 pavgb_2 = "rmo:660FE0rM",
1244 pavgw_2 = "rmo:660FE3rM",
1245 pcmpeqb_2 = "rmo:660F74rM",
1246 pcmpeqd_2 = "rmo:660F76rM",
1247 pcmpeqw_2 = "rmo:660F75rM",
1248 pcmpgtb_2 = "rmo:660F64rM",
1249 pcmpgtd_2 = "rmo:660F66rM",
1250 pcmpgtw_2 = "rmo:660F65rM",
1251 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. 1308 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
1252 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", 1309 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
1253 pmaddwd_2 = "rmo:660FF5rM",
1254 pmaxsw_2 = "rmo:660FEErM",
1255 pmaxub_2 = "rmo:660FDErM",
1256 pminsw_2 = "rmo:660FEArM",
1257 pminub_2 = "rmo:660FDArM",
1258 pmovmskb_2 = "rr/do:660FD7rM", 1310 pmovmskb_2 = "rr/do:660FD7rM",
1259 pmulhuw_2 = "rmo:660FE4rM",
1260 pmulhw_2 = "rmo:660FE5rM",
1261 pmullw_2 = "rmo:660FD5rM",
1262 pmuludq_2 = "rmo:660FF4rM",
1263 por_2 = "rmo:660FEBrM",
1264 prefetchnta_1 = "xb:n0F180m", 1311 prefetchnta_1 = "xb:n0F180m",
1265 prefetcht0_1 = "xb:n0F181m", 1312 prefetcht0_1 = "xb:n0F181m",
1266 prefetcht1_1 = "xb:n0F182m", 1313 prefetcht1_1 = "xb:n0F182m",
1267 prefetcht2_1 = "xb:n0F183m", 1314 prefetcht2_1 = "xb:n0F183m",
1268 psadbw_2 = "rmo:660FF6rM",
1269 pshufd_3 = "rmio:660F70rMU", 1315 pshufd_3 = "rmio:660F70rMU",
1270 pshufhw_3 = "rmio:F30F70rMU", 1316 pshufhw_3 = "rmio:F30F70rMU",
1271 pshuflw_3 = "rmio:F20F70rMU", 1317 pshuflw_3 = "rmio:F20F70rMU",
@@ -1279,23 +1325,6 @@ local map_op = {
1279 psrldq_2 = "rio:660F733mU", 1325 psrldq_2 = "rio:660F733mU",
1280 psrlq_2 = "rmo:660FD3rM|rio:660F732mU", 1326 psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
1281 psrlw_2 = "rmo:660FD1rM|rio:660F712mU", 1327 psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
1282 psubb_2 = "rmo:660FF8rM",
1283 psubd_2 = "rmo:660FFArM",
1284 psubq_2 = "rmo:660FFBrM",
1285 psubsb_2 = "rmo:660FE8rM",
1286 psubsw_2 = "rmo:660FE9rM",
1287 psubusb_2 = "rmo:660FD8rM",
1288 psubusw_2 = "rmo:660FD9rM",
1289 psubw_2 = "rmo:660FF9rM",
1290 punpckhbw_2 = "rmo:660F68rM",
1291 punpckhdq_2 = "rmo:660F6ArM",
1292 punpckhqdq_2 = "rmo:660F6DrM",
1293 punpckhwd_2 = "rmo:660F69rM",
1294 punpcklbw_2 = "rmo:660F60rM",
1295 punpckldq_2 = "rmo:660F62rM",
1296 punpcklqdq_2 = "rmo:660F6CrM",
1297 punpcklwd_2 = "rmo:660F61rM",
1298 pxor_2 = "rmo:660FEFrM",
1299 rcpps_2 = "rmo:0F53rM", 1328 rcpps_2 = "rmo:0F53rM",
1300 rcpss_2 = "rro:F30F53rM|rx/od:", 1329 rcpss_2 = "rro:F30F53rM|rx/od:",
1301 rsqrtps_2 = "rmo:0F52rM", 1330 rsqrtps_2 = "rmo:0F52rM",
@@ -1413,6 +1442,327 @@ local map_op = {
1413 movntsd_2 = "xr/qo:nF20F2BRm", 1442 movntsd_2 = "xr/qo:nF20F2BRm",
1414 movntss_2 = "xr/do:F30F2BRm", 1443 movntss_2 = "xr/do:F30F2BRm",
1415 -- popcnt is also in SSE4.2 1444 -- popcnt is also in SSE4.2
1445
1446 -- AES-NI
1447 aesdec_2 = "rmo:660F38DErM",
1448 aesdeclast_2 = "rmo:660F38DFrM",
1449 aesenc_2 = "rmo:660F38DCrM",
1450 aesenclast_2 = "rmo:660F38DDrM",
1451 aesimc_2 = "rmo:660F38DBrM",
1452 aeskeygenassist_3 = "rmio:660F3ADFrMU",
1453 pclmulqdq_3 = "rmio:660F3A44rMU",
1454
1455 -- AVX FP ops
1456 vaddsubpd_3 = "rrmoy:660FVD0rM",
1457 vaddsubps_3 = "rrmoy:F20FVD0rM",
1458 vandpd_3 = "rrmoy:660FV54rM",
1459 vandps_3 = "rrmoy:0FV54rM",
1460 vandnpd_3 = "rrmoy:660FV55rM",
1461 vandnps_3 = "rrmoy:0FV55rM",
1462 vblendpd_4 = "rrmioy:660F3AV0DrMU",
1463 vblendps_4 = "rrmioy:660F3AV0CrMU",
1464 vblendvpd_4 = "rrmroy:660F3AV4BrMs",
1465 vblendvps_4 = "rrmroy:660F3AV4ArMs",
1466 vbroadcastf128_2 = "rx/yo:660F38u1ArM",
1467 vcmppd_4 = "rrmioy:660FVC2rMU",
1468 vcmpps_4 = "rrmioy:0FVC2rMU",
1469 vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
1470 vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
1471 vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
1472 vcomiss_2 = "rro:0Fu2FrM|rx/od:",
1473 vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
1474 vcvtdq2ps_2 = "rmoy:0Fu5BrM",
1475 vcvtpd2dq_2 = "rmoy:F20FuE6rM",
1476 vcvtpd2ps_2 = "rmoy:660Fu5ArM",
1477 vcvtps2dq_2 = "rmoy:660Fu5BrM",
1478 vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
1479 vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
1480 vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
1481 vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
1482 vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
1483 vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
1484 vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
1485 vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
1486 vcvttps2dq_2 = "rmoy:F30Fu5BrM",
1487 vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
1488 vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
1489 vdppd_4 = "rrmio:660F3AV41rMU",
1490 vdpps_4 = "rrmioy:660F3AV40rMU",
1491 vextractf128_3 = "mri/oy:660F3AuL19RmU",
1492 vextractps_3 = "mri/do:660F3Au17RmU",
1493 vhaddpd_3 = "rrmoy:660FV7CrM",
1494 vhaddps_3 = "rrmoy:F20FV7CrM",
1495 vhsubpd_3 = "rrmoy:660FV7DrM",
1496 vhsubps_3 = "rrmoy:F20FV7DrM",
1497 vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
1498 vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
1499 vldmxcsr_1 = "xd:0FuAE2m",
1500 vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
1501 vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
1502 vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
1503 vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
1504 vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
1505 vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
1506 vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
1507 vmovhlps_3 = "rrro:0FV12rM",
1508 vmovhpd_2 = "xr/qo:660Fu17Rm",
1509 vmovhpd_3 = "rrx/ooq:660FV16rM",
1510 vmovhps_2 = "xr/qo:0Fu17Rm",
1511 vmovhps_3 = "rrx/ooq:0FV16rM",
1512 vmovlhps_3 = "rrro:0FV16rM",
1513 vmovlpd_2 = "xr/qo:660Fu13Rm",
1514 vmovlpd_3 = "rrx/ooq:660FV12rM",
1515 vmovlps_2 = "xr/qo:0Fu13Rm",
1516 vmovlps_3 = "rrx/ooq:0FV12rM",
1517 vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
1518 vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
1519 vmovntpd_2 = "xroy:660Fu2BRm",
1520 vmovntps_2 = "xroy:0Fu2BRm",
1521 vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
1522 vmovsd_3 = "rrro:F20FV10rM",
1523 vmovshdup_2 = "rmoy:F30Fu16rM",
1524 vmovsldup_2 = "rmoy:F30Fu12rM",
1525 vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
1526 vmovss_3 = "rrro:F30FV10rM",
1527 vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
1528 vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
1529 vorpd_3 = "rrmoy:660FV56rM",
1530 vorps_3 = "rrmoy:0FV56rM",
1531 vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
1532 vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
1533 vperm2f128_4 = "rrmiy:660F3AV06rMU",
1534 vptestpd_2 = "rmoy:660F38u0FrM",
1535 vptestps_2 = "rmoy:660F38u0ErM",
1536 vrcpps_2 = "rmoy:0Fu53rM",
1537 vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
1538 vrsqrtps_2 = "rmoy:0Fu52rM",
1539 vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
1540 vroundpd_3 = "rmioy:660F3Au09rMU",
1541 vroundps_3 = "rmioy:660F3Au08rMU",
1542 vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
1543 vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
1544 vshufpd_4 = "rrmioy:660FVC6rMU",
1545 vshufps_4 = "rrmioy:0FVC6rMU",
1546 vsqrtps_2 = "rmoy:0Fu51rM",
1547 vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
1548 vsqrtpd_2 = "rmoy:660Fu51rM",
1549 vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
1550 vstmxcsr_1 = "xd:0FuAE3m",
1551 vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
1552 vucomiss_2 = "rro:0Fu2ErM|rx/od:",
1553 vunpckhpd_3 = "rrmoy:660FV15rM",
1554 vunpckhps_3 = "rrmoy:0FV15rM",
1555 vunpcklpd_3 = "rrmoy:660FV14rM",
1556 vunpcklps_3 = "rrmoy:0FV14rM",
1557 vxorpd_3 = "rrmoy:660FV57rM",
1558 vxorps_3 = "rrmoy:0FV57rM",
1559 vzeroall_0 = "0FuL77",
1560 vzeroupper_0 = "0Fu77",
1561
1562 -- AVX2 FP ops
1563 vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
1564 vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
1565 -- *vgather* (!vsib)
1566 vpermpd_3 = "rmiy:660F3AuX01rMU",
1567 vpermps_3 = "rrmy:660F38V16rM",
1568
1569 -- AVX, AVX2 integer ops
1570 -- In general, xmm requires AVX, ymm requires AVX2.
1571 vaesdec_3 = "rrmo:660F38VDErM",
1572 vaesdeclast_3 = "rrmo:660F38VDFrM",
1573 vaesenc_3 = "rrmo:660F38VDCrM",
1574 vaesenclast_3 = "rrmo:660F38VDDrM",
1575 vaesimc_2 = "rmo:660F38uDBrM",
1576 vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
1577 vlddqu_2 = "rxoy:F20FuF0rM",
1578 vmaskmovdqu_2 = "rro:660FuF7rM",
1579 vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
1580 vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
1581 vmovntdq_2 = "xroy:660FuE7Rm",
1582 vmovntdqa_2 = "rxoy:660F38u2ArM",
1583 vmpsadbw_4 = "rrmioy:660F3AV42rMU",
1584 vpabsb_2 = "rmoy:660F38u1CrM",
1585 vpabsd_2 = "rmoy:660F38u1ErM",
1586 vpabsw_2 = "rmoy:660F38u1DrM",
1587 vpackusdw_3 = "rrmoy:660F38V2BrM",
1588 vpalignr_4 = "rrmioy:660F3AV0FrMU",
1589 vpblendvb_4 = "rrmroy:660F3AV4CrMs",
1590 vpblendw_4 = "rrmioy:660F3AV0ErMU",
1591 vpclmulqdq_4 = "rrmio:660F3AV44rMU",
1592 vpcmpeqq_3 = "rrmoy:660F38V29rM",
1593 vpcmpestri_3 = "rmio:660F3Au61rMU",
1594 vpcmpestrm_3 = "rmio:660F3Au60rMU",
1595 vpcmpgtq_3 = "rrmoy:660F38V37rM",
1596 vpcmpistri_3 = "rmio:660F3Au63rMU",
1597 vpcmpistrm_3 = "rmio:660F3Au62rMU",
1598 vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
1599 vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
1600 vpextrd_3 = "mri/do:660F3Au16RmU",
1601 vpextrq_3 = "mri/qo:660F3Au16RmU",
1602 vphaddw_3 = "rrmoy:660F38V01rM",
1603 vphaddd_3 = "rrmoy:660F38V02rM",
1604 vphaddsw_3 = "rrmoy:660F38V03rM",
1605 vphminposuw_2 = "rmo:660F38u41rM",
1606 vphsubw_3 = "rrmoy:660F38V05rM",
1607 vphsubd_3 = "rrmoy:660F38V06rM",
1608 vphsubsw_3 = "rrmoy:660F38V07rM",
1609 vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
1610 vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
1611 vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
1612 vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
1613 vpmaddubsw_3 = "rrmoy:660F38V04rM",
1614 vpmaxsb_3 = "rrmoy:660F38V3CrM",
1615 vpmaxsd_3 = "rrmoy:660F38V3DrM",
1616 vpmaxuw_3 = "rrmoy:660F38V3ErM",
1617 vpmaxud_3 = "rrmoy:660F38V3FrM",
1618 vpminsb_3 = "rrmoy:660F38V38rM",
1619 vpminsd_3 = "rrmoy:660F38V39rM",
1620 vpminuw_3 = "rrmoy:660F38V3ArM",
1621 vpminud_3 = "rrmoy:660F38V3BrM",
1622 vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
1623 vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
1624 vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
1625 vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
1626 vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
1627 vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
1628 vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
1629 vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
1630 vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
1631 vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
1632 vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
1633 vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
1634 vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
1635 vpmuldq_3 = "rrmoy:660F38V28rM",
1636 vpmulhrsw_3 = "rrmoy:660F38V0BrM",
1637 vpmulld_3 = "rrmoy:660F38V40rM",
1638 vpshufb_3 = "rrmoy:660F38V00rM",
1639 vpshufd_3 = "rmioy:660Fu70rMU",
1640 vpshufhw_3 = "rmioy:F30Fu70rMU",
1641 vpshuflw_3 = "rmioy:F20Fu70rMU",
1642 vpsignb_3 = "rrmoy:660F38V08rM",
1643 vpsignw_3 = "rrmoy:660F38V09rM",
1644 vpsignd_3 = "rrmoy:660F38V0ArM",
1645 vpslldq_3 = "rrioy:660Fv737mU",
1646 vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
1647 vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
1648 vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
1649 vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
1650 vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
1651 vpsrldq_3 = "rrioy:660Fv733mU",
1652 vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
1653 vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
1654 vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
1655 vptest_2 = "rmoy:660F38u17rM",
1656
1657 -- AVX2 integer ops
1658 vbroadcasti128_2 = "rx/yo:660F38u5ArM",
1659 vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
1660 vextracti128_3 = "mri/oy:660F3AuL39RmU",
1661 vpblendd_4 = "rrmioy:660F3AV02rMU",
1662 vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
1663 vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
1664 vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
1665 vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
1666 vpermd_3 = "rrmy:660F38V36rM",
1667 vpermq_3 = "rmiy:660F3AuX00rMU",
1668 -- *vpgather* (!vsib)
1669 vperm2i128_4 = "rrmiy:660F3AV46rMU",
1670 vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
1671 vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
1672 vpsllvd_3 = "rrmoy:660F38V47rM",
1673 vpsllvq_3 = "rrmoy:660F38VX47rM",
1674 vpsravd_3 = "rrmoy:660F38V46rM",
1675 vpsrlvd_3 = "rrmoy:660F38V45rM",
1676 vpsrlvq_3 = "rrmoy:660F38VX45rM",
1677
1678 -- Intel ADX
1679 adcx_2 = "rmqd:660F38F6rM",
1680 adox_2 = "rmqd:F30F38F6rM",
1681
1682 -- BMI1
1683 andn_3 = "rrmqd:0F38VF2rM",
1684 bextr_3 = "rmrqd:0F38wF7rM",
1685 blsi_2 = "rmqd:0F38vF33m",
1686 blsmsk_2 = "rmqd:0F38vF32m",
1687 blsr_2 = "rmqd:0F38vF31m",
1688 tzcnt_2 = "rmqdw:F30FBCrM",
1689
1690 -- BMI2
1691 bzhi_3 = "rmrqd:0F38wF5rM",
1692 mulx_3 = "rrmqd:F20F38VF6rM",
1693 pdep_3 = "rrmqd:F20F38VF5rM",
1694 pext_3 = "rrmqd:F30F38VF5rM",
1695 rorx_3 = "rmSqd:F20F3AuF0rMS",
1696 sarx_3 = "rmrqd:F30F38wF7rM",
1697 shrx_3 = "rmrqd:F20F38wF7rM",
1698 shlx_3 = "rmrqd:660F38wF7rM",
1699
1700 -- FMA3
1701 vfmaddsub132pd_3 = "rrmoy:660F38VX96rM",
1702 vfmaddsub132ps_3 = "rrmoy:660F38V96rM",
1703 vfmaddsub213pd_3 = "rrmoy:660F38VXA6rM",
1704 vfmaddsub213ps_3 = "rrmoy:660F38VA6rM",
1705 vfmaddsub231pd_3 = "rrmoy:660F38VXB6rM",
1706 vfmaddsub231ps_3 = "rrmoy:660F38VB6rM",
1707
1708 vfmsubadd132pd_3 = "rrmoy:660F38VX97rM",
1709 vfmsubadd132ps_3 = "rrmoy:660F38V97rM",
1710 vfmsubadd213pd_3 = "rrmoy:660F38VXA7rM",
1711 vfmsubadd213ps_3 = "rrmoy:660F38VA7rM",
1712 vfmsubadd231pd_3 = "rrmoy:660F38VXB7rM",
1713 vfmsubadd231ps_3 = "rrmoy:660F38VB7rM",
1714
1715 vfmadd132pd_3 = "rrmoy:660F38VX98rM",
1716 vfmadd132ps_3 = "rrmoy:660F38V98rM",
1717 vfmadd132sd_3 = "rrro:660F38VX99rM|rrx/ooq:",
1718 vfmadd132ss_3 = "rrro:660F38V99rM|rrx/ood:",
1719 vfmadd213pd_3 = "rrmoy:660F38VXA8rM",
1720 vfmadd213ps_3 = "rrmoy:660F38VA8rM",
1721 vfmadd213sd_3 = "rrro:660F38VXA9rM|rrx/ooq:",
1722 vfmadd213ss_3 = "rrro:660F38VA9rM|rrx/ood:",
1723 vfmadd231pd_3 = "rrmoy:660F38VXB8rM",
1724 vfmadd231ps_3 = "rrmoy:660F38VB8rM",
1725 vfmadd231sd_3 = "rrro:660F38VXB9rM|rrx/ooq:",
1726 vfmadd231ss_3 = "rrro:660F38VB9rM|rrx/ood:",
1727
1728 vfmsub132pd_3 = "rrmoy:660F38VX9ArM",
1729 vfmsub132ps_3 = "rrmoy:660F38V9ArM",
1730 vfmsub132sd_3 = "rrro:660F38VX9BrM|rrx/ooq:",
1731 vfmsub132ss_3 = "rrro:660F38V9BrM|rrx/ood:",
1732 vfmsub213pd_3 = "rrmoy:660F38VXAArM",
1733 vfmsub213ps_3 = "rrmoy:660F38VAArM",
1734 vfmsub213sd_3 = "rrro:660F38VXABrM|rrx/ooq:",
1735 vfmsub213ss_3 = "rrro:660F38VABrM|rrx/ood:",
1736 vfmsub231pd_3 = "rrmoy:660F38VXBArM",
1737 vfmsub231ps_3 = "rrmoy:660F38VBArM",
1738 vfmsub231sd_3 = "rrro:660F38VXBBrM|rrx/ooq:",
1739 vfmsub231ss_3 = "rrro:660F38VBBrM|rrx/ood:",
1740
1741 vfnmadd132pd_3 = "rrmoy:660F38VX9CrM",
1742 vfnmadd132ps_3 = "rrmoy:660F38V9CrM",
1743 vfnmadd132sd_3 = "rrro:660F38VX9DrM|rrx/ooq:",
1744 vfnmadd132ss_3 = "rrro:660F38V9DrM|rrx/ood:",
1745 vfnmadd213pd_3 = "rrmoy:660F38VXACrM",
1746 vfnmadd213ps_3 = "rrmoy:660F38VACrM",
1747 vfnmadd213sd_3 = "rrro:660F38VXADrM|rrx/ooq:",
1748 vfnmadd213ss_3 = "rrro:660F38VADrM|rrx/ood:",
1749 vfnmadd231pd_3 = "rrmoy:660F38VXBCrM",
1750 vfnmadd231ps_3 = "rrmoy:660F38VBCrM",
1751 vfnmadd231sd_3 = "rrro:660F38VXBDrM|rrx/ooq:",
1752 vfnmadd231ss_3 = "rrro:660F38VBDrM|rrx/ood:",
1753
1754 vfnmsub132pd_3 = "rrmoy:660F38VX9ErM",
1755 vfnmsub132ps_3 = "rrmoy:660F38V9ErM",
1756 vfnmsub132sd_3 = "rrro:660F38VX9FrM|rrx/ooq:",
1757 vfnmsub132ss_3 = "rrro:660F38V9FrM|rrx/ood:",
1758 vfnmsub213pd_3 = "rrmoy:660F38VXAErM",
1759 vfnmsub213ps_3 = "rrmoy:660F38VAErM",
1760 vfnmsub213sd_3 = "rrro:660F38VXAFrM|rrx/ooq:",
1761 vfnmsub213ss_3 = "rrro:660F38VAFrM|rrx/ood:",
1762 vfnmsub231pd_3 = "rrmoy:660F38VXBErM",
1763 vfnmsub231ps_3 = "rrmoy:660F38VBErM",
1764 vfnmsub231sd_3 = "rrro:660F38VXBFrM|rrx/ooq:",
1765 vfnmsub231ss_3 = "rrro:660F38VBFrM|rrx/ood:",
1416} 1766}
1417 1767
1418------------------------------------------------------------------------------ 1768------------------------------------------------------------------------------
@@ -1463,28 +1813,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
1463 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ 1813 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
1464end 1814end
1465 1815
1466-- SSE FP arithmetic ops. 1816-- SSE / AVX FP arithmetic ops.
1467for name,n in pairs{ sqrt = 1, add = 8, mul = 9, 1817for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
1468 sub = 12, min = 13, div = 14, max = 15 } do 1818 sub = 12, min = 13, div = 14, max = 15 } do
1469 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) 1819 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
1470 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) 1820 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
1471 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) 1821 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
1472 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) 1822 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
1823 if n ~= 1 then
1824 map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
1825 map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
1826 map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
1827 map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
1828 end
1829end
1830
1831-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
1832for name,n in pairs{
1833 paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
1834 paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
1835 packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
1836 paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
1837 pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
1838 pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
1839 pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
1840 pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
1841 pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
1842 pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
1843 psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
1844 psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
1845 punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
1846 punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
1847 punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
1848} do
1849 map_op[name.."_2"] = format("rmo:660F%02XrM", n)
1850 map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
1473end 1851end
1474 1852
1475------------------------------------------------------------------------------ 1853------------------------------------------------------------------------------
1476 1854
1855local map_vexarg = { u = false, v = 1, V = 2, w = 3 }
1856
1477-- Process pattern string. 1857-- Process pattern string.
1478local function dopattern(pat, args, sz, op, needrex) 1858local function dopattern(pat, args, sz, op, needrex)
1479 local digit, addin 1859 local digit, addin, vex
1480 local opcode = 0 1860 local opcode = 0
1481 local szov = sz 1861 local szov = sz
1482 local narg = 1 1862 local narg = 1
1483 local rex = 0 1863 local rex = 0
1484 1864
1485 -- Limit number of section buffer positions used by a single dasm_put(). 1865 -- Limit number of section buffer positions used by a single dasm_put().
1486 -- A single opcode needs a maximum of 5 positions. 1866 -- A single opcode needs a maximum of 6 positions.
1487 if secpos+5 > maxsecpos then wflush() end 1867 if secpos+6 > maxsecpos then wflush() end
1488 1868
1489 -- Process each character. 1869 -- Process each character.
1490 for c in gmatch(pat.."|", ".") do 1870 for c in gmatch(pat.."|", ".") do
@@ -1498,6 +1878,8 @@ local function dopattern(pat, args, sz, op, needrex)
1498 szov = nil 1878 szov = nil
1499 elseif c == "X" then -- Force REX.W. 1879 elseif c == "X" then -- Force REX.W.
1500 rex = 8 1880 rex = 8
1881 elseif c == "L" then -- Force VEX.L.
1882 vex.l = true
1501 elseif c == "r" then -- Merge 1st operand regno. into opcode. 1883 elseif c == "r" then -- Merge 1st operand regno. into opcode.
1502 addin = args[1]; opcode = opcode + (addin.reg % 8) 1884 addin = args[1]; opcode = opcode + (addin.reg % 8)
1503 if narg < 2 then narg = 2 end 1885 if narg < 2 then narg = 2 end
@@ -1521,21 +1903,42 @@ local function dopattern(pat, args, sz, op, needrex)
1521 if t.xreg and t.xreg > 7 then rex = rex + 2 end 1903 if t.xreg and t.xreg > 7 then rex = rex + 2 end
1522 if s > 7 then rex = rex + 4 end 1904 if s > 7 then rex = rex + 4 end
1523 if needrex then rex = rex + 16 end 1905 if needrex then rex = rex + 16 end
1524 wputop(szov, opcode, rex); opcode = nil 1906 local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
1907 opcode = nil
1525 local imark = sub(pat, -1) -- Force a mark (ugly). 1908 local imark = sub(pat, -1) -- Force a mark (ugly).
1526 -- Put ModRM/SIB with regno/last digit as spare. 1909 -- Put ModRM/SIB with regno/last digit as spare.
1527 wputmrmsib(t, imark, s, addin and addin.vreg) 1910 wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
1528 addin = nil 1911 addin = nil
1912 elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
1913 local b = band(opcode, 255); opcode = shr(opcode, 8)
1914 local m = 1
1915 if b == 0x38 then m = 2
1916 elseif b == 0x3a then m = 3 end
1917 if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
1918 if b ~= 0x0f then
1919 werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
1920 "' in pattern `"..pat.."' for `"..op.."'")
1921 end
1922 local v = map_vexarg[c]
1923 if v then v = remove(args, v) end
1924 b = band(opcode, 255)
1925 local p = 0
1926 if b == 0x66 then p = 1
1927 elseif b == 0xf3 then p = 2
1928 elseif b == 0xf2 then p = 3 end
1929 if p ~= 0 then opcode = shr(opcode, 8) end
1930 if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
1931 vex = { m = m, p = p, v = v }
1529 else 1932 else
1530 if opcode then -- Flush opcode. 1933 if opcode then -- Flush opcode.
1531 if szov == "q" and rex == 0 then rex = rex + 8 end 1934 if szov == "q" and rex == 0 then rex = rex + 8 end
1532 if needrex then rex = rex + 16 end 1935 if needrex then rex = rex + 16 end
1533 if addin and addin.reg == -1 then 1936 if addin and addin.reg == -1 then
1534 wputop(szov, opcode - 7, rex) 1937 local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
1535 waction("VREG", addin.vreg); wputxb(0) 1938 wvreg("opcode", addin.vreg, psz, sk)
1536 else 1939 else
1537 if addin and addin.reg > 7 then rex = rex + 1 end 1940 if addin and addin.reg > 7 then rex = rex + 1 end
1538 wputop(szov, opcode, rex) 1941 wputop(szov, opcode, rex, vex)
1539 end 1942 end
1540 opcode = nil 1943 opcode = nil
1541 end 1944 end
@@ -1572,6 +1975,14 @@ local function dopattern(pat, args, sz, op, needrex)
1572 else 1975 else
1573 wputlabel("REL_", imm, 2) 1976 wputlabel("REL_", imm, 2)
1574 end 1977 end
1978 elseif c == "s" then
1979 local reg = a.reg
1980 if reg < 0 then
1981 wputb(0)
1982 wvreg("imm.hi", a.vreg)
1983 else
1984 wputb(shl(reg, 4))
1985 end
1575 else 1986 else
1576 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") 1987 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
1577 end 1988 end
@@ -1648,11 +2059,14 @@ map_op[".template__"] = function(params, template, nparams)
1648 if pat == "" then pat = lastpat else lastpat = pat end 2059 if pat == "" then pat = lastpat else lastpat = pat end
1649 if matchtm(tm, args) then 2060 if matchtm(tm, args) then
1650 local prefix = sub(szm, 1, 1) 2061 local prefix = sub(szm, 1, 1)
1651 if prefix == "/" then -- Match both operand sizes. 2062 if prefix == "/" then -- Exactly match leading operand sizes.
1652 if args[1].opsize == sub(szm, 2, 2) and 2063 for i = #szm,1,-1 do
1653 args[2].opsize == sub(szm, 3, 3) then 2064 if i == 1 then
1654 dopattern(pat, args, sz, params.op, needrex) -- Process pattern. 2065 dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
1655 return 2066 return
2067 elseif args[i-1].opsize ~= sub(szm, i, i) then
2068 break
2069 end
1656 end 2070 end
1657 else -- Match common operand size. 2071 else -- Match common operand size.
1658 local szp = sz 2072 local szp = sz
@@ -1717,8 +2131,8 @@ if x64 then
1717 rex = a.reg > 7 and 9 or 8 2131 rex = a.reg > 7 and 9 or 8
1718 end 2132 end
1719 end 2133 end
1720 wputop(sz, opcode, rex) 2134 local psz, sk = wputop(sz, opcode, rex, nil, vreg)
1721 if vreg then waction("VREG", vreg); wputxb(0) end 2135 wvreg("opcode", vreg, psz, sk)
1722 waction("IMM_D", format("(unsigned int)(%s)", op64)) 2136 waction("IMM_D", format("(unsigned int)(%s)", op64))
1723 waction("IMM_D", format("(unsigned int)((%s)>>32)", op64)) 2137 waction("IMM_D", format("(unsigned int)((%s)>>32)", op64))
1724 end 2138 end