aboutsummaryrefslogtreecommitdiff
path: root/dynasm/dasm_x86.lua
diff options
context:
space:
mode:
Diffstat (limited to 'dynasm/dasm_x86.lua')
-rw-r--r--dynasm/dasm_x86.lua648
1 files changed, 545 insertions, 103 deletions
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua
index 24c07f37..b442cd0d 100644
--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@@ -11,9 +11,9 @@ local x64 = x64
11local _info = { 11local _info = {
12 arch = x64 and "x64" or "x86", 12 arch = x64 and "x64" or "x86",
13 description = "DynASM x86/x64 module", 13 description = "DynASM x86/x64 module",
14 version = "1.3.0", 14 version = "1.5.0",
15 vernum = 10300, 15 vernum = 10500,
16 release = "2011-05-05", 16 release = "2021-05-02",
17 author = "Mike Pall", 17 author = "Mike Pall",
18 license = "MIT", 18 license = "MIT",
19} 19}
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
27local _s = string 27local _s = string
28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char 28local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub 29local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
30local concat, sort = table.concat, table.sort 30local concat, sort, remove = table.concat, table.sort, table.remove
31local bit = bit or require("bit") 31local bit = bit or require("bit")
32local band, shl, shr = bit.band, bit.lshift, bit.rshift 32local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
33 33
34-- Inherited tables and callbacks. 34-- Inherited tables and callbacks.
35local g_opt, g_arch 35local g_opt, g_arch
@@ -41,7 +41,7 @@ local action_names = {
41 -- int arg, 1 buffer pos: 41 -- int arg, 1 buffer pos:
42 "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", 42 "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB",
43 -- action arg (1 byte), int arg, 1 buffer pos (reg/num): 43 -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
44 "VREG", "SPACE", -- !x64: VREG support NYI. 44 "VREG", "SPACE",
45 -- ptrdiff_t arg, 1 buffer pos (address): !x64 45 -- ptrdiff_t arg, 1 buffer pos (address): !x64
46 "SETLABEL", "REL_A", 46 "SETLABEL", "REL_A",
47 -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): 47 -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -83,6 +83,21 @@ local actargs = { 0 }
83-- Current number of section buffer positions for dasm_put(). 83-- Current number of section buffer positions for dasm_put().
84local secpos = 1 84local secpos = 1
85 85
86-- VREG kind encodings, pre-shifted by 5 bits.
87local map_vreg = {
88 ["modrm.rm.m"] = 0x00,
89 ["modrm.rm.r"] = 0x20,
90 ["opcode"] = 0x20,
91 ["sib.base"] = 0x20,
92 ["sib.index"] = 0x40,
93 ["modrm.reg"] = 0x80,
94 ["vex.v"] = 0xa0,
95 ["imm.hi"] = 0xc0,
96}
97
98-- Current number of VREG actions contributing to REX/VEX shrinkage.
99local vreg_shrink_count = 0
100
86------------------------------------------------------------------------------ 101------------------------------------------------------------------------------
87 102
88-- Compute action numbers for action names. 103-- Compute action numbers for action names.
@@ -134,6 +149,21 @@ local function waction(action, a, num)
134 if a or num then secpos = secpos + (num or 1) end 149 if a or num then secpos = secpos + (num or 1) end
135end 150end
136 151
152-- Optionally add a VREG action.
153local function wvreg(kind, vreg, psz, sk, defer)
154 if not vreg then return end
155 waction("VREG", vreg)
156 local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
157 if b < (sk or 0) then
158 vreg_shrink_count = vreg_shrink_count + 1
159 end
160 if not defer then
161 b = b + vreg_shrink_count * 8
162 vreg_shrink_count = 0
163 end
164 wputxb(b + (psz or 0))
165end
166
137-- Add call to embedded DynASM C code. 167-- Add call to embedded DynASM C code.
138local function wcall(func, args) 168local function wcall(func, args)
139 wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true) 169 wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
@@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names)
299 local iname = format("@%s%x%s", sz, i, needrex and "R" or "") 329 local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
300 if needrex then map_reg_needrex[iname] = true end 330 if needrex then map_reg_needrex[iname] = true end
301 local name 331 local name
302 if sz == "o" then name = format("xmm%d", i) 332 if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
303 elseif sz == "f" then name = format("st%d", i) 333 elseif sz == "f" then name = format("st%d", i)
304 else name = format("r%d%s", i, sz == addrsize and "" or sz) end 334 else name = format("r%d%s", i, sz == addrsize and "" or sz) end
305 map_archdef[name] = iname 335 map_archdef[name] = iname
@@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
326mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) 356mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
327map_reg_valid_index[map_archdef.esp] = false 357map_reg_valid_index[map_archdef.esp] = false
328if x64 then map_reg_valid_index[map_archdef.rsp] = false end 358if x64 then map_reg_valid_index[map_archdef.rsp] = false end
359if x64 then map_reg_needrex[map_archdef.Rb] = true end
329map_archdef["Ra"] = "@"..addrsize 360map_archdef["Ra"] = "@"..addrsize
330 361
331-- FP registers (internally tword sized, but use "f" as operand size). 362-- FP registers (internally tword sized, but use "f" as operand size).
@@ -334,21 +365,24 @@ mkrmap("f", "Rf")
334-- SSE registers (oword sized, but qword and dword accessible). 365-- SSE registers (oword sized, but qword and dword accessible).
335mkrmap("o", "xmm") 366mkrmap("o", "xmm")
336 367
368-- AVX registers (yword sized, but oword, qword and dword accessible).
369mkrmap("y", "ymm")
370
337-- Operand size prefixes to codes. 371-- Operand size prefixes to codes.
338local map_opsize = { 372local map_opsize = {
339 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", 373 byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
340 aword = addrsize, 374 tword = "t", aword = addrsize,
341} 375}
342 376
343-- Operand size code to number. 377-- Operand size code to number.
344local map_opsizenum = { 378local map_opsizenum = {
345 b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, 379 b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
346} 380}
347 381
348-- Operand size code to name. 382-- Operand size code to name.
349local map_opsizename = { 383local map_opsizename = {
350 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", 384 b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
351 f = "fpword", 385 t = "tword", f = "fpword",
352} 386}
353 387
354-- Valid index register scale factors. 388-- Valid index register scale factors.
@@ -450,6 +484,22 @@ local function wputdarg(n)
450 end 484 end
451end 485end
452 486
487-- Put signed or unsigned qword or arg.
488local function wputqarg(n)
489 local tn = type(n)
490 if tn == "number" then -- This is only used for numbers from -2^31..2^32-1.
491 wputb(band(n, 255))
492 wputb(band(shr(n, 8), 255))
493 wputb(band(shr(n, 16), 255))
494 wputb(shr(n, 24))
495 local sign = n < 0 and 255 or 0
496 wputb(sign); wputb(sign); wputb(sign); wputb(sign)
497 else
498 waction("IMM_D", format("(unsigned int)(%s)", n))
499 waction("IMM_D", format("(unsigned int)((unsigned long long)(%s)>>32)", n))
500 end
501end
502
453-- Put operand-size dependent number or arg (defaults to dword). 503-- Put operand-size dependent number or arg (defaults to dword).
454local function wputszarg(sz, n) 504local function wputszarg(sz, n)
455 if not sz or sz == "d" or sz == "q" then wputdarg(n) 505 if not sz or sz == "d" or sz == "q" then wputdarg(n)
@@ -460,9 +510,45 @@ local function wputszarg(sz, n)
460end 510end
461 511
462-- Put multi-byte opcode with operand-size dependent modifications. 512-- Put multi-byte opcode with operand-size dependent modifications.
463local function wputop(sz, op, rex) 513local function wputop(sz, op, rex, vex, vregr, vregxb)
514 local psz, sk = 0, nil
515 if vex then
516 local tail
517 if vex.m == 1 and band(rex, 11) == 0 then
518 if x64 and vregxb then
519 sk = map_vreg["modrm.reg"]
520 else
521 wputb(0xc5)
522 tail = shl(bxor(band(rex, 4), 4), 5)
523 psz = 3
524 end
525 end
526 if not tail then
527 wputb(0xc4)
528 wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
529 tail = shl(band(rex, 8), 4)
530 psz = 4
531 end
532 local reg, vreg = 0, nil
533 if vex.v then
534 reg = vex.v.reg
535 if not reg then werror("bad vex operand") end
536 if reg < 0 then reg = 0; vreg = vex.v.vreg end
537 end
538 if sz == "y" or vex.l then tail = tail + 4 end
539 wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
540 wvreg("vex.v", vreg)
541 rex = 0
542 if op >= 256 then werror("bad vex opcode") end
543 else
544 if rex ~= 0 then
545 if not x64 then werror("bad operand size") end
546 elseif (vregr or vregxb) and x64 then
547 rex = 0x10
548 sk = map_vreg["vex.v"]
549 end
550 end
464 local r 551 local r
465 if rex ~= 0 and not x64 then werror("bad operand size") end
466 if sz == "w" then wputb(102) end 552 if sz == "w" then wputb(102) end
467 -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] 553 -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
468 if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end 554 if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -471,20 +557,20 @@ local function wputop(sz, op, rex)
471 if rex ~= 0 then 557 if rex ~= 0 then
472 local opc3 = band(op, 0xffff00) 558 local opc3 = band(op, 0xffff00)
473 if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then 559 if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
474 wputb(64 + band(rex, 15)); rex = 0 560 wputb(64 + band(rex, 15)); rex = 0; psz = 2
475 end 561 end
476 end 562 end
477 wputb(shr(op, 16)); op = band(op, 0xffff) 563 wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
478 end 564 end
479 if op >= 256 then 565 if op >= 256 then
480 local b = shr(op, 8) 566 local b = shr(op, 8)
481 if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end 567 if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
482 wputb(b) 568 wputb(b); op = band(op, 255); psz = psz + 1
483 op = band(op, 255)
484 end 569 end
485 if rex ~= 0 then wputb(64 + band(rex, 15)) end 570 if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
486 if sz == "b" then op = op - 1 end 571 if sz == "b" then op = op - 1 end
487 wputb(op) 572 wputb(op)
573 return psz, sk
488end 574end
489 575
490-- Put ModRM or SIB formatted byte. 576-- Put ModRM or SIB formatted byte.
@@ -494,7 +580,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
494end 580end
495 581
496-- Put ModRM/SIB plus optional displacement. 582-- Put ModRM/SIB plus optional displacement.
497local function wputmrmsib(t, imark, s, vsreg) 583local function wputmrmsib(t, imark, s, vsreg, psz, sk)
498 local vreg, vxreg 584 local vreg, vxreg
499 local reg, xreg = t.reg, t.xreg 585 local reg, xreg = t.reg, t.xreg
500 if reg and reg < 0 then reg = 0; vreg = t.vreg end 586 if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -504,8 +590,8 @@ local function wputmrmsib(t, imark, s, vsreg)
504 -- Register mode. 590 -- Register mode.
505 if sub(t.mode, 1, 1) == "r" then 591 if sub(t.mode, 1, 1) == "r" then
506 wputmodrm(3, s, reg) 592 wputmodrm(3, s, reg)
507 if vsreg then waction("VREG", vsreg); wputxb(2) end 593 wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
508 if vreg then waction("VREG", vreg); wputxb(0) end 594 wvreg("modrm.rm.r", vreg, psz+1, sk)
509 return 595 return
510 end 596 end
511 597
@@ -519,21 +605,22 @@ local function wputmrmsib(t, imark, s, vsreg)
519 -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) 605 -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
520 wputmodrm(0, s, 4) 606 wputmodrm(0, s, 4)
521 if imark == "I" then waction("MARK") end 607 if imark == "I" then waction("MARK") end
522 if vsreg then waction("VREG", vsreg); wputxb(2) end 608 wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
523 wputmodrm(t.xsc, xreg, 5) 609 wputmodrm(t.xsc, xreg, 5)
524 if vxreg then waction("VREG", vxreg); wputxb(3) end 610 wvreg("sib.index", vxreg, psz+2, sk)
525 else 611 else
526 -- Pure 32 bit displacement. 612 -- Pure 32 bit displacement.
527 if x64 and tdisp ~= "table" then 613 if x64 and tdisp ~= "table" then
528 wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) 614 wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
615 wvreg("modrm.reg", vsreg, psz+1, sk)
529 if imark == "I" then waction("MARK") end 616 if imark == "I" then waction("MARK") end
530 wputmodrm(0, 4, 5) 617 wputmodrm(0, 4, 5)
531 else 618 else
532 riprel = x64 619 riprel = x64
533 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) 620 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
621 wvreg("modrm.reg", vsreg, psz+1, sk)
534 if imark == "I" then waction("MARK") end 622 if imark == "I" then waction("MARK") end
535 end 623 end
536 if vsreg then waction("VREG", vsreg); wputxb(2) end
537 end 624 end
538 if riprel then -- Emit rip-relative displacement. 625 if riprel then -- Emit rip-relative displacement.
539 if match("UWSiI", imark) then 626 if match("UWSiI", imark) then
@@ -561,16 +648,16 @@ local function wputmrmsib(t, imark, s, vsreg)
561 if xreg or band(reg, 7) == 4 then 648 if xreg or band(reg, 7) == 4 then
562 wputmodrm(m or 2, s, 4) -- ModRM. 649 wputmodrm(m or 2, s, 4) -- ModRM.
563 if m == nil or imark == "I" then waction("MARK") end 650 if m == nil or imark == "I" then waction("MARK") end
564 if vsreg then waction("VREG", vsreg); wputxb(2) end 651 wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
565 wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. 652 wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
566 if vxreg then waction("VREG", vxreg); wputxb(3) end 653 wvreg("sib.index", vxreg, psz+2, sk, vreg)
567 if vreg then waction("VREG", vreg); wputxb(1) end 654 wvreg("sib.base", vreg, psz+2, sk)
568 else 655 else
569 wputmodrm(m or 2, s, reg) -- ModRM. 656 wputmodrm(m or 2, s, reg) -- ModRM.
570 if (imark == "I" and (m == 1 or m == 2)) or 657 if (imark == "I" and (m == 1 or m == 2)) or
571 (m == nil and (vsreg or vreg)) then waction("MARK") end 658 (m == nil and (vsreg or vreg)) then waction("MARK") end
572 if vsreg then waction("VREG", vsreg); wputxb(2) end 659 wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
573 if vreg then waction("VREG", vreg); wputxb(1) end 660 wvreg("modrm.rm.m", vreg, psz+1, sk)
574 end 661 end
575 662
576 -- Put displacement. 663 -- Put displacement.
@@ -592,10 +679,16 @@ local function opmodestr(op, args)
592end 679end
593 680
594-- Convert number to valid integer or nil. 681-- Convert number to valid integer or nil.
595local function toint(expr) 682local function toint(expr, isqword)
596 local n = tonumber(expr) 683 local n = tonumber(expr)
597 if n then 684 if n then
598 if n % 1 ~= 0 or n < -2147483648 or n > 4294967295 then 685 if n % 1 ~= 0 then
686 werror("not an integer number `"..expr.."'")
687 elseif isqword then
688 if n < -2147483648 or n > 2147483647 then
689 n = nil -- Handle it as an expression to avoid precision loss.
690 end
691 elseif n < -2147483648 or n > 4294967295 then
599 werror("bad integer number `"..expr.."'") 692 werror("bad integer number `"..expr.."'")
600 end 693 end
601 return n 694 return n
@@ -678,7 +771,7 @@ local function rtexpr(expr)
678end 771end
679 772
680-- Parse operand and return { mode, opsize, reg, xreg, xsc, disp, imm }. 773-- Parse operand and return { mode, opsize, reg, xreg, xsc, disp, imm }.
681local function parseoperand(param) 774local function parseoperand(param, isqword)
682 local t = {} 775 local t = {}
683 776
684 local expr = param 777 local expr = param
@@ -766,7 +859,7 @@ local function parseoperand(param)
766 t.disp = dispexpr(tailx) 859 t.disp = dispexpr(tailx)
767 else 860 else
768 -- imm or opsize*imm 861 -- imm or opsize*imm
769 local imm = toint(expr) 862 local imm = toint(expr, isqword)
770 if not imm and sub(expr, 1, 1) == "*" and t.opsize then 863 if not imm and sub(expr, 1, 1) == "*" and t.opsize then
771 imm = toint(sub(expr, 2)) 864 imm = toint(sub(expr, 2))
772 if imm then 865 if imm then
@@ -881,9 +974,16 @@ end
881-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. 974-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
882-- The spare 3 bits are either filled with the last hex digit or 975-- The spare 3 bits are either filled with the last hex digit or
883-- the result from a previous "r"/"R". The opcode is restored. 976-- the result from a previous "r"/"R". The opcode is restored.
977-- "u" Use VEX encoding, vvvv unused.
978-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
979-- removed from the list used by future characters).
980-- "w" Use VEX encoding, vvvv from 3rd operand.
981-- "L" Force VEX.L
884-- 982--
885-- All of the following characters force a flush of the opcode: 983-- All of the following characters force a flush of the opcode:
886-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. 984-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
985-- "s" stores a 4 bit immediate from the last register operand,
986-- followed by 4 zero bits.
887-- "S" stores a signed 8 bit immediate from the last operand. 987-- "S" stores a signed 8 bit immediate from the last operand.
888-- "U" stores an unsigned 8 bit immediate from the last operand. 988-- "U" stores an unsigned 8 bit immediate from the last operand.
889-- "W" stores an unsigned 16 bit immediate from the last operand. 989-- "W" stores an unsigned 16 bit immediate from the last operand.
@@ -1226,46 +1326,14 @@ local map_op = {
1226 movups_2 = "rmo:0F10rM|mro:0F11Rm", 1326 movups_2 = "rmo:0F10rM|mro:0F11Rm",
1227 orpd_2 = "rmo:660F56rM", 1327 orpd_2 = "rmo:660F56rM",
1228 orps_2 = "rmo:0F56rM", 1328 orps_2 = "rmo:0F56rM",
1229 packssdw_2 = "rmo:660F6BrM",
1230 packsswb_2 = "rmo:660F63rM",
1231 packuswb_2 = "rmo:660F67rM",
1232 paddb_2 = "rmo:660FFCrM",
1233 paddd_2 = "rmo:660FFErM",
1234 paddq_2 = "rmo:660FD4rM",
1235 paddsb_2 = "rmo:660FECrM",
1236 paddsw_2 = "rmo:660FEDrM",
1237 paddusb_2 = "rmo:660FDCrM",
1238 paddusw_2 = "rmo:660FDDrM",
1239 paddw_2 = "rmo:660FFDrM",
1240 pand_2 = "rmo:660FDBrM",
1241 pandn_2 = "rmo:660FDFrM",
1242 pause_0 = "F390", 1329 pause_0 = "F390",
1243 pavgb_2 = "rmo:660FE0rM",
1244 pavgw_2 = "rmo:660FE3rM",
1245 pcmpeqb_2 = "rmo:660F74rM",
1246 pcmpeqd_2 = "rmo:660F76rM",
1247 pcmpeqw_2 = "rmo:660F75rM",
1248 pcmpgtb_2 = "rmo:660F64rM",
1249 pcmpgtd_2 = "rmo:660F66rM",
1250 pcmpgtw_2 = "rmo:660F65rM",
1251 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. 1330 pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
1252 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", 1331 pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
1253 pmaddwd_2 = "rmo:660FF5rM",
1254 pmaxsw_2 = "rmo:660FEErM",
1255 pmaxub_2 = "rmo:660FDErM",
1256 pminsw_2 = "rmo:660FEArM",
1257 pminub_2 = "rmo:660FDArM",
1258 pmovmskb_2 = "rr/do:660FD7rM", 1332 pmovmskb_2 = "rr/do:660FD7rM",
1259 pmulhuw_2 = "rmo:660FE4rM",
1260 pmulhw_2 = "rmo:660FE5rM",
1261 pmullw_2 = "rmo:660FD5rM",
1262 pmuludq_2 = "rmo:660FF4rM",
1263 por_2 = "rmo:660FEBrM",
1264 prefetchnta_1 = "xb:n0F180m", 1333 prefetchnta_1 = "xb:n0F180m",
1265 prefetcht0_1 = "xb:n0F181m", 1334 prefetcht0_1 = "xb:n0F181m",
1266 prefetcht1_1 = "xb:n0F182m", 1335 prefetcht1_1 = "xb:n0F182m",
1267 prefetcht2_1 = "xb:n0F183m", 1336 prefetcht2_1 = "xb:n0F183m",
1268 psadbw_2 = "rmo:660FF6rM",
1269 pshufd_3 = "rmio:660F70rMU", 1337 pshufd_3 = "rmio:660F70rMU",
1270 pshufhw_3 = "rmio:F30F70rMU", 1338 pshufhw_3 = "rmio:F30F70rMU",
1271 pshuflw_3 = "rmio:F20F70rMU", 1339 pshuflw_3 = "rmio:F20F70rMU",
@@ -1279,23 +1347,6 @@ local map_op = {
1279 psrldq_2 = "rio:660F733mU", 1347 psrldq_2 = "rio:660F733mU",
1280 psrlq_2 = "rmo:660FD3rM|rio:660F732mU", 1348 psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
1281 psrlw_2 = "rmo:660FD1rM|rio:660F712mU", 1349 psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
1282 psubb_2 = "rmo:660FF8rM",
1283 psubd_2 = "rmo:660FFArM",
1284 psubq_2 = "rmo:660FFBrM",
1285 psubsb_2 = "rmo:660FE8rM",
1286 psubsw_2 = "rmo:660FE9rM",
1287 psubusb_2 = "rmo:660FD8rM",
1288 psubusw_2 = "rmo:660FD9rM",
1289 psubw_2 = "rmo:660FF9rM",
1290 punpckhbw_2 = "rmo:660F68rM",
1291 punpckhdq_2 = "rmo:660F6ArM",
1292 punpckhqdq_2 = "rmo:660F6DrM",
1293 punpckhwd_2 = "rmo:660F69rM",
1294 punpcklbw_2 = "rmo:660F60rM",
1295 punpckldq_2 = "rmo:660F62rM",
1296 punpcklqdq_2 = "rmo:660F6CrM",
1297 punpcklwd_2 = "rmo:660F61rM",
1298 pxor_2 = "rmo:660FEFrM",
1299 rcpps_2 = "rmo:0F53rM", 1350 rcpps_2 = "rmo:0F53rM",
1300 rcpss_2 = "rro:F30F53rM|rx/od:", 1351 rcpss_2 = "rro:F30F53rM|rx/od:",
1301 rsqrtps_2 = "rmo:0F52rM", 1352 rsqrtps_2 = "rmo:0F52rM",
@@ -1413,6 +1464,327 @@ local map_op = {
1413 movntsd_2 = "xr/qo:nF20F2BRm", 1464 movntsd_2 = "xr/qo:nF20F2BRm",
1414 movntss_2 = "xr/do:F30F2BRm", 1465 movntss_2 = "xr/do:F30F2BRm",
1415 -- popcnt is also in SSE4.2 1466 -- popcnt is also in SSE4.2
1467
1468 -- AES-NI
1469 aesdec_2 = "rmo:660F38DErM",
1470 aesdeclast_2 = "rmo:660F38DFrM",
1471 aesenc_2 = "rmo:660F38DCrM",
1472 aesenclast_2 = "rmo:660F38DDrM",
1473 aesimc_2 = "rmo:660F38DBrM",
1474 aeskeygenassist_3 = "rmio:660F3ADFrMU",
1475 pclmulqdq_3 = "rmio:660F3A44rMU",
1476
1477 -- AVX FP ops
1478 vaddsubpd_3 = "rrmoy:660FVD0rM",
1479 vaddsubps_3 = "rrmoy:F20FVD0rM",
1480 vandpd_3 = "rrmoy:660FV54rM",
1481 vandps_3 = "rrmoy:0FV54rM",
1482 vandnpd_3 = "rrmoy:660FV55rM",
1483 vandnps_3 = "rrmoy:0FV55rM",
1484 vblendpd_4 = "rrmioy:660F3AV0DrMU",
1485 vblendps_4 = "rrmioy:660F3AV0CrMU",
1486 vblendvpd_4 = "rrmroy:660F3AV4BrMs",
1487 vblendvps_4 = "rrmroy:660F3AV4ArMs",
1488 vbroadcastf128_2 = "rx/yo:660F38u1ArM",
1489 vcmppd_4 = "rrmioy:660FVC2rMU",
1490 vcmpps_4 = "rrmioy:0FVC2rMU",
1491 vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
1492 vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
1493 vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
1494 vcomiss_2 = "rro:0Fu2FrM|rx/od:",
1495 vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
1496 vcvtdq2ps_2 = "rmoy:0Fu5BrM",
1497 vcvtpd2dq_2 = "rmoy:F20FuE6rM",
1498 vcvtpd2ps_2 = "rmoy:660Fu5ArM",
1499 vcvtps2dq_2 = "rmoy:660Fu5BrM",
1500 vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
1501 vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
1502 vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
1503 vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
1504 vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
1505 vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
1506 vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
1507 vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
1508 vcvttps2dq_2 = "rmoy:F30Fu5BrM",
1509 vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
1510 vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
1511 vdppd_4 = "rrmio:660F3AV41rMU",
1512 vdpps_4 = "rrmioy:660F3AV40rMU",
1513 vextractf128_3 = "mri/oy:660F3AuL19RmU",
1514 vextractps_3 = "mri/do:660F3Au17RmU",
1515 vhaddpd_3 = "rrmoy:660FV7CrM",
1516 vhaddps_3 = "rrmoy:F20FV7CrM",
1517 vhsubpd_3 = "rrmoy:660FV7DrM",
1518 vhsubps_3 = "rrmoy:F20FV7DrM",
1519 vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
1520 vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
1521 vldmxcsr_1 = "xd:0FuAE2m",
1522 vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
1523 vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
1524 vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
1525 vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
1526 vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
1527 vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
1528 vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
1529 vmovhlps_3 = "rrro:0FV12rM",
1530 vmovhpd_2 = "xr/qo:660Fu17Rm",
1531 vmovhpd_3 = "rrx/ooq:660FV16rM",
1532 vmovhps_2 = "xr/qo:0Fu17Rm",
1533 vmovhps_3 = "rrx/ooq:0FV16rM",
1534 vmovlhps_3 = "rrro:0FV16rM",
1535 vmovlpd_2 = "xr/qo:660Fu13Rm",
1536 vmovlpd_3 = "rrx/ooq:660FV12rM",
1537 vmovlps_2 = "xr/qo:0Fu13Rm",
1538 vmovlps_3 = "rrx/ooq:0FV12rM",
1539 vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
1540 vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
1541 vmovntpd_2 = "xroy:660Fu2BRm",
1542 vmovntps_2 = "xroy:0Fu2BRm",
1543 vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
1544 vmovsd_3 = "rrro:F20FV10rM",
1545 vmovshdup_2 = "rmoy:F30Fu16rM",
1546 vmovsldup_2 = "rmoy:F30Fu12rM",
1547 vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
1548 vmovss_3 = "rrro:F30FV10rM",
1549 vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
1550 vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
1551 vorpd_3 = "rrmoy:660FV56rM",
1552 vorps_3 = "rrmoy:0FV56rM",
1553 vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
1554 vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
1555 vperm2f128_4 = "rrmiy:660F3AV06rMU",
1556 vptestpd_2 = "rmoy:660F38u0FrM",
1557 vptestps_2 = "rmoy:660F38u0ErM",
1558 vrcpps_2 = "rmoy:0Fu53rM",
1559 vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
1560 vrsqrtps_2 = "rmoy:0Fu52rM",
1561 vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
1562 vroundpd_3 = "rmioy:660F3Au09rMU",
1563 vroundps_3 = "rmioy:660F3Au08rMU",
1564 vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
1565 vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
1566 vshufpd_4 = "rrmioy:660FVC6rMU",
1567 vshufps_4 = "rrmioy:0FVC6rMU",
1568 vsqrtps_2 = "rmoy:0Fu51rM",
1569 vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
1570 vsqrtpd_2 = "rmoy:660Fu51rM",
1571 vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
1572 vstmxcsr_1 = "xd:0FuAE3m",
1573 vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
1574 vucomiss_2 = "rro:0Fu2ErM|rx/od:",
1575 vunpckhpd_3 = "rrmoy:660FV15rM",
1576 vunpckhps_3 = "rrmoy:0FV15rM",
1577 vunpcklpd_3 = "rrmoy:660FV14rM",
1578 vunpcklps_3 = "rrmoy:0FV14rM",
1579 vxorpd_3 = "rrmoy:660FV57rM",
1580 vxorps_3 = "rrmoy:0FV57rM",
1581 vzeroall_0 = "0FuL77",
1582 vzeroupper_0 = "0Fu77",
1583
1584 -- AVX2 FP ops
1585 vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
1586 vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
1587 -- *vgather* (!vsib)
1588 vpermpd_3 = "rmiy:660F3AuX01rMU",
1589 vpermps_3 = "rrmy:660F38V16rM",
1590
1591 -- AVX, AVX2 integer ops
1592 -- In general, xmm requires AVX, ymm requires AVX2.
1593 vaesdec_3 = "rrmo:660F38VDErM",
1594 vaesdeclast_3 = "rrmo:660F38VDFrM",
1595 vaesenc_3 = "rrmo:660F38VDCrM",
1596 vaesenclast_3 = "rrmo:660F38VDDrM",
1597 vaesimc_2 = "rmo:660F38uDBrM",
1598 vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
1599 vlddqu_2 = "rxoy:F20FuF0rM",
1600 vmaskmovdqu_2 = "rro:660FuF7rM",
1601 vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
1602 vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
1603 vmovntdq_2 = "xroy:660FuE7Rm",
1604 vmovntdqa_2 = "rxoy:660F38u2ArM",
1605 vmpsadbw_4 = "rrmioy:660F3AV42rMU",
1606 vpabsb_2 = "rmoy:660F38u1CrM",
1607 vpabsd_2 = "rmoy:660F38u1ErM",
1608 vpabsw_2 = "rmoy:660F38u1DrM",
1609 vpackusdw_3 = "rrmoy:660F38V2BrM",
1610 vpalignr_4 = "rrmioy:660F3AV0FrMU",
1611 vpblendvb_4 = "rrmroy:660F3AV4CrMs",
1612 vpblendw_4 = "rrmioy:660F3AV0ErMU",
1613 vpclmulqdq_4 = "rrmio:660F3AV44rMU",
1614 vpcmpeqq_3 = "rrmoy:660F38V29rM",
1615 vpcmpestri_3 = "rmio:660F3Au61rMU",
1616 vpcmpestrm_3 = "rmio:660F3Au60rMU",
1617 vpcmpgtq_3 = "rrmoy:660F38V37rM",
1618 vpcmpistri_3 = "rmio:660F3Au63rMU",
1619 vpcmpistrm_3 = "rmio:660F3Au62rMU",
1620 vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
1621 vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
1622 vpextrd_3 = "mri/do:660F3Au16RmU",
1623 vpextrq_3 = "mri/qo:660F3Au16RmU",
1624 vphaddw_3 = "rrmoy:660F38V01rM",
1625 vphaddd_3 = "rrmoy:660F38V02rM",
1626 vphaddsw_3 = "rrmoy:660F38V03rM",
1627 vphminposuw_2 = "rmo:660F38u41rM",
1628 vphsubw_3 = "rrmoy:660F38V05rM",
1629 vphsubd_3 = "rrmoy:660F38V06rM",
1630 vphsubsw_3 = "rrmoy:660F38V07rM",
1631 vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
1632 vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
1633 vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
1634 vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
1635 vpmaddubsw_3 = "rrmoy:660F38V04rM",
1636 vpmaxsb_3 = "rrmoy:660F38V3CrM",
1637 vpmaxsd_3 = "rrmoy:660F38V3DrM",
1638 vpmaxuw_3 = "rrmoy:660F38V3ErM",
1639 vpmaxud_3 = "rrmoy:660F38V3FrM",
1640 vpminsb_3 = "rrmoy:660F38V38rM",
1641 vpminsd_3 = "rrmoy:660F38V39rM",
1642 vpminuw_3 = "rrmoy:660F38V3ArM",
1643 vpminud_3 = "rrmoy:660F38V3BrM",
1644 vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
1645 vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
1646 vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
1647 vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
1648 vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
1649 vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
1650 vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
1651 vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
1652 vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
1653 vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
1654 vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
1655 vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
1656 vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
1657 vpmuldq_3 = "rrmoy:660F38V28rM",
1658 vpmulhrsw_3 = "rrmoy:660F38V0BrM",
1659 vpmulld_3 = "rrmoy:660F38V40rM",
1660 vpshufb_3 = "rrmoy:660F38V00rM",
1661 vpshufd_3 = "rmioy:660Fu70rMU",
1662 vpshufhw_3 = "rmioy:F30Fu70rMU",
1663 vpshuflw_3 = "rmioy:F20Fu70rMU",
1664 vpsignb_3 = "rrmoy:660F38V08rM",
1665 vpsignw_3 = "rrmoy:660F38V09rM",
1666 vpsignd_3 = "rrmoy:660F38V0ArM",
1667 vpslldq_3 = "rrioy:660Fv737mU",
1668 vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
1669 vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
1670 vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
1671 vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
1672 vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
1673 vpsrldq_3 = "rrioy:660Fv733mU",
1674 vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
1675 vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
1676 vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
1677 vptest_2 = "rmoy:660F38u17rM",
1678
1679 -- AVX2 integer ops
1680 vbroadcasti128_2 = "rx/yo:660F38u5ArM",
1681 vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
1682 vextracti128_3 = "mri/oy:660F3AuL39RmU",
1683 vpblendd_4 = "rrmioy:660F3AV02rMU",
1684 vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
1685 vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
1686 vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
1687 vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
1688 vpermd_3 = "rrmy:660F38V36rM",
1689 vpermq_3 = "rmiy:660F3AuX00rMU",
1690 -- *vpgather* (!vsib)
1691 vperm2i128_4 = "rrmiy:660F3AV46rMU",
1692 vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
1693 vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
1694 vpsllvd_3 = "rrmoy:660F38V47rM",
1695 vpsllvq_3 = "rrmoy:660F38VX47rM",
1696 vpsravd_3 = "rrmoy:660F38V46rM",
1697 vpsrlvd_3 = "rrmoy:660F38V45rM",
1698 vpsrlvq_3 = "rrmoy:660F38VX45rM",
1699
1700 -- Intel ADX
1701 adcx_2 = "rmqd:660F38F6rM",
1702 adox_2 = "rmqd:F30F38F6rM",
1703
1704 -- BMI1
1705 andn_3 = "rrmqd:0F38VF2rM",
1706 bextr_3 = "rmrqd:0F38wF7rM",
1707 blsi_2 = "rmqd:0F38vF33m",
1708 blsmsk_2 = "rmqd:0F38vF32m",
1709 blsr_2 = "rmqd:0F38vF31m",
1710 tzcnt_2 = "rmqdw:F30FBCrM",
1711
1712 -- BMI2
1713 bzhi_3 = "rmrqd:0F38wF5rM",
1714 mulx_3 = "rrmqd:F20F38VF6rM",
1715 pdep_3 = "rrmqd:F20F38VF5rM",
1716 pext_3 = "rrmqd:F30F38VF5rM",
1717 rorx_3 = "rmSqd:F20F3AuF0rMS",
1718 sarx_3 = "rmrqd:F30F38wF7rM",
1719 shrx_3 = "rmrqd:F20F38wF7rM",
1720 shlx_3 = "rmrqd:660F38wF7rM",
1721
1722 -- FMA3
1723 vfmaddsub132pd_3 = "rrmoy:660F38VX96rM",
1724 vfmaddsub132ps_3 = "rrmoy:660F38V96rM",
1725 vfmaddsub213pd_3 = "rrmoy:660F38VXA6rM",
1726 vfmaddsub213ps_3 = "rrmoy:660F38VA6rM",
1727 vfmaddsub231pd_3 = "rrmoy:660F38VXB6rM",
1728 vfmaddsub231ps_3 = "rrmoy:660F38VB6rM",
1729
1730 vfmsubadd132pd_3 = "rrmoy:660F38VX97rM",
1731 vfmsubadd132ps_3 = "rrmoy:660F38V97rM",
1732 vfmsubadd213pd_3 = "rrmoy:660F38VXA7rM",
1733 vfmsubadd213ps_3 = "rrmoy:660F38VA7rM",
1734 vfmsubadd231pd_3 = "rrmoy:660F38VXB7rM",
1735 vfmsubadd231ps_3 = "rrmoy:660F38VB7rM",
1736
1737 vfmadd132pd_3 = "rrmoy:660F38VX98rM",
1738 vfmadd132ps_3 = "rrmoy:660F38V98rM",
1739 vfmadd132sd_3 = "rrro:660F38VX99rM|rrx/ooq:",
1740 vfmadd132ss_3 = "rrro:660F38V99rM|rrx/ood:",
1741 vfmadd213pd_3 = "rrmoy:660F38VXA8rM",
1742 vfmadd213ps_3 = "rrmoy:660F38VA8rM",
1743 vfmadd213sd_3 = "rrro:660F38VXA9rM|rrx/ooq:",
1744 vfmadd213ss_3 = "rrro:660F38VA9rM|rrx/ood:",
1745 vfmadd231pd_3 = "rrmoy:660F38VXB8rM",
1746 vfmadd231ps_3 = "rrmoy:660F38VB8rM",
1747 vfmadd231sd_3 = "rrro:660F38VXB9rM|rrx/ooq:",
1748 vfmadd231ss_3 = "rrro:660F38VB9rM|rrx/ood:",
1749
1750 vfmsub132pd_3 = "rrmoy:660F38VX9ArM",
1751 vfmsub132ps_3 = "rrmoy:660F38V9ArM",
1752 vfmsub132sd_3 = "rrro:660F38VX9BrM|rrx/ooq:",
1753 vfmsub132ss_3 = "rrro:660F38V9BrM|rrx/ood:",
1754 vfmsub213pd_3 = "rrmoy:660F38VXAArM",
1755 vfmsub213ps_3 = "rrmoy:660F38VAArM",
1756 vfmsub213sd_3 = "rrro:660F38VXABrM|rrx/ooq:",
1757 vfmsub213ss_3 = "rrro:660F38VABrM|rrx/ood:",
1758 vfmsub231pd_3 = "rrmoy:660F38VXBArM",
1759 vfmsub231ps_3 = "rrmoy:660F38VBArM",
1760 vfmsub231sd_3 = "rrro:660F38VXBBrM|rrx/ooq:",
1761 vfmsub231ss_3 = "rrro:660F38VBBrM|rrx/ood:",
1762
1763 vfnmadd132pd_3 = "rrmoy:660F38VX9CrM",
1764 vfnmadd132ps_3 = "rrmoy:660F38V9CrM",
1765 vfnmadd132sd_3 = "rrro:660F38VX9DrM|rrx/ooq:",
1766 vfnmadd132ss_3 = "rrro:660F38V9DrM|rrx/ood:",
1767 vfnmadd213pd_3 = "rrmoy:660F38VXACrM",
1768 vfnmadd213ps_3 = "rrmoy:660F38VACrM",
1769 vfnmadd213sd_3 = "rrro:660F38VXADrM|rrx/ooq:",
1770 vfnmadd213ss_3 = "rrro:660F38VADrM|rrx/ood:",
1771 vfnmadd231pd_3 = "rrmoy:660F38VXBCrM",
1772 vfnmadd231ps_3 = "rrmoy:660F38VBCrM",
1773 vfnmadd231sd_3 = "rrro:660F38VXBDrM|rrx/ooq:",
1774 vfnmadd231ss_3 = "rrro:660F38VBDrM|rrx/ood:",
1775
1776 vfnmsub132pd_3 = "rrmoy:660F38VX9ErM",
1777 vfnmsub132ps_3 = "rrmoy:660F38V9ErM",
1778 vfnmsub132sd_3 = "rrro:660F38VX9FrM|rrx/ooq:",
1779 vfnmsub132ss_3 = "rrro:660F38V9FrM|rrx/ood:",
1780 vfnmsub213pd_3 = "rrmoy:660F38VXAErM",
1781 vfnmsub213ps_3 = "rrmoy:660F38VAErM",
1782 vfnmsub213sd_3 = "rrro:660F38VXAFrM|rrx/ooq:",
1783 vfnmsub213ss_3 = "rrro:660F38VAFrM|rrx/ood:",
1784 vfnmsub231pd_3 = "rrmoy:660F38VXBErM",
1785 vfnmsub231ps_3 = "rrmoy:660F38VBErM",
1786 vfnmsub231sd_3 = "rrro:660F38VXBFrM|rrx/ooq:",
1787 vfnmsub231ss_3 = "rrro:660F38VBFrM|rrx/ood:",
1416} 1788}
1417 1789
1418------------------------------------------------------------------------------ 1790------------------------------------------------------------------------------
@@ -1463,28 +1835,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
1463 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ 1835 map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
1464end 1836end
1465 1837
1466-- SSE FP arithmetic ops. 1838-- SSE / AVX FP arithmetic ops.
1467for name,n in pairs{ sqrt = 1, add = 8, mul = 9, 1839for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
1468 sub = 12, min = 13, div = 14, max = 15 } do 1840 sub = 12, min = 13, div = 14, max = 15 } do
1469 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) 1841 map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
1470 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) 1842 map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
1471 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) 1843 map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
1472 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) 1844 map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
1845 if n ~= 1 then
1846 map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
1847 map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
1848 map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
1849 map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
1850 end
1851end
1852
1853-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
1854for name,n in pairs{
1855 paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
1856 paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
1857 packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
1858 paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
1859 pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
1860 pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
1861 pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
1862 pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
1863 pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
1864 pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
1865 psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
1866 psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
1867 punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
1868 punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
1869 punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
1870} do
1871 map_op[name.."_2"] = format("rmo:660F%02XrM", n)
1872 map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
1473end 1873end
1474 1874
1475------------------------------------------------------------------------------ 1875------------------------------------------------------------------------------
1476 1876
1877local map_vexarg = { u = false, v = 1, V = 2, w = 3 }
1878
1477-- Process pattern string. 1879-- Process pattern string.
1478local function dopattern(pat, args, sz, op, needrex) 1880local function dopattern(pat, args, sz, op, needrex)
1479 local digit, addin 1881 local digit, addin, vex
1480 local opcode = 0 1882 local opcode = 0
1481 local szov = sz 1883 local szov = sz
1482 local narg = 1 1884 local narg = 1
1483 local rex = 0 1885 local rex = 0
1484 1886
1485 -- Limit number of section buffer positions used by a single dasm_put(). 1887 -- Limit number of section buffer positions used by a single dasm_put().
1486 -- A single opcode needs a maximum of 5 positions. 1888 -- A single opcode needs a maximum of 6 positions.
1487 if secpos+5 > maxsecpos then wflush() end 1889 if secpos+6 > maxsecpos then wflush() end
1488 1890
1489 -- Process each character. 1891 -- Process each character.
1490 for c in gmatch(pat.."|", ".") do 1892 for c in gmatch(pat.."|", ".") do
@@ -1498,6 +1900,8 @@ local function dopattern(pat, args, sz, op, needrex)
1498 szov = nil 1900 szov = nil
1499 elseif c == "X" then -- Force REX.W. 1901 elseif c == "X" then -- Force REX.W.
1500 rex = 8 1902 rex = 8
1903 elseif c == "L" then -- Force VEX.L.
1904 vex.l = true
1501 elseif c == "r" then -- Merge 1st operand regno. into opcode. 1905 elseif c == "r" then -- Merge 1st operand regno. into opcode.
1502 addin = args[1]; opcode = opcode + (addin.reg % 8) 1906 addin = args[1]; opcode = opcode + (addin.reg % 8)
1503 if narg < 2 then narg = 2 end 1907 if narg < 2 then narg = 2 end
@@ -1521,21 +1925,42 @@ local function dopattern(pat, args, sz, op, needrex)
1521 if t.xreg and t.xreg > 7 then rex = rex + 2 end 1925 if t.xreg and t.xreg > 7 then rex = rex + 2 end
1522 if s > 7 then rex = rex + 4 end 1926 if s > 7 then rex = rex + 4 end
1523 if needrex then rex = rex + 16 end 1927 if needrex then rex = rex + 16 end
1524 wputop(szov, opcode, rex); opcode = nil 1928 local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
1929 opcode = nil
1525 local imark = sub(pat, -1) -- Force a mark (ugly). 1930 local imark = sub(pat, -1) -- Force a mark (ugly).
1526 -- Put ModRM/SIB with regno/last digit as spare. 1931 -- Put ModRM/SIB with regno/last digit as spare.
1527 wputmrmsib(t, imark, s, addin and addin.vreg) 1932 wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
1528 addin = nil 1933 addin = nil
1934 elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
1935 local b = band(opcode, 255); opcode = shr(opcode, 8)
1936 local m = 1
1937 if b == 0x38 then m = 2
1938 elseif b == 0x3a then m = 3 end
1939 if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
1940 if b ~= 0x0f then
1941 werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
1942 "' in pattern `"..pat.."' for `"..op.."'")
1943 end
1944 local v = map_vexarg[c]
1945 if v then v = remove(args, v) end
1946 b = band(opcode, 255)
1947 local p = 0
1948 if b == 0x66 then p = 1
1949 elseif b == 0xf3 then p = 2
1950 elseif b == 0xf2 then p = 3 end
1951 if p ~= 0 then opcode = shr(opcode, 8) end
1952 if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
1953 vex = { m = m, p = p, v = v }
1529 else 1954 else
1530 if opcode then -- Flush opcode. 1955 if opcode then -- Flush opcode.
1531 if szov == "q" and rex == 0 then rex = rex + 8 end 1956 if szov == "q" and rex == 0 then rex = rex + 8 end
1532 if needrex then rex = rex + 16 end 1957 if needrex then rex = rex + 16 end
1533 if addin and addin.reg == -1 then 1958 if addin and addin.reg == -1 then
1534 wputop(szov, opcode - 7, rex) 1959 local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
1535 waction("VREG", addin.vreg); wputxb(0) 1960 wvreg("opcode", addin.vreg, psz, sk)
1536 else 1961 else
1537 if addin and addin.reg > 7 then rex = rex + 1 end 1962 if addin and addin.reg > 7 then rex = rex + 1 end
1538 wputop(szov, opcode, rex) 1963 wputop(szov, opcode, rex, vex)
1539 end 1964 end
1540 opcode = nil 1965 opcode = nil
1541 end 1966 end
@@ -1549,7 +1974,7 @@ local function dopattern(pat, args, sz, op, needrex)
1549 local a = args[narg] 1974 local a = args[narg]
1550 narg = narg + 1 1975 narg = narg + 1
1551 local mode, imm = a.mode, a.imm 1976 local mode, imm = a.mode, a.imm
1552 if mode == "iJ" and not match("iIJ", c) then 1977 if mode == "iJ" and not match(x64 and "J" or "iIJ", c) then
1553 werror("bad operand size for label") 1978 werror("bad operand size for label")
1554 end 1979 end
1555 if c == "S" then 1980 if c == "S" then
@@ -1572,6 +1997,14 @@ local function dopattern(pat, args, sz, op, needrex)
1572 else 1997 else
1573 wputlabel("REL_", imm, 2) 1998 wputlabel("REL_", imm, 2)
1574 end 1999 end
2000 elseif c == "s" then
2001 local reg = a.reg
2002 if reg < 0 then
2003 wputb(0)
2004 wvreg("imm.hi", a.vreg)
2005 else
2006 wputb(shl(reg, 4))
2007 end
1575 else 2008 else
1576 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") 2009 werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
1577 end 2010 end
@@ -1648,11 +2081,14 @@ map_op[".template__"] = function(params, template, nparams)
1648 if pat == "" then pat = lastpat else lastpat = pat end 2081 if pat == "" then pat = lastpat else lastpat = pat end
1649 if matchtm(tm, args) then 2082 if matchtm(tm, args) then
1650 local prefix = sub(szm, 1, 1) 2083 local prefix = sub(szm, 1, 1)
1651 if prefix == "/" then -- Match both operand sizes. 2084 if prefix == "/" then -- Exactly match leading operand sizes.
1652 if args[1].opsize == sub(szm, 2, 2) and 2085 for i = #szm,1,-1 do
1653 args[2].opsize == sub(szm, 3, 3) then 2086 if i == 1 then
1654 dopattern(pat, args, sz, params.op, needrex) -- Process pattern. 2087 dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
1655 return 2088 return
2089 elseif args[i-1].opsize ~= sub(szm, i, i) then
2090 break
2091 end
1656 end 2092 end
1657 else -- Match common operand size. 2093 else -- Match common operand size.
1658 local szp = sz 2094 local szp = sz
@@ -1717,8 +2153,8 @@ if x64 then
1717 rex = a.reg > 7 and 9 or 8 2153 rex = a.reg > 7 and 9 or 8
1718 end 2154 end
1719 end 2155 end
1720 wputop(sz, opcode, rex) 2156 local psz, sk = wputop(sz, opcode, rex, nil, vreg)
1721 if vreg then waction("VREG", vreg); wputxb(0) end 2157 wvreg("opcode", vreg, psz, sk)
1722 waction("IMM_D", format("(unsigned int)(%s)", op64)) 2158 waction("IMM_D", format("(unsigned int)(%s)", op64))
1723 waction("IMM_D", format("(unsigned int)((%s)>>32)", op64)) 2159 waction("IMM_D", format("(unsigned int)((%s)>>32)", op64))
1724 end 2160 end
@@ -1730,14 +2166,16 @@ end
1730local function op_data(params) 2166local function op_data(params)
1731 if not params then return "imm..." end 2167 if not params then return "imm..." end
1732 local sz = sub(params.op, 2, 2) 2168 local sz = sub(params.op, 2, 2)
1733 if sz == "a" then sz = addrsize end 2169 if sz == "l" then sz = "d" elseif sz == "a" then sz = addrsize end
1734 for _,p in ipairs(params) do 2170 for _,p in ipairs(params) do
1735 local a = parseoperand(p) 2171 local a = parseoperand(p, sz == "q")
1736 if sub(a.mode, 1, 1) ~= "i" or (a.opsize and a.opsize ~= sz) then 2172 if sub(a.mode, 1, 1) ~= "i" or (a.opsize and a.opsize ~= sz) then
1737 werror("bad mode or size in `"..p.."'") 2173 werror("bad mode or size in `"..p.."'")
1738 end 2174 end
1739 if a.mode == "iJ" then 2175 if a.mode == "iJ" then
1740 wputlabel("IMM_", a.imm, 1) 2176 wputlabel("IMM_", a.imm, 1)
2177 elseif sz == "q" then
2178 wputqarg(a.imm)
1741 else 2179 else
1742 wputszarg(sz, a.imm) 2180 wputszarg(sz, a.imm)
1743 end 2181 end
@@ -1749,7 +2187,11 @@ map_op[".byte_*"] = op_data
1749map_op[".sbyte_*"] = op_data 2187map_op[".sbyte_*"] = op_data
1750map_op[".word_*"] = op_data 2188map_op[".word_*"] = op_data
1751map_op[".dword_*"] = op_data 2189map_op[".dword_*"] = op_data
2190map_op[".qword_*"] = op_data
1752map_op[".aword_*"] = op_data 2191map_op[".aword_*"] = op_data
2192map_op[".long_*"] = op_data
2193map_op[".quad_*"] = op_data
2194map_op[".addr_*"] = op_data
1753 2195
1754------------------------------------------------------------------------------ 2196------------------------------------------------------------------------------
1755 2197