diff options
Diffstat (limited to 'dynasm/dasm_x86.lua')
-rw-r--r-- | dynasm/dasm_x86.lua | 606 |
1 files changed, 510 insertions, 96 deletions
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua index 13aa68ff..c5c8c17b 100644 --- a/dynasm/dasm_x86.lua +++ b/dynasm/dasm_x86.lua | |||
@@ -11,9 +11,9 @@ local x64 = x64 | |||
11 | local _info = { | 11 | local _info = { |
12 | arch = x64 and "x64" or "x86", | 12 | arch = x64 and "x64" or "x86", |
13 | description = "DynASM x86/x64 module", | 13 | description = "DynASM x86/x64 module", |
14 | version = "1.3.0", | 14 | version = "1.4.0", |
15 | vernum = 10300, | 15 | vernum = 10400, |
16 | release = "2011-05-05", | 16 | release = "2015-10-18", |
17 | author = "Mike Pall", | 17 | author = "Mike Pall", |
18 | license = "MIT", | 18 | license = "MIT", |
19 | } | 19 | } |
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl | |||
27 | local _s = string | 27 | local _s = string |
28 | local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char | 28 | local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char |
29 | local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub | 29 | local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub |
30 | local concat, sort = table.concat, table.sort | 30 | local concat, sort, remove = table.concat, table.sort, table.remove |
31 | local bit = bit or require("bit") | 31 | local bit = bit or require("bit") |
32 | local band, shl, shr = bit.band, bit.lshift, bit.rshift | 32 | local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift |
33 | 33 | ||
34 | -- Inherited tables and callbacks. | 34 | -- Inherited tables and callbacks. |
35 | local g_opt, g_arch | 35 | local g_opt, g_arch |
@@ -41,7 +41,7 @@ local action_names = { | |||
41 | -- int arg, 1 buffer pos: | 41 | -- int arg, 1 buffer pos: |
42 | "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", | 42 | "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", |
43 | -- action arg (1 byte), int arg, 1 buffer pos (reg/num): | 43 | -- action arg (1 byte), int arg, 1 buffer pos (reg/num): |
44 | "VREG", "SPACE", -- !x64: VREG support NYI. | 44 | "VREG", "SPACE", |
45 | -- ptrdiff_t arg, 1 buffer pos (address): !x64 | 45 | -- ptrdiff_t arg, 1 buffer pos (address): !x64 |
46 | "SETLABEL", "REL_A", | 46 | "SETLABEL", "REL_A", |
47 | -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): | 47 | -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): |
@@ -83,6 +83,21 @@ local actargs = { 0 } | |||
83 | -- Current number of section buffer positions for dasm_put(). | 83 | -- Current number of section buffer positions for dasm_put(). |
84 | local secpos = 1 | 84 | local secpos = 1 |
85 | 85 | ||
86 | -- VREG kind encodings, pre-shifted by 5 bits. | ||
87 | local map_vreg = { | ||
88 | ["modrm.rm.m"] = 0x00, | ||
89 | ["modrm.rm.r"] = 0x20, | ||
90 | ["opcode"] = 0x20, | ||
91 | ["sib.base"] = 0x20, | ||
92 | ["sib.index"] = 0x40, | ||
93 | ["modrm.reg"] = 0x80, | ||
94 | ["vex.v"] = 0xa0, | ||
95 | ["imm.hi"] = 0xc0, | ||
96 | } | ||
97 | |||
98 | -- Current number of VREG actions contributing to REX/VEX shrinkage. | ||
99 | local vreg_shrink_count = 0 | ||
100 | |||
86 | ------------------------------------------------------------------------------ | 101 | ------------------------------------------------------------------------------ |
87 | 102 | ||
88 | -- Compute action numbers for action names. | 103 | -- Compute action numbers for action names. |
@@ -134,6 +149,21 @@ local function waction(action, a, num) | |||
134 | if a or num then secpos = secpos + (num or 1) end | 149 | if a or num then secpos = secpos + (num or 1) end |
135 | end | 150 | end |
136 | 151 | ||
152 | -- Optionally add a VREG action. | ||
153 | local function wvreg(kind, vreg, psz, sk, defer) | ||
154 | if not vreg then return end | ||
155 | waction("VREG", vreg) | ||
156 | local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'") | ||
157 | if b < (sk or 0) then | ||
158 | vreg_shrink_count = vreg_shrink_count + 1 | ||
159 | end | ||
160 | if not defer then | ||
161 | b = b + vreg_shrink_count * 8 | ||
162 | vreg_shrink_count = 0 | ||
163 | end | ||
164 | wputxb(b + (psz or 0)) | ||
165 | end | ||
166 | |||
137 | -- Add call to embedded DynASM C code. | 167 | -- Add call to embedded DynASM C code. |
138 | local function wcall(func, args) | 168 | local function wcall(func, args) |
139 | wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true) | 169 | wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true) |
@@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names) | |||
299 | local iname = format("@%s%x%s", sz, i, needrex and "R" or "") | 329 | local iname = format("@%s%x%s", sz, i, needrex and "R" or "") |
300 | if needrex then map_reg_needrex[iname] = true end | 330 | if needrex then map_reg_needrex[iname] = true end |
301 | local name | 331 | local name |
302 | if sz == "o" then name = format("xmm%d", i) | 332 | if sz == "o" or sz == "y" then name = format("%s%d", cl, i) |
303 | elseif sz == "f" then name = format("st%d", i) | 333 | elseif sz == "f" then name = format("st%d", i) |
304 | else name = format("r%d%s", i, sz == addrsize and "" or sz) end | 334 | else name = format("r%d%s", i, sz == addrsize and "" or sz) end |
305 | map_archdef[name] = iname | 335 | map_archdef[name] = iname |
@@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"}) | |||
326 | mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) | 356 | mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) |
327 | map_reg_valid_index[map_archdef.esp] = false | 357 | map_reg_valid_index[map_archdef.esp] = false |
328 | if x64 then map_reg_valid_index[map_archdef.rsp] = false end | 358 | if x64 then map_reg_valid_index[map_archdef.rsp] = false end |
359 | if x64 then map_reg_needrex[map_archdef.Rb] = true end | ||
329 | map_archdef["Ra"] = "@"..addrsize | 360 | map_archdef["Ra"] = "@"..addrsize |
330 | 361 | ||
331 | -- FP registers (internally tword sized, but use "f" as operand size). | 362 | -- FP registers (internally tword sized, but use "f" as operand size). |
@@ -334,21 +365,24 @@ mkrmap("f", "Rf") | |||
334 | -- SSE registers (oword sized, but qword and dword accessible). | 365 | -- SSE registers (oword sized, but qword and dword accessible). |
335 | mkrmap("o", "xmm") | 366 | mkrmap("o", "xmm") |
336 | 367 | ||
368 | -- AVX registers (yword sized, but oword, qword and dword accessible). | ||
369 | mkrmap("y", "ymm") | ||
370 | |||
337 | -- Operand size prefixes to codes. | 371 | -- Operand size prefixes to codes. |
338 | local map_opsize = { | 372 | local map_opsize = { |
339 | byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", | 373 | byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y", |
340 | aword = addrsize, | 374 | tword = "t", aword = addrsize, |
341 | } | 375 | } |
342 | 376 | ||
343 | -- Operand size code to number. | 377 | -- Operand size code to number. |
344 | local map_opsizenum = { | 378 | local map_opsizenum = { |
345 | b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, | 379 | b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10, |
346 | } | 380 | } |
347 | 381 | ||
348 | -- Operand size code to name. | 382 | -- Operand size code to name. |
349 | local map_opsizename = { | 383 | local map_opsizename = { |
350 | b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", | 384 | b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword", |
351 | f = "fpword", | 385 | t = "tword", f = "fpword", |
352 | } | 386 | } |
353 | 387 | ||
354 | -- Valid index register scale factors. | 388 | -- Valid index register scale factors. |
@@ -460,9 +494,45 @@ local function wputszarg(sz, n) | |||
460 | end | 494 | end |
461 | 495 | ||
462 | -- Put multi-byte opcode with operand-size dependent modifications. | 496 | -- Put multi-byte opcode with operand-size dependent modifications. |
463 | local function wputop(sz, op, rex) | 497 | local function wputop(sz, op, rex, vex, vregr, vregxb) |
498 | local psz, sk = 0, nil | ||
499 | if vex then | ||
500 | local tail | ||
501 | if vex.m == 1 and band(rex, 11) == 0 then | ||
502 | if x64 and vregxb then | ||
503 | sk = map_vreg["modrm.reg"] | ||
504 | else | ||
505 | wputb(0xc5) | ||
506 | tail = shl(bxor(band(rex, 4), 4), 5) | ||
507 | psz = 3 | ||
508 | end | ||
509 | end | ||
510 | if not tail then | ||
511 | wputb(0xc4) | ||
512 | wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m) | ||
513 | tail = shl(band(rex, 8), 4) | ||
514 | psz = 4 | ||
515 | end | ||
516 | local reg, vreg = 0, nil | ||
517 | if vex.v then | ||
518 | reg = vex.v.reg | ||
519 | if not reg then werror("bad vex operand") end | ||
520 | if reg < 0 then reg = 0; vreg = vex.v.vreg end | ||
521 | end | ||
522 | if sz == "y" or vex.l then tail = tail + 4 end | ||
523 | wputb(tail + shl(bxor(reg, 15), 3) + vex.p) | ||
524 | wvreg("vex.v", vreg) | ||
525 | rex = 0 | ||
526 | if op >= 256 then werror("bad vex opcode") end | ||
527 | else | ||
528 | if rex ~= 0 then | ||
529 | if not x64 then werror("bad operand size") end | ||
530 | elseif (vregr or vregxb) and x64 then | ||
531 | rex = 0x10 | ||
532 | sk = map_vreg["vex.v"] | ||
533 | end | ||
534 | end | ||
464 | local r | 535 | local r |
465 | if rex ~= 0 and not x64 then werror("bad operand size") end | ||
466 | if sz == "w" then wputb(102) end | 536 | if sz == "w" then wputb(102) end |
467 | -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] | 537 | -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] |
468 | if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end | 538 | if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end |
@@ -471,20 +541,20 @@ local function wputop(sz, op, rex) | |||
471 | if rex ~= 0 then | 541 | if rex ~= 0 then |
472 | local opc3 = band(op, 0xffff00) | 542 | local opc3 = band(op, 0xffff00) |
473 | if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then | 543 | if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then |
474 | wputb(64 + band(rex, 15)); rex = 0 | 544 | wputb(64 + band(rex, 15)); rex = 0; psz = 2 |
475 | end | 545 | end |
476 | end | 546 | end |
477 | wputb(shr(op, 16)); op = band(op, 0xffff) | 547 | wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1 |
478 | end | 548 | end |
479 | if op >= 256 then | 549 | if op >= 256 then |
480 | local b = shr(op, 8) | 550 | local b = shr(op, 8) |
481 | if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end | 551 | if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end |
482 | wputb(b) | 552 | wputb(b); op = band(op, 255); psz = psz + 1 |
483 | op = band(op, 255) | ||
484 | end | 553 | end |
485 | if rex ~= 0 then wputb(64 + band(rex, 15)) end | 554 | if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end |
486 | if sz == "b" then op = op - 1 end | 555 | if sz == "b" then op = op - 1 end |
487 | wputb(op) | 556 | wputb(op) |
557 | return psz, sk | ||
488 | end | 558 | end |
489 | 559 | ||
490 | -- Put ModRM or SIB formatted byte. | 560 | -- Put ModRM or SIB formatted byte. |
@@ -494,7 +564,7 @@ local function wputmodrm(m, s, rm, vs, vrm) | |||
494 | end | 564 | end |
495 | 565 | ||
496 | -- Put ModRM/SIB plus optional displacement. | 566 | -- Put ModRM/SIB plus optional displacement. |
497 | local function wputmrmsib(t, imark, s, vsreg) | 567 | local function wputmrmsib(t, imark, s, vsreg, psz, sk) |
498 | local vreg, vxreg | 568 | local vreg, vxreg |
499 | local reg, xreg = t.reg, t.xreg | 569 | local reg, xreg = t.reg, t.xreg |
500 | if reg and reg < 0 then reg = 0; vreg = t.vreg end | 570 | if reg and reg < 0 then reg = 0; vreg = t.vreg end |
@@ -504,8 +574,8 @@ local function wputmrmsib(t, imark, s, vsreg) | |||
504 | -- Register mode. | 574 | -- Register mode. |
505 | if sub(t.mode, 1, 1) == "r" then | 575 | if sub(t.mode, 1, 1) == "r" then |
506 | wputmodrm(3, s, reg) | 576 | wputmodrm(3, s, reg) |
507 | if vsreg then waction("VREG", vsreg); wputxb(2) end | 577 | wvreg("modrm.reg", vsreg, psz+1, sk, vreg) |
508 | if vreg then waction("VREG", vreg); wputxb(0) end | 578 | wvreg("modrm.rm.r", vreg, psz+1, sk) |
509 | return | 579 | return |
510 | end | 580 | end |
511 | 581 | ||
@@ -519,21 +589,22 @@ local function wputmrmsib(t, imark, s, vsreg) | |||
519 | -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) | 589 | -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) |
520 | wputmodrm(0, s, 4) | 590 | wputmodrm(0, s, 4) |
521 | if imark == "I" then waction("MARK") end | 591 | if imark == "I" then waction("MARK") end |
522 | if vsreg then waction("VREG", vsreg); wputxb(2) end | 592 | wvreg("modrm.reg", vsreg, psz+1, sk, vxreg) |
523 | wputmodrm(t.xsc, xreg, 5) | 593 | wputmodrm(t.xsc, xreg, 5) |
524 | if vxreg then waction("VREG", vxreg); wputxb(3) end | 594 | wvreg("sib.index", vxreg, psz+2, sk) |
525 | else | 595 | else |
526 | -- Pure 32 bit displacement. | 596 | -- Pure 32 bit displacement. |
527 | if x64 and tdisp ~= "table" then | 597 | if x64 and tdisp ~= "table" then |
528 | wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) | 598 | wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) |
599 | wvreg("modrm.reg", vsreg, psz+1, sk) | ||
529 | if imark == "I" then waction("MARK") end | 600 | if imark == "I" then waction("MARK") end |
530 | wputmodrm(0, 4, 5) | 601 | wputmodrm(0, 4, 5) |
531 | else | 602 | else |
532 | riprel = x64 | 603 | riprel = x64 |
533 | wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) | 604 | wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) |
605 | wvreg("modrm.reg", vsreg, psz+1, sk) | ||
534 | if imark == "I" then waction("MARK") end | 606 | if imark == "I" then waction("MARK") end |
535 | end | 607 | end |
536 | if vsreg then waction("VREG", vsreg); wputxb(2) end | ||
537 | end | 608 | end |
538 | if riprel then -- Emit rip-relative displacement. | 609 | if riprel then -- Emit rip-relative displacement. |
539 | if match("UWSiI", imark) then | 610 | if match("UWSiI", imark) then |
@@ -561,16 +632,16 @@ local function wputmrmsib(t, imark, s, vsreg) | |||
561 | if xreg or band(reg, 7) == 4 then | 632 | if xreg or band(reg, 7) == 4 then |
562 | wputmodrm(m or 2, s, 4) -- ModRM. | 633 | wputmodrm(m or 2, s, 4) -- ModRM. |
563 | if m == nil or imark == "I" then waction("MARK") end | 634 | if m == nil or imark == "I" then waction("MARK") end |
564 | if vsreg then waction("VREG", vsreg); wputxb(2) end | 635 | wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg) |
565 | wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. | 636 | wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. |
566 | if vxreg then waction("VREG", vxreg); wputxb(3) end | 637 | wvreg("sib.index", vxreg, psz+2, sk, vreg) |
567 | if vreg then waction("VREG", vreg); wputxb(1) end | 638 | wvreg("sib.base", vreg, psz+2, sk) |
568 | else | 639 | else |
569 | wputmodrm(m or 2, s, reg) -- ModRM. | 640 | wputmodrm(m or 2, s, reg) -- ModRM. |
570 | if (imark == "I" and (m == 1 or m == 2)) or | 641 | if (imark == "I" and (m == 1 or m == 2)) or |
571 | (m == nil and (vsreg or vreg)) then waction("MARK") end | 642 | (m == nil and (vsreg or vreg)) then waction("MARK") end |
572 | if vsreg then waction("VREG", vsreg); wputxb(2) end | 643 | wvreg("modrm.reg", vsreg, psz+1, sk, vreg) |
573 | if vreg then waction("VREG", vreg); wputxb(1) end | 644 | wvreg("modrm.rm.m", vreg, psz+1, sk) |
574 | end | 645 | end |
575 | 646 | ||
576 | -- Put displacement. | 647 | -- Put displacement. |
@@ -881,9 +952,16 @@ end | |||
881 | -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. | 952 | -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. |
882 | -- The spare 3 bits are either filled with the last hex digit or | 953 | -- The spare 3 bits are either filled with the last hex digit or |
883 | -- the result from a previous "r"/"R". The opcode is restored. | 954 | -- the result from a previous "r"/"R". The opcode is restored. |
955 | -- "u" Use VEX encoding, vvvv unused. | ||
956 | -- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is | ||
957 | -- removed from the list used by future characters). | ||
958 | -- "w" Use VEX encoding, vvvv from 3rd operand. | ||
959 | -- "L" Force VEX.L | ||
884 | -- | 960 | -- |
885 | -- All of the following characters force a flush of the opcode: | 961 | -- All of the following characters force a flush of the opcode: |
886 | -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. | 962 | -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. |
963 | -- "s" stores a 4 bit immediate from the last register operand, | ||
964 | -- followed by 4 zero bits. | ||
887 | -- "S" stores a signed 8 bit immediate from the last operand. | 965 | -- "S" stores a signed 8 bit immediate from the last operand. |
888 | -- "U" stores an unsigned 8 bit immediate from the last operand. | 966 | -- "U" stores an unsigned 8 bit immediate from the last operand. |
889 | -- "W" stores an unsigned 16 bit immediate from the last operand. | 967 | -- "W" stores an unsigned 16 bit immediate from the last operand. |
@@ -1226,46 +1304,14 @@ local map_op = { | |||
1226 | movups_2 = "rmo:0F10rM|mro:0F11Rm", | 1304 | movups_2 = "rmo:0F10rM|mro:0F11Rm", |
1227 | orpd_2 = "rmo:660F56rM", | 1305 | orpd_2 = "rmo:660F56rM", |
1228 | orps_2 = "rmo:0F56rM", | 1306 | orps_2 = "rmo:0F56rM", |
1229 | packssdw_2 = "rmo:660F6BrM", | ||
1230 | packsswb_2 = "rmo:660F63rM", | ||
1231 | packuswb_2 = "rmo:660F67rM", | ||
1232 | paddb_2 = "rmo:660FFCrM", | ||
1233 | paddd_2 = "rmo:660FFErM", | ||
1234 | paddq_2 = "rmo:660FD4rM", | ||
1235 | paddsb_2 = "rmo:660FECrM", | ||
1236 | paddsw_2 = "rmo:660FEDrM", | ||
1237 | paddusb_2 = "rmo:660FDCrM", | ||
1238 | paddusw_2 = "rmo:660FDDrM", | ||
1239 | paddw_2 = "rmo:660FFDrM", | ||
1240 | pand_2 = "rmo:660FDBrM", | ||
1241 | pandn_2 = "rmo:660FDFrM", | ||
1242 | pause_0 = "F390", | 1307 | pause_0 = "F390", |
1243 | pavgb_2 = "rmo:660FE0rM", | ||
1244 | pavgw_2 = "rmo:660FE3rM", | ||
1245 | pcmpeqb_2 = "rmo:660F74rM", | ||
1246 | pcmpeqd_2 = "rmo:660F76rM", | ||
1247 | pcmpeqw_2 = "rmo:660F75rM", | ||
1248 | pcmpgtb_2 = "rmo:660F64rM", | ||
1249 | pcmpgtd_2 = "rmo:660F66rM", | ||
1250 | pcmpgtw_2 = "rmo:660F65rM", | ||
1251 | pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. | 1308 | pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. |
1252 | pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", | 1309 | pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", |
1253 | pmaddwd_2 = "rmo:660FF5rM", | ||
1254 | pmaxsw_2 = "rmo:660FEErM", | ||
1255 | pmaxub_2 = "rmo:660FDErM", | ||
1256 | pminsw_2 = "rmo:660FEArM", | ||
1257 | pminub_2 = "rmo:660FDArM", | ||
1258 | pmovmskb_2 = "rr/do:660FD7rM", | 1310 | pmovmskb_2 = "rr/do:660FD7rM", |
1259 | pmulhuw_2 = "rmo:660FE4rM", | ||
1260 | pmulhw_2 = "rmo:660FE5rM", | ||
1261 | pmullw_2 = "rmo:660FD5rM", | ||
1262 | pmuludq_2 = "rmo:660FF4rM", | ||
1263 | por_2 = "rmo:660FEBrM", | ||
1264 | prefetchnta_1 = "xb:n0F180m", | 1311 | prefetchnta_1 = "xb:n0F180m", |
1265 | prefetcht0_1 = "xb:n0F181m", | 1312 | prefetcht0_1 = "xb:n0F181m", |
1266 | prefetcht1_1 = "xb:n0F182m", | 1313 | prefetcht1_1 = "xb:n0F182m", |
1267 | prefetcht2_1 = "xb:n0F183m", | 1314 | prefetcht2_1 = "xb:n0F183m", |
1268 | psadbw_2 = "rmo:660FF6rM", | ||
1269 | pshufd_3 = "rmio:660F70rMU", | 1315 | pshufd_3 = "rmio:660F70rMU", |
1270 | pshufhw_3 = "rmio:F30F70rMU", | 1316 | pshufhw_3 = "rmio:F30F70rMU", |
1271 | pshuflw_3 = "rmio:F20F70rMU", | 1317 | pshuflw_3 = "rmio:F20F70rMU", |
@@ -1279,23 +1325,6 @@ local map_op = { | |||
1279 | psrldq_2 = "rio:660F733mU", | 1325 | psrldq_2 = "rio:660F733mU", |
1280 | psrlq_2 = "rmo:660FD3rM|rio:660F732mU", | 1326 | psrlq_2 = "rmo:660FD3rM|rio:660F732mU", |
1281 | psrlw_2 = "rmo:660FD1rM|rio:660F712mU", | 1327 | psrlw_2 = "rmo:660FD1rM|rio:660F712mU", |
1282 | psubb_2 = "rmo:660FF8rM", | ||
1283 | psubd_2 = "rmo:660FFArM", | ||
1284 | psubq_2 = "rmo:660FFBrM", | ||
1285 | psubsb_2 = "rmo:660FE8rM", | ||
1286 | psubsw_2 = "rmo:660FE9rM", | ||
1287 | psubusb_2 = "rmo:660FD8rM", | ||
1288 | psubusw_2 = "rmo:660FD9rM", | ||
1289 | psubw_2 = "rmo:660FF9rM", | ||
1290 | punpckhbw_2 = "rmo:660F68rM", | ||
1291 | punpckhdq_2 = "rmo:660F6ArM", | ||
1292 | punpckhqdq_2 = "rmo:660F6DrM", | ||
1293 | punpckhwd_2 = "rmo:660F69rM", | ||
1294 | punpcklbw_2 = "rmo:660F60rM", | ||
1295 | punpckldq_2 = "rmo:660F62rM", | ||
1296 | punpcklqdq_2 = "rmo:660F6CrM", | ||
1297 | punpcklwd_2 = "rmo:660F61rM", | ||
1298 | pxor_2 = "rmo:660FEFrM", | ||
1299 | rcpps_2 = "rmo:0F53rM", | 1328 | rcpps_2 = "rmo:0F53rM", |
1300 | rcpss_2 = "rro:F30F53rM|rx/od:", | 1329 | rcpss_2 = "rro:F30F53rM|rx/od:", |
1301 | rsqrtps_2 = "rmo:0F52rM", | 1330 | rsqrtps_2 = "rmo:0F52rM", |
@@ -1413,6 +1442,327 @@ local map_op = { | |||
1413 | movntsd_2 = "xr/qo:nF20F2BRm", | 1442 | movntsd_2 = "xr/qo:nF20F2BRm", |
1414 | movntss_2 = "xr/do:F30F2BRm", | 1443 | movntss_2 = "xr/do:F30F2BRm", |
1415 | -- popcnt is also in SSE4.2 | 1444 | -- popcnt is also in SSE4.2 |
1445 | |||
1446 | -- AES-NI | ||
1447 | aesdec_2 = "rmo:660F38DErM", | ||
1448 | aesdeclast_2 = "rmo:660F38DFrM", | ||
1449 | aesenc_2 = "rmo:660F38DCrM", | ||
1450 | aesenclast_2 = "rmo:660F38DDrM", | ||
1451 | aesimc_2 = "rmo:660F38DBrM", | ||
1452 | aeskeygenassist_3 = "rmio:660F3ADFrMU", | ||
1453 | pclmulqdq_3 = "rmio:660F3A44rMU", | ||
1454 | |||
1455 | -- AVX FP ops | ||
1456 | vaddsubpd_3 = "rrmoy:660FVD0rM", | ||
1457 | vaddsubps_3 = "rrmoy:F20FVD0rM", | ||
1458 | vandpd_3 = "rrmoy:660FV54rM", | ||
1459 | vandps_3 = "rrmoy:0FV54rM", | ||
1460 | vandnpd_3 = "rrmoy:660FV55rM", | ||
1461 | vandnps_3 = "rrmoy:0FV55rM", | ||
1462 | vblendpd_4 = "rrmioy:660F3AV0DrMU", | ||
1463 | vblendps_4 = "rrmioy:660F3AV0CrMU", | ||
1464 | vblendvpd_4 = "rrmroy:660F3AV4BrMs", | ||
1465 | vblendvps_4 = "rrmroy:660F3AV4ArMs", | ||
1466 | vbroadcastf128_2 = "rx/yo:660F38u1ArM", | ||
1467 | vcmppd_4 = "rrmioy:660FVC2rMU", | ||
1468 | vcmpps_4 = "rrmioy:0FVC2rMU", | ||
1469 | vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:", | ||
1470 | vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:", | ||
1471 | vcomisd_2 = "rro:660Fu2FrM|rx/oq:", | ||
1472 | vcomiss_2 = "rro:0Fu2FrM|rx/od:", | ||
1473 | vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:", | ||
1474 | vcvtdq2ps_2 = "rmoy:0Fu5BrM", | ||
1475 | vcvtpd2dq_2 = "rmoy:F20FuE6rM", | ||
1476 | vcvtpd2ps_2 = "rmoy:660Fu5ArM", | ||
1477 | vcvtps2dq_2 = "rmoy:660Fu5BrM", | ||
1478 | vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:", | ||
1479 | vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:", | ||
1480 | vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:", | ||
1481 | vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM", | ||
1482 | vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM", | ||
1483 | vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:", | ||
1484 | vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:", | ||
1485 | vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM", | ||
1486 | vcvttps2dq_2 = "rmoy:F30Fu5BrM", | ||
1487 | vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:", | ||
1488 | vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:", | ||
1489 | vdppd_4 = "rrmio:660F3AV41rMU", | ||
1490 | vdpps_4 = "rrmioy:660F3AV40rMU", | ||
1491 | vextractf128_3 = "mri/oy:660F3AuL19RmU", | ||
1492 | vextractps_3 = "mri/do:660F3Au17RmU", | ||
1493 | vhaddpd_3 = "rrmoy:660FV7CrM", | ||
1494 | vhaddps_3 = "rrmoy:F20FV7CrM", | ||
1495 | vhsubpd_3 = "rrmoy:660FV7DrM", | ||
1496 | vhsubps_3 = "rrmoy:F20FV7DrM", | ||
1497 | vinsertf128_4 = "rrmi/yyo:660F3AV18rMU", | ||
1498 | vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:", | ||
1499 | vldmxcsr_1 = "xd:0FuAE2m", | ||
1500 | vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm", | ||
1501 | vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm", | ||
1502 | vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm", | ||
1503 | vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm", | ||
1504 | vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:", | ||
1505 | vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm", | ||
1506 | vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:", | ||
1507 | vmovhlps_3 = "rrro:0FV12rM", | ||
1508 | vmovhpd_2 = "xr/qo:660Fu17Rm", | ||
1509 | vmovhpd_3 = "rrx/ooq:660FV16rM", | ||
1510 | vmovhps_2 = "xr/qo:0Fu17Rm", | ||
1511 | vmovhps_3 = "rrx/ooq:0FV16rM", | ||
1512 | vmovlhps_3 = "rrro:0FV16rM", | ||
1513 | vmovlpd_2 = "xr/qo:660Fu13Rm", | ||
1514 | vmovlpd_3 = "rrx/ooq:660FV12rM", | ||
1515 | vmovlps_2 = "xr/qo:0Fu13Rm", | ||
1516 | vmovlps_3 = "rrx/ooq:0FV12rM", | ||
1517 | vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM", | ||
1518 | vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM", | ||
1519 | vmovntpd_2 = "xroy:660Fu2BRm", | ||
1520 | vmovntps_2 = "xroy:0Fu2BRm", | ||
1521 | vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm", | ||
1522 | vmovsd_3 = "rrro:F20FV10rM", | ||
1523 | vmovshdup_2 = "rmoy:F30Fu16rM", | ||
1524 | vmovsldup_2 = "rmoy:F30Fu12rM", | ||
1525 | vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm", | ||
1526 | vmovss_3 = "rrro:F30FV10rM", | ||
1527 | vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm", | ||
1528 | vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm", | ||
1529 | vorpd_3 = "rrmoy:660FV56rM", | ||
1530 | vorps_3 = "rrmoy:0FV56rM", | ||
1531 | vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU", | ||
1532 | vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU", | ||
1533 | vperm2f128_4 = "rrmiy:660F3AV06rMU", | ||
1534 | vptestpd_2 = "rmoy:660F38u0FrM", | ||
1535 | vptestps_2 = "rmoy:660F38u0ErM", | ||
1536 | vrcpps_2 = "rmoy:0Fu53rM", | ||
1537 | vrcpss_3 = "rrro:F30FV53rM|rrx/ood:", | ||
1538 | vrsqrtps_2 = "rmoy:0Fu52rM", | ||
1539 | vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:", | ||
1540 | vroundpd_3 = "rmioy:660F3Au09rMU", | ||
1541 | vroundps_3 = "rmioy:660F3Au08rMU", | ||
1542 | vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:", | ||
1543 | vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:", | ||
1544 | vshufpd_4 = "rrmioy:660FVC6rMU", | ||
1545 | vshufps_4 = "rrmioy:0FVC6rMU", | ||
1546 | vsqrtps_2 = "rmoy:0Fu51rM", | ||
1547 | vsqrtss_2 = "rro:F30Fu51rM|rx/od:", | ||
1548 | vsqrtpd_2 = "rmoy:660Fu51rM", | ||
1549 | vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:", | ||
1550 | vstmxcsr_1 = "xd:0FuAE3m", | ||
1551 | vucomisd_2 = "rro:660Fu2ErM|rx/oq:", | ||
1552 | vucomiss_2 = "rro:0Fu2ErM|rx/od:", | ||
1553 | vunpckhpd_3 = "rrmoy:660FV15rM", | ||
1554 | vunpckhps_3 = "rrmoy:0FV15rM", | ||
1555 | vunpcklpd_3 = "rrmoy:660FV14rM", | ||
1556 | vunpcklps_3 = "rrmoy:0FV14rM", | ||
1557 | vxorpd_3 = "rrmoy:660FV57rM", | ||
1558 | vxorps_3 = "rrmoy:0FV57rM", | ||
1559 | vzeroall_0 = "0FuL77", | ||
1560 | vzeroupper_0 = "0Fu77", | ||
1561 | |||
1562 | -- AVX2 FP ops | ||
1563 | vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:", | ||
1564 | vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:", | ||
1565 | -- *vgather* (!vsib) | ||
1566 | vpermpd_3 = "rmiy:660F3AuX01rMU", | ||
1567 | vpermps_3 = "rrmy:660F38V16rM", | ||
1568 | |||
1569 | -- AVX, AVX2 integer ops | ||
1570 | -- In general, xmm requires AVX, ymm requires AVX2. | ||
1571 | vaesdec_3 = "rrmo:660F38VDErM", | ||
1572 | vaesdeclast_3 = "rrmo:660F38VDFrM", | ||
1573 | vaesenc_3 = "rrmo:660F38VDCrM", | ||
1574 | vaesenclast_3 = "rrmo:660F38VDDrM", | ||
1575 | vaesimc_2 = "rmo:660F38uDBrM", | ||
1576 | vaeskeygenassist_3 = "rmio:660F3AuDFrMU", | ||
1577 | vlddqu_2 = "rxoy:F20FuF0rM", | ||
1578 | vmaskmovdqu_2 = "rro:660FuF7rM", | ||
1579 | vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm", | ||
1580 | vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm", | ||
1581 | vmovntdq_2 = "xroy:660FuE7Rm", | ||
1582 | vmovntdqa_2 = "rxoy:660F38u2ArM", | ||
1583 | vmpsadbw_4 = "rrmioy:660F3AV42rMU", | ||
1584 | vpabsb_2 = "rmoy:660F38u1CrM", | ||
1585 | vpabsd_2 = "rmoy:660F38u1ErM", | ||
1586 | vpabsw_2 = "rmoy:660F38u1DrM", | ||
1587 | vpackusdw_3 = "rrmoy:660F38V2BrM", | ||
1588 | vpalignr_4 = "rrmioy:660F3AV0FrMU", | ||
1589 | vpblendvb_4 = "rrmroy:660F3AV4CrMs", | ||
1590 | vpblendw_4 = "rrmioy:660F3AV0ErMU", | ||
1591 | vpclmulqdq_4 = "rrmio:660F3AV44rMU", | ||
1592 | vpcmpeqq_3 = "rrmoy:660F38V29rM", | ||
1593 | vpcmpestri_3 = "rmio:660F3Au61rMU", | ||
1594 | vpcmpestrm_3 = "rmio:660F3Au60rMU", | ||
1595 | vpcmpgtq_3 = "rrmoy:660F38V37rM", | ||
1596 | vpcmpistri_3 = "rmio:660F3Au63rMU", | ||
1597 | vpcmpistrm_3 = "rmio:660F3Au62rMU", | ||
1598 | vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:", | ||
1599 | vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU", | ||
1600 | vpextrd_3 = "mri/do:660F3Au16RmU", | ||
1601 | vpextrq_3 = "mri/qo:660F3Au16RmU", | ||
1602 | vphaddw_3 = "rrmoy:660F38V01rM", | ||
1603 | vphaddd_3 = "rrmoy:660F38V02rM", | ||
1604 | vphaddsw_3 = "rrmoy:660F38V03rM", | ||
1605 | vphminposuw_2 = "rmo:660F38u41rM", | ||
1606 | vphsubw_3 = "rrmoy:660F38V05rM", | ||
1607 | vphsubd_3 = "rrmoy:660F38V06rM", | ||
1608 | vphsubsw_3 = "rrmoy:660F38V07rM", | ||
1609 | vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:", | ||
1610 | vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:", | ||
1611 | vpinsrd_4 = "rrmi/ood:660F3AV22rMU", | ||
1612 | vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU", | ||
1613 | vpmaddubsw_3 = "rrmoy:660F38V04rM", | ||
1614 | vpmaxsb_3 = "rrmoy:660F38V3CrM", | ||
1615 | vpmaxsd_3 = "rrmoy:660F38V3DrM", | ||
1616 | vpmaxuw_3 = "rrmoy:660F38V3ErM", | ||
1617 | vpmaxud_3 = "rrmoy:660F38V3FrM", | ||
1618 | vpminsb_3 = "rrmoy:660F38V38rM", | ||
1619 | vpminsd_3 = "rrmoy:660F38V39rM", | ||
1620 | vpminuw_3 = "rrmoy:660F38V3ArM", | ||
1621 | vpminud_3 = "rrmoy:660F38V3BrM", | ||
1622 | vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM", | ||
1623 | vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:", | ||
1624 | vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:", | ||
1625 | vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:", | ||
1626 | vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:", | ||
1627 | vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:", | ||
1628 | vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:", | ||
1629 | vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:", | ||
1630 | vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:", | ||
1631 | vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:", | ||
1632 | vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:", | ||
1633 | vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:", | ||
1634 | vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:", | ||
1635 | vpmuldq_3 = "rrmoy:660F38V28rM", | ||
1636 | vpmulhrsw_3 = "rrmoy:660F38V0BrM", | ||
1637 | vpmulld_3 = "rrmoy:660F38V40rM", | ||
1638 | vpshufb_3 = "rrmoy:660F38V00rM", | ||
1639 | vpshufd_3 = "rmioy:660Fu70rMU", | ||
1640 | vpshufhw_3 = "rmioy:F30Fu70rMU", | ||
1641 | vpshuflw_3 = "rmioy:F20Fu70rMU", | ||
1642 | vpsignb_3 = "rrmoy:660F38V08rM", | ||
1643 | vpsignw_3 = "rrmoy:660F38V09rM", | ||
1644 | vpsignd_3 = "rrmoy:660F38V0ArM", | ||
1645 | vpslldq_3 = "rrioy:660Fv737mU", | ||
1646 | vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU", | ||
1647 | vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU", | ||
1648 | vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU", | ||
1649 | vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU", | ||
1650 | vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU", | ||
1651 | vpsrldq_3 = "rrioy:660Fv733mU", | ||
1652 | vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU", | ||
1653 | vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU", | ||
1654 | vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU", | ||
1655 | vptest_2 = "rmoy:660F38u17rM", | ||
1656 | |||
1657 | -- AVX2 integer ops | ||
1658 | vbroadcasti128_2 = "rx/yo:660F38u5ArM", | ||
1659 | vinserti128_4 = "rrmi/yyo:660F3AV38rMU", | ||
1660 | vextracti128_3 = "mri/oy:660F3AuL39RmU", | ||
1661 | vpblendd_4 = "rrmioy:660F3AV02rMU", | ||
1662 | vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:", | ||
1663 | vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:", | ||
1664 | vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:", | ||
1665 | vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:", | ||
1666 | vpermd_3 = "rrmy:660F38V36rM", | ||
1667 | vpermq_3 = "rmiy:660F3AuX00rMU", | ||
1668 | -- *vpgather* (!vsib) | ||
1669 | vperm2i128_4 = "rrmiy:660F3AV46rMU", | ||
1670 | vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm", | ||
1671 | vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm", | ||
1672 | vpsllvd_3 = "rrmoy:660F38V47rM", | ||
1673 | vpsllvq_3 = "rrmoy:660F38VX47rM", | ||
1674 | vpsravd_3 = "rrmoy:660F38V46rM", | ||
1675 | vpsrlvd_3 = "rrmoy:660F38V45rM", | ||
1676 | vpsrlvq_3 = "rrmoy:660F38VX45rM", | ||
1677 | |||
1678 | -- Intel ADX | ||
1679 | adcx_2 = "rmqd:660F38F6rM", | ||
1680 | adox_2 = "rmqd:F30F38F6rM", | ||
1681 | |||
1682 | -- BMI1 | ||
1683 | andn_3 = "rrmqd:0F38VF2rM", | ||
1684 | bextr_3 = "rmrqd:0F38wF7rM", | ||
1685 | blsi_2 = "rmqd:0F38vF33m", | ||
1686 | blsmsk_2 = "rmqd:0F38vF32m", | ||
1687 | blsr_2 = "rmqd:0F38vF31m", | ||
1688 | tzcnt_2 = "rmqdw:F30FBCrM", | ||
1689 | |||
1690 | -- BMI2 | ||
1691 | bzhi_3 = "rmrqd:0F38wF5rM", | ||
1692 | mulx_3 = "rrmqd:F20F38VF6rM", | ||
1693 | pdep_3 = "rrmqd:F20F38VF5rM", | ||
1694 | pext_3 = "rrmqd:F30F38VF5rM", | ||
1695 | rorx_3 = "rmSqd:F20F3AuF0rMS", | ||
1696 | sarx_3 = "rmrqd:F30F38wF7rM", | ||
1697 | shrx_3 = "rmrqd:F20F38wF7rM", | ||
1698 | shlx_3 = "rmrqd:660F38wF7rM", | ||
1699 | |||
1700 | -- FMA3 | ||
1701 | vfmaddsub132pd_3 = "rrmoy:660F38VX96rM", | ||
1702 | vfmaddsub132ps_3 = "rrmoy:660F38V96rM", | ||
1703 | vfmaddsub213pd_3 = "rrmoy:660F38VXA6rM", | ||
1704 | vfmaddsub213ps_3 = "rrmoy:660F38VA6rM", | ||
1705 | vfmaddsub231pd_3 = "rrmoy:660F38VXB6rM", | ||
1706 | vfmaddsub231ps_3 = "rrmoy:660F38VB6rM", | ||
1707 | |||
1708 | vfmsubadd132pd_3 = "rrmoy:660F38VX97rM", | ||
1709 | vfmsubadd132ps_3 = "rrmoy:660F38V97rM", | ||
1710 | vfmsubadd213pd_3 = "rrmoy:660F38VXA7rM", | ||
1711 | vfmsubadd213ps_3 = "rrmoy:660F38VA7rM", | ||
1712 | vfmsubadd231pd_3 = "rrmoy:660F38VXB7rM", | ||
1713 | vfmsubadd231ps_3 = "rrmoy:660F38VB7rM", | ||
1714 | |||
1715 | vfmadd132pd_3 = "rrmoy:660F38VX98rM", | ||
1716 | vfmadd132ps_3 = "rrmoy:660F38V98rM", | ||
1717 | vfmadd132sd_3 = "rrro:660F38VX99rM|rrx/ooq:", | ||
1718 | vfmadd132ss_3 = "rrro:660F38V99rM|rrx/ood:", | ||
1719 | vfmadd213pd_3 = "rrmoy:660F38VXA8rM", | ||
1720 | vfmadd213ps_3 = "rrmoy:660F38VA8rM", | ||
1721 | vfmadd213sd_3 = "rrro:660F38VXA9rM|rrx/ooq:", | ||
1722 | vfmadd213ss_3 = "rrro:660F38VA9rM|rrx/ood:", | ||
1723 | vfmadd231pd_3 = "rrmoy:660F38VXB8rM", | ||
1724 | vfmadd231ps_3 = "rrmoy:660F38VB8rM", | ||
1725 | vfmadd231sd_3 = "rrro:660F38VXB9rM|rrx/ooq:", | ||
1726 | vfmadd231ss_3 = "rrro:660F38VB9rM|rrx/ood:", | ||
1727 | |||
1728 | vfmsub132pd_3 = "rrmoy:660F38VX9ArM", | ||
1729 | vfmsub132ps_3 = "rrmoy:660F38V9ArM", | ||
1730 | vfmsub132sd_3 = "rrro:660F38VX9BrM|rrx/ooq:", | ||
1731 | vfmsub132ss_3 = "rrro:660F38V9BrM|rrx/ood:", | ||
1732 | vfmsub213pd_3 = "rrmoy:660F38VXAArM", | ||
1733 | vfmsub213ps_3 = "rrmoy:660F38VAArM", | ||
1734 | vfmsub213sd_3 = "rrro:660F38VXABrM|rrx/ooq:", | ||
1735 | vfmsub213ss_3 = "rrro:660F38VABrM|rrx/ood:", | ||
1736 | vfmsub231pd_3 = "rrmoy:660F38VXBArM", | ||
1737 | vfmsub231ps_3 = "rrmoy:660F38VBArM", | ||
1738 | vfmsub231sd_3 = "rrro:660F38VXBBrM|rrx/ooq:", | ||
1739 | vfmsub231ss_3 = "rrro:660F38VBBrM|rrx/ood:", | ||
1740 | |||
1741 | vfnmadd132pd_3 = "rrmoy:660F38VX9CrM", | ||
1742 | vfnmadd132ps_3 = "rrmoy:660F38V9CrM", | ||
1743 | vfnmadd132sd_3 = "rrro:660F38VX9DrM|rrx/ooq:", | ||
1744 | vfnmadd132ss_3 = "rrro:660F38V9DrM|rrx/ood:", | ||
1745 | vfnmadd213pd_3 = "rrmoy:660F38VXACrM", | ||
1746 | vfnmadd213ps_3 = "rrmoy:660F38VACrM", | ||
1747 | vfnmadd213sd_3 = "rrro:660F38VXADrM|rrx/ooq:", | ||
1748 | vfnmadd213ss_3 = "rrro:660F38VADrM|rrx/ood:", | ||
1749 | vfnmadd231pd_3 = "rrmoy:660F38VXBCrM", | ||
1750 | vfnmadd231ps_3 = "rrmoy:660F38VBCrM", | ||
1751 | vfnmadd231sd_3 = "rrro:660F38VXBDrM|rrx/ooq:", | ||
1752 | vfnmadd231ss_3 = "rrro:660F38VBDrM|rrx/ood:", | ||
1753 | |||
1754 | vfnmsub132pd_3 = "rrmoy:660F38VX9ErM", | ||
1755 | vfnmsub132ps_3 = "rrmoy:660F38V9ErM", | ||
1756 | vfnmsub132sd_3 = "rrro:660F38VX9FrM|rrx/ooq:", | ||
1757 | vfnmsub132ss_3 = "rrro:660F38V9FrM|rrx/ood:", | ||
1758 | vfnmsub213pd_3 = "rrmoy:660F38VXAErM", | ||
1759 | vfnmsub213ps_3 = "rrmoy:660F38VAErM", | ||
1760 | vfnmsub213sd_3 = "rrro:660F38VXAFrM|rrx/ooq:", | ||
1761 | vfnmsub213ss_3 = "rrro:660F38VAFrM|rrx/ood:", | ||
1762 | vfnmsub231pd_3 = "rrmoy:660F38VXBErM", | ||
1763 | vfnmsub231ps_3 = "rrmoy:660F38VBErM", | ||
1764 | vfnmsub231sd_3 = "rrro:660F38VXBFrM|rrx/ooq:", | ||
1765 | vfnmsub231ss_3 = "rrro:660F38VBFrM|rrx/ood:", | ||
1416 | } | 1766 | } |
1417 | 1767 | ||
1418 | ------------------------------------------------------------------------------ | 1768 | ------------------------------------------------------------------------------ |
@@ -1463,28 +1813,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do | |||
1463 | map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ | 1813 | map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ |
1464 | end | 1814 | end |
1465 | 1815 | ||
1466 | -- SSE FP arithmetic ops. | 1816 | -- SSE / AVX FP arithmetic ops. |
1467 | for name,n in pairs{ sqrt = 1, add = 8, mul = 9, | 1817 | for name,n in pairs{ sqrt = 1, add = 8, mul = 9, |
1468 | sub = 12, min = 13, div = 14, max = 15 } do | 1818 | sub = 12, min = 13, div = 14, max = 15 } do |
1469 | map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) | 1819 | map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) |
1470 | map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) | 1820 | map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) |
1471 | map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) | 1821 | map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) |
1472 | map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) | 1822 | map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) |
1823 | if n ~= 1 then | ||
1824 | map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n) | ||
1825 | map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n) | ||
1826 | map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n) | ||
1827 | map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n) | ||
1828 | end | ||
1829 | end | ||
1830 | |||
1831 | -- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf). | ||
1832 | for name,n in pairs{ | ||
1833 | paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4, | ||
1834 | paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B, | ||
1835 | packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC, | ||
1836 | paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0, | ||
1837 | pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76, | ||
1838 | pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66, | ||
1839 | pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE, | ||
1840 | pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA, | ||
1841 | pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5, | ||
1842 | pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8, | ||
1843 | psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8, | ||
1844 | psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9, | ||
1845 | punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A, | ||
1846 | punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61, | ||
1847 | punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF | ||
1848 | } do | ||
1849 | map_op[name.."_2"] = format("rmo:660F%02XrM", n) | ||
1850 | map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n) | ||
1473 | end | 1851 | end |
1474 | 1852 | ||
1475 | ------------------------------------------------------------------------------ | 1853 | ------------------------------------------------------------------------------ |
1476 | 1854 | ||
1855 | local map_vexarg = { u = false, v = 1, V = 2, w = 3 } | ||
1856 | |||
1477 | -- Process pattern string. | 1857 | -- Process pattern string. |
1478 | local function dopattern(pat, args, sz, op, needrex) | 1858 | local function dopattern(pat, args, sz, op, needrex) |
1479 | local digit, addin | 1859 | local digit, addin, vex |
1480 | local opcode = 0 | 1860 | local opcode = 0 |
1481 | local szov = sz | 1861 | local szov = sz |
1482 | local narg = 1 | 1862 | local narg = 1 |
1483 | local rex = 0 | 1863 | local rex = 0 |
1484 | 1864 | ||
1485 | -- Limit number of section buffer positions used by a single dasm_put(). | 1865 | -- Limit number of section buffer positions used by a single dasm_put(). |
1486 | -- A single opcode needs a maximum of 5 positions. | 1866 | -- A single opcode needs a maximum of 6 positions. |
1487 | if secpos+5 > maxsecpos then wflush() end | 1867 | if secpos+6 > maxsecpos then wflush() end |
1488 | 1868 | ||
1489 | -- Process each character. | 1869 | -- Process each character. |
1490 | for c in gmatch(pat.."|", ".") do | 1870 | for c in gmatch(pat.."|", ".") do |
@@ -1498,6 +1878,8 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1498 | szov = nil | 1878 | szov = nil |
1499 | elseif c == "X" then -- Force REX.W. | 1879 | elseif c == "X" then -- Force REX.W. |
1500 | rex = 8 | 1880 | rex = 8 |
1881 | elseif c == "L" then -- Force VEX.L. | ||
1882 | vex.l = true | ||
1501 | elseif c == "r" then -- Merge 1st operand regno. into opcode. | 1883 | elseif c == "r" then -- Merge 1st operand regno. into opcode. |
1502 | addin = args[1]; opcode = opcode + (addin.reg % 8) | 1884 | addin = args[1]; opcode = opcode + (addin.reg % 8) |
1503 | if narg < 2 then narg = 2 end | 1885 | if narg < 2 then narg = 2 end |
@@ -1521,21 +1903,42 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1521 | if t.xreg and t.xreg > 7 then rex = rex + 2 end | 1903 | if t.xreg and t.xreg > 7 then rex = rex + 2 end |
1522 | if s > 7 then rex = rex + 4 end | 1904 | if s > 7 then rex = rex + 4 end |
1523 | if needrex then rex = rex + 16 end | 1905 | if needrex then rex = rex + 16 end |
1524 | wputop(szov, opcode, rex); opcode = nil | 1906 | local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg) |
1907 | opcode = nil | ||
1525 | local imark = sub(pat, -1) -- Force a mark (ugly). | 1908 | local imark = sub(pat, -1) -- Force a mark (ugly). |
1526 | -- Put ModRM/SIB with regno/last digit as spare. | 1909 | -- Put ModRM/SIB with regno/last digit as spare. |
1527 | wputmrmsib(t, imark, s, addin and addin.vreg) | 1910 | wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk) |
1528 | addin = nil | 1911 | addin = nil |
1912 | elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix | ||
1913 | local b = band(opcode, 255); opcode = shr(opcode, 8) | ||
1914 | local m = 1 | ||
1915 | if b == 0x38 then m = 2 | ||
1916 | elseif b == 0x3a then m = 3 end | ||
1917 | if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end | ||
1918 | if b ~= 0x0f then | ||
1919 | werror("expected `0F', `0F38', or `0F3A' to precede `"..c.. | ||
1920 | "' in pattern `"..pat.."' for `"..op.."'") | ||
1921 | end | ||
1922 | local v = map_vexarg[c] | ||
1923 | if v then v = remove(args, v) end | ||
1924 | b = band(opcode, 255) | ||
1925 | local p = 0 | ||
1926 | if b == 0x66 then p = 1 | ||
1927 | elseif b == 0xf3 then p = 2 | ||
1928 | elseif b == 0xf2 then p = 3 end | ||
1929 | if p ~= 0 then opcode = shr(opcode, 8) end | ||
1930 | if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end | ||
1931 | vex = { m = m, p = p, v = v } | ||
1529 | else | 1932 | else |
1530 | if opcode then -- Flush opcode. | 1933 | if opcode then -- Flush opcode. |
1531 | if szov == "q" and rex == 0 then rex = rex + 8 end | 1934 | if szov == "q" and rex == 0 then rex = rex + 8 end |
1532 | if needrex then rex = rex + 16 end | 1935 | if needrex then rex = rex + 16 end |
1533 | if addin and addin.reg == -1 then | 1936 | if addin and addin.reg == -1 then |
1534 | wputop(szov, opcode - 7, rex) | 1937 | local psz, sk = wputop(szov, opcode - 7, rex, vex, true) |
1535 | waction("VREG", addin.vreg); wputxb(0) | 1938 | wvreg("opcode", addin.vreg, psz, sk) |
1536 | else | 1939 | else |
1537 | if addin and addin.reg > 7 then rex = rex + 1 end | 1940 | if addin and addin.reg > 7 then rex = rex + 1 end |
1538 | wputop(szov, opcode, rex) | 1941 | wputop(szov, opcode, rex, vex) |
1539 | end | 1942 | end |
1540 | opcode = nil | 1943 | opcode = nil |
1541 | end | 1944 | end |
@@ -1572,6 +1975,14 @@ local function dopattern(pat, args, sz, op, needrex) | |||
1572 | else | 1975 | else |
1573 | wputlabel("REL_", imm, 2) | 1976 | wputlabel("REL_", imm, 2) |
1574 | end | 1977 | end |
1978 | elseif c == "s" then | ||
1979 | local reg = a.reg | ||
1980 | if reg < 0 then | ||
1981 | wputb(0) | ||
1982 | wvreg("imm.hi", a.vreg) | ||
1983 | else | ||
1984 | wputb(shl(reg, 4)) | ||
1985 | end | ||
1575 | else | 1986 | else |
1576 | werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") | 1987 | werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") |
1577 | end | 1988 | end |
@@ -1648,11 +2059,14 @@ map_op[".template__"] = function(params, template, nparams) | |||
1648 | if pat == "" then pat = lastpat else lastpat = pat end | 2059 | if pat == "" then pat = lastpat else lastpat = pat end |
1649 | if matchtm(tm, args) then | 2060 | if matchtm(tm, args) then |
1650 | local prefix = sub(szm, 1, 1) | 2061 | local prefix = sub(szm, 1, 1) |
1651 | if prefix == "/" then -- Match both operand sizes. | 2062 | if prefix == "/" then -- Exactly match leading operand sizes. |
1652 | if args[1].opsize == sub(szm, 2, 2) and | 2063 | for i = #szm,1,-1 do |
1653 | args[2].opsize == sub(szm, 3, 3) then | 2064 | if i == 1 then |
1654 | dopattern(pat, args, sz, params.op, needrex) -- Process pattern. | 2065 | dopattern(pat, args, sz, params.op, needrex) -- Process pattern. |
1655 | return | 2066 | return |
2067 | elseif args[i-1].opsize ~= sub(szm, i, i) then | ||
2068 | break | ||
2069 | end | ||
1656 | end | 2070 | end |
1657 | else -- Match common operand size. | 2071 | else -- Match common operand size. |
1658 | local szp = sz | 2072 | local szp = sz |
@@ -1717,8 +2131,8 @@ if x64 then | |||
1717 | rex = a.reg > 7 and 9 or 8 | 2131 | rex = a.reg > 7 and 9 or 8 |
1718 | end | 2132 | end |
1719 | end | 2133 | end |
1720 | wputop(sz, opcode, rex) | 2134 | local psz, sk = wputop(sz, opcode, rex, nil, vreg) |
1721 | if vreg then waction("VREG", vreg); wputxb(0) end | 2135 | wvreg("opcode", vreg, psz, sk) |
1722 | waction("IMM_D", format("(unsigned int)(%s)", op64)) | 2136 | waction("IMM_D", format("(unsigned int)(%s)", op64)) |
1723 | waction("IMM_D", format("(unsigned int)((%s)>>32)", op64)) | 2137 | waction("IMM_D", format("(unsigned int)((%s)>>32)", op64)) |
1724 | end | 2138 | end |