From 04b60707d7d117da22b40736a353e2a10179108a Mon Sep 17 00:00:00 2001
From: Mike Pall <mike>
Date: Sun, 20 Nov 2016 22:16:08 +0100
Subject: ARM64: Add JIT compiler backend.

Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
Sponsored by Cisco Systems, Inc.
---
 src/jit/dis_arm64.lua | 1215 ++++++++++++++++++++++++++++++++
 src/lj_arch.h         |    1 -
 src/lj_asm.c          |    4 +
 src/lj_asm_arm64.h    | 1823 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/lj_ccall.c        |    2 +-
 src/lj_dispatch.h     |    1 +
 src/lj_emit_arm64.h   |  397 +++++++++++
 src/lj_gdbjit.c       |   12 +
 src/lj_target.h       |    4 +-
 src/lj_target_arm64.h |  221 +++++-
 src/vm_arm64.dasc     |  227 +++++-
 11 files changed, 3885 insertions(+), 22 deletions(-)
 create mode 100644 src/jit/dis_arm64.lua
 create mode 100644 src/lj_asm_arm64.h
 create mode 100644 src/lj_emit_arm64.h

(limited to 'src')

diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua
new file mode 100644
index 00000000..909b33bc
--- /dev/null
+++ b/src/jit/dis_arm64.lua
@@ -0,0 +1,1215 @@
+----------------------------------------------------------------------------
+-- LuaJIT ARM64 disassembler module.
+--
+-- Copyright (C) 2005-2016 Mike Pall. All rights reserved.
+-- Released under the MIT license. See Copyright Notice in luajit.h
+--
+-- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+-- Sponsored by Cisco Systems, Inc.
+----------------------------------------------------------------------------
+-- This is a helper module used by the LuaJIT machine code dumper module.
+--
+-- It disassembles most user-mode AArch64 instructions.
+-- NYI: Advanced SIMD and VFP instructions.
+------------------------------------------------------------------------------
+
+local type, tonumber = type, tonumber
+local sub, byte, format = string.sub, string.byte, string.format
+local match, gmatch, gsub = string.match, string.gmatch, string.gsub
+local rep = string.rep
+local concat = table.concat
+local bit = require("bit")
+local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex
+local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
+local ror = bit.ror
+
+------------------------------------------------------------------------------
+-- Opcode maps
+------------------------------------------------------------------------------
+
+local map_adr = { -- PC-relative addressing.
+  shift = 31, mask = 1,
+  [0] = "adrDBx", "adrpDBx"
+}
+
+local map_addsubi = { -- Add/subtract immediate.
+  shift = 29, mask = 3,
+  [0] = "add|movDNIg", "adds|cmnD0NIg", "subDNIg", "subs|cmpD0NIg",
+}
+
+local map_logi = { -- Logical immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+  }
+}
+
+local map_movwi = { -- Move wide immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+    }, false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+  },
+}
+
+local map_bitf = { -- Bitfield.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12w",
+      "bfm|bfi|bfxilDN13w",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12w"
+    }
+  },
+  {
+    shift = 22, mask = 1,
+    {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12x",
+      "bfm|bfi|bfxilDN13x",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12x"
+    }
+  }
+}
+
+local map_datai = { -- Data processing - immediate.
+  shift = 23, mask = 7,
+  [0] = map_adr, map_adr, map_addsubi, false,
+  map_logi, map_movwi, map_bitf,
+  {
+    shift = 15, mask = 0x1c0c1,
+    [0] = "extr|rorDNM4w", [0x10080] = "extr|rorDNM4x",
+    [0x10081] = "extr|rorDNM4x"
+  }
+}
+
+local map_logsr = { -- Logical, shifted register.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 21, mask = 7,
+	[0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+	"andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+	     "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+	"eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+	"ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+      }
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+      "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+      "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+      "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+      "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+    }
+  }
+}
+
+local map_assh = {
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	      "adds|cmnD0NMSg", "adds|cmnD0NMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	      "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+      },
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	    "adds|cmnD0NMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	    "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+    }
+  }
+}
+
+local map_addsubsh = { -- Add/subtract, shifted register.
+  shift = 22, mask = 3,
+  [0] = map_assh, map_assh, map_assh
+}
+
+local map_addsubex = { -- Add/subtract, extended register.
+  shift = 22, mask = 3,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "addDNMXg", "adds|cmnD0NMXg", "subDNMXg", "subs|cmpD0NMzXg",
+  }
+}
+
+local map_addsubc = { -- Add/subtract, with carry.
+  shift = 10, mask = 63,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "adcDNMg", "adcsDNMg", "sbc|ngcDN0Mg", "sbcs|ngcsDN0Mg",
+  }
+}
+
+local map_ccomp = {
+  shift = 4, mask = 1,
+  [0] = {
+    shift = 10, mask = 3,
+    [0] = { -- Conditional compare register.
+      shift = 29, mask = 3,
+      "ccmnNMVCg", false, "ccmpNMVCg",
+    },
+    [2] = {  -- Conditional compare immediate.
+      shift = 29, mask = 3,
+      "ccmnN5VCg", false, "ccmpN5VCg",
+    }
+  }
+}
+
+local map_csel = { -- Conditional select.
+  shift = 11, mask = 1,
+  [0] = {
+    shift = 10, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "cselDNMzCg", false, "csinv|cinv|csetmDNMcg", false,
+    },
+    {
+      shift = 29, mask = 3,
+      [0] = "csinc|cinc|csetDNMcg", false, "csneg|cnegDNMcg", false,
+    }
+  }
+}
+
+local map_data1s = { -- Data processing, 1 source.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 31, mask = 1,
+    [0] = {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "revDNw", false, "clzDNg", "clsDNg"
+    },
+    {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "rev32DNx", "revDNx", "clzDNg", "clsDNg"
+    }
+  }
+}
+
+local map_data2s = { -- Data processing, 2 sources.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 10, mask = 63,
+    false, "udivDNMg", "sdivDNMg", false, false, false, false, "lslDNMg",
+    "lsrDNMg", "asrDNMg", "rorDNMg"
+  }
+}
+
+local map_data3s = { -- Data processing, 3 sources.
+  shift = 29, mask = 7,
+  [0] = {
+    shift = 21, mask = 7,
+    [0] = {
+      shift = 15, mask = 1,
+      [0] = "madd|mulDNMA0g", "msub|mnegDNMA0g"
+    }
+  }, false, false, false,
+  {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "madd|mulDNMA0g", "smaddl|smullDxNMwA0x", "smulhDNMx", false,
+      false, "umaddl|umullDxNMwA0x", "umulhDNMx"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "msub|mnegDNMA0g", "smsubl|smneglDxNMwA0x", false, false,
+      false, "umsubl|umneglDxNMwA0x"
+    }
+  }
+}
+
+local map_datar = { -- Data processing, register.
+  shift = 28, mask = 1,
+  [0] = {
+    shift = 24, mask = 1,
+    [0] = map_logsr,
+    {
+      shift = 21, mask = 1,
+      [0] = map_addsubsh, map_addsubex
+    }
+  },
+  {
+    shift = 21, mask = 15,
+    [0] = map_addsubc, false, map_ccomp, false, map_csel, false,
+    {
+      shift = 30, mask = 1,
+      [0] = map_data2s, map_data1s
+    },
+    false, map_data3s, map_data3s, map_data3s, map_data3s, map_data3s,
+    map_data3s, map_data3s, map_data3s
+  }
+}
+
+local map_lrl = { -- Load register, literal.
+  shift = 26, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = "ldrDwB", "ldrDxB", "ldrswDxB"
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = "ldrDsB", "ldrDdB"
+  }
+}
+
+local map_lsriind = { -- Load/store register, immediate pre/post-indexed.
+  shift = 30, mask = 3,
+  [0] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strbDwzL", "ldrbDwzL", "ldrsbDxzL", "ldrsbDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strhDwzL", "ldrhDwzL", "ldrshDxzL", "ldrshDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDwzL", "ldrDwzL", "ldrswDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDszL", "ldrDszL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDxzL", "ldrDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDdzL", "ldrDdzL"
+    }
+  }
+}
+
+local map_lsriro = {
+  shift = 21, mask = 1,
+  [0] = {  -- Load/store register immediate.
+    shift = 10, mask = 3,
+    [0] = { -- Unscaled immediate.
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[0] = {
+	  shift = 22, mask = 3,
+	  [0] = "sturbDwK", "ldurbDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturhDwK", "ldurhDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDwK", "ldurDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDxK", "ldurDxK"
+	}
+      }
+    }, map_lsriind, false, map_lsriind
+  },
+  {  -- Load/store register, register offset.
+    shift = 10, mask = 3,
+    [2] = {
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[1] = {
+	  shift = 22, mask = 3,
+	  [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO"
+	},
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDwO", "ldrDwO", "ldrswDxO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDxO", "ldrDxO"
+	}
+      },
+      {
+	shift = 30, mask = 3,
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDsO", "ldrDsO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDdO", "ldrDdO"
+	}
+      }
+    }
+  }
+}
+
+local map_lsp = { -- Load/store register pair, offset.
+  shift = 22, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzwP", "stpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      "stpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzxP"
+    }
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzwP", "ldpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpswDAxP", "ldpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzxP"
+    }
+  }
+}
+
+local map_ls = { -- Loads and stores.
+  shift = 24, mask = 0x31,
+  [0x10] = map_lrl, [0x30] = map_lsriro,
+  [0x20] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x21] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x31] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 30, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "strbDwzU", "ldrbDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strhDwzU", "ldrhDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDwzU", "ldrDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDxzU", "ldrDxzU"
+      }
+    },
+    {
+      shift = 30, mask = 3,
+      [2] = {
+	shift = 22, mask = 3,
+	[0] = "strDszU", "ldrDszU"
+      },
+      [3] = {
+	shift = 22, mask = 3,
+	[0] = "strDdzU", "ldrDdzU"
+      }
+    }
+  },
+}
+
+local map_datafp = { -- Data processing, SIMD and FP.
+  shift = 28, mask = 7,
+  { -- 001
+    shift = 24, mask = 1,
+    [0] = {
+      shift = 21, mask = 1,
+      {
+	shift = 10, mask = 3,
+	[0] = {
+	  shift = 12, mask = 1,
+	  [0] = {
+	    shift = 13, mask = 1,
+	    [0] = {
+	      shift = 14, mask = 1,
+	      [0] = {
+		shift = 15, mask = 1,
+		[0] = { -- FP/int conversion.
+		  shift = 31, mask = 1,
+		  [0] = {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDwNs", [0x21] = "fcvtnuDwNs",
+		    [0x22] = "scvtfDsNw", [0x23] = "ucvtfDsNw",
+		    [0x24] = "fcvtasDwNs", [0x25] = "fcvtauDwNs",
+		    [0x26] = "fmovDwNs", [0x27] = "fmovDsNw",
+		    [0x28] = "fcvtpsDwNs", [0x29] = "fcvtpuDwNs",
+		    [0x30] = "fcvtmsDwNs", [0x31] = "fcvtmuDwNs",
+		    [0x38] = "fcvtzsDwNs", [0x39] = "fcvtzuDwNs",
+		    [0x60] = "fcvtnsDwNd", [0x61] = "fcvtnuDwNd",
+		    [0x62] = "scvtfDdNw", [0x63] = "ucvtfDdNw",
+		    [0x64] = "fcvtasDwNd", [0x65] = "fcvtauDwNd",
+		    [0x68] = "fcvtpsDwNd", [0x69] = "fcvtpuDwNd",
+		    [0x70] = "fcvtmsDwNd", [0x71] = "fcvtmuDwNd",
+		    [0x78] = "fcvtzsDwNd", [0x79] = "fcvtzuDwNd"
+		  },
+		  {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDxNs", [0x21] = "fcvtnuDxNs",
+		    [0x22] = "scvtfDsNx", [0x23] = "ucvtfDsNx",
+		    [0x24] = "fcvtasDxNs", [0x25] = "fcvtauDxNs",
+		    [0x28] = "fcvtpsDxNs", [0x29] = "fcvtpuDxNs",
+		    [0x30] = "fcvtmsDxNs", [0x31] = "fcvtmuDxNs",
+		    [0x38] = "fcvtzsDxNs", [0x39] = "fcvtzuDxNs",
+		    [0x60] = "fcvtnsDxNd", [0x61] = "fcvtnuDxNd",
+		    [0x62] = "scvtfDdNx", [0x63] = "ucvtfDdNx",
+		    [0x64] = "fcvtasDxNd", [0x65] = "fcvtauDxNd",
+		    [0x66] = "fmovDxNd", [0x67] = "fmovDdNx",
+		    [0x68] = "fcvtpsDxNd", [0x69] = "fcvtpuDxNd",
+		    [0x70] = "fcvtmsDxNd", [0x71] = "fcvtmuDxNd",
+		    [0x78] = "fcvtzsDxNd", [0x79] = "fcvtzuDxNd"
+		  }
+		}
+	      },
+	      { -- FP data-processing, 1 source.
+		shift = 31, mask = 1,
+		[0] = {
+		  shift = 22, mask = 3,
+		  [0] = {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", false, "fcvtDdNs", false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  },
+		  {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", "fcvtDsNd", false, false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  }
+		}
+	      }
+	    },
+	    { -- FP compare.
+	      shift = 31, mask = 1,
+	      [0] = {
+		shift = 14, mask = 3,
+		[0] = {
+		  shift = 23, mask = 1,
+		  [0] = {
+		    shift = 0, mask = 31,
+		    [0] = "fcmpNMf", [8] = "fcmpNZf",
+		    [16] = "fcmpeNMf", [24] = "fcmpeNZf",
+		  }
+		}
+	      }
+	    }
+	  },
+	  { -- FP immediate.
+	    shift = 31, mask = 1,
+	    [0] = {
+	      shift = 5, mask = 31,
+	      [0] = {
+		shift = 23, mask = 1,
+		[0] = "fmovDFf"
+	      }
+	    }
+	  }
+	},
+	{ -- FP conditional compare.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 4, mask = 1,
+	      [0] = "fccmpNMVCf", "fccmpeNMVCf"
+	    }
+	  }
+	},
+	{ -- FP data-processing, 2 sources.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 12, mask = 15,
+	      [0] = "fmulDNMf", "fdivDNMf", "faddDNMf", "fsubDNMf",
+	      "fmaxDNMf", "fminDNMf", "fmaxnmDNMf", "fminnmDNMf",
+	      "fnmulDNMf"
+	    }
+	  }
+	},
+	{ -- FP conditional select.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = "fcselDNMCf"
+	  }
+	}
+      }
+    },
+    { -- FP data-processing, 3 sources.
+      shift = 31, mask = 1,
+      [0] = {
+	shift = 15, mask = 1,
+	[0] = {
+	  shift = 21, mask = 5,
+	  [0] = "fmaddDNMAf", "fnmaddDNMAf"
+	},
+	{
+	  shift = 21, mask = 5,
+	  [0] = "fmsubDNMAf", "fnmsubDNMAf"
+	}
+      }
+    }
+  }
+}
+
+local map_br = { -- Branches, exception generating and system instructions.
+  shift = 29, mask = 7,
+  [0] = "bB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBw", "tbnzDTBw"
+  },
+  { -- Conditional branch, immediate.
+    shift = 24, mask = 3,
+    [0] = {
+      shift = 4, mask = 1,
+      [0] = {
+	shift = 0, mask = 15,
+	[0] = "beqB", "bneB", "bhsB", "bloB", "bmiB", "bplB", "bvsB", "bvcB",
+	"bhiB", "blsB", "bgeB", "bltB", "bgtB", "bleB", "balB"
+      }
+    }
+  }, false, "blB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBx", "tbnzDTBx"
+  },
+  {
+    shift = 24, mask = 3,
+    [0] = { -- Exception generation.
+      shift = 0, mask = 0xe0001f,
+      [0x200000] = "brkW"
+    },
+    { -- System instructions.
+      shift = 0, mask = 0x3fffff,
+      [0x03201f] = "nop"
+    },
+    { -- Unconditional branch, register.
+      shift = 0, mask = 0xfffc1f,
+      [0x1f0000] = "brNx", [0x3f0000] = "blrNx",
+      [0x5f0000] = "retNx"
+    },
+  }
+}
+
+local map_init = {
+  shift = 25, mask = 15,
+  [0] = false, false, false, false, map_ls, map_datar, map_ls, map_datafp,
+  map_datai, map_datai, map_br, map_br, map_ls, map_datar, map_ls, map_datafp
+}
+
+------------------------------------------------------------------------------
+
+local map_regs = { x = {}, w = {}, d = {}, s = {} }
+
+for i=0,30 do
+  map_regs.x[i] = "x"..i
+  map_regs.w[i] = "w"..i
+  map_regs.d[i] = "d"..i
+  map_regs.s[i] = "s"..i
+end
+map_regs.x[31] = "sp"
+map_regs.w[31] = "wsp"
+map_regs.d[31] = "d31"
+map_regs.s[31] = "s31"
+
+local map_cond = {
+  [0] = "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+  "hi", "ls", "ge", "lt", "gt", "le", "al",
+}
+
+local map_shift = { [0] = "lsl", "lsr", "asr", }
+
+local map_extend = {
+  [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
+}
+
+------------------------------------------------------------------------------
+
+-- Output a nicely formatted line with an opcode and operands.
+local function putop(ctx, text, operands)
+  local pos = ctx.pos
+  local extra = ""
+  if ctx.rel then
+    local sym = ctx.symtab[ctx.rel]
+    if sym then
+      extra = "\t->"..sym
+    end
+  end
+  if ctx.hexdump > 0 then
+    ctx.out(format("%08x  %s  %-5s %s%s\n",
+      ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
+  else
+    ctx.out(format("%08x  %-5s %s%s\n",
+      ctx.addr+pos, text, concat(operands, ", "), extra))
+  end
+  ctx.pos = pos + 4
+end
+
+-- Fallback for unknown opcodes.
+local function unknown(ctx)
+  return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
+end
+
+local function match_reg(p, pat, regnum)
+  return map_regs[match(pat, p.."%w-([xwds])")][regnum]
+end
+
+local function fmt_hex32(x)
+  if x < 0 then
+    return tohex(x)
+  else
+    return format("%x", x)
+  end
+end
+
+local imm13_rep = { 0x55555555, 0x11111111, 0x01010101, 0x00010001, 0x00000001 }
+
+local function decode_imm13(op)
+  local imms = band(rshift(op, 10), 63)
+  local immr = band(rshift(op, 16), 63)
+  if band(op, 0x00400000) == 0 then
+    local len = 5
+    if imms >= 56 then
+      if imms >= 60 then len = 1 else len = 2 end
+    elseif imms >= 48 then len = 3 elseif imms >= 32 then len = 4 end
+    local l = lshift(1, len)-1
+    local s = band(imms, l)
+    local r = band(immr, l)
+    local imm = ror(rshift(-1, 31-s), r)
+    if len ~= 5 then imm = band(imm, lshift(1, l)-1) + rshift(imm, 31-l) end
+    imm = imm * imm13_rep[len]
+    local ix = fmt_hex32(imm)
+    if rshift(op, 31) ~= 0 then
+      return ix..tohex(imm)
+    else
+      return ix
+    end
+  else
+    local lo, hi = -1, 0
+    if imms < 32 then lo = rshift(-1, 31-imms) else hi = rshift(-1, 63-imms) end
+    if immr ~= 0 then
+      lo, hi = ror(lo, immr), ror(hi, immr)
+      local x = immr == 32 and 0 or band(bxor(lo, hi), lshift(-1, 32-immr))
+      lo, hi = bxor(lo, x), bxor(hi, x)
+      if immr >= 32 then lo, hi = hi, lo end
+    end
+    if hi ~= 0 then
+      return fmt_hex32(hi)..tohex(lo)
+    else
+      return fmt_hex32(lo)
+    end
+  end
+end
+
+local function parse_immpc(op, name)
+  if name == "b" or name == "bl" then
+    return arshift(lshift(op, 6), 4)
+  elseif name == "adr" or name == "adrp" then
+    local immlo = band(rshift(op, 29), 3)
+    local immhi = lshift(arshift(lshift(op, 8), 13), 2)
+    return bor(immhi, immlo)
+  elseif name == "tbz" or name == "tbnz" then
+    return lshift(arshift(lshift(op, 13), 18), 2)
+  else
+    return lshift(arshift(lshift(op, 8), 13), 2)
+  end
+end
+
+local function parse_fpimm8(op)
+  local sign = band(op, 0x100000) == 0 and 1 or -1
+  local exp = bxor(rshift(arshift(lshift(op, 12), 5), 24), 0x80) - 131
+  local frac = 16+band(rshift(op, 13), 15)
+  return sign * frac * 2^exp
+end
+
+local function prefer_bfx(sf, uns, imms, immr)
+  if imms < immr or imms == 31 or imms == 63 then
+    return false
+  end
+  if immr == 0 then
+    if sf == 0 and (imms == 7 or imms == 15) then
+      return false
+    end
+    if sf ~= 0 and uns == 0 and (imms == 7 or imms == 15 or imms == 31) then
+      return false
+    end
+  end
+  return true
+end
+
+-- Disassemble a single instruction.
+local function disass_ins(ctx)
+  local pos = ctx.pos
+  local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
+  local op = bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
+  local operands = {}
+  local suffix = ""
+  local last, name, pat
+  local vr
+  local map_reg
+  ctx.op = op
+  ctx.rel = nil
+  last = nil
+  local opat
+  opat = map_init[band(rshift(op, 25), 15)]
+  while type(opat) ~= "string" do
+    if not opat then return unknown(ctx) end
+    opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
+  end
+  name, pat = match(opat, "^([a-z0-9]*)(.*)")
+  local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
+  if altname then pat = pat2 end
+  if sub(pat, 1, 1) == "." then
+    local s2, p2 = match(pat, "^([a-z0-9.]*)(.*)")
+    suffix = suffix..s2
+    pat = p2
+  end
+
+  local rt = match(pat, "[gf]")
+  if rt then
+    if rt == "g" then
+      map_reg = band(op, 0x80000000) ~= 0 and map_regs.x or map_regs.w
+    else
+      map_reg = band(op, 0x400000) ~= 0 and map_regs.d or map_regs.s
+    end
+  end
+
+  local second0, immr
+
+  for p in gmatch(pat, ".") do
+    local x = nil
+    if p == "D" then
+      local regnum = band(op, 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "N" then
+      local regnum = band(rshift(op, 5), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "M" then
+      local regnum = band(rshift(op, 16), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "A" then
+      local regnum = band(rshift(op, 10), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "B" then
+      local addr = ctx.addr + pos + parse_immpc(op, name)
+      ctx.rel = addr
+      x = "0x"..tohex(addr)
+    elseif p == "T" then
+      x = bor(band(rshift(op, 26), 32), band(rshift(op, 19), 31))
+    elseif p == "V" then
+      x = band(op, 15)
+    elseif p == "C" then
+      x = map_cond[band(rshift(op, 12), 15)]
+    elseif p == "c" then
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      local cond = band(rshift(op, 12), 15)
+      local invc = bxor(cond, 1)
+      x = map_cond[cond]
+      if altname and cond ~= 14 and cond ~= 15 then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if rn == rm then
+	  local n = #operands
+	  operands[n] = nil
+	  x = map_cond[invc]
+	  if rn ~= 31 then
+	    if a1 then name = a1 else name = altname end
+	  else
+	    operands[n-1] = nil
+	    name = a2
+	  end
+	end
+      end
+    elseif p == "W" then
+      x = band(rshift(op, 5), 0xffff)
+    elseif p == "Y" then
+      x = band(rshift(op, 5), 0xffff)
+      local hw = band(rshift(op, 21), 3)
+      if altname and (hw == 0 or x ~= 0) then
+	name = altname
+      end
+    elseif p == "L" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if band(op, 0x800) ~= 0 then
+	x = "["..rn..", #"..imm9.."]!"
+      else
+	x = "["..rn.."], #"..imm9
+      end
+    elseif p == "U" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local sz = band(rshift(op, 30), 3)
+      local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
+      if imm12 ~= 0 then
+	x = "["..rn..", #"..imm12.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "K" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if imm9 ~= 0 then
+	x = "["..rn..", #"..imm9.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "O" then
+      local rn, rm = map_regs.x[band(rshift(op, 5), 31)]
+      local m = band(rshift(op, 13), 1)
+      if m == 0 then
+	rm = map_regs.w[band(rshift(op, 16), 31)]
+      else
+	rm = map_regs.x[band(rshift(op, 16), 31)]
+      end
+      x = "["..rn..", "..rm
+      local opt = band(rshift(op, 13), 7)
+      local s = band(rshift(op, 12), 1)
+      local sz = band(rshift(op, 30), 3)
+      -- extension to be applied
+      if opt == 3 then
+       if s == 0 then x = nil
+       else x = x..", lsl #"..sz.."]" end
+      elseif opt == 2 or opt == 6 or opt == 7 then
+	if s == 0 then x = x..", "..map_extend[opt].."]"
+	else x = x..", "..map_extend[opt].." #"..sz.."]" end
+      else
+	x = x.."]"
+      end
+    elseif p == "P" then
+      local opcv, sh = rshift(op, 26), 2
+      if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
+      local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local ind = band(rshift(op, 23), 3)
+      if ind == 1 then
+	x = "["..rn.."], #"..imm7
+      elseif ind == 2 then
+	if imm7 == 0 then
+	  x = "["..rn.."]"
+	else
+	  x = "["..rn..", #"..imm7.."]"
+	end
+      elseif ind == 3 then
+	x = "["..rn..", #"..imm7.."]!"
+      end
+    elseif p == "I" then
+      local shf = band(rshift(op, 22), 3)
+      local imm12 = band(rshift(op, 10), 0x0fff)
+      local n = #operands
+      local rn, rd = band(rshift(op, 5), 31), band(op, 31)
+      if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then
+	name = altname
+	x = nil
+      elseif shf == 0 then
+	x = imm12
+      elseif shf == 1 then
+	x = imm12..", lsl #12"
+      end
+    elseif p == "i" then
+      x = "#0x"..decode_imm13(op)
+    elseif p == "1" then
+      immr = band(rshift(op, 16), 63)
+      x = immr
+    elseif p == "2" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2, a3, a4, a5, a6 =
+	  match(altname, "([^|]*)|([^|]*)|([^|]*)|([^|]*)|([^|]*)|(.*)")
+	local sf = band(rshift(op, 26), 32)
+	local uns = band(rshift(op, 30), 1)
+	if prefer_bfx(sf, uns, x, immr) then
+	  name = a2
+	  x = x - immr + 1
+	elseif immr == 0 and x == 7 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a6
+	  x = nil
+	elseif immr == 0 and x == 15 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a5
+	  x = nil
+	elseif x == 31 or x == 63 then
+	  if x == 31 and immr == 0 and name == "sbfm" then
+	    name = a4
+	    local n = #operands
+	    operands[n] = nil
+	    if sf ~= 0 then
+	      operands[n-1] = gsub(operands[n-1], "x", "w")
+	    end
+	    last = operands[n-1]
+	  else
+	    name = a3
+	  end
+	  x = nil
+	elseif band(x, 31) ~= 31 and immr == x+1 and name == "ubfm" then
+	  name = a4
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = nil
+	elseif x < immr then
+	  name = a1
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	end
+      end
+    elseif p == "3" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if x < immr then
+	  name = a1
+	  local sf = band(rshift(op, 26), 32)
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	elseif x >= immr then
+	  name = a2
+	  x = x - immr + 1
+	end
+      end
+    elseif p == "4" then
+      x = band(rshift(op, 10), 63)
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      if altname and rn == rm then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	name = altname
+      end
+    elseif p == "5" then
+      x = band(rshift(op, 16), 31)
+    elseif p == "S" then
+      x = band(rshift(op, 10), 63)
+      if x == 0 then x = nil
+      else x = map_shift[band(rshift(op, 22), 3)].." #"..x end
+    elseif p == "X" then
+      local opt = band(rshift(op, 13), 7)
+      -- Width specifier <R>.
+      if opt ~= 3 and opt ~= 7 then
+	last = map_regs.w[band(rshift(op, 16), 31)]
+	operands[#operands] = last
+      end
+      x = band(rshift(op, 10), 7)
+      -- Extension.
+      if opt == 2 + band(rshift(op, 31), 1) and
+	 band(rshift(op, second0 and 5 or 0), 31) == 31 then
+	if x == 0 then x = nil
+	else x = "lsl #"..x end
+      else
+	if x == 0 then x = map_extend[band(rshift(op, 13), 7)]
+	else x = map_extend[band(rshift(op, 13), 7)].." #"..x end
+      end
+    elseif p == "R" then
+      x = band(rshift(op,21), 3)
+      if x == 0 then x = nil
+      else x = "lsl #"..x*16 end
+    elseif p == "z" then
+      local n = #operands
+      if operands[n] == "sp" then operands[n] = "xzr"
+      elseif operands[n] == "wsp" then operands[n] = "wzr"
+      end
+    elseif p == "Z" then
+      x = 0
+    elseif p == "F" then
+      x = parse_fpimm8(op)
+    elseif p == "g" or p == "f" or p == "x" or p == "w" or
+	   p == "d" or p == "s" then
+      -- These are handled in D/N/M/A.
+    elseif p == "0" then
+      if last == "sp" or last == "wsp" then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	if altname then
+	  local a1, a2 = match(altname, "([^|]*)|(.*)")
+	  if not a1 then
+	    name = altname
+	  elseif second0 then
+	    name, altname = a2, a1
+	  else
+	    name, altname = a1, a2
+	  end
+	end
+      end
+      second0 = true
+    else
+      assert(false)
+    end
+    if x then
+      last = x
+      if type(x) == "number" then x = "#"..x end
+      operands[#operands+1] = x
+    end
+  end
+
+  return putop(ctx, name..suffix, operands)
+end
+
+------------------------------------------------------------------------------
+
+-- Disassemble a block of code.
+local function disass_block(ctx, ofs, len)
+  if not ofs then ofs = 0 end
+  local stop = len and ofs+len or #ctx.code
+  ctx.pos = ofs
+  ctx.rel = nil
+  while ctx.pos < stop do disass_ins(ctx) end
+end
+
+-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
+local function create(code, addr, out)
+  local ctx = {}
+  ctx.code = code
+  ctx.addr = addr or 0
+  ctx.out = out or io.write
+  ctx.symtab = {}
+  ctx.disass = disass_block
+  ctx.hexdump = 8
+  return ctx
+end
+
+-- Simple API: disassemble code (a string) at address and output via out.
+local function disass(code, addr, out)
+  create(code, addr, out):disass()
+end
+
+-- Return register name for RID.
+local function regname(r)
+  if r < 32 then return map_regs.x[r] end
+  return map_regs.d[r-32]
+end
+
+-- Public module functions.
+return {
+  create = create,
+  disass = disass,
+  regname = regname
+}
+
diff --git a/src/lj_arch.h b/src/lj_arch.h
index cc5a0a66..3df602e3 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -226,7 +226,6 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_TARGET_GC64		1
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
-#define LJ_ARCH_NOJIT		1	/* NYI */
 
 #define LJ_ARCH_VERSION		80
 
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 7ce58924..2cb5abea 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -171,6 +171,8 @@ IRFLDEF(FLOFS)
 #include "lj_emit_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_emit_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_emit_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_emit_ppc.h"
 #elif LJ_TARGET_MIPS
@@ -1563,6 +1565,8 @@ static void asm_loop(ASMState *as)
 #include "lj_asm_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_asm_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_asm_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_asm_ppc.h"
 #elif LJ_TARGET_MIPS
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
new file mode 100644
index 00000000..0a2f5306
--- /dev/null
+++ b/src/lj_asm_arm64.h
@@ -0,0 +1,1823 @@
+/*
+** ARM64 IR assembler (SSA IR -> machine code).
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Register allocator extensions --------------------------------------- */
+
+/* Allocate a register with a hint. */
+static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
+{
+  Reg r = IR(ref)->r;
+  if (ra_noreg(r)) {
+    if (!ra_hashint(r) && !iscrossref(as, ref))
+      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
+    r = ra_allocref(as, ref, allow);
+  }
+  ra_noweak(as, r);
+  return r;
+}
+
+/* Allocate two source registers for three-operand instructions. */
+static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
+{
+  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
+  Reg left = irl->r, right = irr->r;
+  if (ra_hasreg(left)) {
+    ra_noweak(as, left);
+    if (ra_noreg(right))
+      right = ra_allocref(as, ir->op2, rset_exclude(allow, left));
+    else
+      ra_noweak(as, right);
+  } else if (ra_hasreg(right)) {
+    ra_noweak(as, right);
+    left = ra_allocref(as, ir->op1, rset_exclude(allow, right));
+  } else if (ra_hashint(right)) {
+    right = ra_allocref(as, ir->op2, allow);
+    left = ra_alloc1(as, ir->op1, rset_exclude(allow, right));
+  } else {
+    left = ra_allocref(as, ir->op1, allow);
+    right = ra_alloc1(as, ir->op2, rset_exclude(allow, left));
+  }
+  return left | (right << 8);
+}
+
+/* -- Guard handling ------------------------------------------------------ */
+
+/* Generate an exit stub group at the bottom of the reserved MCode memory. */
+static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
+{
+  MCode *mxp = as->mcbot;
+  int i;
+  if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop)
+    asm_mclimit(as);
+  /* str lr, [sp]; bl ->vm_exit_handler; .long group. */
+  *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP);
+  *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu);
+  mxp++;
+  *mxp++ = group*EXITSTUBS_PER_GROUP;
+  for (i = 0; i < EXITSTUBS_PER_GROUP; i++)
+    *mxp++ = A64I_B | ((-3-i)&0x03ffffffu);
+  lj_mcode_sync(as->mcbot, mxp);
+  lj_mcode_commitbot(as->J, mxp);
+  as->mcbot = mxp;
+  as->mclim = as->mcbot + MCLIM_REDZONE;
+  return mxp - EXITSTUBS_PER_GROUP;
+}
+
+/* Setup all needed exit stubs. */
+static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
+{
+  ExitNo i;
+  if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
+    lj_trace_err(as->J, LJ_TRERR_SNAPOV);
+  for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
+    if (as->J->exitstubgroup[i] == NULL)
+      as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
+}
+
+/* Emit conditional branch to exit for guard. */
+static void asm_guardcc(ASMState *as, A64CC cc)
+{
+  MCode *target = exitstub_addr(as->J, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *p = A64I_BL | ((target-p) & 0x03ffffffu);
+    emit_cond_branch(as, cc^1, p-1);
+    return;
+  }
+  /* No conditional calls. Emit b.cc/bl instead. */
+  /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */
+  emit_branch(as, A64I_BL, target);
+  emit_cond_branch(as, cc^1, p);
+}
+
+/* -- Operand fusion ------------------------------------------------------ */
+
+/* Limit linear search to this distance. Avoids O(n^2) behavior. */
+#define CONFLICT_SEARCH_LIM	31
+
+static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
+{
+  if (irref_isk(ref)) {
+    IRIns *ir = IR(ref);
+    if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
+      *k = ir->i;
+      return 1;
+    } else if (checki32((int64_t)ir_k64(ir)->u64)) {
+      *k = (int32_t)ir_k64(ir)->u64;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/* Check if there's no conflicting instruction between curins and ref. */
+static int noconflict(ASMState *as, IRRef ref, IROp conflict)
+{
+  IRIns *ir = as->ir;
+  IRRef i = as->curins;
+  if (i > ref + CONFLICT_SEARCH_LIM)
+    return 0;  /* Give up, ref is too far away. */
+  while (--i > ref)
+    if (ir[i].o == conflict)
+      return 0;  /* Conflict found. */
+  return 1;  /* Ok, no conflict. */
+}
+
+/* Fuse the array base of colocated arrays. */
+static int32_t asm_fuseabase(ASMState *as, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
+      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
+    return (int32_t)sizeof(GCtab);
+  return 0;
+}
+
+#define FUSE_REG	0x40000000
+
+/* Fuse array/hash/upvalue reference into register+offset operand. */
+static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
+			  A64Ins ins)
+{
+  IRIns *ir = IR(ref);
+  if (ra_noreg(ir->r)) {
+    if (ir->o == IR_AREF) {
+      if (mayfuse(as, ref)) {
+	if (irref_isk(ir->op2)) {
+	  IRRef tab = IR(ir->op1)->op1;
+	  int32_t ofs = asm_fuseabase(as, tab);
+	  IRRef refa = ofs ? tab : ir->op1;
+	  ofs += 8*IR(ir->op2)->i;
+	  if (emit_checkofs(ins, ofs)) {
+	    *ofsp = ofs;
+	    return ra_alloc1(as, refa, allow);
+	  }
+	} else {
+	  Reg base = ra_alloc1(as, ir->op1, allow);
+	  *ofsp = FUSE_REG|ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+	  return base;
+	}
+      }
+    } else if (ir->o == IR_HREFK) {
+      if (mayfuse(as, ref)) {
+	int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = ofs;
+	  return ra_alloc1(as, ir->op1, allow);
+	}
+      }
+    } else if (ir->o == IR_UREFC) {
+      if (irref_isk(ir->op1)) {
+	GCfunc *fn = ir_kfunc(IR(ir->op1));
+	GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
+	int64_t ofs = glofs(as, &uv->tv);
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = (int32_t)ofs;
+	  return RID_GL;
+	}
+      }
+    }
+  }
+  *ofsp = 0;
+  return ra_alloc1(as, ref, allow);
+}
+
+/* Fuse m operand into arithmetic/logic instructions. */
+static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  if (ra_hasreg(ir->r)) {
+    ra_noweak(as, ir->r);
+    return A64F_M(ir->r);
+  } else if (irref_isk(ref)) {
+    uint32_t m;
+    int64_t k = get_k64val(ir);
+    if ((ai & 0x1f000000) == 0x0a000000)
+      m = emit_isk13(k, irt_is64(ir->t));
+    else
+      m = emit_isk12(k);
+    if (m)
+      return m;
+  } else if (mayfuse(as, ref)) {
+    if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR && irref_isk(ir->op2)) ||
+	(ir->o == IR_ADD && ir->op1 == ir->op2)) {
+      A64Shift sh = ir->o == IR_BSHR ? A64SH_LSR :
+		    ir->o == IR_BSAR ? A64SH_ASR : A64SH_LSL;
+      int shift = ir->o == IR_ADD ? 1 :
+		    (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+      IRIns *irl = IR(ir->op1);
+      if (sh == A64SH_LSL &&
+	  irl->o == IR_CONV &&
+	  irl->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT) &&
+	  shift <= 4 &&
+	  mayfuse(as, ir->op1)) {
+	Reg m = ra_alloc1(as, irl->op1, allow);
+	return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift);
+      } else {
+	Reg m = ra_alloc1(as, ir->op1, allow);
+	return A64F_M(m) | A64F_SH(sh, shift);
+      }
+    } else if (ir->o == IR_CONV &&
+	       ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)) {
+      Reg m = ra_alloc1(as, ir->op1, allow);
+      return A64F_M(m) | A64F_EX(A64EX_SXTW);
+    }
+  }
+  return A64F_M(ra_allocref(as, ref, allow));
+}
+
+/* Fuse XLOAD/XSTORE reference into load/store operand. */
+static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref,
+			 RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  Reg base;
+  int32_t ofs = 0;
+  if (ra_noreg(ir->r) && canfuse(as, ir)) {
+    if (ir->o == IR_ADD) {
+      if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs))
+	ref = ir->op1;
+      /* NYI: Fuse add with two registers. */
+    } else if (ir->o == IR_STRREF) {
+      if (asm_isk32(as, ir->op2, &ofs)) {
+	ref = ir->op1;
+      } else if (asm_isk32(as, ir->op1, &ofs)) {
+	ref = ir->op2;
+      } else {
+	/* NYI: Fuse ADD with constant. */
+	Reg rn = ra_alloc1(as, ir->op1, allow);
+	uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn));
+	emit_lso(as, ai, rd, rd, sizeof(GCstr));
+	emit_dn(as, A64I_ADDx^m, rd, rn);
+	return;
+      }
+      ofs += sizeof(GCstr);
+      if (!emit_checkofs(ai, ofs)) {
+	Reg rn = ra_alloc1(as, ref, allow);
+	Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn));
+	emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm);
+	return;
+      }
+    }
+  }
+  base = ra_alloc1(as, ref, allow);
+  emit_lso(as, ai, (rd & 31), base, ofs);
+}
+
+/* -- Calls --------------------------------------------------------------- */
+
+/* Generate a call to a C function. */
+static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
+{
+  uint32_t n, nargs = CCI_XNARGS(ci);
+  int32_t ofs = 0;
+  Reg gpr, fpr = REGARG_FIRSTFPR;
+  if ((void *)ci->func)
+    emit_call(as, (void *)ci->func);
+  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
+    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
+  gpr = REGARG_FIRSTGPR;
+  for (n = 0; n < nargs; n++) { /* Setup args. */
+    IRRef ref = args[n];
+    IRIns *ir = IR(ref);
+    if (ref) {
+      if (irt_isfp(ir->t)) {
+	if (fpr <= REGARG_LASTFPR) {
+	  lua_assert(rset_test(as->freeset, fpr)); /* Must have been evicted. */
+	  ra_leftov(as, fpr, ref);
+	  fpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_FPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      } else {
+	if (gpr <= REGARG_LASTGPR) {
+	  lua_assert(rset_test(as->freeset, gpr)); /* Must have been evicted. */
+	  ra_leftov(as, gpr, ref);
+	  gpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_GPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      }
+    }
+  }
+}
+
+/* Setup result reg/sp for call. Evict scratch regs. */
+static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  RegSet drop = RSET_SCRATCH;
+  if (ra_hasreg(ir->r))
+    rset_clear(drop, ir->r); /* Dest reg handled below. */
+  ra_evictset(as, drop); /* Evictions must be performed first. */
+  if (ra_used(ir)) {
+    lua_assert(!irt_ispri(ir->t));
+    if (irt_isfp(ir->t)) {
+      if (ci->flags & CCI_CASTU64) {
+	Reg dest = ra_dest(as, ir, RSET_FPR) & 31;
+	emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D_R : A64I_FMOV_S_R,
+		dest, RID_RET);
+      } else {
+	ra_destreg(as, ir, RID_FPRET);
+      }
+    } else {
+      ra_destreg(as, ir, RID_RET);
+    }
+  }
+  UNUSED(ci);
+}
+
+static void asm_callx(ASMState *as, IRIns *ir)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  CCallInfo ci;
+  IRRef func;
+  IRIns *irf;
+  ci.flags = asm_callx_flags(as, ir);
+  asm_collectargs(as, ir, &ci, args);
+  asm_setupresult(as, ir, &ci);
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  if (irref_isk(func)) {  /* Call to constant address. */
+    ci.func = (ASMFunction)(ir_k64(irf)->u64);
+  } else {  /* Need a non-argument register for indirect calls. */
+    Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    emit_n(as, A64I_BLR, freg);
+    ci.func = (ASMFunction)(void *)0;
+  }
+  asm_gencall(as, &ci, args);
+}
+
+/* -- Returns ------------------------------------------------------------- */
+
+/* Return to lower frame. Guard that it goes to the right spot. */
+static void asm_retf(ASMState *as, IRIns *ir)
+{
+  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
+  void *pc = ir_kptr(IR(ir->op2));
+  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
+  as->topslot -= (BCReg)delta;
+  if ((int32_t)as->topslot < 0) as->topslot = 0;
+  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
+  /* Need to force a spill on REF_BASE now to update the stack slot. */
+  emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
+  emit_setgl(as, base, jit_base);
+  emit_addptr(as, base, -8*delta);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_CMPx, RID_TMP,
+	  ra_allock(as, i64ptr(pc), rset_exclude(RSET_GPR, base)));
+  emit_lso(as, A64I_LDRx, RID_TMP, base, -8);
+}
+
+/* -- Type conversions ---------------------------------------------------- */
+
+static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+{
+  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_FCMPd, (tmp & 31), (left & 31));
+  emit_dn(as, A64I_FCVT_F64_S32, (tmp & 31), dest);
+  emit_dn(as, A64I_FCVT_S32_F64, dest, (left & 31));
+}
+
+static void asm_tobit(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_FPR;
+  Reg left = ra_alloc1(as, ir->op1, allow);
+  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
+  Reg tmp = ra_scratch(as, rset_clear(allow, right));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  emit_dn(as, A64I_FMOV_R_S, dest, (tmp & 31));
+  emit_dnm(as, A64I_FADDd, (tmp & 31), (left & 31), (right & 31));
+}
+
+static void asm_conv(ASMState *as, IRIns *ir)
+{
+  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+  int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
+  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
+  IRRef lref = ir->op1;
+  lua_assert(irt_type(ir->t) != st);
+  if (irt_isfp(ir->t)) {
+    Reg dest = ra_dest(as, ir, RSET_FPR);
+    if (stfp) {  /* FP to FP conversion. */
+      emit_dn(as, st == IRT_NUM ? A64I_FCVT_F32_F64 : A64I_FCVT_F64_F32,
+	      (dest & 31), (ra_alloc1(as, lref, RSET_FPR) & 31));
+    } else {  /* Integer to FP conversion. */
+      Reg left = ra_alloc1(as, lref, RSET_GPR);
+      A64Ins ai = irt_isfloat(ir->t) ?
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F32_S64 : A64I_FCVT_F32_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F32_S32 : A64I_FCVT_F32_U32)) :
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F64_S64 : A64I_FCVT_F64_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F64_S32 : A64I_FCVT_F64_U32));
+      emit_dn(as, ai, (dest & 31), left);
+    }
+  } else if (stfp) {  /* FP to integer conversion. */
+    if (irt_isguard(ir->t)) {
+      /* Checked conversions are only supported from number to int. */
+      lua_assert(irt_isint(ir->t) && st == IRT_NUM);
+      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
+    } else {
+      Reg left = ra_alloc1(as, lref, RSET_FPR);
+      Reg dest = ra_dest(as, ir, RSET_GPR);
+      A64Ins ai = irt_is64(ir->t) ?
+	(st == IRT_NUM ?
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F64 : A64I_FCVT_U64_F64) :
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F32 : A64I_FCVT_U64_F32)) :
+	(st == IRT_NUM ?
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F64 : A64I_FCVT_U32_F64) :
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F32 : A64I_FCVT_U32_F32));
+      emit_dn(as, ai, dest, (left & 31));
+    }
+  } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, lref, RSET_GPR);
+    A64Ins ai = st == IRT_I8 ? A64I_SXTBw :
+		st == IRT_U8 ? A64I_UXTBw :
+		st == IRT_I16 ? A64I_SXTHw : A64I_UXTHw;
+    lua_assert(irt_isint(ir->t) || irt_isu32(ir->t));
+    emit_dn(as, ai, dest, left);
+  } else {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    if (irt_is64(ir->t)) {
+      if (st64 || !(ir->op2 & IRCONV_SEXT)) {
+	/* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      } else {  /* 32 to 64 bit sign extension. */
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dn(as, A64I_SXTW, dest, left);
+      }
+    } else {
+      if (st64) {
+	/* This is either a 32 bit reg/reg mov which zeroes the hiword
+	** or a load of the loword from a 64 bit address.
+	*/
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dm(as, A64I_MOVw, dest, left);
+      } else {  /* 32/32 bit no-op (cast). */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      }
+    }
+  }
+}
+
+static void asm_strto(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
+  IRRef args[2];
+  Reg dest = 0, tmp;
+  int destused = ra_used(ir);
+  int32_t ofs = 0;
+  ra_evictset(as, RSET_SCRATCH);
+  if (destused) {
+    if (ra_hasspill(ir->s)) {
+      ofs = sps_scale(ir->s);
+      destused = 0;
+      if (ra_hasreg(ir->r)) {
+	ra_free(as, ir->r);
+	ra_modified(as, ir->r);
+	emit_spload(as, ir, ir->r, ofs);
+      }
+    } else {
+      dest = ra_dest(as, ir, RSET_FPR);
+    }
+  }
+  asm_guardcc(as, CC_EQ);
+  if (destused)
+    emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
+  emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET);
+  args[0] = ir->op1; /* GCstr *str */
+  args[1] = ASMREF_TMP1; /* TValue *n  */
+  asm_gencall(as, ci, args);
+  tmp = ra_releasetmp(as, ASMREF_TMP1);
+  emit_opk(as, A64I_ADDx, tmp, RID_SP, ofs, RSET_GPR);
+}
+
+/* -- Memory references --------------------------------------------------- */
+
+/* Get pointer to TValue. */
+static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (irt_isnum(ir->t)) {
+    if (irref_isk(ref)) {
+      /* Use the number constant itself as a TValue. */
+      ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
+    } else {
+      /* Otherwise force a spill and use the spill slot. */
+      emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
+    }
+  } else {
+    /* Otherwise use g->tmptv to hold the TValue. */
+    RegSet allow = rset_exclude(RSET_GPR, dest);
+    Reg src;
+    if (irref_isk(ref)) {
+      TValue k;
+      lj_ir_kvalue(as->J->L, &k, ir);
+      src = ra_allock(as, k.u64, allow);
+      emit_lso(as, A64I_STRx, src, dest, 0);
+    } else {
+      Reg type;
+      if (irt_ispri(ir->t)) {
+	src = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	emit_lso(as, A64I_STRx, src, dest, 0);
+      } else if (irt_isint(ir->t)) {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+      } else {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+      }
+    }
+    ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest);
+  }
+}
+
+static void asm_aref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx, base;
+  if (irref_isk(ir->op2)) {
+    IRRef tab = IR(ir->op1)->op1;
+    int32_t ofs = asm_fuseabase(as, tab);
+    IRRef refa = ofs ? tab : ir->op1;
+    uint32_t k = emit_isk12(ofs + 8*IR(ir->op2)->i);
+    if (k) {
+      base = ra_alloc1(as, refa, RSET_GPR);
+      emit_dn(as, A64I_ADDx^k, dest, base);
+      return;
+    }
+  }
+  base = ra_alloc1(as, ir->op1, RSET_GPR);
+  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+  emit_dnm(as, A64I_ADDx | A64F_EXSH(A64EX_UXTW, 3), dest, base, idx);
+}
+
+/* Inlined hash lookup. Specialized for key type and for const keys.
+** The equivalent C code is:
+**   Node *n = hashkey(t, key);
+**   do {
+**     if (lj_obj_equal(&n->key, key)) return &n->val;
+**   } while ((n = nextnode(n)));
+**   return niltv(L);
+*/
+static void asm_href(ASMState *as, IRIns *ir, IROp merge)
+{
+  RegSet allow = RSET_GPR;
+  int destused = ra_used(ir);
+  Reg dest = ra_dest(as, ir, allow);
+  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
+  Reg key = 0, tmp = RID_TMP;
+  IRRef refkey = ir->op2;
+  IRIns *irkey = IR(refkey);
+  int isk = irref_isk(ir->op2);
+  IRType1 kt = irkey->t;
+  uint32_t k = 0;
+  uint32_t khash;
+  MCLabel l_end, l_loop, l_next;
+  rset_clear(allow, tab);
+
+  if (!isk) {
+    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
+    rset_clear(allow, key);
+    if (!irt_isstr(kt)) {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+  } else if (irt_isnum(kt)) {
+    int64_t val = (int64_t)ir_knum(irkey)->u64;
+    if (!(k = emit_isk12(val))) {
+      key = ra_allock(as, val, allow);
+      rset_clear(allow, key);
+    }
+  } else if (!irt_ispri(kt)) {
+    if (!(k = emit_isk12(irkey->i))) {
+      key = ra_alloc1(as, refkey, allow);
+      rset_clear(allow, key);
+    }
+  }
+
+  /* Key not found in chain: jump to exit (if merged) or load niltv. */
+  l_end = emit_label(as);
+  as->invmcp = NULL;
+  if (merge == IR_NE)
+    asm_guardcc(as, CC_AL);
+  else if (destused)
+    emit_loada(as, dest, niltvg(J2G(as->J)));
+
+  /* Follow hash chain until the end. */
+  l_loop = --as->mcp;
+  emit_n(as, A64I_CMPx^A64I_K12^0, dest);
+  emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+  l_next = emit_label(as);
+
+  /* Type and value comparison. */
+  if (merge == IR_EQ)
+    asm_guardcc(as, CC_EQ);
+  else
+    emit_cond_branch(as, CC_EQ, l_end);
+
+  if (irt_isnum(kt)) {
+    if (isk) {
+      /* Assumes -0.0 is already canonicalized to +0.0. */
+      if (k)
+	emit_n(as, A64I_CMPx^k, tmp);
+      else
+	emit_nm(as, A64I_CMPx, key, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      Reg tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
+      Reg ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
+      rset_clear(allow, tisnum);
+      emit_nm(as, A64I_FCMPd, key, ftmp);
+      emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
+      emit_cond_branch(as, CC_LO, l_next);
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+    }
+  } else if (irt_isaddr(kt)) {
+    Reg scr;
+    if (isk) {
+      int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
+      scr = ra_allock(as, kk, allow);
+      emit_nm(as, A64I_CMPx, scr, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      scr = ra_scratch(as, allow);
+      emit_nm(as, A64I_CMPx, tmp, scr);
+      emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
+    }
+    rset_clear(allow, scr);
+  } else {
+    Reg type, scr;
+    lua_assert(irt_ispri(kt) && !irt_isnil(kt));
+    type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+    scr = ra_scratch(as, rset_clear(allow, type));
+    rset_clear(allow, scr);
+    emit_nm(as, A64I_CMPw, scr, type);
+    emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
+  }
+
+  *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE;
+  if (!isk && irt_isaddr(kt)) {
+    Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
+    rset_clear(allow, type);
+  }
+  /* Load main position relative to tab->node into dest. */
+  khash = isk ? ir_khash(irkey) : 1;
+  if (khash == 0) {
+    emit_lso(as, A64I_LDRx, dest, tab, offsetof(GCtab, node));
+  } else {
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 3), dest, tmp, dest);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 1), dest, dest, dest);
+    emit_lso(as, A64I_LDRx, tmp, tab, offsetof(GCtab, node));
+    if (isk) {
+      Reg tmphash = ra_allock(as, khash, allow);
+      emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else if (irt_isstr(kt)) {
+      /* Fetch of str->hash is cheaper than ra_allock. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, hash));
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else {  /* Must match with hash*() in lj_tab.c. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
+      emit_dnm(as, A64I_SUBw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
+      emit_dnm(as, A64I_EORw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
+      emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
+      emit_dnm(as, A64I_EORw, tmp, tmp, dest);
+      if (irt_isnum(kt)) {
+	emit_dnm(as, A64I_ADDw, dest, dest, dest);
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVw, tmp, dest);
+	emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
+      } else {
+	checkmclim(as);
+	emit_dm(as, A64I_MOVw, tmp, key);
+	emit_dnm(as, A64I_EORw, dest, dest,
+		 ra_allock(as, irt_toitype(kt) << 15, allow));
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVx, dest, key);
+      }
+    }
+  }
+}
+
+static void asm_hrefk(ASMState *as, IRIns *ir)
+{
+  IRIns *kslot = IR(ir->op2);
+  IRIns *irkey = IR(kslot->op1);
+  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
+  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
+  int bigofs = !emit_checkofs(A64I_LDRx, ofs);
+  RegSet allow = RSET_GPR;
+  Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
+  Reg node = ra_alloc1(as, ir->op1, allow);
+  Reg key = ra_scratch(as, rset_clear(allow, node));
+  Reg idx = node;
+  uint64_t k;
+  lua_assert(ofs % sizeof(Node) == 0);
+  rset_clear(allow, key);
+  if (bigofs) {
+    idx = dest;
+    rset_clear(allow, dest);
+    kofs = (int32_t)offsetof(Node, key);
+  } else if (ra_hasreg(dest)) {
+    emit_opk(as, A64I_ADDx, dest, node, ofs, allow);
+  }
+  asm_guardcc(as, CC_NE);
+  if (irt_ispri(irkey->t)) {
+    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
+  } else if (irt_isnum(irkey->t)) {
+    k = ir_knum(irkey)->u64;
+  } else {
+    k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
+  }
+  emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow));
+  emit_lso(as, A64I_LDRx, key, idx, kofs);
+  if (bigofs)
+    emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR);
+}
+
+static void asm_uref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  if (irref_isk(ir->op1)) {
+    GCfunc *fn = ir_kfunc(IR(ir->op1));
+    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+    emit_lsptr(as, A64I_LDRx, dest, v);
+  } else {
+    Reg uv = ra_scratch(as, RSET_GPR);
+    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->o == IR_UREFC) {
+      asm_guardcc(as, CC_NE);
+      emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
+      emit_opk(as, A64I_ADDx, dest, uv,
+	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+      emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+    } else {
+      emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
+    }
+    emit_lso(as, A64I_LDRx, uv, func,
+	     (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
+  }
+}
+
+static void asm_fref(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir);
+  lua_assert(!ra_used(ir));
+}
+
+static void asm_strref(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg dest = ra_dest(as, ir, allow);
+  Reg base = ra_alloc1(as, ir->op1, allow);
+  IRIns *irr = IR(ir->op2);
+  int32_t ofs = sizeof(GCstr);
+  uint32_t m;
+  rset_clear(allow, base);
+  if (irref_isk(ir->op2) && (m = emit_isk12(ofs + irr->i))) {
+    emit_dn(as, A64I_ADDx^m, dest, base);
+  } else {
+    emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, dest);
+    emit_dnm(as, A64I_ADDx, dest, base, ra_alloc1(as, ir->op2, allow));
+  }
+}
+
+/* -- Loads and stores ---------------------------------------------------- */
+
+static A64Ins asm_fxloadins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: return A64I_LDRB ^ A64I_LS_S;
+  case IRT_U8: return A64I_LDRB;
+  case IRT_I16: return A64I_LDRH ^ A64I_LS_S;
+  case IRT_U16: return A64I_LDRH;
+  case IRT_NUM: return A64I_LDRd;
+  case IRT_FLOAT: return A64I_LDRs;
+  default: return irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw;
+  }
+}
+
+static A64Ins asm_fxstoreins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: case IRT_U8: return A64I_STRB;
+  case IRT_I16: case IRT_U16: return A64I_STRH;
+  case IRT_NUM: return A64I_STRd;
+  case IRT_FLOAT: return A64I_STRs;
+  default: return irt_is64(ir->t) ? A64I_STRx : A64I_STRw;
+  }
+}
+
+static void asm_fload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx;
+  A64Ins ai = asm_fxloadins(ir);
+  int32_t ofs;
+  if (ir->op1 == REF_NIL) {
+    idx = RID_GL;
+    ofs = (ir->op2 << 2) - GG_OFS(g);
+  } else {
+    idx = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->op2 == IRFL_TAB_ARRAY) {
+      ofs = asm_fuseabase(as, ir->op1);
+      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
+	emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, idx);
+	return;
+      }
+    }
+    ofs = field_ofs[ir->op2];
+  }
+  emit_lso(as, ai, (dest & 31), idx, ofs);
+}
+
+static void asm_fstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
+    IRIns *irf = IR(ir->op1);
+    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
+    int32_t ofs = field_ofs[irf->op2];
+    emit_lso(as, asm_fxstoreins(ir), (src & 31), idx, ofs);
+  }
+}
+
+static void asm_xload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+  lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
+  asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR);
+}
+
+static void asm_xstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+		 rset_exclude(RSET_GPR, src));
+  }
+}
+
+static void asm_ahuvload(ASMState *as, IRIns *ir)
+{
+  Reg idx, tmp, type;
+  int32_t ofs = 0;
+  RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+	     irt_isint(ir->t));
+  if (ra_used(ir)) {
+    Reg dest = ra_dest(as, ir, allow);
+    tmp = irt_isnum(ir->t) ? ra_scratch(as, rset_clear(gpr, dest)) : dest;
+    if (irt_isaddr(ir->t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if (irt_isnum(ir->t)) {
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    } else if (irt_isint(ir->t)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+  } else {
+    tmp = ra_scratch(as, gpr);
+  }
+  type = ra_scratch(as, rset_clear(gpr, tmp));
+  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
+  /* Always do the type check, even if the load result is unused. */
+  asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
+  if (irt_type(ir->t) >= IRT_NUM) {
+    lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
+  } else if (irt_isaddr(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+  } else if (irt_isnil(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+  } else {
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp);
+  }
+  if (ofs & FUSE_REG)
+    emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+  else
+    emit_lso(as, A64I_LDRx, tmp, idx, ofs);
+}
+
+static void asm_ahustore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    RegSet allow = RSET_GPR;
+    Reg idx, src = RID_NONE, tmp = RID_TMP, type = RID_NONE;
+    int32_t ofs = 0;
+    if (irt_isnum(ir->t)) {
+      src = ra_alloc1(as, ir->op2, RSET_FPR);
+      idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31));
+      else
+	emit_lso(as, A64I_STRd, (src & 31), idx, ofs);
+    } else {
+      if (!irt_ispri(ir->t)) {
+	src = ra_alloc1(as, ir->op2, allow);
+	rset_clear(allow, src);
+	if (irt_isinteger(ir->t))
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	else
+	  type = ra_allock(as, irt_toitype(ir->t), allow);
+      } else {
+	tmp = type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t)<<47), allow);
+      }
+      idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type),
+			   A64I_STRx);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+      else
+	emit_lso(as, A64I_STRx, tmp, idx, ofs);
+      if (ra_hasreg(src)) {
+	if (irt_isinteger(ir->t)) {
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), tmp, type, src);
+	} else {
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, src, type);
+	}
+      }
+    }
+  }
+}
+
+static void asm_sload(ASMState *as, IRIns *ir)
+{
+  int32_t ofs = 8*((int32_t)ir->op1-2);
+  IRType1 t = ir->t;
+  Reg dest = RID_NONE, base;
+  RegSet allow = RSET_GPR;
+  lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
+  lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
+  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
+    dest = ra_scratch(as, RSET_FPR);
+    asm_tointg(as, ir, dest);
+    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
+  } else if (ra_used(ir)) {
+    Reg tmp = RID_NONE;
+    if ((ir->op2 & IRSLOAD_CONVERT))
+      tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR);
+    lua_assert((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t));
+    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
+    base = ra_alloc1(as, REF_BASE, rset_clear(allow, dest));
+    if (irt_isaddr(t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if ((ir->op2 & IRSLOAD_CONVERT)) {
+      if (irt_isint(t)) {
+	emit_dn(as, A64I_FCVT_S32_F64, dest, (tmp & 31));
+	/* If value is already loaded for type check, move it to FPR. */
+	if ((ir->op2 & IRSLOAD_TYPECHECK))
+	  emit_dn(as, A64I_FMOV_D_R, (tmp & 31), dest);
+	else
+	  dest = tmp;
+	t.irt = IRT_NUM;  /* Check for original type. */
+      } else {
+	emit_dn(as, A64I_FCVT_F64_S32, (dest & 31), tmp);
+	dest = tmp;
+	t.irt = IRT_INT;  /* Check for original type. */
+      }
+    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+    goto dotypecheck;
+  }
+  base = ra_alloc1(as, REF_BASE, allow);
+dotypecheck:
+  rset_clear(allow, base);
+  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+    Reg tmp;
+    if (ra_hasreg(dest) && rset_test(RSET_GPR, dest)) {
+      tmp = dest;
+    } else {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+    if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT))
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    /* Need type check, even if the load result is unused. */
+    asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE);
+    if (irt_type(t) >= IRT_NUM) {
+      lua_assert(irt_isinteger(t) || irt_isnum(t));
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	      ra_allock(as, LJ_TISNUM << 15, allow), tmp);
+    } else if (irt_isnil(t)) {
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+    } else if (irt_ispri(t)) {
+      emit_nm(as, A64I_CMPx,
+	      ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
+    } else {
+      Reg type = ra_scratch(as, allow);
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
+      emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+    }
+    emit_lso(as, A64I_LDRx, tmp, base, ofs);
+    return;
+  }
+  if (ra_hasreg(dest)) {
+    emit_lso(as, irt_isnum(t) ? A64I_LDRd :
+	     (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs);
+  }
+}
+
+/* -- Allocations --------------------------------------------------------- */
+
+#if LJ_HASFFI
+static void asm_cnew(ASMState *as, IRIns *ir)
+{
+  CTState *cts = ctype_ctsG(J2G(as->J));
+  CTypeID id = (CTypeID)IR(ir->op1)->i;
+  CTSize sz;
+  CTInfo info = lj_ctype_info(cts, id, &sz);
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
+  IRRef args[4];
+  RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
+  lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
+
+  as->gcsteps++;
+  asm_setupresult(as, ir, ci);  /* GCcdata * */
+  /* Initialize immutable cdata object. */
+  if (ir->o == IR_CNEWI) {
+    int32_t ofs = sizeof(GCcdata);
+    Reg r = ra_alloc1(as, ir->op2, allow);
+    lua_assert(sz == 4 || sz == 8);
+    emit_lso(as, sz == 8 ? A64I_STRx : A64I_STRw, r, RID_RET, ofs);
+  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
+    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
+    args[0] = ASMREF_L;     /* lua_State *L */
+    args[1] = ir->op1;      /* CTypeID id   */
+    args[2] = ir->op2;      /* CTSize sz    */
+    args[3] = ASMREF_TMP1;  /* CTSize align */
+    asm_gencall(as, ci, args);
+    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
+    return;
+  }
+
+  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
+  {
+    Reg r = (id < 65536) ? RID_X1 : ra_allock(as, id, allow);
+    emit_lso(as, A64I_STRB, RID_TMP, RID_RET, offsetof(GCcdata, gct));
+    emit_lso(as, A64I_STRH, r, RID_RET, offsetof(GCcdata, ctypeid));
+    emit_d(as, A64I_MOVZw | A64F_U16(~LJ_TCDATA), RID_TMP);
+    if (id < 65536) emit_d(as, A64I_MOVZw | A64F_U16(id), RID_X1);
+  }
+  args[0] = ASMREF_L;     /* lua_State *L */
+  args[1] = ASMREF_TMP1;  /* MSize size   */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
+	       ra_releasetmp(as, ASMREF_TMP1));
+}
+#else
+#define asm_cnew(as, ir)	((void)0)
+#endif
+
+/* -- Write barriers ------------------------------------------------------ */
+
+static void asm_tbar(ASMState *as, IRIns *ir)
+{
+  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
+  Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
+  Reg gr = ra_allock(as, i64ptr(J2G(as->J)),
+		     rset_exclude(rset_exclude(RSET_GPR, tab), link));
+  Reg mark = RID_TMP;
+  MCLabel l_end = emit_label(as);
+  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
+  emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+  emit_lso(as, A64I_STRx, tab, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
+  emit_lso(as, A64I_LDRx, link, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark);
+  emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+}
+
+static void asm_obar(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
+  IRRef args[2];
+  MCLabel l_end;
+  RegSet allow = RSET_GPR;
+  Reg obj, val, tmp;
+  /* No need for other object barriers (yet). */
+  lua_assert(IR(ir->op1)->o == IR_UREFC);
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ir->op1;      /* TValue *tv      */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) );
+  obj = IR(ir->op1)->r;
+  tmp = ra_scratch(as, rset_exclude(allow, obj));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
+  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
+  emit_lso(as, A64I_LDRB, tmp, obj,
+     (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+  emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
+}
+
+/* -- Arithmetic and logic operations ------------------------------------- */
+
+static void asm_fparith(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = (left >> 8); left &= 255;
+  emit_dnm(as, ai, (dest & 31), (left & 31), (right & 31));
+}
+
+static void asm_fpunary(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
+  emit_dn(as, ai, (dest & 31), (left & 31));
+}
+
+static void asm_fpmath(ASMState *as, IRIns *ir)
+{
+  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
+  if (fpm == IRFPM_SQRT) {
+    asm_fpunary(as, ir, A64I_FSQRTd);
+  } else if (fpm <= IRFPM_TRUNC) {
+    asm_fpunary(as, ir, fpm == IRFPM_FLOOR ? A64I_FRINTMd :
+			fpm == IRFPM_CEIL ? A64I_FRINTPd : A64I_FRINTZd);
+  } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
+    return;
+  } else {
+    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
+  }
+}
+
+static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
+{
+  IRIns *ir;
+  if (irref_isk(rref))
+    return 0;  /* Don't swap constants to the left. */
+  if (irref_isk(lref))
+    return 1;  /* But swap constants to the right. */
+  ir = IR(rref);
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
+    return 0;  /* Don't swap fusable operands to the left. */
+  ir = IR(lref);
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
+    return 1;  /* But swap fusable operands to the right. */
+  return 0;  /* Otherwise don't swap. */
+}
+
+static void asm_intop(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  IRRef lref = ir->op1, rref = ir->op2;
+  Reg left, dest = ra_dest(as, ir, RSET_GPR);
+  uint32_t m;
+  if ((ai & ~A64I_S) != A64I_SUBw && asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+  }
+  left = ra_hintalloc(as, lref, dest, RSET_GPR);
+  if (irt_is64(ir->t)) ai |= A64I_X;
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* For IR_ADDOV etc. */
+    asm_guardcc(as, CC_VS);
+    ai |= A64I_S;
+  }
+  emit_dn(as, ai^m, dest, left);
+}
+
+static void asm_intop_s(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp) {  /* Drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai |= A64I_S;
+  }
+  asm_intop(as, ir, ai);
+}
+
+static void asm_intneg(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  emit_dm(as, irt_is64(ir->t) ? A64I_NEGx : A64I_NEGw, dest, left);
+}
+
+/* NYI: use add/shift for MUL(OV) with constants. FOLD only does 2^k. */
+static void asm_intmul(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* IR_MULOV */
+    asm_guardcc(as, CC_NE);
+    emit_dm(as, A64I_MOVw, dest, dest);  /* Zero-extend. */
+    emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
+    emit_dnm(as, A64I_SMULL, dest, right, left);
+  } else {
+    emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
+  }
+}
+
+static void asm_add(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FADDd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_ADDw);
+}
+
+static void asm_sub(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FSUBd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_SUBw);
+}
+
+static void asm_mul(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FMULd);
+    return;
+  }
+  asm_intmul(as, ir);
+}
+
+static void asm_div(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
+					  IRCALL_lj_carith_divu64);
+  else
+#endif
+    asm_fparith(as, ir, A64I_FDIVd);
+}
+
+static void asm_pow(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+					  IRCALL_lj_carith_powu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_powi);
+}
+
+#define asm_addov(as, ir)	asm_add(as, ir)
+#define asm_subov(as, ir)	asm_sub(as, ir)
+#define asm_mulov(as, ir)	asm_mul(as, ir)
+
+#define asm_abs(as, ir)		asm_fpunary(as, ir, A64I_FABS)
+#define asm_atan2(as, ir)	asm_callid(as, ir, IRCALL_atan2)
+#define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
+
+static void asm_mod(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isint(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
+					  IRCALL_lj_carith_modu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_modi);
+}
+
+static void asm_neg(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fpunary(as, ir, A64I_FNEGd);
+    return;
+  }
+  asm_intneg(as, ir);
+}
+
+static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp && ai == A64I_ANDw) {
+    /* Try to drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai += A64I_ANDSw - A64I_ANDw;
+  }
+  if (ir->op2 == 0) {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR);
+    if (irt_is64(ir->t)) ai |= A64I_X;
+    emit_d(as, ai^m, dest);
+  } else {
+    asm_intop(as, ir, ai);
+  }
+}
+
+#define asm_bnot(as, ir)	asm_bitop(as, ir, A64I_MVNw)
+#define asm_band(as, ir)	asm_bitop(as, ir, A64I_ANDw)
+#define asm_bor(as, ir)		asm_bitop(as, ir, A64I_ORRw)
+#define asm_bxor(as, ir)	asm_bitop(as, ir, A64I_EORw)
+
+static void asm_bswap(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+  emit_dn(as, irt_is64(ir->t) ? A64I_REVx : A64I_REVw, dest, left);
+}
+
+static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh)
+{
+  int shmask = irt_is64(ir->t) ? 63 : 31;
+  if (irref_isk(ir->op2)) {  /* Constant shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    int32_t shift = (IR(ir->op2)->i & shmask);
+
+    if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw;
+    switch (sh) {
+    case A64SH_LSL:
+      emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left);
+      break;
+    case A64SH_LSR: case A64SH_ASR:
+      emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left);
+      break;
+    case A64SH_ROR:
+      emit_dnm(as, ai | A64F_IMMS(shift), dest, left, left);
+      break;
+    }
+  } else {  /* Variable-length shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+    emit_dnm(as, (shmask == 63 ? A64I_SHRx : A64I_SHRw) | A64F_BSH(sh), dest, left, right);
+  }
+}
+
+#define asm_bshl(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSL)
+#define asm_bshr(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSR)
+#define asm_bsar(as, ir)	asm_bitshift(as, ir, A64I_SBFMw, A64SH_ASR)
+#define asm_bror(as, ir)	asm_bitshift(as, ir, A64I_EXTRw, A64SH_ROR)
+#define asm_brol(as, ir)	lua_assert(0)
+
+static void asm_intmin_max(ASMState *as, IRIns *ir, A64CC cc)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  emit_dnm(as, A64I_CSELw|A64F_CC(cc), dest, left, right);
+  emit_nm(as, A64I_CMPw, left, right);
+}
+
+static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc)
+{
+  Reg dest = (ra_dest(as, ir, RSET_FPR) & 31);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = ((left >> 8) & 31); left &= 31;
+  emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right);
+  emit_nm(as, A64I_FCMPd, left, right);
+}
+
+static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc)
+{
+  if (irt_isnum(ir->t))
+    asm_fpmin_max(as, ir, fcc);
+  else
+    asm_intmin_max(as, ir, cc);
+}
+
+#define asm_max(as, ir)		asm_min_max(as, ir, CC_GT, CC_HI)
+#define asm_min(as, ir)		asm_min_max(as, ir, CC_LT, CC_LO)
+
+/* -- Comparisons --------------------------------------------------------- */
+
+/* Map of comparisons to flags. ORDER IR. */
+static const uint8_t asm_compmap[IR_ABC+1] = {
+  /* op  FP swp  int cc   FP cc */
+  /* LT       */ CC_GE + (CC_HS << 4),
+  /* GE    x  */ CC_LT + (CC_HI << 4),
+  /* LE       */ CC_GT + (CC_HI << 4),
+  /* GT    x  */ CC_LE + (CC_HS << 4),
+  /* ULT   x  */ CC_HS + (CC_LS << 4),
+  /* UGE      */ CC_LO + (CC_LO << 4),
+  /* ULE   x  */ CC_HI + (CC_LO << 4),
+  /* UGT      */ CC_LS + (CC_LS << 4),
+  /* EQ       */ CC_NE + (CC_NE << 4),
+  /* NE       */ CC_EQ + (CC_EQ << 4),
+  /* ABC      */ CC_LS + (CC_LS << 4)  /* Same as UGT. */
+};
+
+/* FP comparisons. */
+static void asm_fpcomp(ASMState *as, IRIns *ir)
+{
+  Reg left, right;
+  A64Ins ai;
+  int swp = ((ir->o ^ (ir->o >> 2)) & ~(ir->o >> 3) & 1);
+  if (!swp && irref_isk(ir->op2) && ir_knum(IR(ir->op2))->u64 == 0) {
+    left = (ra_alloc1(as, ir->op1, RSET_FPR) & 31);
+    right = 0;
+    ai = A64I_FCMPZd;
+  } else {
+    left = ra_alloc2(as, ir, RSET_FPR);
+    if (swp) {
+      right = (left & 31); left = ((left >> 8) & 31);
+    } else {
+      right = ((left >> 8) & 31); left &= 31;
+    }
+    ai = A64I_FCMPd;
+  }
+  asm_guardcc(as, (asm_compmap[ir->o] >> 4));
+  emit_nm(as, ai, left, right);
+}
+
+/* Integer comparisons. */
+static void asm_intcomp(ASMState *as, IRIns *ir)
+{
+  A64CC oldcc, cc = (asm_compmap[ir->o] & 15);
+  A64Ins ai = irt_is64(ir->t) ? A64I_CMPx : A64I_CMPw;
+  IRRef lref = ir->op1, rref = ir->op2;
+  Reg left;
+  uint32_t m;
+  int cmpprev0 = 0;
+  lua_assert(irt_is64(ir->t) || irt_isint(ir->t) ||
+	     irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t));
+  if (asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+    if (cc >= CC_GE) cc ^= 7;  /* LT <-> GT, LE <-> GE */
+    else if (cc > CC_NE) cc ^= 11;  /* LO <-> HI, LS <-> HS */
+  }
+  oldcc = cc;
+  if (irref_isk(rref) && IR(rref)->i == 0) {
+    IRIns *irl = IR(lref);
+    if (cc == CC_GE) cc = CC_PL;
+    else if (cc == CC_LT) cc = CC_MI;
+    else if (cc > CC_NE) goto notst;  /* Other conds don't work with tst. */
+    cmpprev0 = (irl+1 == ir);
+    /* Combine comp(BAND(left, right), 0) into tst left, right. */
+    if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) {
+      IRRef blref = irl->op1, brref = irl->op2;
+      uint32_t m2 = 0;
+      Reg bleft;
+      if (asm_swapops(as, blref, brref)) {
+	Reg tmp = blref; blref = brref; brref = tmp;
+      }
+      if (irref_isk(brref)) {
+	/* NYI: use tbz/tbnz, if applicable. */
+	m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t));
+	if (!m2)
+	  goto notst;  /* Not beneficial if we miss a constant operand. */
+      }
+      bleft = ra_alloc1(as, blref, RSET_GPR);
+      ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
+      if (!m2)
+	m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
+      asm_guardcc(as, cc);
+      emit_n(as, ai^m2, bleft);
+      return;
+    }
+    /* NYI: use cbz/cbnz for EQ/NE 0. */
+  }
+notst:
+  left = ra_alloc1(as, lref, RSET_GPR);
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  asm_guardcc(as, cc);
+  emit_n(as, ai^m, left);
+  /* Signed comparison with zero and referencing previous ins? */
+  if (cmpprev0 && (oldcc <= CC_NE || oldcc >= CC_GE))
+    as->flagmcp = as->mcp;  /* Allow elimination of the compare. */
+}
+
+static void asm_comp(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t))
+    asm_fpcomp(as, ir);
+  else
+    asm_intcomp(as, ir);
+}
+
+#define asm_equal(as, ir)	asm_comp(as, ir)
+
+/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
+
+/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
+static void asm_hiop(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir); lua_assert(0);  /* Unused on 64 bit. */
+}
+
+/* -- Profiling ----------------------------------------------------------- */
+
+static void asm_prof(ASMState *as, IRIns *ir)
+{
+  uint32_t k = emit_isk13(HOOK_PROFILE, 0);
+  lua_assert(k != 0);
+  UNUSED(ir);
+  asm_guardcc(as, CC_NE);
+  emit_n(as, A64I_TSTw^k, RID_TMP);
+  emit_lsptr(as, A64I_LDRB, RID_TMP, (void *)&J2G(as->J)->hookmask);
+}
+
+/* -- Stack handling ------------------------------------------------------ */
+
+/* Check Lua stack size for overflow. Use exit handler as fallback. */
+static void asm_stack_check(ASMState *as, BCReg topslot,
+			    IRIns *irp, RegSet allow, ExitNo exitno)
+{
+  Reg pbase;
+  uint32_t k;
+  if (irp) {
+    if (!ra_hasspill(irp->s)) {
+      pbase = irp->r;
+      lua_assert(ra_hasreg(pbase));
+    } else if (allow) {
+      pbase = rset_pickbot(allow);
+    } else {
+      pbase = RID_RET;
+      emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0);  /* Restore temp register. */
+    }
+  } else {
+    pbase = RID_BASE;
+  }
+  emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno));
+  emit_cond_branch(as, CC_LS^1, as->mcp+1);
+  k = emit_isk12((8*topslot));
+  lua_assert(k);
+  emit_n(as, A64I_CMPx^k, RID_TMP);
+  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
+  emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
+	   (int32_t)offsetof(lua_State, maxstack));
+  if (irp) {  /* Must not spill arbitrary registers in head of side trace. */
+    if (ra_hasspill(irp->s))
+      emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
+    emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
+    if (ra_hasspill(irp->s) && !allow)
+      emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0);  /* Save temp register. */
+  } else {
+    emit_getgl(as, RID_TMP, cur_L);
+  }
+}
+
+/* Restore Lua stack from on-trace state. */
+static void asm_stack_restore(ASMState *as, SnapShot *snap)
+{
+  SnapEntry *map = &as->T->snapmap[snap->mapofs];
+#ifdef LUA_USE_ASSERT
+  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
+#endif
+  MSize n, nent = snap->nent;
+  /* Store the value of all modified slots to the Lua stack. */
+  for (n = 0; n < nent; n++) {
+    SnapEntry sn = map[n];
+    BCReg s = snap_slot(sn);
+    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
+    IRRef ref = snap_ref(sn);
+    IRIns *ir = IR(ref);
+    if ((sn & SNAP_NORESTORE))
+      continue;
+    if (irt_isnum(ir->t)) {
+      Reg src = ra_alloc1(as, ref, RSET_FPR);
+      emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
+    } else {
+      RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
+      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t));
+      if (!irref_isk(ref)) {
+	Reg type, src;
+	if (irt_is64(ir->t)) {
+	  type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+	} else if (irt_isinteger(ir->t)) {
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+	} else {
+	  type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	  emit_lso(as, A64I_STRx, type, RID_BASE, ofs);
+	}
+      } else {
+	TValue k;
+	lj_ir_kvalue(as->J->L, &k, ir);
+	emit_lso(as, A64I_STRx,
+		 ra_allock(as, tvisnil(&k) ? -1 : (int64_t)k.u64, allow),
+		 RID_BASE, ofs);
+      }
+    }
+    checkmclim(as);
+  }
+  lua_assert(map + nent == flinks);
+}
+
+/* -- GC handling --------------------------------------------------------- */
+
+/* Check GC threshold and do one or more GC steps. */
+static void asm_gc_check(ASMState *as)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
+  IRRef args[2];
+  MCLabel l_end;
+  Reg tmp1, tmp2;
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
+  asm_guardcc(as, CC_NE);  /* Assumes asm_snap_prep() already done. */
+  emit_n(as, A64I_CMPx^A64I_K12, RID_RET);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ASMREF_TMP2;  /* MSize steps     */
+  asm_gencall(as, ci, args);
+  tmp1 = ra_releasetmp(as, ASMREF_TMP1);
+  tmp2 = ra_releasetmp(as, ASMREF_TMP2);
+  emit_loadi(as, tmp2, as->gcsteps);
+  /* Jump around GC step if GC total < GC threshold. */
+  emit_cond_branch(as, CC_LS, l_end);
+  emit_nm(as, A64I_CMPx, RID_TMP, tmp2);
+  emit_lso(as, A64I_LDRx, tmp2, tmp1,
+	   (int32_t)offsetof(global_State, gc.threshold));
+  emit_lso(as, A64I_LDRx, RID_TMP, tmp1,
+	   (int32_t)offsetof(global_State, gc.total));
+  ra_allockreg(as, i64ptr(J2G(as->J)), tmp1);
+  as->gcsteps = 0;
+  checkmclim(as);
+}
+
+/* -- Loop handling ------------------------------------------------------- */
+
+/* Fixup the loop branch. */
+static void asm_loop_fixup(ASMState *as)
+{
+  MCode *p = as->mctop;
+  MCode *target = as->mcp;
+  if (as->loopinv) {  /* Inverted loop branch? */
+    ptrdiff_t delta = target - (p - 2);
+    lua_assert(((delta + 0x40000) >> 19) == 0);
+    /* asm_guardcc already inverted the b.cc and patched the final bl. */
+    p[-2] |= ((uint32_t)delta & 0x7ffff) << 5;
+  } else {
+    ptrdiff_t delta = target - (p - 1);
+    p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
+  }
+}
+
+/* -- Head of trace ------------------------------------------------------- */
+
+/* Reload L register from g->cur_L. */
+static void asm_head_lreg(ASMState *as)
+{
+  IRIns *ir = IR(ASMREF_L);
+  if (ra_used(ir)) {
+    Reg r = ra_dest(as, ir, RSET_GPR);
+    emit_getgl(as, r, cur_L);
+    ra_evictk(as);
+  }
+}
+
+/* Coalesce BASE register for a root trace. */
+static void asm_head_root_base(ASMState *as)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  ra_destreg(as, ir, RID_BASE);
+}
+
+/* Coalesce BASE register for a side trace. */
+static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  if (ra_hasspill(irp->s)) {
+    rset_clear(allow, ra_dest(as, ir, allow));
+  } else {
+    Reg r = irp->r;
+    lua_assert(ra_hasreg(r));
+    rset_clear(allow, r);
+    if (r != ir->r && !rset_test(as->freeset, r))
+      ra_restore(as, regcost_ref(as->cost[r]));
+    ra_destreg(as, ir, r);
+  }
+  return allow;
+}
+
+/* -- Tail of trace ------------------------------------------------------- */
+
+/* Fixup the tail code. */
+static void asm_tail_fixup(ASMState *as, TraceNo lnk)
+{
+  MCode *p = as->mctop;
+  MCode *target;
+  /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */
+  int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED));
+  if (spadj == 0) {
+    as->mctop = --p;
+  } else {
+    /* Patch stack adjustment. */
+    uint32_t k = emit_isk12(spadj);
+    lua_assert(k);
+    p[-2] = (A64I_ADDx^k) | A64F_D(RID_SP) | A64F_N(RID_SP);
+  }
+  /* Patch exit branch. */
+  target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
+  p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu);
+}
+
+/* Prepare tail of code. */
+static void asm_tail_prep(ASMState *as)
+{
+  MCode *p = as->mctop - 1;  /* Leave room for exit branch. */
+  if (as->loopref) {
+    as->invmcp = as->mcp = p;
+  } else {
+    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
+    as->invmcp = NULL;
+  }
+  *p = 0;  /* Prevent load/store merging. */
+}
+
+/* -- Trace setup --------------------------------------------------------- */
+
+/* Ensure there are enough stack slots for call arguments. */
+static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  uint32_t i, nargs = CCI_XNARGS(ci);
+  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+  asm_collectargs(as, ir, ci, args);
+  for (i = 0; i < nargs; i++) {
+    if (args[i] && irt_isfp(IR(args[i])->t)) {
+      if (nfpr > 0) nfpr--; else nslots += 2;
+    } else {
+      if (ngpr > 0) ngpr--; else nslots += 2;
+    }
+  }
+  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+    as->evenspill = nslots;
+  return REGSP_HINT(RID_RET);
+}
+
+static void asm_setup_target(ASMState *as)
+{
+  /* May need extra exit for asm_stack_check on side traces. */
+  asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
+}
+
+/* -- Trace patching ------------------------------------------------------ */
+
+/* Patch exit jumps of existing machine code to a new target. */
+void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
+{
+  MCode *p = T->mcode;
+  MCode *pe = (MCode *)((char *)p + T->szmcode);
+  MCode *cstart = NULL, *cend = p;
+  MCode *mcarea = lj_mcode_patch(J, p, 0);
+  MCode *px = exitstub_addr(J, exitno);
+  for (; p < pe; p++) {
+    /* Look for bl exitstub, replace with b target. */
+    uint32_t ins = *p;
+    if ((ins & 0xfc000000u) == 0x94000000u &&
+	((ins ^ (px-p)) & 0x03ffffffu) == 0) {
+      *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu);
+      cend = p+1;
+      if (!cstart) cstart = p;
+    }
+  }
+  lua_assert(cstart != NULL);
+  lj_mcode_sync(cstart, cend);
+  lj_mcode_patch(J, mcarea, 1);
+}
+
diff --git a/src/lj_ccall.c b/src/lj_ccall.c
index b599be33..a3ae8b05 100644
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@@ -331,7 +331,7 @@
 
 #define CCALL_HANDLE_COMPLEXARG \
   /* Pass complex by value in separate (!) FPRs or on stack. */ \
-  isfp = ctr->size == 2*sizeof(float) ? 2 : 1;
+  isfp = sz == 2*sizeof(float) ? 2 : 1;
 
 #define CCALL_HANDLE_REGARG \
   if (LJ_TARGET_IOS && isva) { \
diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
index 82708077..362d6202 100644
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@@ -107,6 +107,7 @@ typedef struct GG_State {
 #define J2G(J)		(&J2GG(J)->g)
 #define G2J(gl)		(&G2GG(gl)->J)
 #define L2J(L)		(&L2GG(L)->J)
+#define GG_G2J		(GG_OFS(J) - GG_OFS(g))
 #define GG_G2DISP	(GG_OFS(dispatch) - GG_OFS(g))
 #define GG_DISP2G	(GG_OFS(g) - GG_OFS(dispatch))
 #define GG_DISP2J	(GG_OFS(J) - GG_OFS(dispatch))
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
new file mode 100644
index 00000000..eb8f7fc7
--- /dev/null
+++ b/src/lj_emit_arm64.h
@@ -0,0 +1,397 @@
+/*
+** ARM64 instruction emitter.
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Constant encoding --------------------------------------------------- */
+
+static uint64_t get_k64val(IRIns *ir)
+{
+  if (ir->o == IR_KINT64) {
+    return ir_kint64(ir)->u64;
+  } else if (ir->o == IR_KGC) {
+    return (uint64_t)ir_kgc(ir);
+  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
+    return (uint64_t)ir_kptr(ir);
+  } else {
+    lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
+    return ir->i;  /* Sign-extended. */
+  }
+}
+
+/* Encode constant in K12 format for data processing instructions. */
+static uint32_t emit_isk12(int64_t n)
+{
+  uint64_t k = (n < 0) ? -n : n;
+  uint32_t m = (n < 0) ? 0x40000000 : 0;
+  if (k < 0x1000) {
+    return A64I_K12|m|A64F_U12(k);
+  } else if ((k & 0xfff000) == k) {
+    return A64I_K12|m|0x400000|A64F_U12(k>>12);
+  }
+  return 0;
+}
+
+#define emit_clz64(n)	__builtin_clzll(n)
+#define emit_ctz64(n)	__builtin_ctzll(n)
+
+/* Encode constant in K13 format for logical data processing instructions. */
+static uint32_t emit_isk13(uint64_t n, int is64)
+{
+  int inv = 0, w = 128, lz, tz;
+  if (n & 1) { n = ~n; w = 64; inv = 1; }  /* Avoid wrap-around of ones. */
+  if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
+  do {  /* Find the repeat width. */
+    if (is64 && (uint32_t)(n^(n>>32))) break;
+    n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
+    n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
+    n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
+    n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
+    n = n & 0x3; w = 2;
+  } while (0);
+  lz = emit_clz64(n);
+  tz = emit_ctz64(n);
+  if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
+  if (inv)
+    return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
+  else
+    return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
+}
+
+static uint32_t emit_isfpk64(uint64_t n)
+{
+  uint64_t etop9 = ((n >> 54) & 0x1ff);
+  if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
+    return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
+  }
+  return ~0u;
+}
+
+/* -- Emit basic instructions --------------------------------------------- */
+
+static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
+}
+
+static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
+}
+
+static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_d(ASMState *as, A64Ins ai, Reg rd)
+{
+  *--as->mcp = ai | A64F_D(rd);
+}
+
+static void emit_n(ASMState *as, A64Ins ai, Reg rn)
+{
+  *--as->mcp = ai | A64F_N(rn);
+}
+
+static int emit_checkofs(A64Ins ai, int64_t ofs)
+{
+  int scale = (ai >> 30) & 3;
+  if (ofs < 0 || (ofs & ((1<<scale)-1))) {
+    return (ofs >= -256 && ofs <= 255) ? -1 : 0;
+  } else {
+    return (ofs < (4096<<scale)) ? 1 : 0;
+  }
+}
+
+static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
+{
+  int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
+  lua_assert(ot);
+  /* Combine LDR/STR pairs to LDP/STP. */
+  if ((sc == 2 || sc == 3) &&
+      (!(ai & 0x400000) || rd != rn) &&
+      as->mcp != as->mcloop) {
+    uint32_t prev = *as->mcp & ~A64F_D(31);
+    int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
+    A64Ins aip;
+    if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
+	prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+      aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
+    } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
+	       prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+      aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
+      ofsm = ofs;
+    } else {
+      goto nopair;
+    }
+    if (ofsm >= (-64<<sc) && ofsm <= (63<<sc)) {
+      *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
+	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
+      return;
+    }
+  }
+nopair:
+  if (ot == 1)
+    *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
+  else
+    *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
+}
+
+/* -- Emit loads/stores --------------------------------------------------- */
+
+/* Prefer rematerialization of BASE/L from global_State over spills. */
+#define emit_canremat(ref)	((ref) <= ASMREF_L)
+
+/* Try to find an N-step delta relative to other consts with N < lim. */
+static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
+{
+  RegSet work = ~as->freeset & RSET_GPR;
+  if (lim <= 1) return 0;  /* Can't beat that. */
+  while (work) {
+    Reg r = rset_picktop(work);
+    IRRef ref = regcost_ref(as->cost[r]);
+    lua_assert(r != rd);
+    if (ref < REF_TRUE) {
+      uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
+				     get_k64val(IR(ref));
+      int64_t delta = (int64_t)(k - kx);
+      if (delta == 0) {
+	emit_dm(as, A64I_MOVx, rd, r);
+	return 1;
+      } else {
+	uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
+	if (k12) {
+	  emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
+	  return 1;
+	}
+	/* Do other ops or multi-step deltas pay off? Probably not.
+	** E.g. XOR rarely helps with pointer consts.
+	*/
+      }
+    }
+    rset_clear(work, r);
+  }
+  return 0;  /* Failed. */
+}
+
+static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
+{
+  uint32_t k13 = emit_isk13(u64, is64);
+  if (k13) {  /* Can the constant be represented as a bitmask immediate? */
+    emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+  } else {
+    int i, zeros = 0, ones = 0, neg;
+    if (!is64) u64 = (int64_t)(int32_t)u64;  /* Sign-extend. */
+    /* Count homogeneous 16 bit fragments. */
+    for (i = 0; i < 4; i++) {
+      uint64_t frag = (u64 >> i*16) & 0xffff;
+      zeros += (frag == 0);
+      ones += (frag == 0xffff);
+    }
+    neg = ones > zeros;  /* Use MOVN if it pays off. */
+    if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
+      int shift = 0, lshift = 0;
+      uint64_t n64 = neg ? ~u64 : u64;
+      if (n64 != 0) {
+	/* Find first/last fragment to be filled. */
+	shift = (63-emit_clz64(n64)) & ~15;
+	lshift = emit_ctz64(n64) & ~15;
+      }
+      /* MOVK requires the original value (u64). */
+      while (shift > lshift) {
+	uint32_t u16 = (u64 >> shift) & 0xffff;
+	/* Skip fragments that are correctly filled by MOVN/MOVZ. */
+	if (u16 != (neg ? 0xffff : 0))
+	  emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
+	shift -= 16;
+      }
+      /* But MOVN needs an inverted value (n64). */
+      emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
+		 A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+    }
+  }
+}
+
+/* Load a 32 bit constant into a GPR. */
+#define emit_loadi(as, rd, i)	emit_loadk(as, rd, i, 0)
+
+/* Load a 64 bit constant into a GPR. */
+#define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i, A64I_X)
+
+#define emit_loada(as, r, addr)	emit_loadu64(as, (r), (uintptr_t)(addr))
+
+#define glofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
+#define mcpofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp))
+#define checkmcpofs(as, k) \
+  ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
+
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+
+/* Get/set from constant pointer. */
+static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
+{
+  /* First, check if ip + offset is in range. */
+  if ((ai & 0x00400000) && checkmcpofs(as, p)) {
+    emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
+  } else {
+    Reg base = RID_GL;  /* Next, try GL + offset. */
+    int64_t ofs = glofs(as, p);
+    if (!emit_checkofs(ai, ofs)) {  /* Else split up into base reg + offset. */
+      int64_t i64 = i64ptr(p);
+      base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+      ofs = i64 & 0x7fffull;
+    }
+    emit_lso(as, ai, r, base, ofs);
+  }
+}
+
+/* Load 64 bit IR constant into register. */
+static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
+{
+  const uint64_t *k = &ir_k64(ir)->u64;
+  int64_t ofs;
+  if (r >= RID_MAX_GPR) {
+    uint32_t fpk = emit_isfpk64(*k);
+    if (fpk != ~0u) {
+      emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
+      return;
+    }
+  }
+  ofs = glofs(as, k);
+  if (emit_checkofs(A64I_LDRx, ofs)) {
+    emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
+	     (r & 31), RID_GL, ofs);
+  } else {
+    if (r >= RID_MAX_GPR) {
+      emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
+      r = RID_TMP;
+    }
+    if (checkmcpofs(as, k))
+      emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
+    else
+      emit_loadu64(as, r, *k);
+  }
+}
+
+/* Get/set global_State fields. */
+#define emit_getgl(as, r, field) \
+  emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
+#define emit_setgl(as, r, field) \
+  emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
+
+/* Trace number is determined from pc of exit instruction. */
+#define emit_setvmstate(as, i)	UNUSED(i)
+
+/* -- Emit control-flow instructions -------------------------------------- */
+
+/* Label for internal jumps. */
+typedef MCode *MCLabel;
+
+/* Return label pointing to current PC. */
+#define emit_label(as)		((as)->mcp)
+
+static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x40000) >> 19) == 0);
+  *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond;
+  as->mcp = p;
+}
+
+static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x02000000) >> 26) == 0);
+  *--p = ai | ((uint32_t)delta & 0x03ffffffu);
+  as->mcp = p;
+}
+
+#define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
+
+static void emit_call(ASMState *as, void *target)
+{
+  MCode *p = --as->mcp;
+  ptrdiff_t delta = (char *)target - (char *)p;
+  if ((((delta>>2) + 0x02000000) >> 26) == 0) {
+    *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
+  } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
+    Reg r = ra_allock(as, i64ptr(target),
+		      RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    *p = A64I_BLR | A64F_N(r);
+  }
+}
+
+/* -- Emit generic operations --------------------------------------------- */
+
+/* Generic move between two regs. */
+static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
+{
+  if (dst >= RID_MAX_GPR) {
+    emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
+	    (dst & 31), (src & 31));
+    return;
+  }
+  if (as->mcp != as->mcloop) {  /* Swap early registers for loads/stores. */
+    MCode ins = *as->mcp, swp = (src^dst);
+    if ((ins & 0xbf800000) == 0xb9000000) {
+      if (!((ins ^ (dst << 5)) & 0x000003e0))
+	*as->mcp = ins ^ (swp << 5);  /* Swap N in load/store. */
+      if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
+	*as->mcp = ins ^ swp;  /* Swap D in store. */
+    }
+  }
+  emit_dm(as, A64I_MOVx, dst, src);
+}
+
+/* Generic load of register with base and (small) offset address. */
+static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
+}
+
+/* Generic store of register with base and (small) offset address. */
+static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
+}
+
+/* Emit an arithmetic operation with a constant operand. */
+static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
+		     int32_t i, RegSet allow)
+{
+  uint32_t k = emit_isk12(i);
+  if (k)
+    emit_dn(as, ai^k, dest, src);
+  else
+    emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
+}
+
+/* Add offset to pointer. */
+static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
+{
+  if (ofs)
+    emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
+		 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
+}
+
+#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index 8b72be7d..8bc2474c 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -296,6 +296,9 @@ enum {
 #elif LJ_TARGET_ARM
   DW_REG_SP = 13,
   DW_REG_RA = 14,
+#elif LJ_TARGET_ARM64
+  DW_REG_SP = 31,
+  DW_REG_RA = 30,
 #elif LJ_TARGET_PPC
   DW_REG_SP = 1,
   DW_REG_RA = 65,
@@ -374,6 +377,8 @@ static const ELFheader elfhdr_template = {
   .machine = 62,
 #elif LJ_TARGET_ARM
   .machine = 40,
+#elif LJ_TARGET_ARM64
+  .machine = 183,
 #elif LJ_TARGET_PPC
   .machine = 20,
 #elif LJ_TARGET_MIPS
@@ -563,6 +568,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
       int i;
       for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); }
     }
+#elif LJ_TARGET_ARM64
+    {
+      int i;
+      DB(DW_CFA_offset|31); DUV(2);
+      for (i = 28; i >= 19; i--) { DB(DW_CFA_offset|i); DUV(3+(28-i)); }
+      for (i = 15; i >= 8; i--) { DB(DW_CFA_offset|32|i); DUV(28-i); }
+    }
 #elif LJ_TARGET_PPC
     {
       int i;
diff --git a/src/lj_target.h b/src/lj_target.h
index abea8d5b..c069eb95 100644
--- a/src/lj_target.h
+++ b/src/lj_target.h
@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
 /* Bitset for registers. 32 registers suffice for most architectures.
 ** Note that one set holds bits for both GPRs and FPRs.
 */
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 typedef uint64_t RegSet;
 #else
 typedef uint32_t RegSet;
@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
 #define rset_set(rs, r)		(rs |= RID2RSET(r))
 #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
 #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 #define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
 #define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
 #else
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h
index 57ab134f..0cef06d5 100644
--- a/src/lj_target_arm64.h
+++ b/src/lj_target_arm64.h
@@ -55,7 +55,8 @@ enum {
 
 /* Make use of all registers, except for x18, fp, lr and sp. */
 #define RSET_FIXED \
-  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP))
+  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)|\
+   RID2RSET(RID_GL))
 #define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
 #define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
 #define RSET_ALL	(RSET_GPR|RSET_FPR)
@@ -73,25 +74,235 @@ enum {
 #define REGARG_LASTFPR		RID_D7
 #define REGARG_NUMFPR		8
 
+/* -- Spill slots --------------------------------------------------------- */
+
+/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
+**
+** SPS_FIXED: Available fixed spill slots in interpreter frame.
+** This definition must match with the vm_arm64.dasc file.
+** Pre-allocate some slots to avoid sp adjust in every root trace.
+**
+** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots.
+*/
+#define SPS_FIXED	4
+#define SPS_FIRST	2
+
+#define SPOFS_TMP	0
+
+#define sps_scale(slot)		(4 * (int32_t)(slot))
+#define sps_align(slot)		(((slot) - SPS_FIXED + 3) & ~3)
+
+/* -- Exit state ---------------------------------------------------------- */
+
+/* This definition must match with the *.dasc file(s). */
+typedef struct {
+  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
+  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
+  int32_t spill[256];		/* Spill slots. */
+} ExitState;
+
+/* PC after instruction that caused an exit. Used to find the trace number. */
+#define EXITSTATE_PCREG		RID_LR
+/* Highest exit + 1 indicates stack check. */
+#define EXITSTATE_CHECKEXIT	1
+
+#define EXITSTUB_SPACING	4
+#define EXITSTUBS_PER_GROUP	32
+
+
 /* -- Instructions -------------------------------------------------------- */
 
 /* Instruction fields. */
 #define A64F_D(r)	(r)
-#define A64F_N(r)       ((r) << 5)
-#define A64F_A(r)       ((r) << 10)
-#define A64F_M(r)       ((r) << 16)
+#define A64F_N(r)	((r) << 5)
+#define A64F_A(r)	((r) << 10)
+#define A64F_M(r)	((r) << 16)
+#define A64F_IMMS(x)	((x) << 10)
+#define A64F_IMMR(x)	((x) << 16)
 #define A64F_U16(x)	((x) << 5)
+#define A64F_U12(x)	((x) << 10)
 #define A64F_S26(x)	(x)
 #define A64F_S19(x)	((x) << 5)
+#define A64F_S9(x)	((x) << 12)
+#define A64F_SH(sh, x)	(((sh) << 22) | ((x) << 10))
+#define A64F_EX(ex)	(A64I_EX | ((ex) << 13))
+#define A64F_EXSH(ex,x)	(A64I_EX | ((ex) << 13) | ((x) << 10))
+#define A64F_FP8(x)	((x) << 13)
+#define A64F_CC(cc)	((cc) << 12)
+#define A64F_LSL16(x)	(((x) / 16) << 21)
+#define A64F_BSH(sh)	((sh) << 10)
 
 typedef enum A64Ins {
+  A64I_S = 0x20000000,
+  A64I_X = 0x80000000,
+  A64I_EX = 0x00200000,
+  A64I_K12 = 0x1a000000,
+  A64I_K13 = 0x18000000,
+  A64I_LS_U = 0x01000000,
+  A64I_LS_S = 0x00800000,
+  A64I_LS_R = 0x01200800,
+  A64I_LS_UXTWx = 0x00005000,
+  A64I_LS_LSLx = 0x00007000,
+
+  A64I_ADDw = 0x0b000000,
+  A64I_ADDx = 0x8b000000,
+  A64I_ADDSw = 0x2b000000,
+  A64I_ADDSx = 0xab000000,
+  A64I_NEGw = 0x4b0003e0,
+  A64I_NEGx = 0xcb0003e0,
+  A64I_SUBw = 0x4b000000,
+  A64I_SUBx = 0xcb000000,
+  A64I_SUBSw = 0x6b000000,
+  A64I_SUBSx = 0xeb000000,
+
+  A64I_MULw = 0x1b007c00,
+  A64I_MULx = 0x9b007c00,
+  A64I_SMULL = 0x9b207c00,
+
+  A64I_ANDw = 0x0a000000,
+  A64I_ANDx = 0x8a000000,
+  A64I_ANDSw = 0x6a000000,
+  A64I_ANDSx = 0xea000000,
+  A64I_EORw = 0x4a000000,
+  A64I_EORx = 0xca000000,
+  A64I_ORRw = 0x2a000000,
+  A64I_ORRx = 0xaa000000,
+  A64I_TSTw  = 0x6a00001f,
+  A64I_TSTx  = 0xea00001f,
+
+  A64I_CMPw = 0x6b00001f,
+  A64I_CMPx = 0xeb00001f,
+  A64I_CMNw = 0x2b00001f,
+  A64I_CMNx = 0xab00001f,
+  A64I_CCMPw = 0x7a400000,
+  A64I_CCMPx = 0xfa400000,
+  A64I_CSELw = 0x1a800000,
+  A64I_CSELx = 0x9a800000,
+
+  A64I_ASRw = 0x13007c00,
+  A64I_ASRx = 0x9340fc00,
+  A64I_LSLx = 0xd3400000,
+  A64I_LSRx = 0xd340fc00,
+  A64I_SHRw = 0x1ac02000,
+  A64I_SHRx = 0x9ac02000,	/* lsl/lsr/asr/ror x0, x0, x0 */
+  A64I_REVw = 0x5ac00800,
+  A64I_REVx = 0xdac00c00,
+
+  A64I_EXTRw = 0x13800000,
+  A64I_EXTRx = 0x93c00000,
+  A64I_SBFMw = 0x13000000,
+  A64I_SBFMx = 0x93400000,
+  A64I_SXTBw = 0x13001c00,
+  A64I_SXTHw = 0x13003c00,
+  A64I_SXTW = 0x93407c00,
+  A64I_UBFMw = 0x53000000,
+  A64I_UBFMx = 0xd3400000,
+  A64I_UXTBw = 0x53001c00,
+  A64I_UXTHw = 0x53003c00,
+
+  A64I_MOVw = 0x2a0003e0,
+  A64I_MOVx = 0xaa0003e0,
+  A64I_MVNw = 0x2a2003e0,
+  A64I_MVNx = 0xaa2003e0,
+  A64I_MOVKw = 0x72800000,
+  A64I_MOVKx = 0xf2800000,
   A64I_MOVZw = 0x52800000,
   A64I_MOVZx = 0xd2800000,
+  A64I_MOVNw = 0x12800000,
+  A64I_MOVNx = 0x92800000,
+
+  A64I_LDRB = 0x39400000,
+  A64I_LDRH = 0x79400000,
+  A64I_LDRw = 0xb9400000,
+  A64I_LDRx = 0xf9400000,
   A64I_LDRLw = 0x18000000,
   A64I_LDRLx = 0x58000000,
-  A64I_NOP = 0xd503201f,
+  A64I_STRB = 0x39000000,
+  A64I_STRH = 0x79000000,
+  A64I_STRw = 0xb9000000,
+  A64I_STRx = 0xf9000000,
+  A64I_STPw = 0x29000000,
+  A64I_STPx = 0xa9000000,
+  A64I_LDPw = 0x29400000,
+  A64I_LDPx = 0xa9400000,
+
   A64I_B = 0x14000000,
+  A64I_BCC = 0x54000000,
+  A64I_BL = 0x94000000,
   A64I_BR = 0xd61f0000,
+  A64I_BLR = 0xd63f0000,
+
+  A64I_NOP = 0xd503201f,
+
+  /* FP */
+  A64I_FADDd = 0x1e602800,
+  A64I_FSUBd = 0x1e603800,
+  A64I_FMADDd = 0x1f400000,
+  A64I_FMSUBd = 0x1f408000,
+  A64I_FNMADDd = 0x1f600000,
+  A64I_FNMSUBd = 0x1f608000,
+  A64I_FMULd = 0x1e600800,
+  A64I_FDIVd = 0x1e601800,
+  A64I_FNEGd = 0x1e614000,
+  A64I_FABS = 0x1e60c000,
+  A64I_FSQRTd = 0x1e61c000,
+  A64I_LDRs = 0xbd400000,
+  A64I_LDRd = 0xfd400000,
+  A64I_STRs = 0xbd000000,
+  A64I_STRd = 0xfd000000,
+  A64I_LDPs = 0x2d400000,
+  A64I_LDPd = 0x6d400000,
+  A64I_STPs = 0x2d000000,
+  A64I_STPd = 0x6d000000,
+  A64I_FCMPd = 0x1e602000,
+  A64I_FCMPZd = 0x1e602008,
+  A64I_FCSELd = 0x1e600c00,
+  A64I_FRINTMd = 0x1e654000,
+  A64I_FRINTPd = 0x1e64c000,
+  A64I_FRINTZd = 0x1e65c000,
+
+  A64I_FCVT_F32_F64 = 0x1e624000,
+  A64I_FCVT_F64_F32 = 0x1e22c000,
+  A64I_FCVT_F32_S32 = 0x1e220000,
+  A64I_FCVT_F64_S32 = 0x1e620000,
+  A64I_FCVT_F32_U32 = 0x1e230000,
+  A64I_FCVT_F64_U32 = 0x1e630000,
+  A64I_FCVT_F32_S64 = 0x9e220000,
+  A64I_FCVT_F64_S64 = 0x9e620000,
+  A64I_FCVT_F32_U64 = 0x9e230000,
+  A64I_FCVT_F64_U64 = 0x9e630000,
+  A64I_FCVT_S32_F64 = 0x1e780000,
+  A64I_FCVT_S32_F32 = 0x1e380000,
+  A64I_FCVT_U32_F64 = 0x1e790000,
+  A64I_FCVT_U32_F32 = 0x1e390000,
+  A64I_FCVT_S64_F64 = 0x9e780000,
+  A64I_FCVT_S64_F32 = 0x9e380000,
+  A64I_FCVT_U64_F64 = 0x9e790000,
+  A64I_FCVT_U64_F32 = 0x9e390000,
+
+  A64I_FMOV_S = 0x1e204000,
+  A64I_FMOV_D = 0x1e604000,
+  A64I_FMOV_R_S = 0x1e260000,
+  A64I_FMOV_S_R = 0x1e270000,
+  A64I_FMOV_R_D = 0x9e660000,
+  A64I_FMOV_D_R = 0x9e670000,
+  A64I_FMOV_DI = 0x1e601000,
 } A64Ins;
 
+typedef enum A64Shift {
+  A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR
+} A64Shift;
+
+typedef enum A64Extend {
+  A64EX_UXTB, A64EX_UXTH, A64EX_UXTW, A64EX_UXTX,
+  A64EX_SXTB, A64EX_SXTH, A64EX_SXTW, A64EX_SXTX,
+} A64Extend;
+
+/* ARM condition codes. */
+typedef enum A64CC {
+  CC_EQ, CC_NE, CC_CS, CC_CC, CC_MI, CC_PL, CC_VS, CC_VC,
+  CC_HI, CC_LS, CC_GE, CC_LT, CC_GT, CC_LE, CC_AL,
+  CC_HS = CC_CS, CC_LO = CC_CC
+} A64CC;
+
 #endif
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 7a881bdd..a6227bf7 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -236,12 +236,17 @@
 |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
 |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
 |
-#define GL_J(field)	(GG_OFS(J) + (int)offsetof(jit_State, field))
+#define GL_J(field)	(GG_G2J + (int)offsetof(jit_State, field))
 |
 #define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
 |
 |.macro hotcheck, delta
-|  NYI
+|  lsr CARG1, PC, #1
+|  and CARG1, CARG1, #126
+|  add CARG1, CARG1, #GG_G2DISP+GG_DISP2HOT
+|  ldrh CARG2w, [GL, CARG1]
+|  subs CARG2, CARG2, #delta
+|  strh CARG2w, [GL, CARG1]
 |.endmacro
 |
 |.macro hotloop
@@ -869,7 +874,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern lj_meta_for	// (lua_State *L, TValue *base)
   |  ldr INSw, [PC, #-4]
   |.if JIT
-  |   uxtb TMP0, INS
+  |   uxtb TMP0w, INSw
   |.endif
   |  decode_RA RA, INS
   |  decode_RD RC, INS
@@ -1732,7 +1737,20 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |->vm_record:				// Dispatch target for recording phase.
-  |  NYI
+  |.if JIT
+  |  ldrb CARG1w, GL->hookmask
+  |  tst CARG1, #HOOK_VMEVENT		// No recording while in vmevent.
+  |  bne >5
+  |  // Decrement the hookcount for consistency, but always do the call.
+  |   ldr CARG2w, GL->hookcount
+  |  tst CARG1, #HOOK_ACTIVE
+  |  bne >1
+  |   sub CARG2w, CARG2w, #1
+  |  tst CARG1, #LUA_MASKLINE|LUA_MASKCOUNT
+  |  beq >1
+  |   str CARG2w, GL->hookcount
+  |  b >1
+  |.endif
   |
   |->vm_rethook:			// Dispatch target for return hooks.
   |  ldrb TMP2w, GL->hookmask
@@ -1774,7 +1792,21 @@ static void build_subroutines(BuildCtx *ctx)
   |  b <4
   |
   |->vm_hotloop:			// Hot loop counter underflow.
-  |  NYI
+  |.if JIT
+  |  ldr LFUNC:CARG3, [BASE, FRAME_FUNC]  // Same as curr_topL(L).
+  |   add CARG1, GL, #GG_G2DISP+GG_DISP2J
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |   str PC, SAVE_PC
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |   mov CARG2, PC
+  |   str L, [GL, #GL_J(L)]
+  |  ldrb CARG3w, [CARG3, #PC2PROTO(framesize)]
+  |   str BASE, L->base
+  |  add CARG3, BASE, CARG3, lsl #3
+  |  str CARG3, L->top
+  |  bl extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
+  |  b <3
+  |.endif
   |
   |->vm_callhook:			// Dispatch target for call hooks.
   |  mov CARG2, PC
@@ -1804,7 +1836,54 @@ static void build_subroutines(BuildCtx *ctx)
   |  br CRET1
   |
   |->cont_stitch:			// Trace stitching.
-  |  NYI
+  |.if JIT
+  |  // RA = resultptr, CARG4 = meta base
+  |   ldr RB, SAVE_MULTRES
+  |  ldr INSw, [PC, #-4]
+  |    ldr TRACE:CARG3, [CARG4, #-40]	// Save previous trace.
+  |   subs RB, RB, #8
+  |  decode_RA RC, INS			// Call base.
+  |    and CARG3, CARG3, #LJ_GCVMASK
+  |   beq >2
+  |1:  // Move results down.
+  |  ldr CARG1, [RA]
+  |    add RA, RA, #8
+  |   subs RB, RB, #8
+  |  str CARG1, [BASE, RC, lsl #3]
+  |    add RC, RC, #1
+  |   bne <1
+  |2:
+  |   decode_RA RA, INS
+  |   decode_RB RB, INS
+  |   add RA, RA, RB
+  |3:
+  |   cmp RA, RC
+  |   bhi >9				// More results wanted?
+  |
+  |  ldrh RAw, TRACE:CARG3->traceno
+  |  ldrh RCw, TRACE:CARG3->link
+  |  cmp RCw, RAw
+  |  beq ->cont_nop			// Blacklisted.
+  |  cmp RCw, #0
+  |  bne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  mov CARG1, #GL_J(exitno)
+  |  str RA, [GL, CARG1]
+  |  mov CARG1, #GL_J(L)
+  |  str L, [GL, CARG1]
+  |  str BASE, L->base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, PC
+  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |  ldr BASE, L->base
+  |  b ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  str TISNIL, [BASE, RC, lsl #3]
+  |  add RC, RC, #1
+  |  b <3
+  |.endif
   |
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
@@ -1822,10 +1901,120 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Trace exit handler -------------------------------------------------
   |//-----------------------------------------------------------------------
   |
+  |.macro savex_, a, b
+  |  stp d..a, d..b, [sp, #a*8]
+  |  stp x..a, x..b, [sp, #32*8+a*8]
+  |.endmacro
+  |
   |->vm_exit_handler:
-  |  NYI
+  |.if JIT
+  |  sub     sp, sp, #(64*8)
+  |  savex_, 0, 1
+  |  savex_, 2, 3
+  |  savex_, 4, 5
+  |  savex_, 6, 7
+  |  savex_, 8, 9
+  |  savex_, 10, 11
+  |  savex_, 12, 13
+  |  savex_, 14, 15
+  |  savex_, 16, 17
+  |  savex_, 18, 19
+  |  savex_, 20, 21
+  |  savex_, 22, 23
+  |  savex_, 24, 25
+  |  savex_, 26, 27
+  |  savex_, 28, 29
+  |  stp d30, d31, [sp, #30*8]
+  |  ldr CARG1, [sp, #64*8]	// Load original value of lr.
+  |   add CARG3, sp, #64*8	// Recompute original value of sp.
+  |   mv_vmstate CARG4, EXIT
+  |  ldr CARG2w, [CARG1, #-4]!	// Get exit instruction.
+  |   stp CARG1, CARG3, [sp, #62*8]	// Store exit pc/sp in RID_LR/RID_SP.
+  |  lsl CARG2, CARG2, #38
+  |  add CARG1, CARG1, CARG2, asr #36
+  |   ldr CARG2w, [lr]		// Load exit stub group offset.
+  |   sub CARG1, CARG1, lr
+  |   sub CARG1, CARG1, #4
+  |  ldr L, GL->cur_L
+  |   add CARG1, CARG2, CARG1, lsr #2	// Compute exit number.
+  |    ldr BASE, GL->jit_base
+  |   st_vmstate CARG4
+  |   str CARG1w, [GL, #GL_J(exitno)]
+  |    str BASE, L->base
+  |  str L, [GL, #GL_J(L)]
+  |   str xzr, GL->jit_base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, sp
+  |  bl extern lj_trace_exit		// (jit_State *J, ExitState *ex)
+  |  // Returns MULTRES (unscaled) or negated error code.
+  |  ldr CARG2, L->cframe
+  |   ldr BASE, L->base
+  |  and sp, CARG2, #CFRAME_RAWMASK
+  |   ldr PC, SAVE_PC			// Get SAVE_PC.
+  |  str L, SAVE_L			// Set SAVE_L (on-trace resume/yield).
+  |  b >1
+  |.endif
+  |
   |->vm_exit_interp:
-  |  NYI
+  |  // CARG1 = MULTRES or negated error code, BASE, PC and GL set.
+  |.if JIT
+  |  ldr L, SAVE_L
+  |1:
+  |  cmp CARG1w, #0
+  |  blt >9				// Check for error from exit.
+  |   lsl RC, CARG1, #3
+  |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
+  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+  |    movn TISNIL, #0
+  |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
+  |   str RC, SAVE_MULTRES
+  |   str BASE, L->base
+  |  ldr CARG2, LFUNC:CARG2->pc
+  |   str xzr, GL->jit_base
+  |    mv_vmstate CARG4, INTERP
+  |  ldr KBASE, [CARG2, #PC2PROTO(k)]
+  |  // Modified copy of ins_next which handles function header dispatch, too.
+  |  ldrb RBw, [PC]
+  |   ldr INSw, [PC], #4
+  |    st_vmstate CARG4
+  |  cmp RBw, #BC_FUNCC+2		// Fast function?
+  |   add TMP1, GL, INS, uxtb #3
+  |  bhs >4
+  |2:
+  |  cmp RBw, #BC_FUNCF			// Function header?
+  |  add TMP0, GL, RB, uxtb #3
+  |  ldr RB, [TMP0, #GG_G2DISP]
+  |   decode_RA RA, INS
+  |   lsr TMP0, INS, #16
+  |   csel RC, TMP0, RC, lo
+  |   blo >5
+  |   ldr CARG3, [BASE, FRAME_FUNC]
+  |   sub RC, RC, #8
+  |   add RA, BASE, RA, lsl #3	// Yes: RA = BASE+framesize*8, RC = nargs*8
+  |   and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |5:
+  |  br RB
+  |
+  |4:  // Check frame below fast function.
+  |  ldr CARG1, [BASE, FRAME_PC]
+  |  ands CARG2, CARG1, #FRAME_TYPE
+  |  bne <2			// Trace stitching continuation?
+  |  // Otherwise set KBASE for Lua function below fast function.
+  |  ldr CARG3, [CARG1, #-4]
+  |  decode_RA CARG1, CARG3
+  |  sub CARG2, BASE, CARG1, lsl #3
+  |  ldr LFUNC:CARG3, [CARG2, #-32]
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+  |  b <2
+  |
+  |9:  // Rethrow error from the right C frame.
+  |  neg CARG2, CARG1
+  |  mov CARG1, L
+  |  bl extern lj_err_throw		// (lua_State *L, int errcode)
+  |.endif
   |
   |//-----------------------------------------------------------------------
   |//-- Math helper functions ----------------------------------------------
@@ -3387,6 +3576,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     if (op == BC_FORI) {
       |  csel PC, RC, PC, gt
     } else if (op == BC_JFORI) {
+      |  mov PC, RC
       |  ldrh RCw, [RC, #-2]
     } else if (op == BC_IFORL) {
       |  csel PC, RC, PC, le
@@ -3488,7 +3678,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_JLOOP:
     |.if JIT
-    |  NYI
+    |  // RA = base (ignored), RC = traceno
+    |  ldr CARG1, [GL, #GL_J(trace)]
+    |   mov CARG2, #0  // Traces on ARM64 don't store the trace #, so use 0.
+    |  ldr TRACE:RC, [CARG1, RC, lsl #3]
+    |   st_vmstate CARG2
+    |  ldr RA, TRACE:RC->mcode
+    |   str BASE, GL->jit_base
+    |   str L, GL->tmpbuf.L
+    |  sub sp, sp, #16	// See SPS_FIXED. Avoids sp adjust in every root trace.
+    |  br RA
     |.endif
     break;
 
@@ -3546,10 +3745,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_IFUNCV:
     |  // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8
     |  ldr CARG1, L->maxstack
+    |   movn TMP0, #~LJ_TFUNC
     |   add TMP2, BASE, RC
+    |   add LFUNC:CARG3, CARG3, TMP0, lsl #47
     |  add RA, RA, RC
     |   add TMP0, RC, #16+FRAME_VARG
-    |   str LFUNC:CARG3, [TMP2], #8	// Store (untagged) copy of LFUNC.
+    |   str LFUNC:CARG3, [TMP2], #8	// Store (tagged) copy of LFUNC.
     |    ldr KBASE, [PC, #-4+PC2PROTO(k)]
     |  cmp RA, CARG1
     |   str TMP0, [TMP2], #8		// Store delta + FRAME_VARG.
@@ -3736,8 +3937,8 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.uleb128 0x1\n"
 	"\t.sleb128 -8\n"
 	"\t.byte 30\n"				/* Return address is in lr. */
-	"\t.uleb128 1\n"                        /* augmentation length */
-	"\t.byte 0x1b\n"                        /* pcrel|sdata4 */
+	"\t.uleb128 1\n"			/* augmentation length */
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
 	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
 	"\t.align 3\n"
 	".LECIE2:\n\n");
@@ -3748,7 +3949,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.long .LASFDE3-.Lframe2\n"
 	"\t.long lj_vm_ffi_call-.\n"
 	"\t.long %d\n"
-	"\t.uleb128 0\n"                        /* augmentation length */
+	"\t.uleb128 0\n"			/* augmentation length */
 	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
 	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
 	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */
-- 
cgit v1.2.3-55-g6feb