aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Pall <mike>2016-11-20 22:16:08 +0100
committerMike Pall <mike>2016-11-20 22:18:14 +0100
commit04b60707d7d117da22b40736a353e2a10179108a (patch)
treed11f50b00a8589108f5ebeeb005a12071fe6fcdf
parent13642b75ac37957d9e2a37b35ebec69d6d4b3bc1 (diff)
downloadluajit-04b60707d7d117da22b40736a353e2a10179108a.tar.gz
luajit-04b60707d7d117da22b40736a353e2a10179108a.tar.bz2
luajit-04b60707d7d117da22b40736a353e2a10179108a.zip
ARM64: Add JIT compiler backend.
Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc.
-rw-r--r--Makefile4
-rw-r--r--src/jit/dis_arm64.lua1215
-rw-r--r--src/lj_arch.h1
-rw-r--r--src/lj_asm.c4
-rw-r--r--src/lj_asm_arm64.h1823
-rw-r--r--src/lj_ccall.c2
-rw-r--r--src/lj_dispatch.h1
-rw-r--r--src/lj_emit_arm64.h397
-rw-r--r--src/lj_gdbjit.c12
-rw-r--r--src/lj_target.h4
-rw-r--r--src/lj_target_arm64.h221
-rw-r--r--src/vm_arm64.dasc227
12 files changed, 3887 insertions, 24 deletions
diff --git a/Makefile b/Makefile
index 6dfbbde4..5e640d94 100644
--- a/Makefile
+++ b/Makefile
@@ -86,8 +86,8 @@ FILE_MAN= luajit.1
86FILE_PC= luajit.pc 86FILE_PC= luajit.pc
87FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h 87FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
88FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \ 88FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
89 dis_x86.lua dis_x64.lua dis_arm.lua dis_ppc.lua \ 89 dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
90 dis_mips.lua dis_mipsel.lua vmdef.lua 90 dis_ppc.lua dis_mips.lua dis_mipsel.lua vmdef.lua
91 91
92ifeq (,$(findstring Windows,$(OS))) 92ifeq (,$(findstring Windows,$(OS)))
93 HOST_SYS:= $(shell uname -s) 93 HOST_SYS:= $(shell uname -s)
diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua
new file mode 100644
index 00000000..909b33bc
--- /dev/null
+++ b/src/jit/dis_arm64.lua
@@ -0,0 +1,1215 @@
1----------------------------------------------------------------------------
2-- LuaJIT ARM64 disassembler module.
3--
4-- Copyright (C) 2005-2016 Mike Pall. All rights reserved.
5-- Released under the MIT license. See Copyright Notice in luajit.h
6--
7-- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
8-- Sponsored by Cisco Systems, Inc.
9----------------------------------------------------------------------------
10-- This is a helper module used by the LuaJIT machine code dumper module.
11--
12-- It disassembles most user-mode AArch64 instructions.
13-- NYI: Advanced SIMD and VFP instructions.
14------------------------------------------------------------------------------
15
16local type, tonumber = type, tonumber
17local sub, byte, format = string.sub, string.byte, string.format
18local match, gmatch, gsub = string.match, string.gmatch, string.gsub
19local rep = string.rep
20local concat = table.concat
21local bit = require("bit")
22local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex
23local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
24local ror = bit.ror
25
26------------------------------------------------------------------------------
27-- Opcode maps
28------------------------------------------------------------------------------
29
30local map_adr = { -- PC-relative addressing.
31 shift = 31, mask = 1,
32 [0] = "adrDBx", "adrpDBx"
33}
34
35local map_addsubi = { -- Add/subtract immediate.
36 shift = 29, mask = 3,
37 [0] = "add|movDNIg", "adds|cmnD0NIg", "subDNIg", "subs|cmpD0NIg",
38}
39
40local map_logi = { -- Logical immediate.
41 shift = 31, mask = 1,
42 [0] = {
43 shift = 22, mask = 1,
44 [0] = {
45 shift = 29, mask = 3,
46 [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
47 },
48 false -- unallocated
49 },
50 {
51 shift = 29, mask = 3,
52 [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
53 }
54}
55
56local map_movwi = { -- Move wide immediate.
57 shift = 31, mask = 1,
58 [0] = {
59 shift = 22, mask = 1,
60 [0] = {
61 shift = 29, mask = 3,
62 [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
63 }, false -- unallocated
64 },
65 {
66 shift = 29, mask = 3,
67 [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
68 },
69}
70
71local map_bitf = { -- Bitfield.
72 shift = 31, mask = 1,
73 [0] = {
74 shift = 22, mask = 1,
75 [0] = {
76 shift = 29, mask = 3,
77 [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12w",
78 "bfm|bfi|bfxilDN13w",
79 "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12w"
80 }
81 },
82 {
83 shift = 22, mask = 1,
84 {
85 shift = 29, mask = 3,
86 [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12x",
87 "bfm|bfi|bfxilDN13x",
88 "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12x"
89 }
90 }
91}
92
93local map_datai = { -- Data processing - immediate.
94 shift = 23, mask = 7,
95 [0] = map_adr, map_adr, map_addsubi, false,
96 map_logi, map_movwi, map_bitf,
97 {
98 shift = 15, mask = 0x1c0c1,
99 [0] = "extr|rorDNM4w", [0x10080] = "extr|rorDNM4x",
100 [0x10081] = "extr|rorDNM4x"
101 }
102}
103
104local map_logsr = { -- Logical, shifted register.
105 shift = 31, mask = 1,
106 [0] = {
107 shift = 15, mask = 1,
108 [0] = {
109 shift = 29, mask = 3,
110 [0] = {
111 shift = 21, mask = 7,
112 [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
113 "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
114 },
115 {
116 shift = 21, mask = 7,
117 [0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
118 "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
119 },
120 {
121 shift = 21, mask = 7,
122 [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
123 "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
124 },
125 {
126 shift = 21, mask = 7,
127 [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
128 "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
129 }
130 },
131 false -- unallocated
132 },
133 {
134 shift = 29, mask = 3,
135 [0] = {
136 shift = 21, mask = 7,
137 [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
138 "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
139 },
140 {
141 shift = 21, mask = 7,
142 [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
143 "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
144 },
145 {
146 shift = 21, mask = 7,
147 [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
148 "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
149 },
150 {
151 shift = 21, mask = 7,
152 [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
153 "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
154 }
155 }
156}
157
158local map_assh = {
159 shift = 31, mask = 1,
160 [0] = {
161 shift = 15, mask = 1,
162 [0] = {
163 shift = 29, mask = 3,
164 [0] = {
165 shift = 22, mask = 3,
166 [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
167 },
168 {
169 shift = 22, mask = 3,
170 [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg",
171 "adds|cmnD0NMSg", "adds|cmnD0NMg"
172 },
173 {
174 shift = 22, mask = 3,
175 [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
176 },
177 {
178 shift = 22, mask = 3,
179 [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
180 "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
181 },
182 },
183 false -- unallocated
184 },
185 {
186 shift = 29, mask = 3,
187 [0] = {
188 shift = 22, mask = 3,
189 [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
190 },
191 {
192 shift = 22, mask = 3,
193 [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", "adds|cmnD0NMSg",
194 "adds|cmnD0NMg"
195 },
196 {
197 shift = 22, mask = 3,
198 [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
199 },
200 {
201 shift = 22, mask = 3,
202 [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
203 "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
204 }
205 }
206}
207
208local map_addsubsh = { -- Add/subtract, shifted register.
209 shift = 22, mask = 3,
210 [0] = map_assh, map_assh, map_assh
211}
212
213local map_addsubex = { -- Add/subtract, extended register.
214 shift = 22, mask = 3,
215 [0] = {
216 shift = 29, mask = 3,
217 [0] = "addDNMXg", "adds|cmnD0NMXg", "subDNMXg", "subs|cmpD0NMzXg",
218 }
219}
220
221local map_addsubc = { -- Add/subtract, with carry.
222 shift = 10, mask = 63,
223 [0] = {
224 shift = 29, mask = 3,
225 [0] = "adcDNMg", "adcsDNMg", "sbc|ngcDN0Mg", "sbcs|ngcsDN0Mg",
226 }
227}
228
229local map_ccomp = {
230 shift = 4, mask = 1,
231 [0] = {
232 shift = 10, mask = 3,
233 [0] = { -- Conditional compare register.
234 shift = 29, mask = 3,
235 "ccmnNMVCg", false, "ccmpNMVCg",
236 },
237 [2] = { -- Conditional compare immediate.
238 shift = 29, mask = 3,
239 "ccmnN5VCg", false, "ccmpN5VCg",
240 }
241 }
242}
243
244local map_csel = { -- Conditional select.
245 shift = 11, mask = 1,
246 [0] = {
247 shift = 10, mask = 1,
248 [0] = {
249 shift = 29, mask = 3,
250 [0] = "cselDNMzCg", false, "csinv|cinv|csetmDNMcg", false,
251 },
252 {
253 shift = 29, mask = 3,
254 [0] = "csinc|cinc|csetDNMcg", false, "csneg|cnegDNMcg", false,
255 }
256 }
257}
258
259local map_data1s = { -- Data processing, 1 source.
260 shift = 29, mask = 1,
261 [0] = {
262 shift = 31, mask = 1,
263 [0] = {
264 shift = 10, mask = 0x7ff,
265 [0] = "rbitDNg", "rev16DNg", "revDNw", false, "clzDNg", "clsDNg"
266 },
267 {
268 shift = 10, mask = 0x7ff,
269 [0] = "rbitDNg", "rev16DNg", "rev32DNx", "revDNx", "clzDNg", "clsDNg"
270 }
271 }
272}
273
274local map_data2s = { -- Data processing, 2 sources.
275 shift = 29, mask = 1,
276 [0] = {
277 shift = 10, mask = 63,
278 false, "udivDNMg", "sdivDNMg", false, false, false, false, "lslDNMg",
279 "lsrDNMg", "asrDNMg", "rorDNMg"
280 }
281}
282
283local map_data3s = { -- Data processing, 3 sources.
284 shift = 29, mask = 7,
285 [0] = {
286 shift = 21, mask = 7,
287 [0] = {
288 shift = 15, mask = 1,
289 [0] = "madd|mulDNMA0g", "msub|mnegDNMA0g"
290 }
291 }, false, false, false,
292 {
293 shift = 15, mask = 1,
294 [0] = {
295 shift = 21, mask = 7,
296 [0] = "madd|mulDNMA0g", "smaddl|smullDxNMwA0x", "smulhDNMx", false,
297 false, "umaddl|umullDxNMwA0x", "umulhDNMx"
298 },
299 {
300 shift = 21, mask = 7,
301 [0] = "msub|mnegDNMA0g", "smsubl|smneglDxNMwA0x", false, false,
302 false, "umsubl|umneglDxNMwA0x"
303 }
304 }
305}
306
307local map_datar = { -- Data processing, register.
308 shift = 28, mask = 1,
309 [0] = {
310 shift = 24, mask = 1,
311 [0] = map_logsr,
312 {
313 shift = 21, mask = 1,
314 [0] = map_addsubsh, map_addsubex
315 }
316 },
317 {
318 shift = 21, mask = 15,
319 [0] = map_addsubc, false, map_ccomp, false, map_csel, false,
320 {
321 shift = 30, mask = 1,
322 [0] = map_data2s, map_data1s
323 },
324 false, map_data3s, map_data3s, map_data3s, map_data3s, map_data3s,
325 map_data3s, map_data3s, map_data3s
326 }
327}
328
329local map_lrl = { -- Load register, literal.
330 shift = 26, mask = 1,
331 [0] = {
332 shift = 30, mask = 3,
333 [0] = "ldrDwB", "ldrDxB", "ldrswDxB"
334 },
335 {
336 shift = 30, mask = 3,
337 [0] = "ldrDsB", "ldrDdB"
338 }
339}
340
341local map_lsriind = { -- Load/store register, immediate pre/post-indexed.
342 shift = 30, mask = 3,
343 [0] = {
344 shift = 26, mask = 1,
345 [0] = {
346 shift = 22, mask = 3,
347 [0] = "strbDwzL", "ldrbDwzL", "ldrsbDxzL", "ldrsbDwzL"
348 }
349 },
350 {
351 shift = 26, mask = 1,
352 [0] = {
353 shift = 22, mask = 3,
354 [0] = "strhDwzL", "ldrhDwzL", "ldrshDxzL", "ldrshDwzL"
355 }
356 },
357 {
358 shift = 26, mask = 1,
359 [0] = {
360 shift = 22, mask = 3,
361 [0] = "strDwzL", "ldrDwzL", "ldrswDxzL"
362 },
363 {
364 shift = 22, mask = 3,
365 [0] = "strDszL", "ldrDszL"
366 }
367 },
368 {
369 shift = 26, mask = 1,
370 [0] = {
371 shift = 22, mask = 3,
372 [0] = "strDxzL", "ldrDxzL"
373 },
374 {
375 shift = 22, mask = 3,
376 [0] = "strDdzL", "ldrDdzL"
377 }
378 }
379}
380
381local map_lsriro = {
382 shift = 21, mask = 1,
383 [0] = { -- Load/store register immediate.
384 shift = 10, mask = 3,
385 [0] = { -- Unscaled immediate.
386 shift = 26, mask = 1,
387 [0] = {
388 shift = 30, mask = 3,
389 [0] = {
390 shift = 22, mask = 3,
391 [0] = "sturbDwK", "ldurbDwK"
392 },
393 {
394 shift = 22, mask = 3,
395 [0] = "sturhDwK", "ldurhDwK"
396 },
397 {
398 shift = 22, mask = 3,
399 [0] = "sturDwK", "ldurDwK"
400 },
401 {
402 shift = 22, mask = 3,
403 [0] = "sturDxK", "ldurDxK"
404 }
405 }
406 }, map_lsriind, false, map_lsriind
407 },
408 { -- Load/store register, register offset.
409 shift = 10, mask = 3,
410 [2] = {
411 shift = 26, mask = 1,
412 [0] = {
413 shift = 30, mask = 3,
414 [1] = {
415 shift = 22, mask = 3,
416 [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO"
417 },
418 [2] = {
419 shift = 22, mask = 3,
420 [0] = "strDwO", "ldrDwO", "ldrswDxO"
421 },
422 [3] = {
423 shift = 22, mask = 3,
424 [0] = "strDxO", "ldrDxO"
425 }
426 },
427 {
428 shift = 30, mask = 3,
429 [2] = {
430 shift = 22, mask = 3,
431 [0] = "strDsO", "ldrDsO"
432 },
433 [3] = {
434 shift = 22, mask = 3,
435 [0] = "strDdO", "ldrDdO"
436 }
437 }
438 }
439 }
440}
441
442local map_lsp = { -- Load/store register pair, offset.
443 shift = 22, mask = 1,
444 [0] = {
445 shift = 30, mask = 3,
446 [0] = {
447 shift = 26, mask = 1,
448 [0] = "stpDzAzwP", "stpDzAzsP",
449 },
450 {
451 shift = 26, mask = 1,
452 "stpDzAzdP"
453 },
454 {
455 shift = 26, mask = 1,
456 [0] = "stpDzAzxP"
457 }
458 },
459 {
460 shift = 30, mask = 3,
461 [0] = {
462 shift = 26, mask = 1,
463 [0] = "ldpDzAzwP", "ldpDzAzsP",
464 },
465 {
466 shift = 26, mask = 1,
467 [0] = "ldpswDAxP", "ldpDzAzdP"
468 },
469 {
470 shift = 26, mask = 1,
471 [0] = "ldpDzAzxP"
472 }
473 }
474}
475
476local map_ls = { -- Loads and stores.
477 shift = 24, mask = 0x31,
478 [0x10] = map_lrl, [0x30] = map_lsriro,
479 [0x20] = {
480 shift = 23, mask = 3,
481 map_lsp, map_lsp, map_lsp
482 },
483 [0x21] = {
484 shift = 23, mask = 3,
485 map_lsp, map_lsp, map_lsp
486 },
487 [0x31] = {
488 shift = 26, mask = 1,
489 [0] = {
490 shift = 30, mask = 3,
491 [0] = {
492 shift = 22, mask = 3,
493 [0] = "strbDwzU", "ldrbDwzU"
494 },
495 {
496 shift = 22, mask = 3,
497 [0] = "strhDwzU", "ldrhDwzU"
498 },
499 {
500 shift = 22, mask = 3,
501 [0] = "strDwzU", "ldrDwzU"
502 },
503 {
504 shift = 22, mask = 3,
505 [0] = "strDxzU", "ldrDxzU"
506 }
507 },
508 {
509 shift = 30, mask = 3,
510 [2] = {
511 shift = 22, mask = 3,
512 [0] = "strDszU", "ldrDszU"
513 },
514 [3] = {
515 shift = 22, mask = 3,
516 [0] = "strDdzU", "ldrDdzU"
517 }
518 }
519 },
520}
521
522local map_datafp = { -- Data processing, SIMD and FP.
523 shift = 28, mask = 7,
524 { -- 001
525 shift = 24, mask = 1,
526 [0] = {
527 shift = 21, mask = 1,
528 {
529 shift = 10, mask = 3,
530 [0] = {
531 shift = 12, mask = 1,
532 [0] = {
533 shift = 13, mask = 1,
534 [0] = {
535 shift = 14, mask = 1,
536 [0] = {
537 shift = 15, mask = 1,
538 [0] = { -- FP/int conversion.
539 shift = 31, mask = 1,
540 [0] = {
541 shift = 16, mask = 0xff,
542 [0x20] = "fcvtnsDwNs", [0x21] = "fcvtnuDwNs",
543 [0x22] = "scvtfDsNw", [0x23] = "ucvtfDsNw",
544 [0x24] = "fcvtasDwNs", [0x25] = "fcvtauDwNs",
545 [0x26] = "fmovDwNs", [0x27] = "fmovDsNw",
546 [0x28] = "fcvtpsDwNs", [0x29] = "fcvtpuDwNs",
547 [0x30] = "fcvtmsDwNs", [0x31] = "fcvtmuDwNs",
548 [0x38] = "fcvtzsDwNs", [0x39] = "fcvtzuDwNs",
549 [0x60] = "fcvtnsDwNd", [0x61] = "fcvtnuDwNd",
550 [0x62] = "scvtfDdNw", [0x63] = "ucvtfDdNw",
551 [0x64] = "fcvtasDwNd", [0x65] = "fcvtauDwNd",
552 [0x68] = "fcvtpsDwNd", [0x69] = "fcvtpuDwNd",
553 [0x70] = "fcvtmsDwNd", [0x71] = "fcvtmuDwNd",
554 [0x78] = "fcvtzsDwNd", [0x79] = "fcvtzuDwNd"
555 },
556 {
557 shift = 16, mask = 0xff,
558 [0x20] = "fcvtnsDxNs", [0x21] = "fcvtnuDxNs",
559 [0x22] = "scvtfDsNx", [0x23] = "ucvtfDsNx",
560 [0x24] = "fcvtasDxNs", [0x25] = "fcvtauDxNs",
561 [0x28] = "fcvtpsDxNs", [0x29] = "fcvtpuDxNs",
562 [0x30] = "fcvtmsDxNs", [0x31] = "fcvtmuDxNs",
563 [0x38] = "fcvtzsDxNs", [0x39] = "fcvtzuDxNs",
564 [0x60] = "fcvtnsDxNd", [0x61] = "fcvtnuDxNd",
565 [0x62] = "scvtfDdNx", [0x63] = "ucvtfDdNx",
566 [0x64] = "fcvtasDxNd", [0x65] = "fcvtauDxNd",
567 [0x66] = "fmovDxNd", [0x67] = "fmovDdNx",
568 [0x68] = "fcvtpsDxNd", [0x69] = "fcvtpuDxNd",
569 [0x70] = "fcvtmsDxNd", [0x71] = "fcvtmuDxNd",
570 [0x78] = "fcvtzsDxNd", [0x79] = "fcvtzuDxNd"
571 }
572 }
573 },
574 { -- FP data-processing, 1 source.
575 shift = 31, mask = 1,
576 [0] = {
577 shift = 22, mask = 3,
578 [0] = {
579 shift = 15, mask = 63,
580 [0] = "fmovDNf", "fabsDNf", "fnegDNf",
581 "fsqrtDNf", false, "fcvtDdNs", false, false,
582 "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
583 "frintaDNf", false, "frintxDNf", "frintiDNf",
584 },
585 {
586 shift = 15, mask = 63,
587 [0] = "fmovDNf", "fabsDNf", "fnegDNf",
588 "fsqrtDNf", "fcvtDsNd", false, false, false,
589 "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
590 "frintaDNf", false, "frintxDNf", "frintiDNf",
591 }
592 }
593 }
594 },
595 { -- FP compare.
596 shift = 31, mask = 1,
597 [0] = {
598 shift = 14, mask = 3,
599 [0] = {
600 shift = 23, mask = 1,
601 [0] = {
602 shift = 0, mask = 31,
603 [0] = "fcmpNMf", [8] = "fcmpNZf",
604 [16] = "fcmpeNMf", [24] = "fcmpeNZf",
605 }
606 }
607 }
608 }
609 },
610 { -- FP immediate.
611 shift = 31, mask = 1,
612 [0] = {
613 shift = 5, mask = 31,
614 [0] = {
615 shift = 23, mask = 1,
616 [0] = "fmovDFf"
617 }
618 }
619 }
620 },
621 { -- FP conditional compare.
622 shift = 31, mask = 1,
623 [0] = {
624 shift = 23, mask = 1,
625 [0] = {
626 shift = 4, mask = 1,
627 [0] = "fccmpNMVCf", "fccmpeNMVCf"
628 }
629 }
630 },
631 { -- FP data-processing, 2 sources.
632 shift = 31, mask = 1,
633 [0] = {
634 shift = 23, mask = 1,
635 [0] = {
636 shift = 12, mask = 15,
637 [0] = "fmulDNMf", "fdivDNMf", "faddDNMf", "fsubDNMf",
638 "fmaxDNMf", "fminDNMf", "fmaxnmDNMf", "fminnmDNMf",
639 "fnmulDNMf"
640 }
641 }
642 },
643 { -- FP conditional select.
644 shift = 31, mask = 1,
645 [0] = {
646 shift = 23, mask = 1,
647 [0] = "fcselDNMCf"
648 }
649 }
650 }
651 },
652 { -- FP data-processing, 3 sources.
653 shift = 31, mask = 1,
654 [0] = {
655 shift = 15, mask = 1,
656 [0] = {
657 shift = 21, mask = 5,
658 [0] = "fmaddDNMAf", "fnmaddDNMAf"
659 },
660 {
661 shift = 21, mask = 5,
662 [0] = "fmsubDNMAf", "fnmsubDNMAf"
663 }
664 }
665 }
666 }
667}
668
669local map_br = { -- Branches, exception generating and system instructions.
670 shift = 29, mask = 7,
671 [0] = "bB",
672 { -- Compare & branch, immediate.
673 shift = 24, mask = 3,
674 [0] = "cbzDBg", "cbnzDBg", "tbzDTBw", "tbnzDTBw"
675 },
676 { -- Conditional branch, immediate.
677 shift = 24, mask = 3,
678 [0] = {
679 shift = 4, mask = 1,
680 [0] = {
681 shift = 0, mask = 15,
682 [0] = "beqB", "bneB", "bhsB", "bloB", "bmiB", "bplB", "bvsB", "bvcB",
683 "bhiB", "blsB", "bgeB", "bltB", "bgtB", "bleB", "balB"
684 }
685 }
686 }, false, "blB",
687 { -- Compare & branch, immediate.
688 shift = 24, mask = 3,
689 [0] = "cbzDBg", "cbnzDBg", "tbzDTBx", "tbnzDTBx"
690 },
691 {
692 shift = 24, mask = 3,
693 [0] = { -- Exception generation.
694 shift = 0, mask = 0xe0001f,
695 [0x200000] = "brkW"
696 },
697 { -- System instructions.
698 shift = 0, mask = 0x3fffff,
699 [0x03201f] = "nop"
700 },
701 { -- Unconditional branch, register.
702 shift = 0, mask = 0xfffc1f,
703 [0x1f0000] = "brNx", [0x3f0000] = "blrNx",
704 [0x5f0000] = "retNx"
705 },
706 }
707}
708
709local map_init = {
710 shift = 25, mask = 15,
711 [0] = false, false, false, false, map_ls, map_datar, map_ls, map_datafp,
712 map_datai, map_datai, map_br, map_br, map_ls, map_datar, map_ls, map_datafp
713}
714
715------------------------------------------------------------------------------
716
717local map_regs = { x = {}, w = {}, d = {}, s = {} }
718
719for i=0,30 do
720 map_regs.x[i] = "x"..i
721 map_regs.w[i] = "w"..i
722 map_regs.d[i] = "d"..i
723 map_regs.s[i] = "s"..i
724end
725map_regs.x[31] = "sp"
726map_regs.w[31] = "wsp"
727map_regs.d[31] = "d31"
728map_regs.s[31] = "s31"
729
730local map_cond = {
731 [0] = "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
732 "hi", "ls", "ge", "lt", "gt", "le", "al",
733}
734
735local map_shift = { [0] = "lsl", "lsr", "asr", }
736
737local map_extend = {
738 [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
739}
740
741------------------------------------------------------------------------------
742
743-- Output a nicely formatted line with an opcode and operands.
744local function putop(ctx, text, operands)
745 local pos = ctx.pos
746 local extra = ""
747 if ctx.rel then
748 local sym = ctx.symtab[ctx.rel]
749 if sym then
750 extra = "\t->"..sym
751 end
752 end
753 if ctx.hexdump > 0 then
754 ctx.out(format("%08x %s %-5s %s%s\n",
755 ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
756 else
757 ctx.out(format("%08x %-5s %s%s\n",
758 ctx.addr+pos, text, concat(operands, ", "), extra))
759 end
760 ctx.pos = pos + 4
761end
762
763-- Fallback for unknown opcodes.
764local function unknown(ctx)
765 return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
766end
767
768local function match_reg(p, pat, regnum)
769 return map_regs[match(pat, p.."%w-([xwds])")][regnum]
770end
771
772local function fmt_hex32(x)
773 if x < 0 then
774 return tohex(x)
775 else
776 return format("%x", x)
777 end
778end
779
780local imm13_rep = { 0x55555555, 0x11111111, 0x01010101, 0x00010001, 0x00000001 }
781
782local function decode_imm13(op)
783 local imms = band(rshift(op, 10), 63)
784 local immr = band(rshift(op, 16), 63)
785 if band(op, 0x00400000) == 0 then
786 local len = 5
787 if imms >= 56 then
788 if imms >= 60 then len = 1 else len = 2 end
789 elseif imms >= 48 then len = 3 elseif imms >= 32 then len = 4 end
790 local l = lshift(1, len)-1
791 local s = band(imms, l)
792 local r = band(immr, l)
793 local imm = ror(rshift(-1, 31-s), r)
794 if len ~= 5 then imm = band(imm, lshift(1, l)-1) + rshift(imm, 31-l) end
795 imm = imm * imm13_rep[len]
796 local ix = fmt_hex32(imm)
797 if rshift(op, 31) ~= 0 then
798 return ix..tohex(imm)
799 else
800 return ix
801 end
802 else
803 local lo, hi = -1, 0
804 if imms < 32 then lo = rshift(-1, 31-imms) else hi = rshift(-1, 63-imms) end
805 if immr ~= 0 then
806 lo, hi = ror(lo, immr), ror(hi, immr)
807 local x = immr == 32 and 0 or band(bxor(lo, hi), lshift(-1, 32-immr))
808 lo, hi = bxor(lo, x), bxor(hi, x)
809 if immr >= 32 then lo, hi = hi, lo end
810 end
811 if hi ~= 0 then
812 return fmt_hex32(hi)..tohex(lo)
813 else
814 return fmt_hex32(lo)
815 end
816 end
817end
818
819local function parse_immpc(op, name)
820 if name == "b" or name == "bl" then
821 return arshift(lshift(op, 6), 4)
822 elseif name == "adr" or name == "adrp" then
823 local immlo = band(rshift(op, 29), 3)
824 local immhi = lshift(arshift(lshift(op, 8), 13), 2)
825 return bor(immhi, immlo)
826 elseif name == "tbz" or name == "tbnz" then
827 return lshift(arshift(lshift(op, 13), 18), 2)
828 else
829 return lshift(arshift(lshift(op, 8), 13), 2)
830 end
831end
832
833local function parse_fpimm8(op)
834 local sign = band(op, 0x100000) == 0 and 1 or -1
835 local exp = bxor(rshift(arshift(lshift(op, 12), 5), 24), 0x80) - 131
836 local frac = 16+band(rshift(op, 13), 15)
837 return sign * frac * 2^exp
838end
839
840local function prefer_bfx(sf, uns, imms, immr)
841 if imms < immr or imms == 31 or imms == 63 then
842 return false
843 end
844 if immr == 0 then
845 if sf == 0 and (imms == 7 or imms == 15) then
846 return false
847 end
848 if sf ~= 0 and uns == 0 and (imms == 7 or imms == 15 or imms == 31) then
849 return false
850 end
851 end
852 return true
853end
854
855-- Disassemble a single instruction.
856local function disass_ins(ctx)
857 local pos = ctx.pos
858 local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
859 local op = bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
860 local operands = {}
861 local suffix = ""
862 local last, name, pat
863 local vr
864 local map_reg
865 ctx.op = op
866 ctx.rel = nil
867 last = nil
868 local opat
869 opat = map_init[band(rshift(op, 25), 15)]
870 while type(opat) ~= "string" do
871 if not opat then return unknown(ctx) end
872 opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
873 end
874 name, pat = match(opat, "^([a-z0-9]*)(.*)")
875 local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
876 if altname then pat = pat2 end
877 if sub(pat, 1, 1) == "." then
878 local s2, p2 = match(pat, "^([a-z0-9.]*)(.*)")
879 suffix = suffix..s2
880 pat = p2
881 end
882
883 local rt = match(pat, "[gf]")
884 if rt then
885 if rt == "g" then
886 map_reg = band(op, 0x80000000) ~= 0 and map_regs.x or map_regs.w
887 else
888 map_reg = band(op, 0x400000) ~= 0 and map_regs.d or map_regs.s
889 end
890 end
891
892 local second0, immr
893
894 for p in gmatch(pat, ".") do
895 local x = nil
896 if p == "D" then
897 local regnum = band(op, 31)
898 x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
899 elseif p == "N" then
900 local regnum = band(rshift(op, 5), 31)
901 x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
902 elseif p == "M" then
903 local regnum = band(rshift(op, 16), 31)
904 x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
905 elseif p == "A" then
906 local regnum = band(rshift(op, 10), 31)
907 x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
908 elseif p == "B" then
909 local addr = ctx.addr + pos + parse_immpc(op, name)
910 ctx.rel = addr
911 x = "0x"..tohex(addr)
912 elseif p == "T" then
913 x = bor(band(rshift(op, 26), 32), band(rshift(op, 19), 31))
914 elseif p == "V" then
915 x = band(op, 15)
916 elseif p == "C" then
917 x = map_cond[band(rshift(op, 12), 15)]
918 elseif p == "c" then
919 local rn = band(rshift(op, 5), 31)
920 local rm = band(rshift(op, 16), 31)
921 local cond = band(rshift(op, 12), 15)
922 local invc = bxor(cond, 1)
923 x = map_cond[cond]
924 if altname and cond ~= 14 and cond ~= 15 then
925 local a1, a2 = match(altname, "([^|]*)|(.*)")
926 if rn == rm then
927 local n = #operands
928 operands[n] = nil
929 x = map_cond[invc]
930 if rn ~= 31 then
931 if a1 then name = a1 else name = altname end
932 else
933 operands[n-1] = nil
934 name = a2
935 end
936 end
937 end
938 elseif p == "W" then
939 x = band(rshift(op, 5), 0xffff)
940 elseif p == "Y" then
941 x = band(rshift(op, 5), 0xffff)
942 local hw = band(rshift(op, 21), 3)
943 if altname and (hw == 0 or x ~= 0) then
944 name = altname
945 end
946 elseif p == "L" then
947 local rn = map_regs.x[band(rshift(op, 5), 31)]
948 local imm9 = arshift(lshift(op, 11), 23)
949 if band(op, 0x800) ~= 0 then
950 x = "["..rn..", #"..imm9.."]!"
951 else
952 x = "["..rn.."], #"..imm9
953 end
954 elseif p == "U" then
955 local rn = map_regs.x[band(rshift(op, 5), 31)]
956 local sz = band(rshift(op, 30), 3)
957 local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
958 if imm12 ~= 0 then
959 x = "["..rn..", #"..imm12.."]"
960 else
961 x = "["..rn.."]"
962 end
963 elseif p == "K" then
964 local rn = map_regs.x[band(rshift(op, 5), 31)]
965 local imm9 = arshift(lshift(op, 11), 23)
966 if imm9 ~= 0 then
967 x = "["..rn..", #"..imm9.."]"
968 else
969 x = "["..rn.."]"
970 end
971 elseif p == "O" then
972 local rn, rm = map_regs.x[band(rshift(op, 5), 31)]
973 local m = band(rshift(op, 13), 1)
974 if m == 0 then
975 rm = map_regs.w[band(rshift(op, 16), 31)]
976 else
977 rm = map_regs.x[band(rshift(op, 16), 31)]
978 end
979 x = "["..rn..", "..rm
980 local opt = band(rshift(op, 13), 7)
981 local s = band(rshift(op, 12), 1)
982 local sz = band(rshift(op, 30), 3)
983 -- extension to be applied
984 if opt == 3 then
985 if s == 0 then x = nil
986 else x = x..", lsl #"..sz.."]" end
987 elseif opt == 2 or opt == 6 or opt == 7 then
988 if s == 0 then x = x..", "..map_extend[opt].."]"
989 else x = x..", "..map_extend[opt].." #"..sz.."]" end
990 else
991 x = x.."]"
992 end
993 elseif p == "P" then
994 local opcv, sh = rshift(op, 26), 2
995 if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
996 local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
997 local rn = map_regs.x[band(rshift(op, 5), 31)]
998 local ind = band(rshift(op, 23), 3)
999 if ind == 1 then
1000 x = "["..rn.."], #"..imm7
1001 elseif ind == 2 then
1002 if imm7 == 0 then
1003 x = "["..rn.."]"
1004 else
1005 x = "["..rn..", #"..imm7.."]"
1006 end
1007 elseif ind == 3 then
1008 x = "["..rn..", #"..imm7.."]!"
1009 end
1010 elseif p == "I" then
1011 local shf = band(rshift(op, 22), 3)
1012 local imm12 = band(rshift(op, 10), 0x0fff)
1013 local n = #operands
1014 local rn, rd = band(rshift(op, 5), 31), band(op, 31)
1015 if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then
1016 name = altname
1017 x = nil
1018 elseif shf == 0 then
1019 x = imm12
1020 elseif shf == 1 then
1021 x = imm12..", lsl #12"
1022 end
1023 elseif p == "i" then
1024 x = "#0x"..decode_imm13(op)
1025 elseif p == "1" then
1026 immr = band(rshift(op, 16), 63)
1027 x = immr
1028 elseif p == "2" then
1029 x = band(rshift(op, 10), 63)
1030 if altname then
1031 local a1, a2, a3, a4, a5, a6 =
1032 match(altname, "([^|]*)|([^|]*)|([^|]*)|([^|]*)|([^|]*)|(.*)")
1033 local sf = band(rshift(op, 26), 32)
1034 local uns = band(rshift(op, 30), 1)
1035 if prefer_bfx(sf, uns, x, immr) then
1036 name = a2
1037 x = x - immr + 1
1038 elseif immr == 0 and x == 7 then
1039 local n = #operands
1040 operands[n] = nil
1041 if sf ~= 0 then
1042 operands[n-1] = gsub(operands[n-1], "x", "w")
1043 end
1044 last = operands[n-1]
1045 name = a6
1046 x = nil
1047 elseif immr == 0 and x == 15 then
1048 local n = #operands
1049 operands[n] = nil
1050 if sf ~= 0 then
1051 operands[n-1] = gsub(operands[n-1], "x", "w")
1052 end
1053 last = operands[n-1]
1054 name = a5
1055 x = nil
1056 elseif x == 31 or x == 63 then
1057 if x == 31 and immr == 0 and name == "sbfm" then
1058 name = a4
1059 local n = #operands
1060 operands[n] = nil
1061 if sf ~= 0 then
1062 operands[n-1] = gsub(operands[n-1], "x", "w")
1063 end
1064 last = operands[n-1]
1065 else
1066 name = a3
1067 end
1068 x = nil
1069 elseif band(x, 31) ~= 31 and immr == x+1 and name == "ubfm" then
1070 name = a4
1071 last = "#"..(sf+32 - immr)
1072 operands[#operands] = last
1073 x = nil
1074 elseif x < immr then
1075 name = a1
1076 last = "#"..(sf+32 - immr)
1077 operands[#operands] = last
1078 x = x + 1
1079 end
1080 end
1081 elseif p == "3" then
1082 x = band(rshift(op, 10), 63)
1083 if altname then
1084 local a1, a2 = match(altname, "([^|]*)|(.*)")
1085 if x < immr then
1086 name = a1
1087 local sf = band(rshift(op, 26), 32)
1088 last = "#"..(sf+32 - immr)
1089 operands[#operands] = last
1090 x = x + 1
1091 elseif x >= immr then
1092 name = a2
1093 x = x - immr + 1
1094 end
1095 end
1096 elseif p == "4" then
1097 x = band(rshift(op, 10), 63)
1098 local rn = band(rshift(op, 5), 31)
1099 local rm = band(rshift(op, 16), 31)
1100 if altname and rn == rm then
1101 local n = #operands
1102 operands[n] = nil
1103 last = operands[n-1]
1104 name = altname
1105 end
1106 elseif p == "5" then
1107 x = band(rshift(op, 16), 31)
1108 elseif p == "S" then
1109 x = band(rshift(op, 10), 63)
1110 if x == 0 then x = nil
1111 else x = map_shift[band(rshift(op, 22), 3)].." #"..x end
1112 elseif p == "X" then
1113 local opt = band(rshift(op, 13), 7)
1114 -- Width specifier <R>.
1115 if opt ~= 3 and opt ~= 7 then
1116 last = map_regs.w[band(rshift(op, 16), 31)]
1117 operands[#operands] = last
1118 end
1119 x = band(rshift(op, 10), 7)
1120 -- Extension.
1121 if opt == 2 + band(rshift(op, 31), 1) and
1122 band(rshift(op, second0 and 5 or 0), 31) == 31 then
1123 if x == 0 then x = nil
1124 else x = "lsl #"..x end
1125 else
1126 if x == 0 then x = map_extend[band(rshift(op, 13), 7)]
1127 else x = map_extend[band(rshift(op, 13), 7)].." #"..x end
1128 end
1129 elseif p == "R" then
1130 x = band(rshift(op,21), 3)
1131 if x == 0 then x = nil
1132 else x = "lsl #"..x*16 end
1133 elseif p == "z" then
1134 local n = #operands
1135 if operands[n] == "sp" then operands[n] = "xzr"
1136 elseif operands[n] == "wsp" then operands[n] = "wzr"
1137 end
1138 elseif p == "Z" then
1139 x = 0
1140 elseif p == "F" then
1141 x = parse_fpimm8(op)
1142 elseif p == "g" or p == "f" or p == "x" or p == "w" or
1143 p == "d" or p == "s" then
1144 -- These are handled in D/N/M/A.
1145 elseif p == "0" then
1146 if last == "sp" or last == "wsp" then
1147 local n = #operands
1148 operands[n] = nil
1149 last = operands[n-1]
1150 if altname then
1151 local a1, a2 = match(altname, "([^|]*)|(.*)")
1152 if not a1 then
1153 name = altname
1154 elseif second0 then
1155 name, altname = a2, a1
1156 else
1157 name, altname = a1, a2
1158 end
1159 end
1160 end
1161 second0 = true
1162 else
1163 assert(false)
1164 end
1165 if x then
1166 last = x
1167 if type(x) == "number" then x = "#"..x end
1168 operands[#operands+1] = x
1169 end
1170 end
1171
1172 return putop(ctx, name..suffix, operands)
1173end
1174
1175------------------------------------------------------------------------------
1176
1177-- Disassemble a block of code.
1178local function disass_block(ctx, ofs, len)
1179 if not ofs then ofs = 0 end
1180 local stop = len and ofs+len or #ctx.code
1181 ctx.pos = ofs
1182 ctx.rel = nil
1183 while ctx.pos < stop do disass_ins(ctx) end
1184end
1185
1186-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
1187local function create(code, addr, out)
1188 local ctx = {}
1189 ctx.code = code
1190 ctx.addr = addr or 0
1191 ctx.out = out or io.write
1192 ctx.symtab = {}
1193 ctx.disass = disass_block
1194 ctx.hexdump = 8
1195 return ctx
1196end
1197
1198-- Simple API: disassemble code (a string) at address and output via out.
1199local function disass(code, addr, out)
1200 create(code, addr, out):disass()
1201end
1202
1203-- Return register name for RID.
1204local function regname(r)
1205 if r < 32 then return map_regs.x[r] end
1206 return map_regs.d[r-32]
1207end
1208
1209-- Public module functions.
1210return {
1211 create = create,
1212 disass = disass,
1213 regname = regname
1214}
1215
diff --git a/src/lj_arch.h b/src/lj_arch.h
index cc5a0a66..3df602e3 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -226,7 +226,6 @@
226#define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ 226#define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */
227#define LJ_TARGET_GC64 1 227#define LJ_TARGET_GC64 1
228#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL 228#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL
229#define LJ_ARCH_NOJIT 1 /* NYI */
230 229
231#define LJ_ARCH_VERSION 80 230#define LJ_ARCH_VERSION 80
232 231
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 7ce58924..2cb5abea 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -171,6 +171,8 @@ IRFLDEF(FLOFS)
171#include "lj_emit_x86.h" 171#include "lj_emit_x86.h"
172#elif LJ_TARGET_ARM 172#elif LJ_TARGET_ARM
173#include "lj_emit_arm.h" 173#include "lj_emit_arm.h"
174#elif LJ_TARGET_ARM64
175#include "lj_emit_arm64.h"
174#elif LJ_TARGET_PPC 176#elif LJ_TARGET_PPC
175#include "lj_emit_ppc.h" 177#include "lj_emit_ppc.h"
176#elif LJ_TARGET_MIPS 178#elif LJ_TARGET_MIPS
@@ -1563,6 +1565,8 @@ static void asm_loop(ASMState *as)
1563#include "lj_asm_x86.h" 1565#include "lj_asm_x86.h"
1564#elif LJ_TARGET_ARM 1566#elif LJ_TARGET_ARM
1565#include "lj_asm_arm.h" 1567#include "lj_asm_arm.h"
1568#elif LJ_TARGET_ARM64
1569#include "lj_asm_arm64.h"
1566#elif LJ_TARGET_PPC 1570#elif LJ_TARGET_PPC
1567#include "lj_asm_ppc.h" 1571#include "lj_asm_ppc.h"
1568#elif LJ_TARGET_MIPS 1572#elif LJ_TARGET_MIPS
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
new file mode 100644
index 00000000..0a2f5306
--- /dev/null
+++ b/src/lj_asm_arm64.h
@@ -0,0 +1,1823 @@
1/*
2** ARM64 IR assembler (SSA IR -> machine code).
3** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
4**
5** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
6** Sponsored by Cisco Systems, Inc.
7*/
8
9/* -- Register allocator extensions --------------------------------------- */
10
11/* Allocate a register with a hint. */
12static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
13{
14 Reg r = IR(ref)->r;
15 if (ra_noreg(r)) {
16 if (!ra_hashint(r) && !iscrossref(as, ref))
17 ra_sethint(IR(ref)->r, hint); /* Propagate register hint. */
18 r = ra_allocref(as, ref, allow);
19 }
20 ra_noweak(as, r);
21 return r;
22}
23
24/* Allocate two source registers for three-operand instructions. */
25static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
26{
27 IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
28 Reg left = irl->r, right = irr->r;
29 if (ra_hasreg(left)) {
30 ra_noweak(as, left);
31 if (ra_noreg(right))
32 right = ra_allocref(as, ir->op2, rset_exclude(allow, left));
33 else
34 ra_noweak(as, right);
35 } else if (ra_hasreg(right)) {
36 ra_noweak(as, right);
37 left = ra_allocref(as, ir->op1, rset_exclude(allow, right));
38 } else if (ra_hashint(right)) {
39 right = ra_allocref(as, ir->op2, allow);
40 left = ra_alloc1(as, ir->op1, rset_exclude(allow, right));
41 } else {
42 left = ra_allocref(as, ir->op1, allow);
43 right = ra_alloc1(as, ir->op2, rset_exclude(allow, left));
44 }
45 return left | (right << 8);
46}
47
48/* -- Guard handling ------------------------------------------------------ */
49
50/* Generate an exit stub group at the bottom of the reserved MCode memory. */
51static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
52{
53 MCode *mxp = as->mcbot;
54 int i;
55 if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop)
56 asm_mclimit(as);
57 /* str lr, [sp]; bl ->vm_exit_handler; .long group. */
58 *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP);
59 *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu);
60 mxp++;
61 *mxp++ = group*EXITSTUBS_PER_GROUP;
62 for (i = 0; i < EXITSTUBS_PER_GROUP; i++)
63 *mxp++ = A64I_B | ((-3-i)&0x03ffffffu);
64 lj_mcode_sync(as->mcbot, mxp);
65 lj_mcode_commitbot(as->J, mxp);
66 as->mcbot = mxp;
67 as->mclim = as->mcbot + MCLIM_REDZONE;
68 return mxp - EXITSTUBS_PER_GROUP;
69}
70
71/* Setup all needed exit stubs. */
72static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
73{
74 ExitNo i;
75 if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
76 lj_trace_err(as->J, LJ_TRERR_SNAPOV);
77 for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
78 if (as->J->exitstubgroup[i] == NULL)
79 as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
80}
81
82/* Emit conditional branch to exit for guard. */
83static void asm_guardcc(ASMState *as, A64CC cc)
84{
85 MCode *target = exitstub_addr(as->J, as->snapno);
86 MCode *p = as->mcp;
87 if (LJ_UNLIKELY(p == as->invmcp)) {
88 as->loopinv = 1;
89 *p = A64I_BL | ((target-p) & 0x03ffffffu);
90 emit_cond_branch(as, cc^1, p-1);
91 return;
92 }
93 /* No conditional calls. Emit b.cc/bl instead. */
94 /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */
95 emit_branch(as, A64I_BL, target);
96 emit_cond_branch(as, cc^1, p);
97}
98
99/* -- Operand fusion ------------------------------------------------------ */
100
101/* Limit linear search to this distance. Avoids O(n^2) behavior. */
102#define CONFLICT_SEARCH_LIM 31
103
104static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
105{
106 if (irref_isk(ref)) {
107 IRIns *ir = IR(ref);
108 if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
109 *k = ir->i;
110 return 1;
111 } else if (checki32((int64_t)ir_k64(ir)->u64)) {
112 *k = (int32_t)ir_k64(ir)->u64;
113 return 1;
114 }
115 }
116 return 0;
117}
118
119/* Check if there's no conflicting instruction between curins and ref. */
120static int noconflict(ASMState *as, IRRef ref, IROp conflict)
121{
122 IRIns *ir = as->ir;
123 IRRef i = as->curins;
124 if (i > ref + CONFLICT_SEARCH_LIM)
125 return 0; /* Give up, ref is too far away. */
126 while (--i > ref)
127 if (ir[i].o == conflict)
128 return 0; /* Conflict found. */
129 return 1; /* Ok, no conflict. */
130}
131
132/* Fuse the array base of colocated arrays. */
133static int32_t asm_fuseabase(ASMState *as, IRRef ref)
134{
135 IRIns *ir = IR(ref);
136 if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
137 !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
138 return (int32_t)sizeof(GCtab);
139 return 0;
140}
141
142#define FUSE_REG 0x40000000
143
144/* Fuse array/hash/upvalue reference into register+offset operand. */
145static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
146 A64Ins ins)
147{
148 IRIns *ir = IR(ref);
149 if (ra_noreg(ir->r)) {
150 if (ir->o == IR_AREF) {
151 if (mayfuse(as, ref)) {
152 if (irref_isk(ir->op2)) {
153 IRRef tab = IR(ir->op1)->op1;
154 int32_t ofs = asm_fuseabase(as, tab);
155 IRRef refa = ofs ? tab : ir->op1;
156 ofs += 8*IR(ir->op2)->i;
157 if (emit_checkofs(ins, ofs)) {
158 *ofsp = ofs;
159 return ra_alloc1(as, refa, allow);
160 }
161 } else {
162 Reg base = ra_alloc1(as, ir->op1, allow);
163 *ofsp = FUSE_REG|ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
164 return base;
165 }
166 }
167 } else if (ir->o == IR_HREFK) {
168 if (mayfuse(as, ref)) {
169 int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
170 if (emit_checkofs(ins, ofs)) {
171 *ofsp = ofs;
172 return ra_alloc1(as, ir->op1, allow);
173 }
174 }
175 } else if (ir->o == IR_UREFC) {
176 if (irref_isk(ir->op1)) {
177 GCfunc *fn = ir_kfunc(IR(ir->op1));
178 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
179 int64_t ofs = glofs(as, &uv->tv);
180 if (emit_checkofs(ins, ofs)) {
181 *ofsp = (int32_t)ofs;
182 return RID_GL;
183 }
184 }
185 }
186 }
187 *ofsp = 0;
188 return ra_alloc1(as, ref, allow);
189}
190
191/* Fuse m operand into arithmetic/logic instructions. */
192static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
193{
194 IRIns *ir = IR(ref);
195 if (ra_hasreg(ir->r)) {
196 ra_noweak(as, ir->r);
197 return A64F_M(ir->r);
198 } else if (irref_isk(ref)) {
199 uint32_t m;
200 int64_t k = get_k64val(ir);
201 if ((ai & 0x1f000000) == 0x0a000000)
202 m = emit_isk13(k, irt_is64(ir->t));
203 else
204 m = emit_isk12(k);
205 if (m)
206 return m;
207 } else if (mayfuse(as, ref)) {
208 if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR && irref_isk(ir->op2)) ||
209 (ir->o == IR_ADD && ir->op1 == ir->op2)) {
210 A64Shift sh = ir->o == IR_BSHR ? A64SH_LSR :
211 ir->o == IR_BSAR ? A64SH_ASR : A64SH_LSL;
212 int shift = ir->o == IR_ADD ? 1 :
213 (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
214 IRIns *irl = IR(ir->op1);
215 if (sh == A64SH_LSL &&
216 irl->o == IR_CONV &&
217 irl->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT) &&
218 shift <= 4 &&
219 mayfuse(as, ir->op1)) {
220 Reg m = ra_alloc1(as, irl->op1, allow);
221 return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift);
222 } else {
223 Reg m = ra_alloc1(as, ir->op1, allow);
224 return A64F_M(m) | A64F_SH(sh, shift);
225 }
226 } else if (ir->o == IR_CONV &&
227 ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)) {
228 Reg m = ra_alloc1(as, ir->op1, allow);
229 return A64F_M(m) | A64F_EX(A64EX_SXTW);
230 }
231 }
232 return A64F_M(ra_allocref(as, ref, allow));
233}
234
235/* Fuse XLOAD/XSTORE reference into load/store operand. */
236static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref,
237 RegSet allow)
238{
239 IRIns *ir = IR(ref);
240 Reg base;
241 int32_t ofs = 0;
242 if (ra_noreg(ir->r) && canfuse(as, ir)) {
243 if (ir->o == IR_ADD) {
244 if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs))
245 ref = ir->op1;
246 /* NYI: Fuse add with two registers. */
247 } else if (ir->o == IR_STRREF) {
248 if (asm_isk32(as, ir->op2, &ofs)) {
249 ref = ir->op1;
250 } else if (asm_isk32(as, ir->op1, &ofs)) {
251 ref = ir->op2;
252 } else {
253 /* NYI: Fuse ADD with constant. */
254 Reg rn = ra_alloc1(as, ir->op1, allow);
255 uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn));
256 emit_lso(as, ai, rd, rd, sizeof(GCstr));
257 emit_dn(as, A64I_ADDx^m, rd, rn);
258 return;
259 }
260 ofs += sizeof(GCstr);
261 if (!emit_checkofs(ai, ofs)) {
262 Reg rn = ra_alloc1(as, ref, allow);
263 Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn));
264 emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm);
265 return;
266 }
267 }
268 }
269 base = ra_alloc1(as, ref, allow);
270 emit_lso(as, ai, (rd & 31), base, ofs);
271}
272
273/* -- Calls --------------------------------------------------------------- */
274
275/* Generate a call to a C function. */
276static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
277{
278 uint32_t n, nargs = CCI_XNARGS(ci);
279 int32_t ofs = 0;
280 Reg gpr, fpr = REGARG_FIRSTFPR;
281 if ((void *)ci->func)
282 emit_call(as, (void *)ci->func);
283 for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
284 as->cost[gpr] = REGCOST(~0u, ASMREF_L);
285 gpr = REGARG_FIRSTGPR;
286 for (n = 0; n < nargs; n++) { /* Setup args. */
287 IRRef ref = args[n];
288 IRIns *ir = IR(ref);
289 if (ref) {
290 if (irt_isfp(ir->t)) {
291 if (fpr <= REGARG_LASTFPR) {
292 lua_assert(rset_test(as->freeset, fpr)); /* Must have been evicted. */
293 ra_leftov(as, fpr, ref);
294 fpr++;
295 } else {
296 Reg r = ra_alloc1(as, ref, RSET_FPR);
297 emit_spstore(as, ir, r, ofs);
298 ofs += 8;
299 }
300 } else {
301 if (gpr <= REGARG_LASTGPR) {
302 lua_assert(rset_test(as->freeset, gpr)); /* Must have been evicted. */
303 ra_leftov(as, gpr, ref);
304 gpr++;
305 } else {
306 Reg r = ra_alloc1(as, ref, RSET_GPR);
307 emit_spstore(as, ir, r, ofs);
308 ofs += 8;
309 }
310 }
311 }
312 }
313}
314
315/* Setup result reg/sp for call. Evict scratch regs. */
316static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
317{
318 RegSet drop = RSET_SCRATCH;
319 if (ra_hasreg(ir->r))
320 rset_clear(drop, ir->r); /* Dest reg handled below. */
321 ra_evictset(as, drop); /* Evictions must be performed first. */
322 if (ra_used(ir)) {
323 lua_assert(!irt_ispri(ir->t));
324 if (irt_isfp(ir->t)) {
325 if (ci->flags & CCI_CASTU64) {
326 Reg dest = ra_dest(as, ir, RSET_FPR) & 31;
327 emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D_R : A64I_FMOV_S_R,
328 dest, RID_RET);
329 } else {
330 ra_destreg(as, ir, RID_FPRET);
331 }
332 } else {
333 ra_destreg(as, ir, RID_RET);
334 }
335 }
336 UNUSED(ci);
337}
338
339static void asm_callx(ASMState *as, IRIns *ir)
340{
341 IRRef args[CCI_NARGS_MAX*2];
342 CCallInfo ci;
343 IRRef func;
344 IRIns *irf;
345 ci.flags = asm_callx_flags(as, ir);
346 asm_collectargs(as, ir, &ci, args);
347 asm_setupresult(as, ir, &ci);
348 func = ir->op2; irf = IR(func);
349 if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
350 if (irref_isk(func)) { /* Call to constant address. */
351 ci.func = (ASMFunction)(ir_k64(irf)->u64);
352 } else { /* Need a non-argument register for indirect calls. */
353 Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
354 emit_n(as, A64I_BLR, freg);
355 ci.func = (ASMFunction)(void *)0;
356 }
357 asm_gencall(as, &ci, args);
358}
359
360/* -- Returns ------------------------------------------------------------- */
361
362/* Return to lower frame. Guard that it goes to the right spot. */
363static void asm_retf(ASMState *as, IRIns *ir)
364{
365 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
366 void *pc = ir_kptr(IR(ir->op2));
367 int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
368 as->topslot -= (BCReg)delta;
369 if ((int32_t)as->topslot < 0) as->topslot = 0;
370 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */
371 /* Need to force a spill on REF_BASE now to update the stack slot. */
372 emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
373 emit_setgl(as, base, jit_base);
374 emit_addptr(as, base, -8*delta);
375 asm_guardcc(as, CC_NE);
376 emit_nm(as, A64I_CMPx, RID_TMP,
377 ra_allock(as, i64ptr(pc), rset_exclude(RSET_GPR, base)));
378 emit_lso(as, A64I_LDRx, RID_TMP, base, -8);
379}
380
381/* -- Type conversions ---------------------------------------------------- */
382
383static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
384{
385 Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
386 Reg dest = ra_dest(as, ir, RSET_GPR);
387 asm_guardcc(as, CC_NE);
388 emit_nm(as, A64I_FCMPd, (tmp & 31), (left & 31));
389 emit_dn(as, A64I_FCVT_F64_S32, (tmp & 31), dest);
390 emit_dn(as, A64I_FCVT_S32_F64, dest, (left & 31));
391}
392
393static void asm_tobit(ASMState *as, IRIns *ir)
394{
395 RegSet allow = RSET_FPR;
396 Reg left = ra_alloc1(as, ir->op1, allow);
397 Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
398 Reg tmp = ra_scratch(as, rset_clear(allow, right));
399 Reg dest = ra_dest(as, ir, RSET_GPR);
400 emit_dn(as, A64I_FMOV_R_S, dest, (tmp & 31));
401 emit_dnm(as, A64I_FADDd, (tmp & 31), (left & 31), (right & 31));
402}
403
404static void asm_conv(ASMState *as, IRIns *ir)
405{
406 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
407 int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
408 int stfp = (st == IRT_NUM || st == IRT_FLOAT);
409 IRRef lref = ir->op1;
410 lua_assert(irt_type(ir->t) != st);
411 if (irt_isfp(ir->t)) {
412 Reg dest = ra_dest(as, ir, RSET_FPR);
413 if (stfp) { /* FP to FP conversion. */
414 emit_dn(as, st == IRT_NUM ? A64I_FCVT_F32_F64 : A64I_FCVT_F64_F32,
415 (dest & 31), (ra_alloc1(as, lref, RSET_FPR) & 31));
416 } else { /* Integer to FP conversion. */
417 Reg left = ra_alloc1(as, lref, RSET_GPR);
418 A64Ins ai = irt_isfloat(ir->t) ?
419 (((IRT_IS64 >> st) & 1) ?
420 (st == IRT_I64 ? A64I_FCVT_F32_S64 : A64I_FCVT_F32_U64) :
421 (st == IRT_INT ? A64I_FCVT_F32_S32 : A64I_FCVT_F32_U32)) :
422 (((IRT_IS64 >> st) & 1) ?
423 (st == IRT_I64 ? A64I_FCVT_F64_S64 : A64I_FCVT_F64_U64) :
424 (st == IRT_INT ? A64I_FCVT_F64_S32 : A64I_FCVT_F64_U32));
425 emit_dn(as, ai, (dest & 31), left);
426 }
427 } else if (stfp) { /* FP to integer conversion. */
428 if (irt_isguard(ir->t)) {
429 /* Checked conversions are only supported from number to int. */
430 lua_assert(irt_isint(ir->t) && st == IRT_NUM);
431 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
432 } else {
433 Reg left = ra_alloc1(as, lref, RSET_FPR);
434 Reg dest = ra_dest(as, ir, RSET_GPR);
435 A64Ins ai = irt_is64(ir->t) ?
436 (st == IRT_NUM ?
437 (irt_isi64(ir->t) ? A64I_FCVT_S64_F64 : A64I_FCVT_U64_F64) :
438 (irt_isi64(ir->t) ? A64I_FCVT_S64_F32 : A64I_FCVT_U64_F32)) :
439 (st == IRT_NUM ?
440 (irt_isint(ir->t) ? A64I_FCVT_S32_F64 : A64I_FCVT_U32_F64) :
441 (irt_isint(ir->t) ? A64I_FCVT_S32_F32 : A64I_FCVT_U32_F32));
442 emit_dn(as, ai, dest, (left & 31));
443 }
444 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
445 Reg dest = ra_dest(as, ir, RSET_GPR);
446 Reg left = ra_alloc1(as, lref, RSET_GPR);
447 A64Ins ai = st == IRT_I8 ? A64I_SXTBw :
448 st == IRT_U8 ? A64I_UXTBw :
449 st == IRT_I16 ? A64I_SXTHw : A64I_UXTHw;
450 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t));
451 emit_dn(as, ai, dest, left);
452 } else {
453 Reg dest = ra_dest(as, ir, RSET_GPR);
454 if (irt_is64(ir->t)) {
455 if (st64 || !(ir->op2 & IRCONV_SEXT)) {
456 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
457 ra_leftov(as, dest, lref); /* Do nothing, but may need to move regs. */
458 } else { /* 32 to 64 bit sign extension. */
459 Reg left = ra_alloc1(as, lref, RSET_GPR);
460 emit_dn(as, A64I_SXTW, dest, left);
461 }
462 } else {
463 if (st64) {
464 /* This is either a 32 bit reg/reg mov which zeroes the hiword
465 ** or a load of the loword from a 64 bit address.
466 */
467 Reg left = ra_alloc1(as, lref, RSET_GPR);
468 emit_dm(as, A64I_MOVw, dest, left);
469 } else { /* 32/32 bit no-op (cast). */
470 ra_leftov(as, dest, lref); /* Do nothing, but may need to move regs. */
471 }
472 }
473 }
474}
475
476static void asm_strto(ASMState *as, IRIns *ir)
477{
478 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
479 IRRef args[2];
480 Reg dest = 0, tmp;
481 int destused = ra_used(ir);
482 int32_t ofs = 0;
483 ra_evictset(as, RSET_SCRATCH);
484 if (destused) {
485 if (ra_hasspill(ir->s)) {
486 ofs = sps_scale(ir->s);
487 destused = 0;
488 if (ra_hasreg(ir->r)) {
489 ra_free(as, ir->r);
490 ra_modified(as, ir->r);
491 emit_spload(as, ir, ir->r, ofs);
492 }
493 } else {
494 dest = ra_dest(as, ir, RSET_FPR);
495 }
496 }
497 asm_guardcc(as, CC_EQ);
498 if (destused)
499 emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
500 emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET);
501 args[0] = ir->op1; /* GCstr *str */
502 args[1] = ASMREF_TMP1; /* TValue *n */
503 asm_gencall(as, ci, args);
504 tmp = ra_releasetmp(as, ASMREF_TMP1);
505 emit_opk(as, A64I_ADDx, tmp, RID_SP, ofs, RSET_GPR);
506}
507
508/* -- Memory references --------------------------------------------------- */
509
510/* Get pointer to TValue. */
511static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
512{
513 IRIns *ir = IR(ref);
514 if (irt_isnum(ir->t)) {
515 if (irref_isk(ref)) {
516 /* Use the number constant itself as a TValue. */
517 ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
518 } else {
519 /* Otherwise force a spill and use the spill slot. */
520 emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
521 }
522 } else {
523 /* Otherwise use g->tmptv to hold the TValue. */
524 RegSet allow = rset_exclude(RSET_GPR, dest);
525 Reg src;
526 if (irref_isk(ref)) {
527 TValue k;
528 lj_ir_kvalue(as->J->L, &k, ir);
529 src = ra_allock(as, k.u64, allow);
530 emit_lso(as, A64I_STRx, src, dest, 0);
531 } else {
532 Reg type;
533 if (irt_ispri(ir->t)) {
534 src = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
535 emit_lso(as, A64I_STRx, src, dest, 0);
536 } else if (irt_isint(ir->t)) {
537 src = ra_alloc1(as, ref, allow);
538 type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
539 emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
540 emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
541 } else {
542 src = ra_alloc1(as, ref, allow);
543 type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
544 emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
545 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
546 }
547 }
548 ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest);
549 }
550}
551
552static void asm_aref(ASMState *as, IRIns *ir)
553{
554 Reg dest = ra_dest(as, ir, RSET_GPR);
555 Reg idx, base;
556 if (irref_isk(ir->op2)) {
557 IRRef tab = IR(ir->op1)->op1;
558 int32_t ofs = asm_fuseabase(as, tab);
559 IRRef refa = ofs ? tab : ir->op1;
560 uint32_t k = emit_isk12(ofs + 8*IR(ir->op2)->i);
561 if (k) {
562 base = ra_alloc1(as, refa, RSET_GPR);
563 emit_dn(as, A64I_ADDx^k, dest, base);
564 return;
565 }
566 }
567 base = ra_alloc1(as, ir->op1, RSET_GPR);
568 idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
569 emit_dnm(as, A64I_ADDx | A64F_EXSH(A64EX_UXTW, 3), dest, base, idx);
570}
571
572/* Inlined hash lookup. Specialized for key type and for const keys.
573** The equivalent C code is:
574** Node *n = hashkey(t, key);
575** do {
576** if (lj_obj_equal(&n->key, key)) return &n->val;
577** } while ((n = nextnode(n)));
578** return niltv(L);
579*/
580static void asm_href(ASMState *as, IRIns *ir, IROp merge)
581{
582 RegSet allow = RSET_GPR;
583 int destused = ra_used(ir);
584 Reg dest = ra_dest(as, ir, allow);
585 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
586 Reg key = 0, tmp = RID_TMP;
587 IRRef refkey = ir->op2;
588 IRIns *irkey = IR(refkey);
589 int isk = irref_isk(ir->op2);
590 IRType1 kt = irkey->t;
591 uint32_t k = 0;
592 uint32_t khash;
593 MCLabel l_end, l_loop, l_next;
594 rset_clear(allow, tab);
595
596 if (!isk) {
597 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
598 rset_clear(allow, key);
599 if (!irt_isstr(kt)) {
600 tmp = ra_scratch(as, allow);
601 rset_clear(allow, tmp);
602 }
603 } else if (irt_isnum(kt)) {
604 int64_t val = (int64_t)ir_knum(irkey)->u64;
605 if (!(k = emit_isk12(val))) {
606 key = ra_allock(as, val, allow);
607 rset_clear(allow, key);
608 }
609 } else if (!irt_ispri(kt)) {
610 if (!(k = emit_isk12(irkey->i))) {
611 key = ra_alloc1(as, refkey, allow);
612 rset_clear(allow, key);
613 }
614 }
615
616 /* Key not found in chain: jump to exit (if merged) or load niltv. */
617 l_end = emit_label(as);
618 as->invmcp = NULL;
619 if (merge == IR_NE)
620 asm_guardcc(as, CC_AL);
621 else if (destused)
622 emit_loada(as, dest, niltvg(J2G(as->J)));
623
624 /* Follow hash chain until the end. */
625 l_loop = --as->mcp;
626 emit_n(as, A64I_CMPx^A64I_K12^0, dest);
627 emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
628 l_next = emit_label(as);
629
630 /* Type and value comparison. */
631 if (merge == IR_EQ)
632 asm_guardcc(as, CC_EQ);
633 else
634 emit_cond_branch(as, CC_EQ, l_end);
635
636 if (irt_isnum(kt)) {
637 if (isk) {
638 /* Assumes -0.0 is already canonicalized to +0.0. */
639 if (k)
640 emit_n(as, A64I_CMPx^k, tmp);
641 else
642 emit_nm(as, A64I_CMPx, key, tmp);
643 emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
644 } else {
645 Reg tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
646 Reg ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
647 rset_clear(allow, tisnum);
648 emit_nm(as, A64I_FCMPd, key, ftmp);
649 emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
650 emit_cond_branch(as, CC_LO, l_next);
651 emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
652 emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
653 }
654 } else if (irt_isaddr(kt)) {
655 Reg scr;
656 if (isk) {
657 int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
658 scr = ra_allock(as, kk, allow);
659 emit_nm(as, A64I_CMPx, scr, tmp);
660 emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
661 } else {
662 scr = ra_scratch(as, allow);
663 emit_nm(as, A64I_CMPx, tmp, scr);
664 emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
665 }
666 rset_clear(allow, scr);
667 } else {
668 Reg type, scr;
669 lua_assert(irt_ispri(kt) && !irt_isnil(kt));
670 type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
671 scr = ra_scratch(as, rset_clear(allow, type));
672 rset_clear(allow, scr);
673 emit_nm(as, A64I_CMPw, scr, type);
674 emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
675 }
676
677 *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE;
678 if (!isk && irt_isaddr(kt)) {
679 Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
680 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
681 rset_clear(allow, type);
682 }
683 /* Load main position relative to tab->node into dest. */
684 khash = isk ? ir_khash(irkey) : 1;
685 if (khash == 0) {
686 emit_lso(as, A64I_LDRx, dest, tab, offsetof(GCtab, node));
687 } else {
688 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 3), dest, tmp, dest);
689 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 1), dest, dest, dest);
690 emit_lso(as, A64I_LDRx, tmp, tab, offsetof(GCtab, node));
691 if (isk) {
692 Reg tmphash = ra_allock(as, khash, allow);
693 emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
694 emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
695 } else if (irt_isstr(kt)) {
696 /* Fetch of str->hash is cheaper than ra_allock. */
697 emit_dnm(as, A64I_ANDw, dest, dest, tmp);
698 emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, hash));
699 emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
700 } else { /* Must match with hash*() in lj_tab.c. */
701 emit_dnm(as, A64I_ANDw, dest, dest, tmp);
702 emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
703 emit_dnm(as, A64I_SUBw, dest, dest, tmp);
704 emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
705 emit_dnm(as, A64I_EORw, dest, dest, tmp);
706 emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
707 emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
708 emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
709 emit_dnm(as, A64I_EORw, tmp, tmp, dest);
710 if (irt_isnum(kt)) {
711 emit_dnm(as, A64I_ADDw, dest, dest, dest);
712 emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
713 emit_dm(as, A64I_MOVw, tmp, dest);
714 emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
715 } else {
716 checkmclim(as);
717 emit_dm(as, A64I_MOVw, tmp, key);
718 emit_dnm(as, A64I_EORw, dest, dest,
719 ra_allock(as, irt_toitype(kt) << 15, allow));
720 emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
721 emit_dm(as, A64I_MOVx, dest, key);
722 }
723 }
724 }
725}
726
727static void asm_hrefk(ASMState *as, IRIns *ir)
728{
729 IRIns *kslot = IR(ir->op2);
730 IRIns *irkey = IR(kslot->op1);
731 int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
732 int32_t kofs = ofs + (int32_t)offsetof(Node, key);
733 int bigofs = !emit_checkofs(A64I_LDRx, ofs);
734 RegSet allow = RSET_GPR;
735 Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
736 Reg node = ra_alloc1(as, ir->op1, allow);
737 Reg key = ra_scratch(as, rset_clear(allow, node));
738 Reg idx = node;
739 uint64_t k;
740 lua_assert(ofs % sizeof(Node) == 0);
741 rset_clear(allow, key);
742 if (bigofs) {
743 idx = dest;
744 rset_clear(allow, dest);
745 kofs = (int32_t)offsetof(Node, key);
746 } else if (ra_hasreg(dest)) {
747 emit_opk(as, A64I_ADDx, dest, node, ofs, allow);
748 }
749 asm_guardcc(as, CC_NE);
750 if (irt_ispri(irkey->t)) {
751 k = ~((int64_t)~irt_toitype(irkey->t) << 47);
752 } else if (irt_isnum(irkey->t)) {
753 k = ir_knum(irkey)->u64;
754 } else {
755 k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
756 }
757 emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow));
758 emit_lso(as, A64I_LDRx, key, idx, kofs);
759 if (bigofs)
760 emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR);
761}
762
763static void asm_uref(ASMState *as, IRIns *ir)
764{
765 Reg dest = ra_dest(as, ir, RSET_GPR);
766 if (irref_isk(ir->op1)) {
767 GCfunc *fn = ir_kfunc(IR(ir->op1));
768 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
769 emit_lsptr(as, A64I_LDRx, dest, v);
770 } else {
771 Reg uv = ra_scratch(as, RSET_GPR);
772 Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
773 if (ir->o == IR_UREFC) {
774 asm_guardcc(as, CC_NE);
775 emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
776 emit_opk(as, A64I_ADDx, dest, uv,
777 (int32_t)offsetof(GCupval, tv), RSET_GPR);
778 emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
779 } else {
780 emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
781 }
782 emit_lso(as, A64I_LDRx, uv, func,
783 (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
784 }
785}
786
787static void asm_fref(ASMState *as, IRIns *ir)
788{
789 UNUSED(as); UNUSED(ir);
790 lua_assert(!ra_used(ir));
791}
792
793static void asm_strref(ASMState *as, IRIns *ir)
794{
795 RegSet allow = RSET_GPR;
796 Reg dest = ra_dest(as, ir, allow);
797 Reg base = ra_alloc1(as, ir->op1, allow);
798 IRIns *irr = IR(ir->op2);
799 int32_t ofs = sizeof(GCstr);
800 uint32_t m;
801 rset_clear(allow, base);
802 if (irref_isk(ir->op2) && (m = emit_isk12(ofs + irr->i))) {
803 emit_dn(as, A64I_ADDx^m, dest, base);
804 } else {
805 emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, dest);
806 emit_dnm(as, A64I_ADDx, dest, base, ra_alloc1(as, ir->op2, allow));
807 }
808}
809
810/* -- Loads and stores ---------------------------------------------------- */
811
812static A64Ins asm_fxloadins(IRIns *ir)
813{
814 switch (irt_type(ir->t)) {
815 case IRT_I8: return A64I_LDRB ^ A64I_LS_S;
816 case IRT_U8: return A64I_LDRB;
817 case IRT_I16: return A64I_LDRH ^ A64I_LS_S;
818 case IRT_U16: return A64I_LDRH;
819 case IRT_NUM: return A64I_LDRd;
820 case IRT_FLOAT: return A64I_LDRs;
821 default: return irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw;
822 }
823}
824
825static A64Ins asm_fxstoreins(IRIns *ir)
826{
827 switch (irt_type(ir->t)) {
828 case IRT_I8: case IRT_U8: return A64I_STRB;
829 case IRT_I16: case IRT_U16: return A64I_STRH;
830 case IRT_NUM: return A64I_STRd;
831 case IRT_FLOAT: return A64I_STRs;
832 default: return irt_is64(ir->t) ? A64I_STRx : A64I_STRw;
833 }
834}
835
836static void asm_fload(ASMState *as, IRIns *ir)
837{
838 Reg dest = ra_dest(as, ir, RSET_GPR);
839 Reg idx;
840 A64Ins ai = asm_fxloadins(ir);
841 int32_t ofs;
842 if (ir->op1 == REF_NIL) {
843 idx = RID_GL;
844 ofs = (ir->op2 << 2) - GG_OFS(g);
845 } else {
846 idx = ra_alloc1(as, ir->op1, RSET_GPR);
847 if (ir->op2 == IRFL_TAB_ARRAY) {
848 ofs = asm_fuseabase(as, ir->op1);
849 if (ofs) { /* Turn the t->array load into an add for colocated arrays. */
850 emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, idx);
851 return;
852 }
853 }
854 ofs = field_ofs[ir->op2];
855 }
856 emit_lso(as, ai, (dest & 31), idx, ofs);
857}
858
859static void asm_fstore(ASMState *as, IRIns *ir)
860{
861 if (ir->r != RID_SINK) {
862 Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
863 IRIns *irf = IR(ir->op1);
864 Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
865 int32_t ofs = field_ofs[irf->op2];
866 emit_lso(as, asm_fxstoreins(ir), (src & 31), idx, ofs);
867 }
868}
869
870static void asm_xload(ASMState *as, IRIns *ir)
871{
872 Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
873 lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
874 asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR);
875}
876
877static void asm_xstore(ASMState *as, IRIns *ir)
878{
879 if (ir->r != RID_SINK) {
880 Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
881 asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
882 rset_exclude(RSET_GPR, src));
883 }
884}
885
886static void asm_ahuvload(ASMState *as, IRIns *ir)
887{
888 Reg idx, tmp, type;
889 int32_t ofs = 0;
890 RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
891 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
892 irt_isint(ir->t));
893 if (ra_used(ir)) {
894 Reg dest = ra_dest(as, ir, allow);
895 tmp = irt_isnum(ir->t) ? ra_scratch(as, rset_clear(gpr, dest)) : dest;
896 if (irt_isaddr(ir->t)) {
897 emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
898 } else if (irt_isnum(ir->t)) {
899 emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
900 } else if (irt_isint(ir->t)) {
901 emit_dm(as, A64I_MOVw, dest, dest);
902 }
903 } else {
904 tmp = ra_scratch(as, gpr);
905 }
906 type = ra_scratch(as, rset_clear(gpr, tmp));
907 idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
908 /* Always do the type check, even if the load result is unused. */
909 asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
910 if (irt_type(ir->t) >= IRT_NUM) {
911 lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
912 emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
913 ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
914 } else if (irt_isaddr(ir->t)) {
915 emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
916 emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
917 } else if (irt_isnil(ir->t)) {
918 emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
919 } else {
920 emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
921 ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp);
922 }
923 if (ofs & FUSE_REG)
924 emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
925 else
926 emit_lso(as, A64I_LDRx, tmp, idx, ofs);
927}
928
929static void asm_ahustore(ASMState *as, IRIns *ir)
930{
931 if (ir->r != RID_SINK) {
932 RegSet allow = RSET_GPR;
933 Reg idx, src = RID_NONE, tmp = RID_TMP, type = RID_NONE;
934 int32_t ofs = 0;
935 if (irt_isnum(ir->t)) {
936 src = ra_alloc1(as, ir->op2, RSET_FPR);
937 idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd);
938 if (ofs & FUSE_REG)
939 emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31));
940 else
941 emit_lso(as, A64I_STRd, (src & 31), idx, ofs);
942 } else {
943 if (!irt_ispri(ir->t)) {
944 src = ra_alloc1(as, ir->op2, allow);
945 rset_clear(allow, src);
946 if (irt_isinteger(ir->t))
947 type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
948 else
949 type = ra_allock(as, irt_toitype(ir->t), allow);
950 } else {
951 tmp = type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t)<<47), allow);
952 }
953 idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type),
954 A64I_STRx);
955 if (ofs & FUSE_REG)
956 emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
957 else
958 emit_lso(as, A64I_STRx, tmp, idx, ofs);
959 if (ra_hasreg(src)) {
960 if (irt_isinteger(ir->t)) {
961 emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), tmp, type, src);
962 } else {
963 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, src, type);
964 }
965 }
966 }
967 }
968}
969
970static void asm_sload(ASMState *as, IRIns *ir)
971{
972 int32_t ofs = 8*((int32_t)ir->op1-2);
973 IRType1 t = ir->t;
974 Reg dest = RID_NONE, base;
975 RegSet allow = RSET_GPR;
976 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */
977 lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
978 if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
979 dest = ra_scratch(as, RSET_FPR);
980 asm_tointg(as, ir, dest);
981 t.irt = IRT_NUM; /* Continue with a regular number type check. */
982 } else if (ra_used(ir)) {
983 Reg tmp = RID_NONE;
984 if ((ir->op2 & IRSLOAD_CONVERT))
985 tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR);
986 lua_assert((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t));
987 dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
988 base = ra_alloc1(as, REF_BASE, rset_clear(allow, dest));
989 if (irt_isaddr(t)) {
990 emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
991 } else if ((ir->op2 & IRSLOAD_CONVERT)) {
992 if (irt_isint(t)) {
993 emit_dn(as, A64I_FCVT_S32_F64, dest, (tmp & 31));
994 /* If value is already loaded for type check, move it to FPR. */
995 if ((ir->op2 & IRSLOAD_TYPECHECK))
996 emit_dn(as, A64I_FMOV_D_R, (tmp & 31), dest);
997 else
998 dest = tmp;
999 t.irt = IRT_NUM; /* Check for original type. */
1000 } else {
1001 emit_dn(as, A64I_FCVT_F64_S32, (dest & 31), tmp);
1002 dest = tmp;
1003 t.irt = IRT_INT; /* Check for original type. */
1004 }
1005 } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
1006 emit_dm(as, A64I_MOVw, dest, dest);
1007 }
1008 goto dotypecheck;
1009 }
1010 base = ra_alloc1(as, REF_BASE, allow);
1011dotypecheck:
1012 rset_clear(allow, base);
1013 if ((ir->op2 & IRSLOAD_TYPECHECK)) {
1014 Reg tmp;
1015 if (ra_hasreg(dest) && rset_test(RSET_GPR, dest)) {
1016 tmp = dest;
1017 } else {
1018 tmp = ra_scratch(as, allow);
1019 rset_clear(allow, tmp);
1020 }
1021 if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT))
1022 emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
1023 /* Need type check, even if the load result is unused. */
1024 asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE);
1025 if (irt_type(t) >= IRT_NUM) {
1026 lua_assert(irt_isinteger(t) || irt_isnum(t));
1027 emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
1028 ra_allock(as, LJ_TISNUM << 15, allow), tmp);
1029 } else if (irt_isnil(t)) {
1030 emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
1031 } else if (irt_ispri(t)) {
1032 emit_nm(as, A64I_CMPx,
1033 ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
1034 } else {
1035 Reg type = ra_scratch(as, allow);
1036 emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
1037 emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
1038 }
1039 emit_lso(as, A64I_LDRx, tmp, base, ofs);
1040 return;
1041 }
1042 if (ra_hasreg(dest)) {
1043 emit_lso(as, irt_isnum(t) ? A64I_LDRd :
1044 (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs);
1045 }
1046}
1047
1048/* -- Allocations --------------------------------------------------------- */
1049
1050#if LJ_HASFFI
1051static void asm_cnew(ASMState *as, IRIns *ir)
1052{
1053 CTState *cts = ctype_ctsG(J2G(as->J));
1054 CTypeID id = (CTypeID)IR(ir->op1)->i;
1055 CTSize sz;
1056 CTInfo info = lj_ctype_info(cts, id, &sz);
1057 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
1058 IRRef args[4];
1059 RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
1060 lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
1061
1062 as->gcsteps++;
1063 asm_setupresult(as, ir, ci); /* GCcdata * */
1064 /* Initialize immutable cdata object. */
1065 if (ir->o == IR_CNEWI) {
1066 int32_t ofs = sizeof(GCcdata);
1067 Reg r = ra_alloc1(as, ir->op2, allow);
1068 lua_assert(sz == 4 || sz == 8);
1069 emit_lso(as, sz == 8 ? A64I_STRx : A64I_STRw, r, RID_RET, ofs);
1070 } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */
1071 ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
1072 args[0] = ASMREF_L; /* lua_State *L */
1073 args[1] = ir->op1; /* CTypeID id */
1074 args[2] = ir->op2; /* CTSize sz */
1075 args[3] = ASMREF_TMP1; /* CTSize align */
1076 asm_gencall(as, ci, args);
1077 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
1078 return;
1079 }
1080
1081 /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
1082 {
1083 Reg r = (id < 65536) ? RID_X1 : ra_allock(as, id, allow);
1084 emit_lso(as, A64I_STRB, RID_TMP, RID_RET, offsetof(GCcdata, gct));
1085 emit_lso(as, A64I_STRH, r, RID_RET, offsetof(GCcdata, ctypeid));
1086 emit_d(as, A64I_MOVZw | A64F_U16(~LJ_TCDATA), RID_TMP);
1087 if (id < 65536) emit_d(as, A64I_MOVZw | A64F_U16(id), RID_X1);
1088 }
1089 args[0] = ASMREF_L; /* lua_State *L */
1090 args[1] = ASMREF_TMP1; /* MSize size */
1091 asm_gencall(as, ci, args);
1092 ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
1093 ra_releasetmp(as, ASMREF_TMP1));
1094}
1095#else
1096#define asm_cnew(as, ir) ((void)0)
1097#endif
1098
1099/* -- Write barriers ------------------------------------------------------ */
1100
1101static void asm_tbar(ASMState *as, IRIns *ir)
1102{
1103 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
1104 Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
1105 Reg gr = ra_allock(as, i64ptr(J2G(as->J)),
1106 rset_exclude(rset_exclude(RSET_GPR, tab), link));
1107 Reg mark = RID_TMP;
1108 MCLabel l_end = emit_label(as);
1109 emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
1110 emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
1111 emit_lso(as, A64I_STRx, tab, gr,
1112 (int32_t)offsetof(global_State, gc.grayagain));
1113 emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
1114 emit_lso(as, A64I_LDRx, link, gr,
1115 (int32_t)offsetof(global_State, gc.grayagain));
1116 emit_cond_branch(as, CC_EQ, l_end);
1117 emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark);
1118 emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked));
1119}
1120
1121static void asm_obar(ASMState *as, IRIns *ir)
1122{
1123 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
1124 IRRef args[2];
1125 MCLabel l_end;
1126 RegSet allow = RSET_GPR;
1127 Reg obj, val, tmp;
1128 /* No need for other object barriers (yet). */
1129 lua_assert(IR(ir->op1)->o == IR_UREFC);
1130 ra_evictset(as, RSET_SCRATCH);
1131 l_end = emit_label(as);
1132 args[0] = ASMREF_TMP1; /* global_State *g */
1133 args[1] = ir->op1; /* TValue *tv */
1134 asm_gencall(as, ci, args);
1135 ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) );
1136 obj = IR(ir->op1)->r;
1137 tmp = ra_scratch(as, rset_exclude(allow, obj));
1138 emit_cond_branch(as, CC_EQ, l_end);
1139 emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
1140 emit_cond_branch(as, CC_EQ, l_end);
1141 emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
1142 val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
1143 emit_lso(as, A64I_LDRB, tmp, obj,
1144 (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
1145 emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
1146}
1147
1148/* -- Arithmetic and logic operations ------------------------------------- */
1149
1150static void asm_fparith(ASMState *as, IRIns *ir, A64Ins ai)
1151{
1152 Reg dest = ra_dest(as, ir, RSET_FPR);
1153 Reg right, left = ra_alloc2(as, ir, RSET_FPR);
1154 right = (left >> 8); left &= 255;
1155 emit_dnm(as, ai, (dest & 31), (left & 31), (right & 31));
1156}
1157
1158static void asm_fpunary(ASMState *as, IRIns *ir, A64Ins ai)
1159{
1160 Reg dest = ra_dest(as, ir, RSET_FPR);
1161 Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
1162 emit_dn(as, ai, (dest & 31), (left & 31));
1163}
1164
1165static void asm_fpmath(ASMState *as, IRIns *ir)
1166{
1167 IRFPMathOp fpm = (IRFPMathOp)ir->op2;
1168 if (fpm == IRFPM_SQRT) {
1169 asm_fpunary(as, ir, A64I_FSQRTd);
1170 } else if (fpm <= IRFPM_TRUNC) {
1171 asm_fpunary(as, ir, fpm == IRFPM_FLOOR ? A64I_FRINTMd :
1172 fpm == IRFPM_CEIL ? A64I_FRINTPd : A64I_FRINTZd);
1173 } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
1174 return;
1175 } else {
1176 asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
1177 }
1178}
1179
1180static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
1181{
1182 IRIns *ir;
1183 if (irref_isk(rref))
1184 return 0; /* Don't swap constants to the left. */
1185 if (irref_isk(lref))
1186 return 1; /* But swap constants to the right. */
1187 ir = IR(rref);
1188 if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
1189 (ir->o == IR_ADD && ir->op1 == ir->op2) ||
1190 (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
1191 return 0; /* Don't swap fusable operands to the left. */
1192 ir = IR(lref);
1193 if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
1194 (ir->o == IR_ADD && ir->op1 == ir->op2) ||
1195 (ir->o == IR_CONV && ir->op2 == ((IRT_I64<<IRCONV_DSH)|IRT_INT|IRCONV_SEXT)))
1196 return 1; /* But swap fusable operands to the right. */
1197 return 0; /* Otherwise don't swap. */
1198}
1199
1200static void asm_intop(ASMState *as, IRIns *ir, A64Ins ai)
1201{
1202 IRRef lref = ir->op1, rref = ir->op2;
1203 Reg left, dest = ra_dest(as, ir, RSET_GPR);
1204 uint32_t m;
1205 if ((ai & ~A64I_S) != A64I_SUBw && asm_swapops(as, lref, rref)) {
1206 IRRef tmp = lref; lref = rref; rref = tmp;
1207 }
1208 left = ra_hintalloc(as, lref, dest, RSET_GPR);
1209 if (irt_is64(ir->t)) ai |= A64I_X;
1210 m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
1211 if (irt_isguard(ir->t)) { /* For IR_ADDOV etc. */
1212 asm_guardcc(as, CC_VS);
1213 ai |= A64I_S;
1214 }
1215 emit_dn(as, ai^m, dest, left);
1216}
1217
1218static void asm_intop_s(ASMState *as, IRIns *ir, A64Ins ai)
1219{
1220 if (as->flagmcp == as->mcp) { /* Drop cmp r, #0. */
1221 as->flagmcp = NULL;
1222 as->mcp++;
1223 ai |= A64I_S;
1224 }
1225 asm_intop(as, ir, ai);
1226}
1227
1228static void asm_intneg(ASMState *as, IRIns *ir)
1229{
1230 Reg dest = ra_dest(as, ir, RSET_GPR);
1231 Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
1232 emit_dm(as, irt_is64(ir->t) ? A64I_NEGx : A64I_NEGw, dest, left);
1233}
1234
1235/* NYI: use add/shift for MUL(OV) with constants. FOLD only does 2^k. */
1236static void asm_intmul(ASMState *as, IRIns *ir)
1237{
1238 Reg dest = ra_dest(as, ir, RSET_GPR);
1239 Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
1240 Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
1241 if (irt_isguard(ir->t)) { /* IR_MULOV */
1242 asm_guardcc(as, CC_NE);
1243 emit_dm(as, A64I_MOVw, dest, dest); /* Zero-extend. */
1244 emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
1245 emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
1246 emit_dnm(as, A64I_SMULL, dest, right, left);
1247 } else {
1248 emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
1249 }
1250}
1251
1252static void asm_add(ASMState *as, IRIns *ir)
1253{
1254 if (irt_isnum(ir->t)) {
1255 asm_fparith(as, ir, A64I_FADDd);
1256 return;
1257 }
1258 asm_intop_s(as, ir, A64I_ADDw);
1259}
1260
1261static void asm_sub(ASMState *as, IRIns *ir)
1262{
1263 if (irt_isnum(ir->t)) {
1264 asm_fparith(as, ir, A64I_FSUBd);
1265 return;
1266 }
1267 asm_intop_s(as, ir, A64I_SUBw);
1268}
1269
1270static void asm_mul(ASMState *as, IRIns *ir)
1271{
1272 if (irt_isnum(ir->t)) {
1273 asm_fparith(as, ir, A64I_FMULd);
1274 return;
1275 }
1276 asm_intmul(as, ir);
1277}
1278
1279static void asm_div(ASMState *as, IRIns *ir)
1280{
1281#if LJ_HASFFI
1282 if (!irt_isnum(ir->t))
1283 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
1284 IRCALL_lj_carith_divu64);
1285 else
1286#endif
1287 asm_fparith(as, ir, A64I_FDIVd);
1288}
1289
1290static void asm_pow(ASMState *as, IRIns *ir)
1291{
1292#if LJ_HASFFI
1293 if (!irt_isnum(ir->t))
1294 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
1295 IRCALL_lj_carith_powu64);
1296 else
1297#endif
1298 asm_callid(as, ir, IRCALL_lj_vm_powi);
1299}
1300
1301#define asm_addov(as, ir) asm_add(as, ir)
1302#define asm_subov(as, ir) asm_sub(as, ir)
1303#define asm_mulov(as, ir) asm_mul(as, ir)
1304
1305#define asm_abs(as, ir) asm_fpunary(as, ir, A64I_FABS)
1306#define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2)
1307#define asm_ldexp(as, ir) asm_callid(as, ir, IRCALL_ldexp)
1308
1309static void asm_mod(ASMState *as, IRIns *ir)
1310{
1311#if LJ_HASFFI
1312 if (!irt_isint(ir->t))
1313 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
1314 IRCALL_lj_carith_modu64);
1315 else
1316#endif
1317 asm_callid(as, ir, IRCALL_lj_vm_modi);
1318}
1319
1320static void asm_neg(ASMState *as, IRIns *ir)
1321{
1322 if (irt_isnum(ir->t)) {
1323 asm_fpunary(as, ir, A64I_FNEGd);
1324 return;
1325 }
1326 asm_intneg(as, ir);
1327}
1328
1329static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai)
1330{
1331 if (as->flagmcp == as->mcp && ai == A64I_ANDw) {
1332 /* Try to drop cmp r, #0. */
1333 as->flagmcp = NULL;
1334 as->mcp++;
1335 ai += A64I_ANDSw - A64I_ANDw;
1336 }
1337 if (ir->op2 == 0) {
1338 Reg dest = ra_dest(as, ir, RSET_GPR);
1339 uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR);
1340 if (irt_is64(ir->t)) ai |= A64I_X;
1341 emit_d(as, ai^m, dest);
1342 } else {
1343 asm_intop(as, ir, ai);
1344 }
1345}
1346
1347#define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw)
1348#define asm_band(as, ir) asm_bitop(as, ir, A64I_ANDw)
1349#define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw)
1350#define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw)
1351
1352static void asm_bswap(ASMState *as, IRIns *ir)
1353{
1354 Reg dest = ra_dest(as, ir, RSET_GPR);
1355 Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
1356 emit_dn(as, irt_is64(ir->t) ? A64I_REVx : A64I_REVw, dest, left);
1357}
1358
1359static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh)
1360{
1361 int shmask = irt_is64(ir->t) ? 63 : 31;
1362 if (irref_isk(ir->op2)) { /* Constant shifts. */
1363 Reg dest = ra_dest(as, ir, RSET_GPR);
1364 Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
1365 int32_t shift = (IR(ir->op2)->i & shmask);
1366
1367 if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw;
1368 switch (sh) {
1369 case A64SH_LSL:
1370 emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left);
1371 break;
1372 case A64SH_LSR: case A64SH_ASR:
1373 emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left);
1374 break;
1375 case A64SH_ROR:
1376 emit_dnm(as, ai | A64F_IMMS(shift), dest, left, left);
1377 break;
1378 }
1379 } else { /* Variable-length shifts. */
1380 Reg dest = ra_dest(as, ir, RSET_GPR);
1381 Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
1382 Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
1383 emit_dnm(as, (shmask == 63 ? A64I_SHRx : A64I_SHRw) | A64F_BSH(sh), dest, left, right);
1384 }
1385}
1386
1387#define asm_bshl(as, ir) asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSL)
1388#define asm_bshr(as, ir) asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSR)
1389#define asm_bsar(as, ir) asm_bitshift(as, ir, A64I_SBFMw, A64SH_ASR)
1390#define asm_bror(as, ir) asm_bitshift(as, ir, A64I_EXTRw, A64SH_ROR)
1391#define asm_brol(as, ir) lua_assert(0)
1392
1393static void asm_intmin_max(ASMState *as, IRIns *ir, A64CC cc)
1394{
1395 Reg dest = ra_dest(as, ir, RSET_GPR);
1396 Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
1397 Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
1398 emit_dnm(as, A64I_CSELw|A64F_CC(cc), dest, left, right);
1399 emit_nm(as, A64I_CMPw, left, right);
1400}
1401
1402static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc)
1403{
1404 Reg dest = (ra_dest(as, ir, RSET_FPR) & 31);
1405 Reg right, left = ra_alloc2(as, ir, RSET_FPR);
1406 right = ((left >> 8) & 31); left &= 31;
1407 emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right);
1408 emit_nm(as, A64I_FCMPd, left, right);
1409}
1410
1411static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc)
1412{
1413 if (irt_isnum(ir->t))
1414 asm_fpmin_max(as, ir, fcc);
1415 else
1416 asm_intmin_max(as, ir, cc);
1417}
1418
1419#define asm_max(as, ir) asm_min_max(as, ir, CC_GT, CC_HI)
1420#define asm_min(as, ir) asm_min_max(as, ir, CC_LT, CC_LO)
1421
1422/* -- Comparisons --------------------------------------------------------- */
1423
1424/* Map of comparisons to flags. ORDER IR. */
1425static const uint8_t asm_compmap[IR_ABC+1] = {
1426 /* op FP swp int cc FP cc */
1427 /* LT */ CC_GE + (CC_HS << 4),
1428 /* GE x */ CC_LT + (CC_HI << 4),
1429 /* LE */ CC_GT + (CC_HI << 4),
1430 /* GT x */ CC_LE + (CC_HS << 4),
1431 /* ULT x */ CC_HS + (CC_LS << 4),
1432 /* UGE */ CC_LO + (CC_LO << 4),
1433 /* ULE x */ CC_HI + (CC_LO << 4),
1434 /* UGT */ CC_LS + (CC_LS << 4),
1435 /* EQ */ CC_NE + (CC_NE << 4),
1436 /* NE */ CC_EQ + (CC_EQ << 4),
1437 /* ABC */ CC_LS + (CC_LS << 4) /* Same as UGT. */
1438};
1439
1440/* FP comparisons. */
1441static void asm_fpcomp(ASMState *as, IRIns *ir)
1442{
1443 Reg left, right;
1444 A64Ins ai;
1445 int swp = ((ir->o ^ (ir->o >> 2)) & ~(ir->o >> 3) & 1);
1446 if (!swp && irref_isk(ir->op2) && ir_knum(IR(ir->op2))->u64 == 0) {
1447 left = (ra_alloc1(as, ir->op1, RSET_FPR) & 31);
1448 right = 0;
1449 ai = A64I_FCMPZd;
1450 } else {
1451 left = ra_alloc2(as, ir, RSET_FPR);
1452 if (swp) {
1453 right = (left & 31); left = ((left >> 8) & 31);
1454 } else {
1455 right = ((left >> 8) & 31); left &= 31;
1456 }
1457 ai = A64I_FCMPd;
1458 }
1459 asm_guardcc(as, (asm_compmap[ir->o] >> 4));
1460 emit_nm(as, ai, left, right);
1461}
1462
1463/* Integer comparisons. */
1464static void asm_intcomp(ASMState *as, IRIns *ir)
1465{
1466 A64CC oldcc, cc = (asm_compmap[ir->o] & 15);
1467 A64Ins ai = irt_is64(ir->t) ? A64I_CMPx : A64I_CMPw;
1468 IRRef lref = ir->op1, rref = ir->op2;
1469 Reg left;
1470 uint32_t m;
1471 int cmpprev0 = 0;
1472 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) ||
1473 irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t));
1474 if (asm_swapops(as, lref, rref)) {
1475 IRRef tmp = lref; lref = rref; rref = tmp;
1476 if (cc >= CC_GE) cc ^= 7; /* LT <-> GT, LE <-> GE */
1477 else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */
1478 }
1479 oldcc = cc;
1480 if (irref_isk(rref) && IR(rref)->i == 0) {
1481 IRIns *irl = IR(lref);
1482 if (cc == CC_GE) cc = CC_PL;
1483 else if (cc == CC_LT) cc = CC_MI;
1484 else if (cc > CC_NE) goto notst; /* Other conds don't work with tst. */
1485 cmpprev0 = (irl+1 == ir);
1486 /* Combine comp(BAND(left, right), 0) into tst left, right. */
1487 if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) {
1488 IRRef blref = irl->op1, brref = irl->op2;
1489 uint32_t m2 = 0;
1490 Reg bleft;
1491 if (asm_swapops(as, blref, brref)) {
1492 Reg tmp = blref; blref = brref; brref = tmp;
1493 }
1494 if (irref_isk(brref)) {
1495 /* NYI: use tbz/tbnz, if applicable. */
1496 m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t));
1497 if (!m2)
1498 goto notst; /* Not beneficial if we miss a constant operand. */
1499 }
1500 bleft = ra_alloc1(as, blref, RSET_GPR);
1501 ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
1502 if (!m2)
1503 m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
1504 asm_guardcc(as, cc);
1505 emit_n(as, ai^m2, bleft);
1506 return;
1507 }
1508 /* NYI: use cbz/cbnz for EQ/NE 0. */
1509 }
1510notst:
1511 left = ra_alloc1(as, lref, RSET_GPR);
1512 m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
1513 asm_guardcc(as, cc);
1514 emit_n(as, ai^m, left);
1515 /* Signed comparison with zero and referencing previous ins? */
1516 if (cmpprev0 && (oldcc <= CC_NE || oldcc >= CC_GE))
1517 as->flagmcp = as->mcp; /* Allow elimination of the compare. */
1518}
1519
1520static void asm_comp(ASMState *as, IRIns *ir)
1521{
1522 if (irt_isnum(ir->t))
1523 asm_fpcomp(as, ir);
1524 else
1525 asm_intcomp(as, ir);
1526}
1527
1528#define asm_equal(as, ir) asm_comp(as, ir)
1529
1530/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
1531
1532/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
1533static void asm_hiop(ASMState *as, IRIns *ir)
1534{
1535 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on 64 bit. */
1536}
1537
1538/* -- Profiling ----------------------------------------------------------- */
1539
1540static void asm_prof(ASMState *as, IRIns *ir)
1541{
1542 uint32_t k = emit_isk13(HOOK_PROFILE, 0);
1543 lua_assert(k != 0);
1544 UNUSED(ir);
1545 asm_guardcc(as, CC_NE);
1546 emit_n(as, A64I_TSTw^k, RID_TMP);
1547 emit_lsptr(as, A64I_LDRB, RID_TMP, (void *)&J2G(as->J)->hookmask);
1548}
1549
1550/* -- Stack handling ------------------------------------------------------ */
1551
1552/* Check Lua stack size for overflow. Use exit handler as fallback. */
1553static void asm_stack_check(ASMState *as, BCReg topslot,
1554 IRIns *irp, RegSet allow, ExitNo exitno)
1555{
1556 Reg pbase;
1557 uint32_t k;
1558 if (irp) {
1559 if (!ra_hasspill(irp->s)) {
1560 pbase = irp->r;
1561 lua_assert(ra_hasreg(pbase));
1562 } else if (allow) {
1563 pbase = rset_pickbot(allow);
1564 } else {
1565 pbase = RID_RET;
1566 emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0); /* Restore temp register. */
1567 }
1568 } else {
1569 pbase = RID_BASE;
1570 }
1571 emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno));
1572 emit_cond_branch(as, CC_LS^1, as->mcp+1);
1573 k = emit_isk12((8*topslot));
1574 lua_assert(k);
1575 emit_n(as, A64I_CMPx^k, RID_TMP);
1576 emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
1577 emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
1578 (int32_t)offsetof(lua_State, maxstack));
1579 if (irp) { /* Must not spill arbitrary registers in head of side trace. */
1580 if (ra_hasspill(irp->s))
1581 emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
1582 emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
1583 if (ra_hasspill(irp->s) && !allow)
1584 emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0); /* Save temp register. */
1585 } else {
1586 emit_getgl(as, RID_TMP, cur_L);
1587 }
1588}
1589
1590/* Restore Lua stack from on-trace state. */
1591static void asm_stack_restore(ASMState *as, SnapShot *snap)
1592{
1593 SnapEntry *map = &as->T->snapmap[snap->mapofs];
1594#ifdef LUA_USE_ASSERT
1595 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
1596#endif
1597 MSize n, nent = snap->nent;
1598 /* Store the value of all modified slots to the Lua stack. */
1599 for (n = 0; n < nent; n++) {
1600 SnapEntry sn = map[n];
1601 BCReg s = snap_slot(sn);
1602 int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
1603 IRRef ref = snap_ref(sn);
1604 IRIns *ir = IR(ref);
1605 if ((sn & SNAP_NORESTORE))
1606 continue;
1607 if (irt_isnum(ir->t)) {
1608 Reg src = ra_alloc1(as, ref, RSET_FPR);
1609 emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
1610 } else {
1611 RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
1612 lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t));
1613 if (!irref_isk(ref)) {
1614 Reg type, src;
1615 if (irt_is64(ir->t)) {
1616 type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
1617 src = ra_alloc1(as, ref, rset_exclude(allow, type));
1618 emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
1619 emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
1620 } else if (irt_isinteger(ir->t)) {
1621 type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
1622 src = ra_alloc1(as, ref, rset_exclude(allow, type));
1623 emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
1624 emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
1625 } else {
1626 type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
1627 emit_lso(as, A64I_STRx, type, RID_BASE, ofs);
1628 }
1629 } else {
1630 TValue k;
1631 lj_ir_kvalue(as->J->L, &k, ir);
1632 emit_lso(as, A64I_STRx,
1633 ra_allock(as, tvisnil(&k) ? -1 : (int64_t)k.u64, allow),
1634 RID_BASE, ofs);
1635 }
1636 }
1637 checkmclim(as);
1638 }
1639 lua_assert(map + nent == flinks);
1640}
1641
1642/* -- GC handling --------------------------------------------------------- */
1643
1644/* Check GC threshold and do one or more GC steps. */
1645static void asm_gc_check(ASMState *as)
1646{
1647 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
1648 IRRef args[2];
1649 MCLabel l_end;
1650 Reg tmp1, tmp2;
1651 ra_evictset(as, RSET_SCRATCH);
1652 l_end = emit_label(as);
1653 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
1654 asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */
1655 emit_n(as, A64I_CMPx^A64I_K12, RID_RET);
1656 args[0] = ASMREF_TMP1; /* global_State *g */
1657 args[1] = ASMREF_TMP2; /* MSize steps */
1658 asm_gencall(as, ci, args);
1659 tmp1 = ra_releasetmp(as, ASMREF_TMP1);
1660 tmp2 = ra_releasetmp(as, ASMREF_TMP2);
1661 emit_loadi(as, tmp2, as->gcsteps);
1662 /* Jump around GC step if GC total < GC threshold. */
1663 emit_cond_branch(as, CC_LS, l_end);
1664 emit_nm(as, A64I_CMPx, RID_TMP, tmp2);
1665 emit_lso(as, A64I_LDRx, tmp2, tmp1,
1666 (int32_t)offsetof(global_State, gc.threshold));
1667 emit_lso(as, A64I_LDRx, RID_TMP, tmp1,
1668 (int32_t)offsetof(global_State, gc.total));
1669 ra_allockreg(as, i64ptr(J2G(as->J)), tmp1);
1670 as->gcsteps = 0;
1671 checkmclim(as);
1672}
1673
1674/* -- Loop handling ------------------------------------------------------- */
1675
1676/* Fixup the loop branch. */
1677static void asm_loop_fixup(ASMState *as)
1678{
1679 MCode *p = as->mctop;
1680 MCode *target = as->mcp;
1681 if (as->loopinv) { /* Inverted loop branch? */
1682 ptrdiff_t delta = target - (p - 2);
1683 lua_assert(((delta + 0x40000) >> 19) == 0);
1684 /* asm_guardcc already inverted the b.cc and patched the final bl. */
1685 p[-2] |= ((uint32_t)delta & 0x7ffff) << 5;
1686 } else {
1687 ptrdiff_t delta = target - (p - 1);
1688 p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
1689 }
1690}
1691
1692/* -- Head of trace ------------------------------------------------------- */
1693
1694/* Reload L register from g->cur_L. */
1695static void asm_head_lreg(ASMState *as)
1696{
1697 IRIns *ir = IR(ASMREF_L);
1698 if (ra_used(ir)) {
1699 Reg r = ra_dest(as, ir, RSET_GPR);
1700 emit_getgl(as, r, cur_L);
1701 ra_evictk(as);
1702 }
1703}
1704
1705/* Coalesce BASE register for a root trace. */
1706static void asm_head_root_base(ASMState *as)
1707{
1708 IRIns *ir;
1709 asm_head_lreg(as);
1710 ir = IR(REF_BASE);
1711 if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
1712 ra_spill(as, ir);
1713 ra_destreg(as, ir, RID_BASE);
1714}
1715
1716/* Coalesce BASE register for a side trace. */
1717static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
1718{
1719 IRIns *ir;
1720 asm_head_lreg(as);
1721 ir = IR(REF_BASE);
1722 if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
1723 ra_spill(as, ir);
1724 if (ra_hasspill(irp->s)) {
1725 rset_clear(allow, ra_dest(as, ir, allow));
1726 } else {
1727 Reg r = irp->r;
1728 lua_assert(ra_hasreg(r));
1729 rset_clear(allow, r);
1730 if (r != ir->r && !rset_test(as->freeset, r))
1731 ra_restore(as, regcost_ref(as->cost[r]));
1732 ra_destreg(as, ir, r);
1733 }
1734 return allow;
1735}
1736
1737/* -- Tail of trace ------------------------------------------------------- */
1738
1739/* Fixup the tail code. */
1740static void asm_tail_fixup(ASMState *as, TraceNo lnk)
1741{
1742 MCode *p = as->mctop;
1743 MCode *target;
1744 /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */
1745 int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED));
1746 if (spadj == 0) {
1747 as->mctop = --p;
1748 } else {
1749 /* Patch stack adjustment. */
1750 uint32_t k = emit_isk12(spadj);
1751 lua_assert(k);
1752 p[-2] = (A64I_ADDx^k) | A64F_D(RID_SP) | A64F_N(RID_SP);
1753 }
1754 /* Patch exit branch. */
1755 target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
1756 p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu);
1757}
1758
1759/* Prepare tail of code. */
1760static void asm_tail_prep(ASMState *as)
1761{
1762 MCode *p = as->mctop - 1; /* Leave room for exit branch. */
1763 if (as->loopref) {
1764 as->invmcp = as->mcp = p;
1765 } else {
1766 as->mcp = p-1; /* Leave room for stack pointer adjustment. */
1767 as->invmcp = NULL;
1768 }
1769 *p = 0; /* Prevent load/store merging. */
1770}
1771
1772/* -- Trace setup --------------------------------------------------------- */
1773
1774/* Ensure there are enough stack slots for call arguments. */
1775static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
1776{
1777 IRRef args[CCI_NARGS_MAX*2];
1778 uint32_t i, nargs = CCI_XNARGS(ci);
1779 int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
1780 asm_collectargs(as, ir, ci, args);
1781 for (i = 0; i < nargs; i++) {
1782 if (args[i] && irt_isfp(IR(args[i])->t)) {
1783 if (nfpr > 0) nfpr--; else nslots += 2;
1784 } else {
1785 if (ngpr > 0) ngpr--; else nslots += 2;
1786 }
1787 }
1788 if (nslots > as->evenspill) /* Leave room for args in stack slots. */
1789 as->evenspill = nslots;
1790 return REGSP_HINT(RID_RET);
1791}
1792
1793static void asm_setup_target(ASMState *as)
1794{
1795 /* May need extra exit for asm_stack_check on side traces. */
1796 asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
1797}
1798
1799/* -- Trace patching ------------------------------------------------------ */
1800
1801/* Patch exit jumps of existing machine code to a new target. */
1802void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
1803{
1804 MCode *p = T->mcode;
1805 MCode *pe = (MCode *)((char *)p + T->szmcode);
1806 MCode *cstart = NULL, *cend = p;
1807 MCode *mcarea = lj_mcode_patch(J, p, 0);
1808 MCode *px = exitstub_addr(J, exitno);
1809 for (; p < pe; p++) {
1810 /* Look for bl exitstub, replace with b target. */
1811 uint32_t ins = *p;
1812 if ((ins & 0xfc000000u) == 0x94000000u &&
1813 ((ins ^ (px-p)) & 0x03ffffffu) == 0) {
1814 *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu);
1815 cend = p+1;
1816 if (!cstart) cstart = p;
1817 }
1818 }
1819 lua_assert(cstart != NULL);
1820 lj_mcode_sync(cstart, cend);
1821 lj_mcode_patch(J, mcarea, 1);
1822}
1823
diff --git a/src/lj_ccall.c b/src/lj_ccall.c
index b599be33..a3ae8b05 100644
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@@ -331,7 +331,7 @@
331 331
332#define CCALL_HANDLE_COMPLEXARG \ 332#define CCALL_HANDLE_COMPLEXARG \
333 /* Pass complex by value in separate (!) FPRs or on stack. */ \ 333 /* Pass complex by value in separate (!) FPRs or on stack. */ \
334 isfp = ctr->size == 2*sizeof(float) ? 2 : 1; 334 isfp = sz == 2*sizeof(float) ? 2 : 1;
335 335
336#define CCALL_HANDLE_REGARG \ 336#define CCALL_HANDLE_REGARG \
337 if (LJ_TARGET_IOS && isva) { \ 337 if (LJ_TARGET_IOS && isva) { \
diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
index 82708077..362d6202 100644
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@@ -107,6 +107,7 @@ typedef struct GG_State {
107#define J2G(J) (&J2GG(J)->g) 107#define J2G(J) (&J2GG(J)->g)
108#define G2J(gl) (&G2GG(gl)->J) 108#define G2J(gl) (&G2GG(gl)->J)
109#define L2J(L) (&L2GG(L)->J) 109#define L2J(L) (&L2GG(L)->J)
110#define GG_G2J (GG_OFS(J) - GG_OFS(g))
110#define GG_G2DISP (GG_OFS(dispatch) - GG_OFS(g)) 111#define GG_G2DISP (GG_OFS(dispatch) - GG_OFS(g))
111#define GG_DISP2G (GG_OFS(g) - GG_OFS(dispatch)) 112#define GG_DISP2G (GG_OFS(g) - GG_OFS(dispatch))
112#define GG_DISP2J (GG_OFS(J) - GG_OFS(dispatch)) 113#define GG_DISP2J (GG_OFS(J) - GG_OFS(dispatch))
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
new file mode 100644
index 00000000..eb8f7fc7
--- /dev/null
+++ b/src/lj_emit_arm64.h
@@ -0,0 +1,397 @@
1/*
2** ARM64 instruction emitter.
3** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
4**
5** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
6** Sponsored by Cisco Systems, Inc.
7*/
8
9/* -- Constant encoding --------------------------------------------------- */
10
11static uint64_t get_k64val(IRIns *ir)
12{
13 if (ir->o == IR_KINT64) {
14 return ir_kint64(ir)->u64;
15 } else if (ir->o == IR_KGC) {
16 return (uint64_t)ir_kgc(ir);
17 } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
18 return (uint64_t)ir_kptr(ir);
19 } else {
20 lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
21 return ir->i; /* Sign-extended. */
22 }
23}
24
25/* Encode constant in K12 format for data processing instructions. */
26static uint32_t emit_isk12(int64_t n)
27{
28 uint64_t k = (n < 0) ? -n : n;
29 uint32_t m = (n < 0) ? 0x40000000 : 0;
30 if (k < 0x1000) {
31 return A64I_K12|m|A64F_U12(k);
32 } else if ((k & 0xfff000) == k) {
33 return A64I_K12|m|0x400000|A64F_U12(k>>12);
34 }
35 return 0;
36}
37
38#define emit_clz64(n) __builtin_clzll(n)
39#define emit_ctz64(n) __builtin_ctzll(n)
40
41/* Encode constant in K13 format for logical data processing instructions. */
42static uint32_t emit_isk13(uint64_t n, int is64)
43{
44 int inv = 0, w = 128, lz, tz;
45 if (n & 1) { n = ~n; w = 64; inv = 1; } /* Avoid wrap-around of ones. */
46 if (!n) return 0; /* Neither all-zero nor all-ones are allowed. */
47 do { /* Find the repeat width. */
48 if (is64 && (uint32_t)(n^(n>>32))) break;
49 n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
50 n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
51 n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
52 n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
53 n = n & 0x3; w = 2;
54 } while (0);
55 lz = emit_clz64(n);
56 tz = emit_ctz64(n);
57 if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
58 if (inv)
59 return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
60 else
61 return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
62}
63
64static uint32_t emit_isfpk64(uint64_t n)
65{
66 uint64_t etop9 = ((n >> 54) & 0x1ff);
67 if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
68 return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
69 }
70 return ~0u;
71}
72
73/* -- Emit basic instructions --------------------------------------------- */
74
75static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
76{
77 *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
78}
79
80static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
81{
82 *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
83}
84
85static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
86{
87 *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
88}
89
90static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
91{
92 *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
93}
94
95static void emit_d(ASMState *as, A64Ins ai, Reg rd)
96{
97 *--as->mcp = ai | A64F_D(rd);
98}
99
100static void emit_n(ASMState *as, A64Ins ai, Reg rn)
101{
102 *--as->mcp = ai | A64F_N(rn);
103}
104
105static int emit_checkofs(A64Ins ai, int64_t ofs)
106{
107 int scale = (ai >> 30) & 3;
108 if (ofs < 0 || (ofs & ((1<<scale)-1))) {
109 return (ofs >= -256 && ofs <= 255) ? -1 : 0;
110 } else {
111 return (ofs < (4096<<scale)) ? 1 : 0;
112 }
113}
114
115static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
116{
117 int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
118 lua_assert(ot);
119 /* Combine LDR/STR pairs to LDP/STP. */
120 if ((sc == 2 || sc == 3) &&
121 (!(ai & 0x400000) || rd != rn) &&
122 as->mcp != as->mcloop) {
123 uint32_t prev = *as->mcp & ~A64F_D(31);
124 int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
125 A64Ins aip;
126 if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
127 prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
128 aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
129 } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
130 prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
131 aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
132 ofsm = ofs;
133 } else {
134 goto nopair;
135 }
136 if (ofsm >= (-64<<sc) && ofsm <= (63<<sc)) {
137 *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
138 (ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
139 return;
140 }
141 }
142nopair:
143 if (ot == 1)
144 *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
145 else
146 *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
147}
148
149/* -- Emit loads/stores --------------------------------------------------- */
150
151/* Prefer rematerialization of BASE/L from global_State over spills. */
152#define emit_canremat(ref) ((ref) <= ASMREF_L)
153
154/* Try to find an N-step delta relative to other consts with N < lim. */
155static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
156{
157 RegSet work = ~as->freeset & RSET_GPR;
158 if (lim <= 1) return 0; /* Can't beat that. */
159 while (work) {
160 Reg r = rset_picktop(work);
161 IRRef ref = regcost_ref(as->cost[r]);
162 lua_assert(r != rd);
163 if (ref < REF_TRUE) {
164 uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
165 get_k64val(IR(ref));
166 int64_t delta = (int64_t)(k - kx);
167 if (delta == 0) {
168 emit_dm(as, A64I_MOVx, rd, r);
169 return 1;
170 } else {
171 uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
172 if (k12) {
173 emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
174 return 1;
175 }
176 /* Do other ops or multi-step deltas pay off? Probably not.
177 ** E.g. XOR rarely helps with pointer consts.
178 */
179 }
180 }
181 rset_clear(work, r);
182 }
183 return 0; /* Failed. */
184}
185
186static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
187{
188 uint32_t k13 = emit_isk13(u64, is64);
189 if (k13) { /* Can the constant be represented as a bitmask immediate? */
190 emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
191 } else {
192 int i, zeros = 0, ones = 0, neg;
193 if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */
194 /* Count homogeneous 16 bit fragments. */
195 for (i = 0; i < 4; i++) {
196 uint64_t frag = (u64 >> i*16) & 0xffff;
197 zeros += (frag == 0);
198 ones += (frag == 0xffff);
199 }
200 neg = ones > zeros; /* Use MOVN if it pays off. */
201 if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
202 int shift = 0, lshift = 0;
203 uint64_t n64 = neg ? ~u64 : u64;
204 if (n64 != 0) {
205 /* Find first/last fragment to be filled. */
206 shift = (63-emit_clz64(n64)) & ~15;
207 lshift = emit_ctz64(n64) & ~15;
208 }
209 /* MOVK requires the original value (u64). */
210 while (shift > lshift) {
211 uint32_t u16 = (u64 >> shift) & 0xffff;
212 /* Skip fragments that are correctly filled by MOVN/MOVZ. */
213 if (u16 != (neg ? 0xffff : 0))
214 emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
215 shift -= 16;
216 }
217 /* But MOVN needs an inverted value (n64). */
218 emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
219 A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
220 }
221 }
222}
223
224/* Load a 32 bit constant into a GPR. */
225#define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0)
226
227/* Load a 64 bit constant into a GPR. */
228#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X)
229
230#define emit_loada(as, r, addr) emit_loadu64(as, (r), (uintptr_t)(addr))
231
232#define glofs(as, k) \
233 ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
234#define mcpofs(as, k) \
235 ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp))
236#define checkmcpofs(as, k) \
237 ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
238
239static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
240
241/* Get/set from constant pointer. */
242static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
243{
244 /* First, check if ip + offset is in range. */
245 if ((ai & 0x00400000) && checkmcpofs(as, p)) {
246 emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
247 } else {
248 Reg base = RID_GL; /* Next, try GL + offset. */
249 int64_t ofs = glofs(as, p);
250 if (!emit_checkofs(ai, ofs)) { /* Else split up into base reg + offset. */
251 int64_t i64 = i64ptr(p);
252 base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
253 ofs = i64 & 0x7fffull;
254 }
255 emit_lso(as, ai, r, base, ofs);
256 }
257}
258
259/* Load 64 bit IR constant into register. */
260static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
261{
262 const uint64_t *k = &ir_k64(ir)->u64;
263 int64_t ofs;
264 if (r >= RID_MAX_GPR) {
265 uint32_t fpk = emit_isfpk64(*k);
266 if (fpk != ~0u) {
267 emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
268 return;
269 }
270 }
271 ofs = glofs(as, k);
272 if (emit_checkofs(A64I_LDRx, ofs)) {
273 emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
274 (r & 31), RID_GL, ofs);
275 } else {
276 if (r >= RID_MAX_GPR) {
277 emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
278 r = RID_TMP;
279 }
280 if (checkmcpofs(as, k))
281 emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
282 else
283 emit_loadu64(as, r, *k);
284 }
285}
286
287/* Get/set global_State fields. */
288#define emit_getgl(as, r, field) \
289 emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
290#define emit_setgl(as, r, field) \
291 emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
292
293/* Trace number is determined from pc of exit instruction. */
294#define emit_setvmstate(as, i) UNUSED(i)
295
296/* -- Emit control-flow instructions -------------------------------------- */
297
298/* Label for internal jumps. */
299typedef MCode *MCLabel;
300
301/* Return label pointing to current PC. */
302#define emit_label(as) ((as)->mcp)
303
304static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
305{
306 MCode *p = as->mcp;
307 ptrdiff_t delta = target - (p - 1);
308 lua_assert(((delta + 0x40000) >> 19) == 0);
309 *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond;
310 as->mcp = p;
311}
312
313static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
314{
315 MCode *p = as->mcp;
316 ptrdiff_t delta = target - (p - 1);
317 lua_assert(((delta + 0x02000000) >> 26) == 0);
318 *--p = ai | ((uint32_t)delta & 0x03ffffffu);
319 as->mcp = p;
320}
321
322#define emit_jmp(as, target) emit_branch(as, A64I_B, (target))
323
324static void emit_call(ASMState *as, void *target)
325{
326 MCode *p = --as->mcp;
327 ptrdiff_t delta = (char *)target - (char *)p;
328 if ((((delta>>2) + 0x02000000) >> 26) == 0) {
329 *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
330 } else { /* Target out of range: need indirect call. But don't use R0-R7. */
331 Reg r = ra_allock(as, i64ptr(target),
332 RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
333 *p = A64I_BLR | A64F_N(r);
334 }
335}
336
337/* -- Emit generic operations --------------------------------------------- */
338
339/* Generic move between two regs. */
340static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
341{
342 if (dst >= RID_MAX_GPR) {
343 emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
344 (dst & 31), (src & 31));
345 return;
346 }
347 if (as->mcp != as->mcloop) { /* Swap early registers for loads/stores. */
348 MCode ins = *as->mcp, swp = (src^dst);
349 if ((ins & 0xbf800000) == 0xb9000000) {
350 if (!((ins ^ (dst << 5)) & 0x000003e0))
351 *as->mcp = ins ^ (swp << 5); /* Swap N in load/store. */
352 if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
353 *as->mcp = ins ^ swp; /* Swap D in store. */
354 }
355 }
356 emit_dm(as, A64I_MOVx, dst, src);
357}
358
359/* Generic load of register with base and (small) offset address. */
360static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
361{
362 if (r >= RID_MAX_GPR)
363 emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
364 else
365 emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
366}
367
368/* Generic store of register with base and (small) offset address. */
369static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
370{
371 if (r >= RID_MAX_GPR)
372 emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
373 else
374 emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
375}
376
377/* Emit an arithmetic operation with a constant operand. */
378static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
379 int32_t i, RegSet allow)
380{
381 uint32_t k = emit_isk12(i);
382 if (k)
383 emit_dn(as, ai^k, dest, src);
384 else
385 emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
386}
387
388/* Add offset to pointer. */
389static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
390{
391 if (ofs)
392 emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
393 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
394}
395
396#define emit_spsub(as, ofs) emit_addptr(as, RID_SP, -(ofs))
397
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index 8b72be7d..8bc2474c 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -296,6 +296,9 @@ enum {
296#elif LJ_TARGET_ARM 296#elif LJ_TARGET_ARM
297 DW_REG_SP = 13, 297 DW_REG_SP = 13,
298 DW_REG_RA = 14, 298 DW_REG_RA = 14,
299#elif LJ_TARGET_ARM64
300 DW_REG_SP = 31,
301 DW_REG_RA = 30,
299#elif LJ_TARGET_PPC 302#elif LJ_TARGET_PPC
300 DW_REG_SP = 1, 303 DW_REG_SP = 1,
301 DW_REG_RA = 65, 304 DW_REG_RA = 65,
@@ -374,6 +377,8 @@ static const ELFheader elfhdr_template = {
374 .machine = 62, 377 .machine = 62,
375#elif LJ_TARGET_ARM 378#elif LJ_TARGET_ARM
376 .machine = 40, 379 .machine = 40,
380#elif LJ_TARGET_ARM64
381 .machine = 183,
377#elif LJ_TARGET_PPC 382#elif LJ_TARGET_PPC
378 .machine = 20, 383 .machine = 20,
379#elif LJ_TARGET_MIPS 384#elif LJ_TARGET_MIPS
@@ -563,6 +568,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
563 int i; 568 int i;
564 for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); } 569 for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); }
565 } 570 }
571#elif LJ_TARGET_ARM64
572 {
573 int i;
574 DB(DW_CFA_offset|31); DUV(2);
575 for (i = 28; i >= 19; i--) { DB(DW_CFA_offset|i); DUV(3+(28-i)); }
576 for (i = 15; i >= 8; i--) { DB(DW_CFA_offset|32|i); DUV(28-i); }
577 }
566#elif LJ_TARGET_PPC 578#elif LJ_TARGET_PPC
567 { 579 {
568 int i; 580 int i;
diff --git a/src/lj_target.h b/src/lj_target.h
index abea8d5b..c069eb95 100644
--- a/src/lj_target.h
+++ b/src/lj_target.h
@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
55/* Bitset for registers. 32 registers suffice for most architectures. 55/* Bitset for registers. 32 registers suffice for most architectures.
56** Note that one set holds bits for both GPRs and FPRs. 56** Note that one set holds bits for both GPRs and FPRs.
57*/ 57*/
58#if LJ_TARGET_PPC || LJ_TARGET_MIPS 58#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
59typedef uint64_t RegSet; 59typedef uint64_t RegSet;
60#else 60#else
61typedef uint32_t RegSet; 61typedef uint32_t RegSet;
@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
69#define rset_set(rs, r) (rs |= RID2RSET(r)) 69#define rset_set(rs, r) (rs |= RID2RSET(r))
70#define rset_clear(rs, r) (rs &= ~RID2RSET(r)) 70#define rset_clear(rs, r) (rs &= ~RID2RSET(r))
71#define rset_exclude(rs, r) (rs & ~RID2RSET(r)) 71#define rset_exclude(rs, r) (rs & ~RID2RSET(r))
72#if LJ_TARGET_PPC || LJ_TARGET_MIPS 72#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
73#define rset_picktop(rs) ((Reg)(__builtin_clzll(rs)^63)) 73#define rset_picktop(rs) ((Reg)(__builtin_clzll(rs)^63))
74#define rset_pickbot(rs) ((Reg)__builtin_ctzll(rs)) 74#define rset_pickbot(rs) ((Reg)__builtin_ctzll(rs))
75#else 75#else
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h
index 57ab134f..0cef06d5 100644
--- a/src/lj_target_arm64.h
+++ b/src/lj_target_arm64.h
@@ -55,7 +55,8 @@ enum {
55 55
56/* Make use of all registers, except for x18, fp, lr and sp. */ 56/* Make use of all registers, except for x18, fp, lr and sp. */
57#define RSET_FIXED \ 57#define RSET_FIXED \
58 (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)) 58 (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)|\
59 RID2RSET(RID_GL))
59#define RSET_GPR (RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED) 60#define RSET_GPR (RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
60#define RSET_FPR RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR) 61#define RSET_FPR RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
61#define RSET_ALL (RSET_GPR|RSET_FPR) 62#define RSET_ALL (RSET_GPR|RSET_FPR)
@@ -73,25 +74,235 @@ enum {
73#define REGARG_LASTFPR RID_D7 74#define REGARG_LASTFPR RID_D7
74#define REGARG_NUMFPR 8 75#define REGARG_NUMFPR 8
75 76
77/* -- Spill slots --------------------------------------------------------- */
78
79/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
80**
81** SPS_FIXED: Available fixed spill slots in interpreter frame.
82** This definition must match with the vm_arm64.dasc file.
83** Pre-allocate some slots to avoid sp adjust in every root trace.
84**
85** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots.
86*/
87#define SPS_FIXED 4
88#define SPS_FIRST 2
89
90#define SPOFS_TMP 0
91
92#define sps_scale(slot) (4 * (int32_t)(slot))
93#define sps_align(slot) (((slot) - SPS_FIXED + 3) & ~3)
94
95/* -- Exit state ---------------------------------------------------------- */
96
97/* This definition must match with the *.dasc file(s). */
98typedef struct {
99 lua_Number fpr[RID_NUM_FPR]; /* Floating-point registers. */
100 intptr_t gpr[RID_NUM_GPR]; /* General-purpose registers. */
101 int32_t spill[256]; /* Spill slots. */
102} ExitState;
103
104/* PC after instruction that caused an exit. Used to find the trace number. */
105#define EXITSTATE_PCREG RID_LR
106/* Highest exit + 1 indicates stack check. */
107#define EXITSTATE_CHECKEXIT 1
108
109#define EXITSTUB_SPACING 4
110#define EXITSTUBS_PER_GROUP 32
111
112
76/* -- Instructions -------------------------------------------------------- */ 113/* -- Instructions -------------------------------------------------------- */
77 114
78/* Instruction fields. */ 115/* Instruction fields. */
79#define A64F_D(r) (r) 116#define A64F_D(r) (r)
80#define A64F_N(r) ((r) << 5) 117#define A64F_N(r) ((r) << 5)
81#define A64F_A(r) ((r) << 10) 118#define A64F_A(r) ((r) << 10)
82#define A64F_M(r) ((r) << 16) 119#define A64F_M(r) ((r) << 16)
120#define A64F_IMMS(x) ((x) << 10)
121#define A64F_IMMR(x) ((x) << 16)
83#define A64F_U16(x) ((x) << 5) 122#define A64F_U16(x) ((x) << 5)
123#define A64F_U12(x) ((x) << 10)
84#define A64F_S26(x) (x) 124#define A64F_S26(x) (x)
85#define A64F_S19(x) ((x) << 5) 125#define A64F_S19(x) ((x) << 5)
126#define A64F_S9(x) ((x) << 12)
127#define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10))
128#define A64F_EX(ex) (A64I_EX | ((ex) << 13))
129#define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10))
130#define A64F_FP8(x) ((x) << 13)
131#define A64F_CC(cc) ((cc) << 12)
132#define A64F_LSL16(x) (((x) / 16) << 21)
133#define A64F_BSH(sh) ((sh) << 10)
86 134
87typedef enum A64Ins { 135typedef enum A64Ins {
136 A64I_S = 0x20000000,
137 A64I_X = 0x80000000,
138 A64I_EX = 0x00200000,
139 A64I_K12 = 0x1a000000,
140 A64I_K13 = 0x18000000,
141 A64I_LS_U = 0x01000000,
142 A64I_LS_S = 0x00800000,
143 A64I_LS_R = 0x01200800,
144 A64I_LS_UXTWx = 0x00005000,
145 A64I_LS_LSLx = 0x00007000,
146
147 A64I_ADDw = 0x0b000000,
148 A64I_ADDx = 0x8b000000,
149 A64I_ADDSw = 0x2b000000,
150 A64I_ADDSx = 0xab000000,
151 A64I_NEGw = 0x4b0003e0,
152 A64I_NEGx = 0xcb0003e0,
153 A64I_SUBw = 0x4b000000,
154 A64I_SUBx = 0xcb000000,
155 A64I_SUBSw = 0x6b000000,
156 A64I_SUBSx = 0xeb000000,
157
158 A64I_MULw = 0x1b007c00,
159 A64I_MULx = 0x9b007c00,
160 A64I_SMULL = 0x9b207c00,
161
162 A64I_ANDw = 0x0a000000,
163 A64I_ANDx = 0x8a000000,
164 A64I_ANDSw = 0x6a000000,
165 A64I_ANDSx = 0xea000000,
166 A64I_EORw = 0x4a000000,
167 A64I_EORx = 0xca000000,
168 A64I_ORRw = 0x2a000000,
169 A64I_ORRx = 0xaa000000,
170 A64I_TSTw = 0x6a00001f,
171 A64I_TSTx = 0xea00001f,
172
173 A64I_CMPw = 0x6b00001f,
174 A64I_CMPx = 0xeb00001f,
175 A64I_CMNw = 0x2b00001f,
176 A64I_CMNx = 0xab00001f,
177 A64I_CCMPw = 0x7a400000,
178 A64I_CCMPx = 0xfa400000,
179 A64I_CSELw = 0x1a800000,
180 A64I_CSELx = 0x9a800000,
181
182 A64I_ASRw = 0x13007c00,
183 A64I_ASRx = 0x9340fc00,
184 A64I_LSLx = 0xd3400000,
185 A64I_LSRx = 0xd340fc00,
186 A64I_SHRw = 0x1ac02000,
187 A64I_SHRx = 0x9ac02000, /* lsl/lsr/asr/ror x0, x0, x0 */
188 A64I_REVw = 0x5ac00800,
189 A64I_REVx = 0xdac00c00,
190
191 A64I_EXTRw = 0x13800000,
192 A64I_EXTRx = 0x93c00000,
193 A64I_SBFMw = 0x13000000,
194 A64I_SBFMx = 0x93400000,
195 A64I_SXTBw = 0x13001c00,
196 A64I_SXTHw = 0x13003c00,
197 A64I_SXTW = 0x93407c00,
198 A64I_UBFMw = 0x53000000,
199 A64I_UBFMx = 0xd3400000,
200 A64I_UXTBw = 0x53001c00,
201 A64I_UXTHw = 0x53003c00,
202
203 A64I_MOVw = 0x2a0003e0,
204 A64I_MOVx = 0xaa0003e0,
205 A64I_MVNw = 0x2a2003e0,
206 A64I_MVNx = 0xaa2003e0,
207 A64I_MOVKw = 0x72800000,
208 A64I_MOVKx = 0xf2800000,
88 A64I_MOVZw = 0x52800000, 209 A64I_MOVZw = 0x52800000,
89 A64I_MOVZx = 0xd2800000, 210 A64I_MOVZx = 0xd2800000,
211 A64I_MOVNw = 0x12800000,
212 A64I_MOVNx = 0x92800000,
213
214 A64I_LDRB = 0x39400000,
215 A64I_LDRH = 0x79400000,
216 A64I_LDRw = 0xb9400000,
217 A64I_LDRx = 0xf9400000,
90 A64I_LDRLw = 0x18000000, 218 A64I_LDRLw = 0x18000000,
91 A64I_LDRLx = 0x58000000, 219 A64I_LDRLx = 0x58000000,
92 A64I_NOP = 0xd503201f, 220 A64I_STRB = 0x39000000,
221 A64I_STRH = 0x79000000,
222 A64I_STRw = 0xb9000000,
223 A64I_STRx = 0xf9000000,
224 A64I_STPw = 0x29000000,
225 A64I_STPx = 0xa9000000,
226 A64I_LDPw = 0x29400000,
227 A64I_LDPx = 0xa9400000,
228
93 A64I_B = 0x14000000, 229 A64I_B = 0x14000000,
230 A64I_BCC = 0x54000000,
231 A64I_BL = 0x94000000,
94 A64I_BR = 0xd61f0000, 232 A64I_BR = 0xd61f0000,
233 A64I_BLR = 0xd63f0000,
234
235 A64I_NOP = 0xd503201f,
236
237 /* FP */
238 A64I_FADDd = 0x1e602800,
239 A64I_FSUBd = 0x1e603800,
240 A64I_FMADDd = 0x1f400000,
241 A64I_FMSUBd = 0x1f408000,
242 A64I_FNMADDd = 0x1f600000,
243 A64I_FNMSUBd = 0x1f608000,
244 A64I_FMULd = 0x1e600800,
245 A64I_FDIVd = 0x1e601800,
246 A64I_FNEGd = 0x1e614000,
247 A64I_FABS = 0x1e60c000,
248 A64I_FSQRTd = 0x1e61c000,
249 A64I_LDRs = 0xbd400000,
250 A64I_LDRd = 0xfd400000,
251 A64I_STRs = 0xbd000000,
252 A64I_STRd = 0xfd000000,
253 A64I_LDPs = 0x2d400000,
254 A64I_LDPd = 0x6d400000,
255 A64I_STPs = 0x2d000000,
256 A64I_STPd = 0x6d000000,
257 A64I_FCMPd = 0x1e602000,
258 A64I_FCMPZd = 0x1e602008,
259 A64I_FCSELd = 0x1e600c00,
260 A64I_FRINTMd = 0x1e654000,
261 A64I_FRINTPd = 0x1e64c000,
262 A64I_FRINTZd = 0x1e65c000,
263
264 A64I_FCVT_F32_F64 = 0x1e624000,
265 A64I_FCVT_F64_F32 = 0x1e22c000,
266 A64I_FCVT_F32_S32 = 0x1e220000,
267 A64I_FCVT_F64_S32 = 0x1e620000,
268 A64I_FCVT_F32_U32 = 0x1e230000,
269 A64I_FCVT_F64_U32 = 0x1e630000,
270 A64I_FCVT_F32_S64 = 0x9e220000,
271 A64I_FCVT_F64_S64 = 0x9e620000,
272 A64I_FCVT_F32_U64 = 0x9e230000,
273 A64I_FCVT_F64_U64 = 0x9e630000,
274 A64I_FCVT_S32_F64 = 0x1e780000,
275 A64I_FCVT_S32_F32 = 0x1e380000,
276 A64I_FCVT_U32_F64 = 0x1e790000,
277 A64I_FCVT_U32_F32 = 0x1e390000,
278 A64I_FCVT_S64_F64 = 0x9e780000,
279 A64I_FCVT_S64_F32 = 0x9e380000,
280 A64I_FCVT_U64_F64 = 0x9e790000,
281 A64I_FCVT_U64_F32 = 0x9e390000,
282
283 A64I_FMOV_S = 0x1e204000,
284 A64I_FMOV_D = 0x1e604000,
285 A64I_FMOV_R_S = 0x1e260000,
286 A64I_FMOV_S_R = 0x1e270000,
287 A64I_FMOV_R_D = 0x9e660000,
288 A64I_FMOV_D_R = 0x9e670000,
289 A64I_FMOV_DI = 0x1e601000,
95} A64Ins; 290} A64Ins;
96 291
292typedef enum A64Shift {
293 A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR
294} A64Shift;
295
296typedef enum A64Extend {
297 A64EX_UXTB, A64EX_UXTH, A64EX_UXTW, A64EX_UXTX,
298 A64EX_SXTB, A64EX_SXTH, A64EX_SXTW, A64EX_SXTX,
299} A64Extend;
300
301/* ARM condition codes. */
302typedef enum A64CC {
303 CC_EQ, CC_NE, CC_CS, CC_CC, CC_MI, CC_PL, CC_VS, CC_VC,
304 CC_HI, CC_LS, CC_GE, CC_LT, CC_GT, CC_LE, CC_AL,
305 CC_HS = CC_CS, CC_LO = CC_CC
306} A64CC;
307
97#endif 308#endif
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 7a881bdd..a6227bf7 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -236,12 +236,17 @@
236|.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro 236|.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
237|.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro 237|.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
238| 238|
239#define GL_J(field) (GG_OFS(J) + (int)offsetof(jit_State, field)) 239#define GL_J(field) (GG_G2J + (int)offsetof(jit_State, field))
240| 240|
241#define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto)) 241#define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
242| 242|
243|.macro hotcheck, delta 243|.macro hotcheck, delta
244| NYI 244| lsr CARG1, PC, #1
245| and CARG1, CARG1, #126
246| add CARG1, CARG1, #GG_G2DISP+GG_DISP2HOT
247| ldrh CARG2w, [GL, CARG1]
248| subs CARG2, CARG2, #delta
249| strh CARG2w, [GL, CARG1]
245|.endmacro 250|.endmacro
246| 251|
247|.macro hotloop 252|.macro hotloop
@@ -869,7 +874,7 @@ static void build_subroutines(BuildCtx *ctx)
869 | bl extern lj_meta_for // (lua_State *L, TValue *base) 874 | bl extern lj_meta_for // (lua_State *L, TValue *base)
870 | ldr INSw, [PC, #-4] 875 | ldr INSw, [PC, #-4]
871 |.if JIT 876 |.if JIT
872 | uxtb TMP0, INS 877 | uxtb TMP0w, INSw
873 |.endif 878 |.endif
874 | decode_RA RA, INS 879 | decode_RA RA, INS
875 | decode_RD RC, INS 880 | decode_RD RC, INS
@@ -1732,7 +1737,20 @@ static void build_subroutines(BuildCtx *ctx)
1732 |//----------------------------------------------------------------------- 1737 |//-----------------------------------------------------------------------
1733 | 1738 |
1734 |->vm_record: // Dispatch target for recording phase. 1739 |->vm_record: // Dispatch target for recording phase.
1735 | NYI 1740 |.if JIT
1741 | ldrb CARG1w, GL->hookmask
1742 | tst CARG1, #HOOK_VMEVENT // No recording while in vmevent.
1743 | bne >5
1744 | // Decrement the hookcount for consistency, but always do the call.
1745 | ldr CARG2w, GL->hookcount
1746 | tst CARG1, #HOOK_ACTIVE
1747 | bne >1
1748 | sub CARG2w, CARG2w, #1
1749 | tst CARG1, #LUA_MASKLINE|LUA_MASKCOUNT
1750 | beq >1
1751 | str CARG2w, GL->hookcount
1752 | b >1
1753 |.endif
1736 | 1754 |
1737 |->vm_rethook: // Dispatch target for return hooks. 1755 |->vm_rethook: // Dispatch target for return hooks.
1738 | ldrb TMP2w, GL->hookmask 1756 | ldrb TMP2w, GL->hookmask
@@ -1774,7 +1792,21 @@ static void build_subroutines(BuildCtx *ctx)
1774 | b <4 1792 | b <4
1775 | 1793 |
1776 |->vm_hotloop: // Hot loop counter underflow. 1794 |->vm_hotloop: // Hot loop counter underflow.
1777 | NYI 1795 |.if JIT
1796 | ldr LFUNC:CARG3, [BASE, FRAME_FUNC] // Same as curr_topL(L).
1797 | add CARG1, GL, #GG_G2DISP+GG_DISP2J
1798 | and LFUNC:CARG3, CARG3, #LJ_GCVMASK
1799 | str PC, SAVE_PC
1800 | ldr CARG3, LFUNC:CARG3->pc
1801 | mov CARG2, PC
1802 | str L, [GL, #GL_J(L)]
1803 | ldrb CARG3w, [CARG3, #PC2PROTO(framesize)]
1804 | str BASE, L->base
1805 | add CARG3, BASE, CARG3, lsl #3
1806 | str CARG3, L->top
1807 | bl extern lj_trace_hot // (jit_State *J, const BCIns *pc)
1808 | b <3
1809 |.endif
1778 | 1810 |
1779 |->vm_callhook: // Dispatch target for call hooks. 1811 |->vm_callhook: // Dispatch target for call hooks.
1780 | mov CARG2, PC 1812 | mov CARG2, PC
@@ -1804,7 +1836,54 @@ static void build_subroutines(BuildCtx *ctx)
1804 | br CRET1 1836 | br CRET1
1805 | 1837 |
1806 |->cont_stitch: // Trace stitching. 1838 |->cont_stitch: // Trace stitching.
1807 | NYI 1839 |.if JIT
1840 | // RA = resultptr, CARG4 = meta base
1841 | ldr RB, SAVE_MULTRES
1842 | ldr INSw, [PC, #-4]
1843 | ldr TRACE:CARG3, [CARG4, #-40] // Save previous trace.
1844 | subs RB, RB, #8
1845 | decode_RA RC, INS // Call base.
1846 | and CARG3, CARG3, #LJ_GCVMASK
1847 | beq >2
1848 |1: // Move results down.
1849 | ldr CARG1, [RA]
1850 | add RA, RA, #8
1851 | subs RB, RB, #8
1852 | str CARG1, [BASE, RC, lsl #3]
1853 | add RC, RC, #1
1854 | bne <1
1855 |2:
1856 | decode_RA RA, INS
1857 | decode_RB RB, INS
1858 | add RA, RA, RB
1859 |3:
1860 | cmp RA, RC
1861 | bhi >9 // More results wanted?
1862 |
1863 | ldrh RAw, TRACE:CARG3->traceno
1864 | ldrh RCw, TRACE:CARG3->link
1865 | cmp RCw, RAw
1866 | beq ->cont_nop // Blacklisted.
1867 | cmp RCw, #0
1868 | bne =>BC_JLOOP // Jump to stitched trace.
1869 |
1870 | // Stitch a new trace to the previous trace.
1871 | mov CARG1, #GL_J(exitno)
1872 | str RA, [GL, CARG1]
1873 | mov CARG1, #GL_J(L)
1874 | str L, [GL, CARG1]
1875 | str BASE, L->base
1876 | add CARG1, GL, #GG_G2J
1877 | mov CARG2, PC
1878 | bl extern lj_dispatch_stitch // (jit_State *J, const BCIns *pc)
1879 | ldr BASE, L->base
1880 | b ->cont_nop
1881 |
1882 |9: // Fill up results with nil.
1883 | str TISNIL, [BASE, RC, lsl #3]
1884 | add RC, RC, #1
1885 | b <3
1886 |.endif
1808 | 1887 |
1809 |->vm_profhook: // Dispatch target for profiler hook. 1888 |->vm_profhook: // Dispatch target for profiler hook.
1810#if LJ_HASPROFILE 1889#if LJ_HASPROFILE
@@ -1822,10 +1901,120 @@ static void build_subroutines(BuildCtx *ctx)
1822 |//-- Trace exit handler ------------------------------------------------- 1901 |//-- Trace exit handler -------------------------------------------------
1823 |//----------------------------------------------------------------------- 1902 |//-----------------------------------------------------------------------
1824 | 1903 |
1904 |.macro savex_, a, b
1905 | stp d..a, d..b, [sp, #a*8]
1906 | stp x..a, x..b, [sp, #32*8+a*8]
1907 |.endmacro
1908 |
1825 |->vm_exit_handler: 1909 |->vm_exit_handler:
1826 | NYI 1910 |.if JIT
1911 | sub sp, sp, #(64*8)
1912 | savex_, 0, 1
1913 | savex_, 2, 3
1914 | savex_, 4, 5
1915 | savex_, 6, 7
1916 | savex_, 8, 9
1917 | savex_, 10, 11
1918 | savex_, 12, 13
1919 | savex_, 14, 15
1920 | savex_, 16, 17
1921 | savex_, 18, 19
1922 | savex_, 20, 21
1923 | savex_, 22, 23
1924 | savex_, 24, 25
1925 | savex_, 26, 27
1926 | savex_, 28, 29
1927 | stp d30, d31, [sp, #30*8]
1928 | ldr CARG1, [sp, #64*8] // Load original value of lr.
1929 | add CARG3, sp, #64*8 // Recompute original value of sp.
1930 | mv_vmstate CARG4, EXIT
1931 | ldr CARG2w, [CARG1, #-4]! // Get exit instruction.
1932 | stp CARG1, CARG3, [sp, #62*8] // Store exit pc/sp in RID_LR/RID_SP.
1933 | lsl CARG2, CARG2, #38
1934 | add CARG1, CARG1, CARG2, asr #36
1935 | ldr CARG2w, [lr] // Load exit stub group offset.
1936 | sub CARG1, CARG1, lr
1937 | sub CARG1, CARG1, #4
1938 | ldr L, GL->cur_L
1939 | add CARG1, CARG2, CARG1, lsr #2 // Compute exit number.
1940 | ldr BASE, GL->jit_base
1941 | st_vmstate CARG4
1942 | str CARG1w, [GL, #GL_J(exitno)]
1943 | str BASE, L->base
1944 | str L, [GL, #GL_J(L)]
1945 | str xzr, GL->jit_base
1946 | add CARG1, GL, #GG_G2J
1947 | mov CARG2, sp
1948 | bl extern lj_trace_exit // (jit_State *J, ExitState *ex)
1949 | // Returns MULTRES (unscaled) or negated error code.
1950 | ldr CARG2, L->cframe
1951 | ldr BASE, L->base
1952 | and sp, CARG2, #CFRAME_RAWMASK
1953 | ldr PC, SAVE_PC // Get SAVE_PC.
1954 | str L, SAVE_L // Set SAVE_L (on-trace resume/yield).
1955 | b >1
1956 |.endif
1957 |
1827 |->vm_exit_interp: 1958 |->vm_exit_interp:
1828 | NYI 1959 | // CARG1 = MULTRES or negated error code, BASE, PC and GL set.
1960 |.if JIT
1961 | ldr L, SAVE_L
1962 |1:
1963 | cmp CARG1w, #0
1964 | blt >9 // Check for error from exit.
1965 | lsl RC, CARG1, #3
1966 | ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
1967 | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
1968 | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
1969 | movn TISNIL, #0
1970 | and LFUNC:CARG2, CARG2, #LJ_GCVMASK
1971 | str RC, SAVE_MULTRES
1972 | str BASE, L->base
1973 | ldr CARG2, LFUNC:CARG2->pc
1974 | str xzr, GL->jit_base
1975 | mv_vmstate CARG4, INTERP
1976 | ldr KBASE, [CARG2, #PC2PROTO(k)]
1977 | // Modified copy of ins_next which handles function header dispatch, too.
1978 | ldrb RBw, [PC]
1979 | ldr INSw, [PC], #4
1980 | st_vmstate CARG4
1981 | cmp RBw, #BC_FUNCC+2 // Fast function?
1982 | add TMP1, GL, INS, uxtb #3
1983 | bhs >4
1984 |2:
1985 | cmp RBw, #BC_FUNCF // Function header?
1986 | add TMP0, GL, RB, uxtb #3
1987 | ldr RB, [TMP0, #GG_G2DISP]
1988 | decode_RA RA, INS
1989 | lsr TMP0, INS, #16
1990 | csel RC, TMP0, RC, lo
1991 | blo >5
1992 | ldr CARG3, [BASE, FRAME_FUNC]
1993 | sub RC, RC, #8
1994 | add RA, BASE, RA, lsl #3 // Yes: RA = BASE+framesize*8, RC = nargs*8
1995 | and LFUNC:CARG3, CARG3, #LJ_GCVMASK
1996 |5:
1997 | br RB
1998 |
1999 |4: // Check frame below fast function.
2000 | ldr CARG1, [BASE, FRAME_PC]
2001 | ands CARG2, CARG1, #FRAME_TYPE
2002 | bne <2 // Trace stitching continuation?
2003 | // Otherwise set KBASE for Lua function below fast function.
2004 | ldr CARG3, [CARG1, #-4]
2005 | decode_RA CARG1, CARG3
2006 | sub CARG2, BASE, CARG1, lsl #3
2007 | ldr LFUNC:CARG3, [CARG2, #-32]
2008 | and LFUNC:CARG3, CARG3, #LJ_GCVMASK
2009 | ldr CARG3, LFUNC:CARG3->pc
2010 | ldr KBASE, [CARG3, #PC2PROTO(k)]
2011 | b <2
2012 |
2013 |9: // Rethrow error from the right C frame.
2014 | neg CARG2, CARG1
2015 | mov CARG1, L
2016 | bl extern lj_err_throw // (lua_State *L, int errcode)
2017 |.endif
1829 | 2018 |
1830 |//----------------------------------------------------------------------- 2019 |//-----------------------------------------------------------------------
1831 |//-- Math helper functions ---------------------------------------------- 2020 |//-- Math helper functions ----------------------------------------------
@@ -3387,6 +3576,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
3387 if (op == BC_FORI) { 3576 if (op == BC_FORI) {
3388 | csel PC, RC, PC, gt 3577 | csel PC, RC, PC, gt
3389 } else if (op == BC_JFORI) { 3578 } else if (op == BC_JFORI) {
3579 | mov PC, RC
3390 | ldrh RCw, [RC, #-2] 3580 | ldrh RCw, [RC, #-2]
3391 } else if (op == BC_IFORL) { 3581 } else if (op == BC_IFORL) {
3392 | csel PC, RC, PC, le 3582 | csel PC, RC, PC, le
@@ -3488,7 +3678,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
3488 3678
3489 case BC_JLOOP: 3679 case BC_JLOOP:
3490 |.if JIT 3680 |.if JIT
3491 | NYI 3681 | // RA = base (ignored), RC = traceno
3682 | ldr CARG1, [GL, #GL_J(trace)]
3683 | mov CARG2, #0 // Traces on ARM64 don't store the trace #, so use 0.
3684 | ldr TRACE:RC, [CARG1, RC, lsl #3]
3685 | st_vmstate CARG2
3686 | ldr RA, TRACE:RC->mcode
3687 | str BASE, GL->jit_base
3688 | str L, GL->tmpbuf.L
3689 | sub sp, sp, #16 // See SPS_FIXED. Avoids sp adjust in every root trace.
3690 | br RA
3492 |.endif 3691 |.endif
3493 break; 3692 break;
3494 3693
@@ -3546,10 +3745,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
3546 case BC_IFUNCV: 3745 case BC_IFUNCV:
3547 | // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8 3746 | // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8
3548 | ldr CARG1, L->maxstack 3747 | ldr CARG1, L->maxstack
3748 | movn TMP0, #~LJ_TFUNC
3549 | add TMP2, BASE, RC 3749 | add TMP2, BASE, RC
3750 | add LFUNC:CARG3, CARG3, TMP0, lsl #47
3550 | add RA, RA, RC 3751 | add RA, RA, RC
3551 | add TMP0, RC, #16+FRAME_VARG 3752 | add TMP0, RC, #16+FRAME_VARG
3552 | str LFUNC:CARG3, [TMP2], #8 // Store (untagged) copy of LFUNC. 3753 | str LFUNC:CARG3, [TMP2], #8 // Store (tagged) copy of LFUNC.
3553 | ldr KBASE, [PC, #-4+PC2PROTO(k)] 3754 | ldr KBASE, [PC, #-4+PC2PROTO(k)]
3554 | cmp RA, CARG1 3755 | cmp RA, CARG1
3555 | str TMP0, [TMP2], #8 // Store delta + FRAME_VARG. 3756 | str TMP0, [TMP2], #8 // Store delta + FRAME_VARG.
@@ -3736,8 +3937,8 @@ static void emit_asm_debug(BuildCtx *ctx)
3736 "\t.uleb128 0x1\n" 3937 "\t.uleb128 0x1\n"
3737 "\t.sleb128 -8\n" 3938 "\t.sleb128 -8\n"
3738 "\t.byte 30\n" /* Return address is in lr. */ 3939 "\t.byte 30\n" /* Return address is in lr. */
3739 "\t.uleb128 1\n" /* augmentation length */ 3940 "\t.uleb128 1\n" /* augmentation length */
3740 "\t.byte 0x1b\n" /* pcrel|sdata4 */ 3941 "\t.byte 0x1b\n" /* pcrel|sdata4 */
3741 "\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n" /* def_cfa sp */ 3942 "\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n" /* def_cfa sp */
3742 "\t.align 3\n" 3943 "\t.align 3\n"
3743 ".LECIE2:\n\n"); 3944 ".LECIE2:\n\n");
@@ -3748,7 +3949,7 @@ static void emit_asm_debug(BuildCtx *ctx)
3748 "\t.long .LASFDE3-.Lframe2\n" 3949 "\t.long .LASFDE3-.Lframe2\n"
3749 "\t.long lj_vm_ffi_call-.\n" 3950 "\t.long lj_vm_ffi_call-.\n"
3750 "\t.long %d\n" 3951 "\t.long %d\n"
3751 "\t.uleb128 0\n" /* augmentation length */ 3952 "\t.uleb128 0\n" /* augmentation length */
3752 "\t.byte 0xe\n\t.uleb128 32\n" /* def_cfa_offset */ 3953 "\t.byte 0xe\n\t.uleb128 32\n" /* def_cfa_offset */
3753 "\t.byte 0x9d\n\t.uleb128 4\n" /* offset fp */ 3954 "\t.byte 0x9d\n\t.uleb128 4\n" /* offset fp */
3754 "\t.byte 0x9e\n\t.uleb128 3\n" /* offset lr */ 3955 "\t.byte 0x9e\n\t.uleb128 3\n" /* offset lr */