aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/lj_crecord.c287
1 files changed, 261 insertions, 26 deletions
diff --git a/src/lj_crecord.c b/src/lj_crecord.c
index fbb5d79a..47ae65b4 100644
--- a/src/lj_crecord.c
+++ b/src/lj_crecord.c
@@ -91,25 +91,7 @@ static CTypeID argv2ctype(jit_State *J, TRef tr, cTValue *o)
91 } 91 }
92} 92}
93 93
94/* -- Convert C type to C type -------------------------------------------- */ 94/* Convert CType to IRType (if possible). */
95
96/*
97** This code mirrors the code in lj_cconv.c. It performs the same steps
98** for the trace recorder that lj_cconv.c does for the interpreter.
99**
100** One major difference is that we can get away with much fewer checks
101** here. E.g. checks for casts, constness or correct types can often be
102** omitted, even if they might fail. The interpreter subsequently throws
103** an error, which aborts the trace.
104**
105** All operations are specialized to their C types, so the on-trace
106** outcome must be the same as the outcome in the interpreter. If the
107** interpreter doesn't throw an error, then the trace is correct, too.
108** Care must be taken not to generate invalid (temporary) IR or to
109** trigger asserts.
110*/
111
112/* Convert CType to IRType. */
113static IRType crec_ct2irt(CTState *cts, CType *ct) 95static IRType crec_ct2irt(CTState *cts, CType *ct)
114{ 96{
115 if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); 97 if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
@@ -135,6 +117,253 @@ static IRType crec_ct2irt(CTState *cts, CType *ct)
135 return IRT_CDATA; 117 return IRT_CDATA;
136} 118}
137 119
120/* -- Optimized memory fill and copy -------------------------------------- */
121
122/* Maximum length and unroll of inlined copy/fill. */
123#define CREC_COPY_MAXUNROLL 16
124#define CREC_COPY_MAXLEN 128
125
126#define CREC_FILL_MAXUNROLL 16
127#if LJ_TARGET_UNALIGNED
128#define CREC_FILL_MAXLEN (CTSIZE_PTR * CREC_FILL_MAXUNROLL)
129#else
130#define CREC_FILL_MAXLEN CREC_FILL_MAXUNROLL
131#endif
132
133/* Number of windowed registers used for optimized memory copy. */
134#if LJ_TARGET_X86
135#define CREC_COPY_REGWIN 2
136#elif LJ_TARGET_PPC || LJ_TARGET_MIPS
137#define CREC_COPY_REGWIN 8
138#else
139#define CREC_COPY_REGWIN 4
140#endif
141
142/* List of memory offsets for copy/fill. */
143typedef struct CRecMemList {
144 CTSize ofs; /* Offset in bytes. */
145 IRType tp; /* Type of load/store. */
146 TRef trofs; /* TRef of interned offset. */
147 TRef trval; /* TRef of load value. */
148} CRecMemList;
149
150/* Generate copy list for element-wise struct copy. */
151static MSize crec_copy_struct(CRecMemList *ml, CTState *cts, CType *ct)
152{
153 CTypeID fid = ct->sib;
154 MSize mlp = 0;
155 while (fid) {
156 CType *df = ctype_get(cts, fid);
157 fid = df->sib;
158 if (ctype_isfield(df->info)) {
159 CType *cct;
160 IRType tp;
161 if (!gcref(df->name)) continue; /* Ignore unnamed fields. */
162 cct = ctype_rawchild(cts, df); /* Field type. */
163 tp = crec_ct2irt(cts, cct);
164 if (tp == IRT_CDATA) return 0; /* NYI: aggregates. */
165 if (mlp >= CREC_COPY_MAXUNROLL) return 0;
166 ml[mlp].ofs = df->size;
167 ml[mlp].tp = tp;
168 mlp++;
169 if (ctype_iscomplex(cct->info)) {
170 if (mlp >= CREC_COPY_MAXUNROLL) return 0;
171 ml[mlp].ofs = df->size + (cct->size >> 1);
172 ml[mlp].tp = tp;
173 mlp++;
174 }
175 } else if (!ctype_isconstval(df->info)) {
176 /* NYI: bitfields and sub-structures. */
177 return 0;
178 }
179 }
180 return mlp;
181}
182
183/* Generate unrolled copy list, from highest to lowest step size/alignment. */
184static MSize crec_copy_unroll(CRecMemList *ml, CTSize len, CTSize step,
185 IRType tp)
186{
187 CTSize ofs = 0;
188 MSize mlp = 0;
189 if (tp == IRT_CDATA) tp = IRT_U8 + 2*lj_fls(step);
190 do {
191 while (ofs + step <= len) {
192 if (mlp >= CREC_COPY_MAXUNROLL) return 0;
193 ml[mlp].ofs = ofs;
194 ml[mlp].tp = tp;
195 mlp++;
196 ofs += step;
197 }
198 step >>= 1;
199 tp -= 2;
200 } while (ofs < len);
201 return mlp;
202}
203
204/*
205** Emit copy list with windowed loads/stores.
206** LJ_TARGET_UNALIGNED: may emit unaligned loads/stores (not marked as such).
207*/
208static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp,
209 TRef trdst, TRef trsrc)
210{
211 MSize i, j, rwin = 0;
212 for (i = 0, j = 0; i < mlp; ) {
213 TRef trofs = lj_ir_kintp(J, ml[i].ofs);
214 TRef trsptr = emitir(IRT(IR_ADD, IRT_PTR), trsrc, trofs);
215 ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0);
216 ml[i].trofs = trofs;
217 i++;
218 rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1;
219 if (rwin >= CREC_COPY_REGWIN || i >= mlp) { /* Flush buffered stores. */
220 rwin = 0;
221 for ( ; j < i; j++) {
222 TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, ml[j].trofs);
223 emitir(IRT(IR_XSTORE, ml[j].tp), trdptr, ml[j].trval);
224 }
225 }
226 }
227}
228
229/* Optimized memory copy. */
230static void crec_copy(jit_State *J, TRef trdst, TRef trsrc, TRef trlen,
231 CType *ct)
232{
233 if (tref_isk(trlen)) { /* Length must be constant. */
234 CRecMemList ml[CREC_COPY_MAXUNROLL];
235 MSize mlp = 0;
236 CTSize step = 1, len = (CTSize)IR(tref_ref(trlen))->i;
237 IRType tp = IRT_CDATA;
238 int needxbar = 0;
239 if (len == 0) return; /* Shortcut. */
240 if (len > CREC_COPY_MAXLEN) goto fallback;
241 if (ct) {
242 CTState *cts = ctype_ctsG(J2G(J));
243 lua_assert(ctype_isarray(ct->info) || ctype_isstruct(ct->info));
244 if (ctype_isarray(ct->info)) {
245 CType *cct = ctype_rawchild(cts, ct);
246 tp = crec_ct2irt(cts, cct);
247 if (tp == IRT_CDATA) goto rawcopy;
248 step = lj_ir_type_size[tp];
249 lua_assert((len & (step-1)) == 0);
250 } else if ((ct->info & CTF_UNION)) {
251 step = (1u << ctype_align(ct->info));
252 goto rawcopy;
253 } else {
254 mlp = crec_copy_struct(ml, cts, ct);
255 goto emitcopy;
256 }
257 } else {
258 rawcopy:
259 needxbar = 1;
260 if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
261 step = CTSIZE_PTR;
262 }
263 mlp = crec_copy_unroll(ml, len, step, tp);
264 emitcopy:
265 if (mlp) {
266 crec_copy_emit(J, ml, mlp, trdst, trsrc);
267 if (needxbar)
268 emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
269 return;
270 }
271 }
272fallback:
273 /* Call memcpy. Always needs a barrier to disable alias analysis. */
274 lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
275 emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
276}
277
278/* Generate unrolled fill list, from highest to lowest step size/alignment. */
279static MSize crec_fill_unroll(CRecMemList *ml, CTSize len, CTSize step)
280{
281 CTSize ofs = 0;
282 MSize mlp = 0;
283 IRType tp = IRT_U8 + 2*lj_fls(step);
284 do {
285 while (ofs + step <= len) {
286 if (mlp >= CREC_COPY_MAXUNROLL) return 0;
287 ml[mlp].ofs = ofs;
288 ml[mlp].tp = tp;
289 mlp++;
290 ofs += step;
291 }
292 step >>= 1;
293 tp -= 2;
294 } while (ofs < len);
295 return mlp;
296}
297
298/*
299** Emit stores for fill list.
300** LJ_TARGET_UNALIGNED: may emit unaligned stores (not marked as such).
301*/
302static void crec_fill_emit(jit_State *J, CRecMemList *ml, MSize mlp,
303 TRef trdst, TRef trfill)
304{
305 MSize i;
306 for (i = 0; i < mlp; i++) {
307 TRef trofs = lj_ir_kintp(J, ml[i].ofs);
308 TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, trofs);
309 emitir(IRT(IR_XSTORE, ml[i].tp), trdptr, trfill);
310 }
311}
312
313/* Optimized memory fill. */
314static void crec_fill(jit_State *J, TRef trdst, TRef trlen, TRef trfill,
315 CTSize step)
316{
317 if (tref_isk(trlen)) { /* Length must be constant. */
318 CRecMemList ml[CREC_FILL_MAXUNROLL];
319 MSize mlp;
320 CTSize len = (CTSize)IR(tref_ref(trlen))->i;
321 if (len == 0) return; /* Shortcut. */
322 if (len > CREC_FILL_MAXLEN) goto fallback;
323 if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
324 step = CTSIZE_PTR;
325 mlp = crec_fill_unroll(ml, len, step);
326 if (!mlp) goto fallback;
327 if (tref_isk(trfill) || ml[0].tp != IRT_U8)
328 trfill = emitconv(trfill, IRT_INT, IRT_U8, 0);
329 if (ml[0].tp != IRT_U8) { /* Scatter U8 to U16/U32/U64. */
330 if (CTSIZE_PTR == 8 && ml[0].tp == IRT_U64) {
331 if (tref_isk(trfill)) /* Pointless on x64 with zero-extended regs. */
332 trfill = emitconv(trfill, IRT_U64, IRT_U32, 0);
333 trfill = emitir(IRT(IR_MUL, IRT_U64), trfill,
334 lj_ir_kint64(J, U64x(01010101,01010101)));
335 } else {
336 trfill = emitir(IRTI(IR_MUL), trfill,
337 lj_ir_kint(J, ml[0].tp == IRT_U16 ? 0x0101 : 0x01010101));
338 }
339 }
340 crec_fill_emit(J, ml, mlp, trdst, trfill);
341 } else {
342fallback:
343 /* Call memset. Always needs a barrier to disable alias analysis. */
344 lj_ir_call(J, IRCALL_memset, trdst, trfill, trlen); /* Note: arg order! */
345 }
346 emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
347}
348
349/* -- Convert C type to C type -------------------------------------------- */
350
351/*
352** This code mirrors the code in lj_cconv.c. It performs the same steps
353** for the trace recorder that lj_cconv.c does for the interpreter.
354**
355** One major difference is that we can get away with much fewer checks
356** here. E.g. checks for casts, constness or correct types can often be
357** omitted, even if they might fail. The interpreter subsequently throws
358** an error, which aborts the trace.
359**
360** All operations are specialized to their C types, so the on-trace
361** outcome must be the same as the outcome in the interpreter. If the
362** interpreter doesn't throw an error, then the trace is correct, too.
363** Care must be taken not to generate invalid (temporary) IR or to
364** trigger asserts.
365*/
366
138/* Determine whether a passed number or cdata number is non-zero. */ 367/* Determine whether a passed number or cdata number is non-zero. */
139static int crec_isnonzero(CType *s, void *p) 368static int crec_isnonzero(CType *s, void *p)
140{ 369{
@@ -1298,26 +1527,32 @@ void LJ_FASTCALL recff_ffi_copy(jit_State *J, RecordFFData *rd)
1298 trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN); 1527 trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN);
1299 trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); 1528 trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1));
1300 } 1529 }
1301 lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
1302 emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
1303 rd->nres = 0; 1530 rd->nres = 0;
1531 crec_copy(J, trdst, trsrc, trlen, NULL);
1304 } /* else: interpreter will throw. */ 1532 } /* else: interpreter will throw. */
1305} 1533}
1306 1534
1307void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd) 1535void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd)
1308{ 1536{
1309 CTState *cts = ctype_ctsG(J2G(J)); 1537 CTState *cts = ctype_ctsG(J2G(J));
1310 TRef tr = J->base[0], trlen = J->base[1], trfill = J->base[2]; 1538 TRef trdst = J->base[0], trlen = J->base[1], trfill = J->base[2];
1311 if (tr && trlen) { 1539 if (trdst && trlen) {
1312 tr = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, tr, &rd->argv[0]); 1540 CTSize step = 1;
1541 if (tviscdata(&rd->argv[0])) { /* Get alignment of original destination. */
1542 CTSize sz;
1543 CType *ct = ctype_raw(cts, cdataV(&rd->argv[0])->ctypeid);
1544 if (ctype_isptr(ct->info))
1545 ct = ctype_rawchild(cts, ct);
1546 step = (1u<<ctype_align(lj_ctype_info(cts, ctype_typeid(cts, ct), &sz)));
1547 }
1548 trdst = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, trdst, &rd->argv[0]);
1313 trlen = crec_toint(J, cts, trlen, &rd->argv[1]); 1549 trlen = crec_toint(J, cts, trlen, &rd->argv[1]);
1314 if (trfill) 1550 if (trfill)
1315 trfill = crec_toint(J, cts, trfill, &rd->argv[2]); 1551 trfill = crec_toint(J, cts, trfill, &rd->argv[2]);
1316 else 1552 else
1317 trfill = lj_ir_kint(J, 0); 1553 trfill = lj_ir_kint(J, 0);
1318 lj_ir_call(J, IRCALL_memset, tr, trfill, trlen);
1319 emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
1320 rd->nres = 0; 1554 rd->nres = 0;
1555 crec_fill(J, trdst, trlen, trfill, step);
1321 } /* else: interpreter will throw. */ 1556 } /* else: interpreter will throw. */
1322} 1557}
1323 1558