diff options
-rw-r--r-- | src/lj_crecord.c | 287 |
1 files changed, 261 insertions, 26 deletions
diff --git a/src/lj_crecord.c b/src/lj_crecord.c index fbb5d79a..47ae65b4 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c | |||
@@ -91,25 +91,7 @@ static CTypeID argv2ctype(jit_State *J, TRef tr, cTValue *o) | |||
91 | } | 91 | } |
92 | } | 92 | } |
93 | 93 | ||
94 | /* -- Convert C type to C type -------------------------------------------- */ | 94 | /* Convert CType to IRType (if possible). */ |
95 | |||
96 | /* | ||
97 | ** This code mirrors the code in lj_cconv.c. It performs the same steps | ||
98 | ** for the trace recorder that lj_cconv.c does for the interpreter. | ||
99 | ** | ||
100 | ** One major difference is that we can get away with much fewer checks | ||
101 | ** here. E.g. checks for casts, constness or correct types can often be | ||
102 | ** omitted, even if they might fail. The interpreter subsequently throws | ||
103 | ** an error, which aborts the trace. | ||
104 | ** | ||
105 | ** All operations are specialized to their C types, so the on-trace | ||
106 | ** outcome must be the same as the outcome in the interpreter. If the | ||
107 | ** interpreter doesn't throw an error, then the trace is correct, too. | ||
108 | ** Care must be taken not to generate invalid (temporary) IR or to | ||
109 | ** trigger asserts. | ||
110 | */ | ||
111 | |||
112 | /* Convert CType to IRType. */ | ||
113 | static IRType crec_ct2irt(CTState *cts, CType *ct) | 95 | static IRType crec_ct2irt(CTState *cts, CType *ct) |
114 | { | 96 | { |
115 | if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); | 97 | if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); |
@@ -135,6 +117,253 @@ static IRType crec_ct2irt(CTState *cts, CType *ct) | |||
135 | return IRT_CDATA; | 117 | return IRT_CDATA; |
136 | } | 118 | } |
137 | 119 | ||
120 | /* -- Optimized memory fill and copy -------------------------------------- */ | ||
121 | |||
122 | /* Maximum length and unroll of inlined copy/fill. */ | ||
123 | #define CREC_COPY_MAXUNROLL 16 | ||
124 | #define CREC_COPY_MAXLEN 128 | ||
125 | |||
126 | #define CREC_FILL_MAXUNROLL 16 | ||
127 | #if LJ_TARGET_UNALIGNED | ||
128 | #define CREC_FILL_MAXLEN (CTSIZE_PTR * CREC_FILL_MAXUNROLL) | ||
129 | #else | ||
130 | #define CREC_FILL_MAXLEN CREC_FILL_MAXUNROLL | ||
131 | #endif | ||
132 | |||
133 | /* Number of windowed registers used for optimized memory copy. */ | ||
134 | #if LJ_TARGET_X86 | ||
135 | #define CREC_COPY_REGWIN 2 | ||
136 | #elif LJ_TARGET_PPC || LJ_TARGET_MIPS | ||
137 | #define CREC_COPY_REGWIN 8 | ||
138 | #else | ||
139 | #define CREC_COPY_REGWIN 4 | ||
140 | #endif | ||
141 | |||
142 | /* List of memory offsets for copy/fill. */ | ||
143 | typedef struct CRecMemList { | ||
144 | CTSize ofs; /* Offset in bytes. */ | ||
145 | IRType tp; /* Type of load/store. */ | ||
146 | TRef trofs; /* TRef of interned offset. */ | ||
147 | TRef trval; /* TRef of load value. */ | ||
148 | } CRecMemList; | ||
149 | |||
150 | /* Generate copy list for element-wise struct copy. */ | ||
151 | static MSize crec_copy_struct(CRecMemList *ml, CTState *cts, CType *ct) | ||
152 | { | ||
153 | CTypeID fid = ct->sib; | ||
154 | MSize mlp = 0; | ||
155 | while (fid) { | ||
156 | CType *df = ctype_get(cts, fid); | ||
157 | fid = df->sib; | ||
158 | if (ctype_isfield(df->info)) { | ||
159 | CType *cct; | ||
160 | IRType tp; | ||
161 | if (!gcref(df->name)) continue; /* Ignore unnamed fields. */ | ||
162 | cct = ctype_rawchild(cts, df); /* Field type. */ | ||
163 | tp = crec_ct2irt(cts, cct); | ||
164 | if (tp == IRT_CDATA) return 0; /* NYI: aggregates. */ | ||
165 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
166 | ml[mlp].ofs = df->size; | ||
167 | ml[mlp].tp = tp; | ||
168 | mlp++; | ||
169 | if (ctype_iscomplex(cct->info)) { | ||
170 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
171 | ml[mlp].ofs = df->size + (cct->size >> 1); | ||
172 | ml[mlp].tp = tp; | ||
173 | mlp++; | ||
174 | } | ||
175 | } else if (!ctype_isconstval(df->info)) { | ||
176 | /* NYI: bitfields and sub-structures. */ | ||
177 | return 0; | ||
178 | } | ||
179 | } | ||
180 | return mlp; | ||
181 | } | ||
182 | |||
183 | /* Generate unrolled copy list, from highest to lowest step size/alignment. */ | ||
184 | static MSize crec_copy_unroll(CRecMemList *ml, CTSize len, CTSize step, | ||
185 | IRType tp) | ||
186 | { | ||
187 | CTSize ofs = 0; | ||
188 | MSize mlp = 0; | ||
189 | if (tp == IRT_CDATA) tp = IRT_U8 + 2*lj_fls(step); | ||
190 | do { | ||
191 | while (ofs + step <= len) { | ||
192 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
193 | ml[mlp].ofs = ofs; | ||
194 | ml[mlp].tp = tp; | ||
195 | mlp++; | ||
196 | ofs += step; | ||
197 | } | ||
198 | step >>= 1; | ||
199 | tp -= 2; | ||
200 | } while (ofs < len); | ||
201 | return mlp; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | ** Emit copy list with windowed loads/stores. | ||
206 | ** LJ_TARGET_UNALIGNED: may emit unaligned loads/stores (not marked as such). | ||
207 | */ | ||
208 | static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp, | ||
209 | TRef trdst, TRef trsrc) | ||
210 | { | ||
211 | MSize i, j, rwin = 0; | ||
212 | for (i = 0, j = 0; i < mlp; ) { | ||
213 | TRef trofs = lj_ir_kintp(J, ml[i].ofs); | ||
214 | TRef trsptr = emitir(IRT(IR_ADD, IRT_PTR), trsrc, trofs); | ||
215 | ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0); | ||
216 | ml[i].trofs = trofs; | ||
217 | i++; | ||
218 | rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1; | ||
219 | if (rwin >= CREC_COPY_REGWIN || i >= mlp) { /* Flush buffered stores. */ | ||
220 | rwin = 0; | ||
221 | for ( ; j < i; j++) { | ||
222 | TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, ml[j].trofs); | ||
223 | emitir(IRT(IR_XSTORE, ml[j].tp), trdptr, ml[j].trval); | ||
224 | } | ||
225 | } | ||
226 | } | ||
227 | } | ||
228 | |||
229 | /* Optimized memory copy. */ | ||
230 | static void crec_copy(jit_State *J, TRef trdst, TRef trsrc, TRef trlen, | ||
231 | CType *ct) | ||
232 | { | ||
233 | if (tref_isk(trlen)) { /* Length must be constant. */ | ||
234 | CRecMemList ml[CREC_COPY_MAXUNROLL]; | ||
235 | MSize mlp = 0; | ||
236 | CTSize step = 1, len = (CTSize)IR(tref_ref(trlen))->i; | ||
237 | IRType tp = IRT_CDATA; | ||
238 | int needxbar = 0; | ||
239 | if (len == 0) return; /* Shortcut. */ | ||
240 | if (len > CREC_COPY_MAXLEN) goto fallback; | ||
241 | if (ct) { | ||
242 | CTState *cts = ctype_ctsG(J2G(J)); | ||
243 | lua_assert(ctype_isarray(ct->info) || ctype_isstruct(ct->info)); | ||
244 | if (ctype_isarray(ct->info)) { | ||
245 | CType *cct = ctype_rawchild(cts, ct); | ||
246 | tp = crec_ct2irt(cts, cct); | ||
247 | if (tp == IRT_CDATA) goto rawcopy; | ||
248 | step = lj_ir_type_size[tp]; | ||
249 | lua_assert((len & (step-1)) == 0); | ||
250 | } else if ((ct->info & CTF_UNION)) { | ||
251 | step = (1u << ctype_align(ct->info)); | ||
252 | goto rawcopy; | ||
253 | } else { | ||
254 | mlp = crec_copy_struct(ml, cts, ct); | ||
255 | goto emitcopy; | ||
256 | } | ||
257 | } else { | ||
258 | rawcopy: | ||
259 | needxbar = 1; | ||
260 | if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR) | ||
261 | step = CTSIZE_PTR; | ||
262 | } | ||
263 | mlp = crec_copy_unroll(ml, len, step, tp); | ||
264 | emitcopy: | ||
265 | if (mlp) { | ||
266 | crec_copy_emit(J, ml, mlp, trdst, trsrc); | ||
267 | if (needxbar) | ||
268 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
269 | return; | ||
270 | } | ||
271 | } | ||
272 | fallback: | ||
273 | /* Call memcpy. Always needs a barrier to disable alias analysis. */ | ||
274 | lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen); | ||
275 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
276 | } | ||
277 | |||
278 | /* Generate unrolled fill list, from highest to lowest step size/alignment. */ | ||
279 | static MSize crec_fill_unroll(CRecMemList *ml, CTSize len, CTSize step) | ||
280 | { | ||
281 | CTSize ofs = 0; | ||
282 | MSize mlp = 0; | ||
283 | IRType tp = IRT_U8 + 2*lj_fls(step); | ||
284 | do { | ||
285 | while (ofs + step <= len) { | ||
286 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
287 | ml[mlp].ofs = ofs; | ||
288 | ml[mlp].tp = tp; | ||
289 | mlp++; | ||
290 | ofs += step; | ||
291 | } | ||
292 | step >>= 1; | ||
293 | tp -= 2; | ||
294 | } while (ofs < len); | ||
295 | return mlp; | ||
296 | } | ||
297 | |||
298 | /* | ||
299 | ** Emit stores for fill list. | ||
300 | ** LJ_TARGET_UNALIGNED: may emit unaligned stores (not marked as such). | ||
301 | */ | ||
302 | static void crec_fill_emit(jit_State *J, CRecMemList *ml, MSize mlp, | ||
303 | TRef trdst, TRef trfill) | ||
304 | { | ||
305 | MSize i; | ||
306 | for (i = 0; i < mlp; i++) { | ||
307 | TRef trofs = lj_ir_kintp(J, ml[i].ofs); | ||
308 | TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, trofs); | ||
309 | emitir(IRT(IR_XSTORE, ml[i].tp), trdptr, trfill); | ||
310 | } | ||
311 | } | ||
312 | |||
313 | /* Optimized memory fill. */ | ||
314 | static void crec_fill(jit_State *J, TRef trdst, TRef trlen, TRef trfill, | ||
315 | CTSize step) | ||
316 | { | ||
317 | if (tref_isk(trlen)) { /* Length must be constant. */ | ||
318 | CRecMemList ml[CREC_FILL_MAXUNROLL]; | ||
319 | MSize mlp; | ||
320 | CTSize len = (CTSize)IR(tref_ref(trlen))->i; | ||
321 | if (len == 0) return; /* Shortcut. */ | ||
322 | if (len > CREC_FILL_MAXLEN) goto fallback; | ||
323 | if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR) | ||
324 | step = CTSIZE_PTR; | ||
325 | mlp = crec_fill_unroll(ml, len, step); | ||
326 | if (!mlp) goto fallback; | ||
327 | if (tref_isk(trfill) || ml[0].tp != IRT_U8) | ||
328 | trfill = emitconv(trfill, IRT_INT, IRT_U8, 0); | ||
329 | if (ml[0].tp != IRT_U8) { /* Scatter U8 to U16/U32/U64. */ | ||
330 | if (CTSIZE_PTR == 8 && ml[0].tp == IRT_U64) { | ||
331 | if (tref_isk(trfill)) /* Pointless on x64 with zero-extended regs. */ | ||
332 | trfill = emitconv(trfill, IRT_U64, IRT_U32, 0); | ||
333 | trfill = emitir(IRT(IR_MUL, IRT_U64), trfill, | ||
334 | lj_ir_kint64(J, U64x(01010101,01010101))); | ||
335 | } else { | ||
336 | trfill = emitir(IRTI(IR_MUL), trfill, | ||
337 | lj_ir_kint(J, ml[0].tp == IRT_U16 ? 0x0101 : 0x01010101)); | ||
338 | } | ||
339 | } | ||
340 | crec_fill_emit(J, ml, mlp, trdst, trfill); | ||
341 | } else { | ||
342 | fallback: | ||
343 | /* Call memset. Always needs a barrier to disable alias analysis. */ | ||
344 | lj_ir_call(J, IRCALL_memset, trdst, trfill, trlen); /* Note: arg order! */ | ||
345 | } | ||
346 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
347 | } | ||
348 | |||
349 | /* -- Convert C type to C type -------------------------------------------- */ | ||
350 | |||
351 | /* | ||
352 | ** This code mirrors the code in lj_cconv.c. It performs the same steps | ||
353 | ** for the trace recorder that lj_cconv.c does for the interpreter. | ||
354 | ** | ||
355 | ** One major difference is that we can get away with much fewer checks | ||
356 | ** here. E.g. checks for casts, constness or correct types can often be | ||
357 | ** omitted, even if they might fail. The interpreter subsequently throws | ||
358 | ** an error, which aborts the trace. | ||
359 | ** | ||
360 | ** All operations are specialized to their C types, so the on-trace | ||
361 | ** outcome must be the same as the outcome in the interpreter. If the | ||
362 | ** interpreter doesn't throw an error, then the trace is correct, too. | ||
363 | ** Care must be taken not to generate invalid (temporary) IR or to | ||
364 | ** trigger asserts. | ||
365 | */ | ||
366 | |||
138 | /* Determine whether a passed number or cdata number is non-zero. */ | 367 | /* Determine whether a passed number or cdata number is non-zero. */ |
139 | static int crec_isnonzero(CType *s, void *p) | 368 | static int crec_isnonzero(CType *s, void *p) |
140 | { | 369 | { |
@@ -1298,26 +1527,32 @@ void LJ_FASTCALL recff_ffi_copy(jit_State *J, RecordFFData *rd) | |||
1298 | trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN); | 1527 | trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN); |
1299 | trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); | 1528 | trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); |
1300 | } | 1529 | } |
1301 | lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen); | ||
1302 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
1303 | rd->nres = 0; | 1530 | rd->nres = 0; |
1531 | crec_copy(J, trdst, trsrc, trlen, NULL); | ||
1304 | } /* else: interpreter will throw. */ | 1532 | } /* else: interpreter will throw. */ |
1305 | } | 1533 | } |
1306 | 1534 | ||
1307 | void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd) | 1535 | void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd) |
1308 | { | 1536 | { |
1309 | CTState *cts = ctype_ctsG(J2G(J)); | 1537 | CTState *cts = ctype_ctsG(J2G(J)); |
1310 | TRef tr = J->base[0], trlen = J->base[1], trfill = J->base[2]; | 1538 | TRef trdst = J->base[0], trlen = J->base[1], trfill = J->base[2]; |
1311 | if (tr && trlen) { | 1539 | if (trdst && trlen) { |
1312 | tr = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, tr, &rd->argv[0]); | 1540 | CTSize step = 1; |
1541 | if (tviscdata(&rd->argv[0])) { /* Get alignment of original destination. */ | ||
1542 | CTSize sz; | ||
1543 | CType *ct = ctype_raw(cts, cdataV(&rd->argv[0])->ctypeid); | ||
1544 | if (ctype_isptr(ct->info)) | ||
1545 | ct = ctype_rawchild(cts, ct); | ||
1546 | step = (1u<<ctype_align(lj_ctype_info(cts, ctype_typeid(cts, ct), &sz))); | ||
1547 | } | ||
1548 | trdst = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, trdst, &rd->argv[0]); | ||
1313 | trlen = crec_toint(J, cts, trlen, &rd->argv[1]); | 1549 | trlen = crec_toint(J, cts, trlen, &rd->argv[1]); |
1314 | if (trfill) | 1550 | if (trfill) |
1315 | trfill = crec_toint(J, cts, trfill, &rd->argv[2]); | 1551 | trfill = crec_toint(J, cts, trfill, &rd->argv[2]); |
1316 | else | 1552 | else |
1317 | trfill = lj_ir_kint(J, 0); | 1553 | trfill = lj_ir_kint(J, 0); |
1318 | lj_ir_call(J, IRCALL_memset, tr, trfill, trlen); | ||
1319 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
1320 | rd->nres = 0; | 1554 | rd->nres = 0; |
1555 | crec_fill(J, trdst, trlen, trfill, step); | ||
1321 | } /* else: interpreter will throw. */ | 1556 | } /* else: interpreter will throw. */ |
1322 | } | 1557 | } |
1323 | 1558 | ||