diff options
| author | Mike Pall <mike> | 2012-10-09 21:49:08 +0200 |
|---|---|---|
| committer | Mike Pall <mike> | 2012-10-09 21:49:57 +0200 |
| commit | a7d1dbacb19c3e5d0c4888cdf59913693e4648b6 (patch) | |
| tree | bc549e9263100e40c4ed60bbc5efa21bd1fb9e97 /src | |
| parent | 5ebe4990baa2a0fcbd8c22cf813a18082a7eb961 (diff) | |
| download | luajit-a7d1dbacb19c3e5d0c4888cdf59913693e4648b6.tar.gz luajit-a7d1dbacb19c3e5d0c4888cdf59913693e4648b6.tar.bz2 luajit-a7d1dbacb19c3e5d0c4888cdf59913693e4648b6.zip | |
FFI: Optimize ffi.copy() and ffi.fill().
Diffstat (limited to 'src')
| -rw-r--r-- | src/lj_crecord.c | 287 |
1 files changed, 261 insertions, 26 deletions
diff --git a/src/lj_crecord.c b/src/lj_crecord.c index fbb5d79a..47ae65b4 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c | |||
| @@ -91,25 +91,7 @@ static CTypeID argv2ctype(jit_State *J, TRef tr, cTValue *o) | |||
| 91 | } | 91 | } |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | /* -- Convert C type to C type -------------------------------------------- */ | 94 | /* Convert CType to IRType (if possible). */ |
| 95 | |||
| 96 | /* | ||
| 97 | ** This code mirrors the code in lj_cconv.c. It performs the same steps | ||
| 98 | ** for the trace recorder that lj_cconv.c does for the interpreter. | ||
| 99 | ** | ||
| 100 | ** One major difference is that we can get away with much fewer checks | ||
| 101 | ** here. E.g. checks for casts, constness or correct types can often be | ||
| 102 | ** omitted, even if they might fail. The interpreter subsequently throws | ||
| 103 | ** an error, which aborts the trace. | ||
| 104 | ** | ||
| 105 | ** All operations are specialized to their C types, so the on-trace | ||
| 106 | ** outcome must be the same as the outcome in the interpreter. If the | ||
| 107 | ** interpreter doesn't throw an error, then the trace is correct, too. | ||
| 108 | ** Care must be taken not to generate invalid (temporary) IR or to | ||
| 109 | ** trigger asserts. | ||
| 110 | */ | ||
| 111 | |||
| 112 | /* Convert CType to IRType. */ | ||
| 113 | static IRType crec_ct2irt(CTState *cts, CType *ct) | 95 | static IRType crec_ct2irt(CTState *cts, CType *ct) |
| 114 | { | 96 | { |
| 115 | if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); | 97 | if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); |
| @@ -135,6 +117,253 @@ static IRType crec_ct2irt(CTState *cts, CType *ct) | |||
| 135 | return IRT_CDATA; | 117 | return IRT_CDATA; |
| 136 | } | 118 | } |
| 137 | 119 | ||
| 120 | /* -- Optimized memory fill and copy -------------------------------------- */ | ||
| 121 | |||
| 122 | /* Maximum length and unroll of inlined copy/fill. */ | ||
| 123 | #define CREC_COPY_MAXUNROLL 16 | ||
| 124 | #define CREC_COPY_MAXLEN 128 | ||
| 125 | |||
| 126 | #define CREC_FILL_MAXUNROLL 16 | ||
| 127 | #if LJ_TARGET_UNALIGNED | ||
| 128 | #define CREC_FILL_MAXLEN (CTSIZE_PTR * CREC_FILL_MAXUNROLL) | ||
| 129 | #else | ||
| 130 | #define CREC_FILL_MAXLEN CREC_FILL_MAXUNROLL | ||
| 131 | #endif | ||
| 132 | |||
| 133 | /* Number of windowed registers used for optimized memory copy. */ | ||
| 134 | #if LJ_TARGET_X86 | ||
| 135 | #define CREC_COPY_REGWIN 2 | ||
| 136 | #elif LJ_TARGET_PPC || LJ_TARGET_MIPS | ||
| 137 | #define CREC_COPY_REGWIN 8 | ||
| 138 | #else | ||
| 139 | #define CREC_COPY_REGWIN 4 | ||
| 140 | #endif | ||
| 141 | |||
| 142 | /* List of memory offsets for copy/fill. */ | ||
| 143 | typedef struct CRecMemList { | ||
| 144 | CTSize ofs; /* Offset in bytes. */ | ||
| 145 | IRType tp; /* Type of load/store. */ | ||
| 146 | TRef trofs; /* TRef of interned offset. */ | ||
| 147 | TRef trval; /* TRef of load value. */ | ||
| 148 | } CRecMemList; | ||
| 149 | |||
| 150 | /* Generate copy list for element-wise struct copy. */ | ||
| 151 | static MSize crec_copy_struct(CRecMemList *ml, CTState *cts, CType *ct) | ||
| 152 | { | ||
| 153 | CTypeID fid = ct->sib; | ||
| 154 | MSize mlp = 0; | ||
| 155 | while (fid) { | ||
| 156 | CType *df = ctype_get(cts, fid); | ||
| 157 | fid = df->sib; | ||
| 158 | if (ctype_isfield(df->info)) { | ||
| 159 | CType *cct; | ||
| 160 | IRType tp; | ||
| 161 | if (!gcref(df->name)) continue; /* Ignore unnamed fields. */ | ||
| 162 | cct = ctype_rawchild(cts, df); /* Field type. */ | ||
| 163 | tp = crec_ct2irt(cts, cct); | ||
| 164 | if (tp == IRT_CDATA) return 0; /* NYI: aggregates. */ | ||
| 165 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
| 166 | ml[mlp].ofs = df->size; | ||
| 167 | ml[mlp].tp = tp; | ||
| 168 | mlp++; | ||
| 169 | if (ctype_iscomplex(cct->info)) { | ||
| 170 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
| 171 | ml[mlp].ofs = df->size + (cct->size >> 1); | ||
| 172 | ml[mlp].tp = tp; | ||
| 173 | mlp++; | ||
| 174 | } | ||
| 175 | } else if (!ctype_isconstval(df->info)) { | ||
| 176 | /* NYI: bitfields and sub-structures. */ | ||
| 177 | return 0; | ||
| 178 | } | ||
| 179 | } | ||
| 180 | return mlp; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* Generate unrolled copy list, from highest to lowest step size/alignment. */ | ||
| 184 | static MSize crec_copy_unroll(CRecMemList *ml, CTSize len, CTSize step, | ||
| 185 | IRType tp) | ||
| 186 | { | ||
| 187 | CTSize ofs = 0; | ||
| 188 | MSize mlp = 0; | ||
| 189 | if (tp == IRT_CDATA) tp = IRT_U8 + 2*lj_fls(step); | ||
| 190 | do { | ||
| 191 | while (ofs + step <= len) { | ||
| 192 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
| 193 | ml[mlp].ofs = ofs; | ||
| 194 | ml[mlp].tp = tp; | ||
| 195 | mlp++; | ||
| 196 | ofs += step; | ||
| 197 | } | ||
| 198 | step >>= 1; | ||
| 199 | tp -= 2; | ||
| 200 | } while (ofs < len); | ||
| 201 | return mlp; | ||
| 202 | } | ||
| 203 | |||
| 204 | /* | ||
| 205 | ** Emit copy list with windowed loads/stores. | ||
| 206 | ** LJ_TARGET_UNALIGNED: may emit unaligned loads/stores (not marked as such). | ||
| 207 | */ | ||
| 208 | static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp, | ||
| 209 | TRef trdst, TRef trsrc) | ||
| 210 | { | ||
| 211 | MSize i, j, rwin = 0; | ||
| 212 | for (i = 0, j = 0; i < mlp; ) { | ||
| 213 | TRef trofs = lj_ir_kintp(J, ml[i].ofs); | ||
| 214 | TRef trsptr = emitir(IRT(IR_ADD, IRT_PTR), trsrc, trofs); | ||
| 215 | ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0); | ||
| 216 | ml[i].trofs = trofs; | ||
| 217 | i++; | ||
| 218 | rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1; | ||
| 219 | if (rwin >= CREC_COPY_REGWIN || i >= mlp) { /* Flush buffered stores. */ | ||
| 220 | rwin = 0; | ||
| 221 | for ( ; j < i; j++) { | ||
| 222 | TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, ml[j].trofs); | ||
| 223 | emitir(IRT(IR_XSTORE, ml[j].tp), trdptr, ml[j].trval); | ||
| 224 | } | ||
| 225 | } | ||
| 226 | } | ||
| 227 | } | ||
| 228 | |||
| 229 | /* Optimized memory copy. */ | ||
| 230 | static void crec_copy(jit_State *J, TRef trdst, TRef trsrc, TRef trlen, | ||
| 231 | CType *ct) | ||
| 232 | { | ||
| 233 | if (tref_isk(trlen)) { /* Length must be constant. */ | ||
| 234 | CRecMemList ml[CREC_COPY_MAXUNROLL]; | ||
| 235 | MSize mlp = 0; | ||
| 236 | CTSize step = 1, len = (CTSize)IR(tref_ref(trlen))->i; | ||
| 237 | IRType tp = IRT_CDATA; | ||
| 238 | int needxbar = 0; | ||
| 239 | if (len == 0) return; /* Shortcut. */ | ||
| 240 | if (len > CREC_COPY_MAXLEN) goto fallback; | ||
| 241 | if (ct) { | ||
| 242 | CTState *cts = ctype_ctsG(J2G(J)); | ||
| 243 | lua_assert(ctype_isarray(ct->info) || ctype_isstruct(ct->info)); | ||
| 244 | if (ctype_isarray(ct->info)) { | ||
| 245 | CType *cct = ctype_rawchild(cts, ct); | ||
| 246 | tp = crec_ct2irt(cts, cct); | ||
| 247 | if (tp == IRT_CDATA) goto rawcopy; | ||
| 248 | step = lj_ir_type_size[tp]; | ||
| 249 | lua_assert((len & (step-1)) == 0); | ||
| 250 | } else if ((ct->info & CTF_UNION)) { | ||
| 251 | step = (1u << ctype_align(ct->info)); | ||
| 252 | goto rawcopy; | ||
| 253 | } else { | ||
| 254 | mlp = crec_copy_struct(ml, cts, ct); | ||
| 255 | goto emitcopy; | ||
| 256 | } | ||
| 257 | } else { | ||
| 258 | rawcopy: | ||
| 259 | needxbar = 1; | ||
| 260 | if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR) | ||
| 261 | step = CTSIZE_PTR; | ||
| 262 | } | ||
| 263 | mlp = crec_copy_unroll(ml, len, step, tp); | ||
| 264 | emitcopy: | ||
| 265 | if (mlp) { | ||
| 266 | crec_copy_emit(J, ml, mlp, trdst, trsrc); | ||
| 267 | if (needxbar) | ||
| 268 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
| 269 | return; | ||
| 270 | } | ||
| 271 | } | ||
| 272 | fallback: | ||
| 273 | /* Call memcpy. Always needs a barrier to disable alias analysis. */ | ||
| 274 | lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen); | ||
| 275 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
| 276 | } | ||
| 277 | |||
| 278 | /* Generate unrolled fill list, from highest to lowest step size/alignment. */ | ||
| 279 | static MSize crec_fill_unroll(CRecMemList *ml, CTSize len, CTSize step) | ||
| 280 | { | ||
| 281 | CTSize ofs = 0; | ||
| 282 | MSize mlp = 0; | ||
| 283 | IRType tp = IRT_U8 + 2*lj_fls(step); | ||
| 284 | do { | ||
| 285 | while (ofs + step <= len) { | ||
| 286 | if (mlp >= CREC_COPY_MAXUNROLL) return 0; | ||
| 287 | ml[mlp].ofs = ofs; | ||
| 288 | ml[mlp].tp = tp; | ||
| 289 | mlp++; | ||
| 290 | ofs += step; | ||
| 291 | } | ||
| 292 | step >>= 1; | ||
| 293 | tp -= 2; | ||
| 294 | } while (ofs < len); | ||
| 295 | return mlp; | ||
| 296 | } | ||
| 297 | |||
| 298 | /* | ||
| 299 | ** Emit stores for fill list. | ||
| 300 | ** LJ_TARGET_UNALIGNED: may emit unaligned stores (not marked as such). | ||
| 301 | */ | ||
| 302 | static void crec_fill_emit(jit_State *J, CRecMemList *ml, MSize mlp, | ||
| 303 | TRef trdst, TRef trfill) | ||
| 304 | { | ||
| 305 | MSize i; | ||
| 306 | for (i = 0; i < mlp; i++) { | ||
| 307 | TRef trofs = lj_ir_kintp(J, ml[i].ofs); | ||
| 308 | TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, trofs); | ||
| 309 | emitir(IRT(IR_XSTORE, ml[i].tp), trdptr, trfill); | ||
| 310 | } | ||
| 311 | } | ||
| 312 | |||
| 313 | /* Optimized memory fill. */ | ||
| 314 | static void crec_fill(jit_State *J, TRef trdst, TRef trlen, TRef trfill, | ||
| 315 | CTSize step) | ||
| 316 | { | ||
| 317 | if (tref_isk(trlen)) { /* Length must be constant. */ | ||
| 318 | CRecMemList ml[CREC_FILL_MAXUNROLL]; | ||
| 319 | MSize mlp; | ||
| 320 | CTSize len = (CTSize)IR(tref_ref(trlen))->i; | ||
| 321 | if (len == 0) return; /* Shortcut. */ | ||
| 322 | if (len > CREC_FILL_MAXLEN) goto fallback; | ||
| 323 | if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR) | ||
| 324 | step = CTSIZE_PTR; | ||
| 325 | mlp = crec_fill_unroll(ml, len, step); | ||
| 326 | if (!mlp) goto fallback; | ||
| 327 | if (tref_isk(trfill) || ml[0].tp != IRT_U8) | ||
| 328 | trfill = emitconv(trfill, IRT_INT, IRT_U8, 0); | ||
| 329 | if (ml[0].tp != IRT_U8) { /* Scatter U8 to U16/U32/U64. */ | ||
| 330 | if (CTSIZE_PTR == 8 && ml[0].tp == IRT_U64) { | ||
| 331 | if (tref_isk(trfill)) /* Pointless on x64 with zero-extended regs. */ | ||
| 332 | trfill = emitconv(trfill, IRT_U64, IRT_U32, 0); | ||
| 333 | trfill = emitir(IRT(IR_MUL, IRT_U64), trfill, | ||
| 334 | lj_ir_kint64(J, U64x(01010101,01010101))); | ||
| 335 | } else { | ||
| 336 | trfill = emitir(IRTI(IR_MUL), trfill, | ||
| 337 | lj_ir_kint(J, ml[0].tp == IRT_U16 ? 0x0101 : 0x01010101)); | ||
| 338 | } | ||
| 339 | } | ||
| 340 | crec_fill_emit(J, ml, mlp, trdst, trfill); | ||
| 341 | } else { | ||
| 342 | fallback: | ||
| 343 | /* Call memset. Always needs a barrier to disable alias analysis. */ | ||
| 344 | lj_ir_call(J, IRCALL_memset, trdst, trfill, trlen); /* Note: arg order! */ | ||
| 345 | } | ||
| 346 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
| 347 | } | ||
| 348 | |||
| 349 | /* -- Convert C type to C type -------------------------------------------- */ | ||
| 350 | |||
| 351 | /* | ||
| 352 | ** This code mirrors the code in lj_cconv.c. It performs the same steps | ||
| 353 | ** for the trace recorder that lj_cconv.c does for the interpreter. | ||
| 354 | ** | ||
| 355 | ** One major difference is that we can get away with much fewer checks | ||
| 356 | ** here. E.g. checks for casts, constness or correct types can often be | ||
| 357 | ** omitted, even if they might fail. The interpreter subsequently throws | ||
| 358 | ** an error, which aborts the trace. | ||
| 359 | ** | ||
| 360 | ** All operations are specialized to their C types, so the on-trace | ||
| 361 | ** outcome must be the same as the outcome in the interpreter. If the | ||
| 362 | ** interpreter doesn't throw an error, then the trace is correct, too. | ||
| 363 | ** Care must be taken not to generate invalid (temporary) IR or to | ||
| 364 | ** trigger asserts. | ||
| 365 | */ | ||
| 366 | |||
| 138 | /* Determine whether a passed number or cdata number is non-zero. */ | 367 | /* Determine whether a passed number or cdata number is non-zero. */ |
| 139 | static int crec_isnonzero(CType *s, void *p) | 368 | static int crec_isnonzero(CType *s, void *p) |
| 140 | { | 369 | { |
| @@ -1298,26 +1527,32 @@ void LJ_FASTCALL recff_ffi_copy(jit_State *J, RecordFFData *rd) | |||
| 1298 | trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN); | 1527 | trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN); |
| 1299 | trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); | 1528 | trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); |
| 1300 | } | 1529 | } |
| 1301 | lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen); | ||
| 1302 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
| 1303 | rd->nres = 0; | 1530 | rd->nres = 0; |
| 1531 | crec_copy(J, trdst, trsrc, trlen, NULL); | ||
| 1304 | } /* else: interpreter will throw. */ | 1532 | } /* else: interpreter will throw. */ |
| 1305 | } | 1533 | } |
| 1306 | 1534 | ||
| 1307 | void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd) | 1535 | void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd) |
| 1308 | { | 1536 | { |
| 1309 | CTState *cts = ctype_ctsG(J2G(J)); | 1537 | CTState *cts = ctype_ctsG(J2G(J)); |
| 1310 | TRef tr = J->base[0], trlen = J->base[1], trfill = J->base[2]; | 1538 | TRef trdst = J->base[0], trlen = J->base[1], trfill = J->base[2]; |
| 1311 | if (tr && trlen) { | 1539 | if (trdst && trlen) { |
| 1312 | tr = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, tr, &rd->argv[0]); | 1540 | CTSize step = 1; |
| 1541 | if (tviscdata(&rd->argv[0])) { /* Get alignment of original destination. */ | ||
| 1542 | CTSize sz; | ||
| 1543 | CType *ct = ctype_raw(cts, cdataV(&rd->argv[0])->ctypeid); | ||
| 1544 | if (ctype_isptr(ct->info)) | ||
| 1545 | ct = ctype_rawchild(cts, ct); | ||
| 1546 | step = (1u<<ctype_align(lj_ctype_info(cts, ctype_typeid(cts, ct), &sz))); | ||
| 1547 | } | ||
| 1548 | trdst = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, trdst, &rd->argv[0]); | ||
| 1313 | trlen = crec_toint(J, cts, trlen, &rd->argv[1]); | 1549 | trlen = crec_toint(J, cts, trlen, &rd->argv[1]); |
| 1314 | if (trfill) | 1550 | if (trfill) |
| 1315 | trfill = crec_toint(J, cts, trfill, &rd->argv[2]); | 1551 | trfill = crec_toint(J, cts, trfill, &rd->argv[2]); |
| 1316 | else | 1552 | else |
| 1317 | trfill = lj_ir_kint(J, 0); | 1553 | trfill = lj_ir_kint(J, 0); |
| 1318 | lj_ir_call(J, IRCALL_memset, tr, trfill, trlen); | ||
| 1319 | emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); | ||
| 1320 | rd->nres = 0; | 1554 | rd->nres = 0; |
| 1555 | crec_fill(J, trdst, trlen, trfill, step); | ||
| 1321 | } /* else: interpreter will throw. */ | 1556 | } /* else: interpreter will throw. */ |
| 1322 | } | 1557 | } |
| 1323 | 1558 | ||
