diff options
Diffstat (limited to 'lpcode.c')
-rw-r--r-- | lpcode.c | 1016 |
1 files changed, 1016 insertions, 0 deletions
diff --git a/lpcode.c b/lpcode.c new file mode 100644 index 0000000..4a49e7d --- /dev/null +++ b/lpcode.c | |||
@@ -0,0 +1,1016 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.c,v 1.18 2013/04/12 16:30:33 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | |||
8 | |||
9 | #include "lua.h" | ||
10 | #include "lauxlib.h" | ||
11 | |||
12 | #include "lptypes.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | /* signals a "no-instruction */ | ||
17 | #define NOINST -1 | ||
18 | |||
19 | |||
20 | |||
21 | static const Charset fullset_ = | ||
22 | {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
23 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
24 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
25 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; | ||
26 | |||
27 | static const Charset *fullset = &fullset_; | ||
28 | |||
29 | /* | ||
30 | ** {====================================================== | ||
31 | ** Analysis and some optimizations | ||
32 | ** ======================================================= | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | ** Check whether a charset is empty (IFail), singleton (IChar), | ||
37 | ** full (IAny), or none of those (ISet). | ||
38 | */ | ||
39 | static Opcode charsettype (const byte *cs, int *c) { | ||
40 | int count = 0; | ||
41 | int i; | ||
42 | int candidate = -1; /* candidate position for a char */ | ||
43 | for (i = 0; i < CHARSETSIZE; i++) { | ||
44 | int b = cs[i]; | ||
45 | if (b == 0) { | ||
46 | if (count > 1) return ISet; /* else set is still empty */ | ||
47 | } | ||
48 | else if (b == 0xFF) { | ||
49 | if (count < (i * BITSPERCHAR)) | ||
50 | return ISet; | ||
51 | else count += BITSPERCHAR; /* set is still full */ | ||
52 | } | ||
53 | else if ((b & (b - 1)) == 0) { /* byte has only one bit? */ | ||
54 | if (count > 0) | ||
55 | return ISet; /* set is neither full nor empty */ | ||
56 | else { /* set has only one char till now; track it */ | ||
57 | count++; | ||
58 | candidate = i; | ||
59 | } | ||
60 | } | ||
61 | else return ISet; /* byte is neither empty, full, nor singleton */ | ||
62 | } | ||
63 | switch (count) { | ||
64 | case 0: return IFail; /* empty set */ | ||
65 | case 1: { /* singleton; find character bit inside byte */ | ||
66 | int b = cs[candidate]; | ||
67 | *c = candidate * BITSPERCHAR; | ||
68 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } | ||
69 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } | ||
70 | if ((b & 0x02) != 0) { *c += 1; } | ||
71 | return IChar; | ||
72 | } | ||
73 | default: { | ||
74 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ | ||
75 | return IAny; | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | ** A few basic operations on Charsets | ||
82 | */ | ||
83 | static void cs_complement (Charset *cs) { | ||
84 | loopset(i, cs->cs[i] = ~cs->cs[i]); | ||
85 | } | ||
86 | |||
87 | |||
88 | static int cs_equal (const byte *cs1, const byte *cs2) { | ||
89 | loopset(i, if (cs1[i] != cs2[i]) return 0); | ||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | |||
94 | /* | ||
95 | ** computes whether sets cs1 and cs2 are disjoint | ||
96 | */ | ||
97 | static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | ||
98 | loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | ** Convert a 'char' pattern (TSet, TChar, TAny) to a charset | ||
105 | */ | ||
106 | int tocharset (TTree *tree, Charset *cs) { | ||
107 | switch (tree->tag) { | ||
108 | case TSet: { /* copy set */ | ||
109 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
110 | return 1; | ||
111 | } | ||
112 | case TChar: { /* only one char */ | ||
113 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
114 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
115 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
116 | return 1; | ||
117 | } | ||
118 | case TAny: { | ||
119 | loopset(i, cs->cs[i] = 0xFF); /* add all to the set */ | ||
120 | return 1; | ||
121 | } | ||
122 | default: return 0; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | |||
127 | /* | ||
128 | ** Checks whether a pattern has captures | ||
129 | */ | ||
130 | int hascaptures (TTree *tree) { | ||
131 | tailcall: | ||
132 | switch (tree->tag) { | ||
133 | case TCapture: case TRunTime: | ||
134 | return 1; | ||
135 | default: { | ||
136 | switch (numsiblings[tree->tag]) { | ||
137 | case 1: /* return hascaptures(sib1(tree)); */ | ||
138 | tree = sib1(tree); goto tailcall; | ||
139 | case 2: | ||
140 | if (hascaptures(sib1(tree))) return 1; | ||
141 | /* else return hascaptures(sib2(tree)); */ | ||
142 | tree = sib2(tree); goto tailcall; | ||
143 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | } | ||
148 | |||
149 | |||
150 | /* | ||
151 | ** Checks how a pattern behaves regarding the empty string, | ||
152 | ** in one of two different ways: | ||
153 | ** A pattern is *nullable* if it can match without consuming any character; | ||
154 | ** A pattern is *nofail* if it never fails for any string | ||
155 | ** (including the empty string). | ||
156 | ** The difference is only for predicates and run-time captures; | ||
157 | ** for other patterns, the two properties are equivalent. | ||
158 | ** (With predicates, &'a' is nullable but not nofail. Of course, | ||
159 | ** nofail => nullable.) | ||
160 | ** These functions are all convervative in the following way: | ||
161 | ** p is nullable => nullable(p) | ||
162 | ** nofail(p) => p cannot fail | ||
163 | ** The function assumes that TOpenCall is not nullable; | ||
164 | ** this will be checked again when the grammar is fixed.) | ||
165 | ** Run-time captures can do whatever they want, so the result | ||
166 | ** is conservative. | ||
167 | */ | ||
168 | int checkaux (TTree *tree, int pred) { | ||
169 | tailcall: | ||
170 | switch (tree->tag) { | ||
171 | case TChar: case TSet: case TAny: | ||
172 | case TFalse: case TOpenCall: case TThrow: /* labeled failure */ | ||
173 | return 0; /* not nullable */ | ||
174 | case TRep: case TTrue: | ||
175 | return 1; /* no fail */ | ||
176 | case TNot: case TBehind: /* can match empty, but can fail */ | ||
177 | if (pred == PEnofail) return 0; | ||
178 | else return 1; /* PEnullable */ | ||
179 | case TAnd: /* can match empty; fail iff body does */ | ||
180 | if (pred == PEnullable) return 1; | ||
181 | /* else return checkaux(sib1(tree), pred); */ | ||
182 | tree = sib1(tree); goto tailcall; | ||
183 | case TRunTime: /* can fail; match empty iff body does */ | ||
184 | if (pred == PEnofail) return 0; | ||
185 | /* else return checkaux(sib1(tree), pred); */ | ||
186 | tree = sib1(tree); goto tailcall; | ||
187 | case TSeq: | ||
188 | if (!checkaux(sib1(tree), pred)) return 0; | ||
189 | /* else return checkaux(sib2(tree), pred); */ | ||
190 | tree = sib2(tree); goto tailcall; | ||
191 | case TChoice: case TLabChoice: /* labeled failure */ | ||
192 | if (checkaux(sib2(tree), pred)) return 1; | ||
193 | /* else return checkaux(sib1(tree), pred); */ | ||
194 | tree = sib1(tree); goto tailcall; | ||
195 | case TCapture: case TGrammar: case TRule: | ||
196 | /* return checkaux(sib1(tree), pred); */ | ||
197 | tree = sib1(tree); goto tailcall; | ||
198 | case TCall: /* return checkaux(sib2(tree), pred); */ | ||
199 | tree = sib2(tree); goto tailcall; | ||
200 | default: assert(0); return 0; | ||
201 | }; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* | ||
206 | ** number of characters to match a pattern (or -1 if variable) | ||
207 | ** ('count' avoids infinite loops for grammars) | ||
208 | */ | ||
209 | int fixedlenx (TTree *tree, int count, int len) { | ||
210 | tailcall: | ||
211 | switch (tree->tag) { | ||
212 | case TChar: case TSet: case TAny: | ||
213 | return len + 1; | ||
214 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | ||
215 | case TThrow: /* labeled failure */ | ||
216 | return len; | ||
217 | case TRep: case TRunTime: case TOpenCall: | ||
218 | return -1; | ||
219 | case TCapture: case TRule: case TGrammar: | ||
220 | /* return fixedlenx(sib1(tree), count); */ | ||
221 | tree = sib1(tree); goto tailcall; | ||
222 | case TCall: | ||
223 | if (count++ >= MAXRULES) | ||
224 | return -1; /* may be a loop */ | ||
225 | /* else return fixedlenx(sib2(tree), count); */ | ||
226 | tree = sib2(tree); goto tailcall; | ||
227 | case TSeq: { | ||
228 | len = fixedlenx(sib1(tree), count, len); | ||
229 | if (len < 0) return -1; | ||
230 | /* else return fixedlenx(sib2(tree), count, len); */ | ||
231 | tree = sib2(tree); goto tailcall; | ||
232 | } | ||
233 | case TChoice: case TLabChoice: { /* labeled failure */ | ||
234 | int n1, n2; | ||
235 | n1 = fixedlenx(sib1(tree), count, len); | ||
236 | if (n1 < 0) return -1; | ||
237 | n2 = fixedlenx(sib2(tree), count, len); | ||
238 | if (n1 == n2) return n1; | ||
239 | else return -1; | ||
240 | } | ||
241 | default: assert(0); return 0; | ||
242 | }; | ||
243 | } | ||
244 | |||
245 | |||
246 | /* | ||
247 | ** Computes the 'first set' of a pattern. | ||
248 | ** The result is a conservative aproximation: | ||
249 | ** match p ax -> x' for some x ==> a in first(p). | ||
250 | ** The set 'follow' is the first set of what follows the | ||
251 | ** pattern (full set if nothing follows it). | ||
252 | ** The function returns 0 when this set can be used for | ||
253 | ** tests that avoid the pattern altogether. | ||
254 | ** A non-zero return can happen for two reasons: | ||
255 | ** 1) match p '' -> '' ==> returns 1. | ||
256 | ** (tests cannot be used because they always fail for an empty input) | ||
257 | ** 2) there is a match-time capture ==> returns 2. | ||
258 | ** (match-time captures should not be avoided by optimizations) | ||
259 | */ | ||
260 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | ||
261 | tailcall: | ||
262 | switch (tree->tag) { | ||
263 | case TChar: case TSet: case TAny: { | ||
264 | tocharset(tree, firstset); | ||
265 | return 0; | ||
266 | } | ||
267 | case TTrue: { | ||
268 | loopset(i, firstset->cs[i] = follow->cs[i]); | ||
269 | return 1; | ||
270 | } | ||
271 | case TFalse: { | ||
272 | loopset(i, firstset->cs[i] = 0); | ||
273 | return 0; | ||
274 | } | ||
275 | case TThrow: { /* (?)labeled failure: should always throw the label */ | ||
276 | loopset(i, firstset->cs[i] = follow->cs[i]); /* follow = fullset? */ | ||
277 | return 1; | ||
278 | } | ||
279 | case TChoice: case TLabChoice: { /*(?) labeled failure */ | ||
280 | Charset csaux; | ||
281 | int e1 = getfirst(sib1(tree), follow, firstset); | ||
282 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
283 | loopset(i, firstset->cs[i] |= csaux.cs[i]); | ||
284 | return e1 | e2; | ||
285 | } | ||
286 | case TSeq: { | ||
287 | if (!nullable(sib1(tree))) { | ||
288 | /* return getfirst(sib1(tree), fullset, firstset); */ | ||
289 | tree = sib1(tree); follow = fullset; goto tailcall; | ||
290 | } | ||
291 | else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ | ||
292 | Charset csaux; | ||
293 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
294 | int e1 = getfirst(sib1(tree), &csaux, firstset); | ||
295 | if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ | ||
296 | else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ | ||
297 | return 2; /* pattern has a matchtime capture */ | ||
298 | else return e2; /* else depends on 'e2' */ | ||
299 | } | ||
300 | } | ||
301 | case TRep: { | ||
302 | getfirst(sib1(tree), follow, firstset); | ||
303 | loopset(i, firstset->cs[i] |= follow->cs[i]); | ||
304 | return 1; /* accept the empty string */ | ||
305 | } | ||
306 | case TCapture: case TGrammar: case TRule: { | ||
307 | /* return getfirst(sib1(tree), follow, firstset); */ | ||
308 | tree = sib1(tree); goto tailcall; | ||
309 | } | ||
310 | case TRunTime: { /* function invalidates any follow info. */ | ||
311 | int e = getfirst(sib1(tree), fullset, firstset); | ||
312 | if (e) return 2; /* function is not "protected"? */ | ||
313 | else return 0; /* pattern inside capture ensures first can be used */ | ||
314 | } | ||
315 | case TCall: { | ||
316 | /* return getfirst(sib2(tree), follow, firstset); */ | ||
317 | tree = sib2(tree); goto tailcall; | ||
318 | } | ||
319 | case TAnd: { | ||
320 | int e = getfirst(sib1(tree), follow, firstset); | ||
321 | loopset(i, firstset->cs[i] &= follow->cs[i]); | ||
322 | return e; | ||
323 | } | ||
324 | case TNot: { | ||
325 | if (tocharset(sib1(tree), firstset)) { | ||
326 | cs_complement(firstset); | ||
327 | return 1; | ||
328 | } | ||
329 | /* else go through */ | ||
330 | } | ||
331 | case TBehind: { /* instruction gives no new information */ | ||
332 | /* call 'getfirst' to check for math-time captures */ | ||
333 | int e = getfirst(sib1(tree), follow, firstset); | ||
334 | loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ | ||
335 | return e | 1; /* always can accept the empty string */ | ||
336 | } | ||
337 | default: assert(0); return 0; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | |||
342 | /* | ||
343 | ** If it returns true, then pattern can fail only depending on the next | ||
344 | ** character of the subject | ||
345 | */ | ||
346 | static int headfail (TTree *tree) { | ||
347 | tailcall: | ||
348 | switch (tree->tag) { | ||
349 | case TChar: case TSet: case TAny: case TFalse: | ||
350 | return 1; | ||
351 | case TTrue: case TRep: case TRunTime: case TNot: | ||
352 | case TBehind: | ||
353 | case TThrow: /* labeled failure: should always throw the label */ | ||
354 | return 0; | ||
355 | case TCapture: case TGrammar: case TRule: case TAnd: | ||
356 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | ||
357 | case TCall: | ||
358 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ | ||
359 | case TSeq: | ||
360 | if (!nofail(sib2(tree))) return 0; | ||
361 | /* else return headfail(sib1(tree)); */ | ||
362 | tree = sib1(tree); goto tailcall; | ||
363 | case TChoice: case TLabChoice: /* labeled failure */ | ||
364 | if (!headfail(sib1(tree))) return 0; | ||
365 | /* else return headfail(sib2(tree)); */ | ||
366 | tree = sib2(tree); goto tailcall; | ||
367 | default: assert(0); return 0; | ||
368 | } | ||
369 | } | ||
370 | |||
371 | |||
372 | /* | ||
373 | ** Check whether the code generation for the given tree can benefit | ||
374 | ** from a follow set (to avoid computing the follow set when it is | ||
375 | ** not needed) | ||
376 | */ | ||
377 | static int needfollow (TTree *tree) { | ||
378 | tailcall: | ||
379 | switch (tree->tag) { | ||
380 | case TChar: case TSet: case TAny: | ||
381 | case TFalse: case TTrue: case TAnd: case TNot: | ||
382 | case TRunTime: case TGrammar: case TCall: case TBehind: | ||
383 | case TThrow: case TLabChoice: /* (?)labeled failure */ | ||
384 | return 0; | ||
385 | case TChoice: case TRep: | ||
386 | return 1; | ||
387 | case TCapture: | ||
388 | tree = sib1(tree); goto tailcall; | ||
389 | case TSeq: | ||
390 | tree = sib2(tree); goto tailcall; | ||
391 | default: assert(0); return 0; | ||
392 | } | ||
393 | } | ||
394 | |||
395 | /* }====================================================== */ | ||
396 | |||
397 | |||
398 | |||
399 | /* | ||
400 | ** {====================================================== | ||
401 | ** Code generation | ||
402 | ** ======================================================= | ||
403 | */ | ||
404 | |||
405 | |||
406 | /* | ||
407 | ** size of an instruction | ||
408 | */ | ||
409 | int sizei (const Instruction *i) { | ||
410 | switch((Opcode)i->i.code) { | ||
411 | case ISet: case ISpan: return CHARSETINSTSIZE; | ||
412 | case ITestSet: return CHARSETINSTSIZE + 1; | ||
413 | case ITestChar: case ITestAny: case IChoice: case IJmp: | ||
414 | case ICall: case IOpenCall: case ICommit: case IPartialCommit: | ||
415 | case IBackCommit: case IThrow: return 2; /* labeled failure */ | ||
416 | case ILabChoice: return 3; /* labeled failure */ | ||
417 | default: return 1; | ||
418 | } | ||
419 | } | ||
420 | |||
421 | |||
422 | /* | ||
423 | ** state for the compiler | ||
424 | */ | ||
425 | typedef struct CompileState { | ||
426 | Pattern *p; /* pattern being compiled */ | ||
427 | int ncode; /* next position in p->code to be filled */ | ||
428 | lua_State *L; | ||
429 | } CompileState; | ||
430 | |||
431 | |||
432 | /* | ||
433 | ** code generation is recursive; 'opt' indicates that the code is | ||
434 | ** being generated under a 'IChoice' operator jumping to its end. | ||
435 | ** 'tt' points to a previous test protecting this code. 'fl' is | ||
436 | ** the follow set of the pattern. | ||
437 | */ | ||
438 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
439 | const Charset *fl); | ||
440 | |||
441 | |||
442 | void reallocprog (lua_State *L, Pattern *p, int nsize) { | ||
443 | void *ud; | ||
444 | lua_Alloc f = lua_getallocf(L, &ud); | ||
445 | void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), | ||
446 | nsize * sizeof(Instruction)); | ||
447 | if (newblock == NULL && nsize > 0) | ||
448 | luaL_error(L, "not enough memory"); | ||
449 | p->code = (Instruction *)newblock; | ||
450 | p->codesize = nsize; | ||
451 | } | ||
452 | |||
453 | |||
454 | static int nextinstruction (CompileState *compst) { | ||
455 | int size = compst->p->codesize; | ||
456 | if (compst->ncode >= size) | ||
457 | reallocprog(compst->L, compst->p, size * 2); | ||
458 | return compst->ncode++; | ||
459 | } | ||
460 | |||
461 | |||
462 | #define getinstr(cs,i) ((cs)->p->code[i]) | ||
463 | |||
464 | |||
465 | static int addinstruction (CompileState *compst, Opcode op, int aux) { | ||
466 | int i = nextinstruction(compst); | ||
467 | getinstr(compst, i).i.code = op; | ||
468 | getinstr(compst, i).i.aux = aux; | ||
469 | return i; | ||
470 | } | ||
471 | |||
472 | |||
473 | static int addoffsetinst (CompileState *compst, Opcode op) { | ||
474 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
475 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
476 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); | ||
477 | return i; | ||
478 | } | ||
479 | |||
480 | /* labeled failure begin */ | ||
481 | static int addthrowinstruction (CompileState *compst, Labelset ls) { | ||
482 | int i = nextinstruction(compst); | ||
483 | getinstr(compst, i).i.code = IThrow; | ||
484 | i = nextinstruction(compst); | ||
485 | getinstr(compst, i).labels = ls; | ||
486 | return i; | ||
487 | } | ||
488 | |||
489 | static int addoffsetlabinst (CompileState *compst, Opcode op, Labelset ls) { | ||
490 | int j; | ||
491 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
492 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
493 | j = nextinstruction(compst); /* open space for labels */ | ||
494 | getinstr(compst, j).labels = ls; | ||
495 | assert(op == ILabChoice); | ||
496 | return i; | ||
497 | } | ||
498 | /* labeled failure end */ | ||
499 | |||
500 | |||
501 | static void setoffset (CompileState *compst, int instruction, int offset) { | ||
502 | getinstr(compst, instruction + 1).offset = offset; | ||
503 | } | ||
504 | |||
505 | |||
506 | /* | ||
507 | ** Add a capture instruction: | ||
508 | ** 'op' is the capture instruction; 'cap' the capture kind; | ||
509 | ** 'key' the key into ktable; 'aux' is optional offset | ||
510 | ** | ||
511 | */ | ||
512 | static int addinstcap (CompileState *compst, Opcode op, int cap, int key, | ||
513 | int aux) { | ||
514 | int i = addinstruction(compst, op, joinkindoff(cap, aux)); | ||
515 | getinstr(compst, i).i.key = key; | ||
516 | return i; | ||
517 | } | ||
518 | |||
519 | |||
520 | #define gethere(compst) ((compst)->ncode) | ||
521 | |||
522 | #define target(code,i) ((i) + code[i + 1].offset) | ||
523 | |||
524 | |||
525 | static void jumptothere (CompileState *compst, int instruction, int target) { | ||
526 | if (instruction >= 0) | ||
527 | setoffset(compst, instruction, target - instruction); | ||
528 | } | ||
529 | |||
530 | |||
531 | static void jumptohere (CompileState *compst, int instruction) { | ||
532 | jumptothere(compst, instruction, gethere(compst)); | ||
533 | } | ||
534 | |||
535 | |||
536 | /* | ||
537 | ** Code an IChar instruction, or IAny if there is an equivalent | ||
538 | ** test dominating it | ||
539 | */ | ||
540 | static void codechar (CompileState *compst, int c, int tt) { | ||
541 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && | ||
542 | getinstr(compst, tt).i.aux == c) | ||
543 | addinstruction(compst, IAny, 0); | ||
544 | else | ||
545 | addinstruction(compst, IChar, c); | ||
546 | } | ||
547 | |||
548 | |||
549 | /* | ||
550 | ** Add a charset posfix to an instruction | ||
551 | */ | ||
552 | static void addcharset (CompileState *compst, const byte *cs) { | ||
553 | int p = gethere(compst); | ||
554 | int i; | ||
555 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | ||
556 | nextinstruction(compst); /* space for buffer */ | ||
557 | /* fill buffer with charset */ | ||
558 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | ||
559 | } | ||
560 | |||
561 | |||
562 | /* | ||
563 | ** code a char set, optimizing unit sets for IChar, "complete" | ||
564 | ** sets for IAny, and empty sets for IFail; also use an IAny | ||
565 | ** when instruction is dominated by an equivalent test. | ||
566 | */ | ||
567 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | ||
568 | int c = 0; /* (=) to avoid warnings */ | ||
569 | Opcode op = charsettype(cs, &c); | ||
570 | switch (op) { | ||
571 | case IChar: codechar(compst, c, tt); break; | ||
572 | case ISet: { /* non-trivial set? */ | ||
573 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && | ||
574 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | ||
575 | addinstruction(compst, IAny, 0); | ||
576 | else { | ||
577 | addinstruction(compst, ISet, 0); | ||
578 | addcharset(compst, cs); | ||
579 | } | ||
580 | break; | ||
581 | } | ||
582 | default: addinstruction(compst, op, c); break; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | |||
587 | /* | ||
588 | ** code a test set, optimizing unit sets for ITestChar, "complete" | ||
589 | ** sets for ITestAny, and empty sets for IJmp (always fails). | ||
590 | ** 'e' is true iff test should accept the empty string. (Test | ||
591 | ** instructions in the current VM never accept the empty string.) | ||
592 | */ | ||
593 | static int codetestset (CompileState *compst, Charset *cs, int e) { | ||
594 | if (e) return NOINST; /* no test */ | ||
595 | else { | ||
596 | int c = 0; | ||
597 | Opcode op = charsettype(cs->cs, &c); | ||
598 | switch (op) { | ||
599 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ | ||
600 | case IAny: return addoffsetinst(compst, ITestAny); | ||
601 | case IChar: { | ||
602 | int i = addoffsetinst(compst, ITestChar); | ||
603 | getinstr(compst, i).i.aux = c; | ||
604 | return i; | ||
605 | } | ||
606 | case ISet: { | ||
607 | int i = addoffsetinst(compst, ITestSet); | ||
608 | addcharset(compst, cs->cs); | ||
609 | return i; | ||
610 | } | ||
611 | default: assert(0); return 0; | ||
612 | } | ||
613 | } | ||
614 | } | ||
615 | |||
616 | |||
617 | /* | ||
618 | ** Find the final destination of a sequence of jumps | ||
619 | */ | ||
620 | static int finaltarget (Instruction *code, int i) { | ||
621 | while (code[i].i.code == IJmp) | ||
622 | i = target(code, i); | ||
623 | return i; | ||
624 | } | ||
625 | |||
626 | |||
627 | /* | ||
628 | ** final label (after traversing any jumps) | ||
629 | */ | ||
630 | static int finallabel (Instruction *code, int i) { | ||
631 | return finaltarget(code, target(code, i)); | ||
632 | } | ||
633 | |||
634 | |||
635 | /* | ||
636 | ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) | ||
637 | */ | ||
638 | static void codebehind (CompileState *compst, TTree *tree) { | ||
639 | if (tree->u.n > 0) | ||
640 | addinstruction(compst, IBehind, tree->u.n); | ||
641 | codegen(compst, sib1(tree), 0, NOINST, fullset); | ||
642 | } | ||
643 | |||
644 | |||
645 | /* | ||
646 | ** Choice; optimizations: | ||
647 | ** - when p1 is headfail | ||
648 | ** - when first(p1) and first(p2) are disjoint; than | ||
649 | ** a character not in first(p1) cannot go to p1, and a character | ||
650 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | ||
651 | ** (The optimization is not valid if p1 accepts the empty string, | ||
652 | ** as then there is no character at all...) | ||
653 | ** - when p2 is empty and opt is true; a IPartialCommit can resuse | ||
654 | ** the Choice already active in the stack. | ||
655 | */ | ||
656 | static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
657 | const Charset *fl) { | ||
658 | int emptyp2 = (p2->tag == TTrue); | ||
659 | Charset cs1, cs2; | ||
660 | int e1 = getfirst(p1, fullset, &cs1); | ||
661 | if (headfail(p1) || | ||
662 | (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { | ||
663 | /*if (0) {*/ | ||
664 | /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ | ||
665 | int test = codetestset(compst, &cs1, 0); | ||
666 | int jmp = NOINST; | ||
667 | codegen(compst, p1, 0, test, fl); | ||
668 | if (!emptyp2) | ||
669 | jmp = addoffsetinst(compst, IJmp); | ||
670 | jumptohere(compst, test); | ||
671 | codegen(compst, p2, opt, NOINST, fl); | ||
672 | jumptohere(compst, jmp); | ||
673 | } | ||
674 | else if (opt && emptyp2) { | ||
675 | /* p1? == IPartialCommit; p1 */ | ||
676 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
677 | codegen(compst, p1, 1, NOINST, fullset); | ||
678 | } | ||
679 | else { | ||
680 | /* <p1 / p2> == | ||
681 | test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | ||
682 | int pcommit; | ||
683 | int test = codetestset(compst, &cs1, e1); | ||
684 | int pchoice = addoffsetinst(compst, IChoice); | ||
685 | codegen(compst, p1, emptyp2, test, fullset); | ||
686 | pcommit = addoffsetinst(compst, ICommit); | ||
687 | jumptohere(compst, pchoice); | ||
688 | jumptohere(compst, test); | ||
689 | codegen(compst, p2, opt, NOINST, fl); | ||
690 | jumptohere(compst, pcommit); | ||
691 | } | ||
692 | } | ||
693 | |||
694 | /* labeled failure begin */ | ||
695 | static void codelabchoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
696 | const Charset *fl, Labelset ls) { | ||
697 | int emptyp2 = (p2->tag == TTrue); | ||
698 | int pcommit; | ||
699 | int test = NOINST; | ||
700 | int pchoice = addoffsetlabinst(compst, ILabChoice, ls); | ||
701 | codegen(compst, p1, emptyp2, test, fullset); | ||
702 | pcommit = addoffsetinst(compst, ICommit); | ||
703 | jumptohere(compst, pchoice); | ||
704 | jumptohere(compst, test); | ||
705 | codegen(compst, p2, opt, NOINST, fl); | ||
706 | jumptohere(compst, pcommit); | ||
707 | |||
708 | } | ||
709 | /* labeled failure end */ | ||
710 | |||
711 | /* | ||
712 | ** And predicate | ||
713 | ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n | ||
714 | ** (valid only when 'p' has no captures) | ||
715 | */ | ||
716 | static void codeand (CompileState *compst, TTree *tree, int tt) { | ||
717 | int n = fixedlen(tree); | ||
718 | if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { | ||
719 | codegen(compst, tree, 0, tt, fullset); | ||
720 | if (n > 0) | ||
721 | addinstruction(compst, IBehind, n); | ||
722 | } | ||
723 | else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ | ||
724 | int pcommit; | ||
725 | int pchoice = addoffsetinst(compst, IChoice); | ||
726 | codegen(compst, tree, 0, tt, fullset); | ||
727 | pcommit = addoffsetinst(compst, IBackCommit); | ||
728 | jumptohere(compst, pchoice); | ||
729 | addinstruction(compst, IFail, 0); | ||
730 | jumptohere(compst, pcommit); | ||
731 | } | ||
732 | } | ||
733 | |||
734 | |||
735 | /* | ||
736 | ** Captures: if pattern has fixed (and not too big) length, use | ||
737 | ** a single IFullCapture instruction after the match; otherwise, | ||
738 | ** enclose the pattern with OpenCapture - CloseCapture. | ||
739 | */ | ||
740 | static void codecapture (CompileState *compst, TTree *tree, int tt, | ||
741 | const Charset *fl) { | ||
742 | int len = fixedlen(sib1(tree)); | ||
743 | if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { | ||
744 | codegen(compst, sib1(tree), 0, tt, fl); | ||
745 | addinstcap(compst, IFullCapture, tree->cap, tree->key, len); | ||
746 | } | ||
747 | else { | ||
748 | addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); | ||
749 | codegen(compst, sib1(tree), 0, tt, fl); | ||
750 | addinstcap(compst, ICloseCapture, Cclose, 0, 0); | ||
751 | } | ||
752 | } | ||
753 | |||
754 | |||
755 | static void coderuntime (CompileState *compst, TTree *tree, int tt) { | ||
756 | addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); | ||
757 | codegen(compst, sib1(tree), 0, tt, fullset); | ||
758 | addinstcap(compst, ICloseRunTime, Cclose, 0, 0); | ||
759 | } | ||
760 | |||
761 | |||
762 | /* | ||
763 | ** Repetion; optimizations: | ||
764 | ** When pattern is a charset, can use special instruction ISpan. | ||
765 | ** When pattern is head fail, or if it starts with characters that | ||
766 | ** are disjoint from what follows the repetions, a simple test | ||
767 | ** is enough (a fail inside the repetition would backtrack to fail | ||
768 | ** again in the following pattern, so there is no need for a choice). | ||
769 | ** When 'opt' is true, the repetion can reuse the Choice already | ||
770 | ** active in the stack. | ||
771 | */ | ||
772 | static void coderep (CompileState *compst, TTree *tree, int opt, | ||
773 | const Charset *fl) { | ||
774 | Charset st; | ||
775 | if (tocharset(tree, &st)) { | ||
776 | addinstruction(compst, ISpan, 0); | ||
777 | addcharset(compst, st.cs); | ||
778 | } | ||
779 | else { | ||
780 | int e1 = getfirst(tree, fullset, &st); | ||
781 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | ||
782 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | ||
783 | int jmp; | ||
784 | int test = codetestset(compst, &st, 0); | ||
785 | codegen(compst, tree, opt, test, fullset); | ||
786 | jmp = addoffsetinst(compst, IJmp); | ||
787 | jumptohere(compst, test); | ||
788 | jumptothere(compst, jmp, test); | ||
789 | } | ||
790 | else { | ||
791 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ | ||
792 | /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ | ||
793 | int commit, l2; | ||
794 | int test = codetestset(compst, &st, e1); | ||
795 | int pchoice = NOINST; | ||
796 | if (opt) | ||
797 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
798 | else | ||
799 | pchoice = addoffsetinst(compst, IChoice); | ||
800 | l2 = gethere(compst); | ||
801 | codegen(compst, tree, 0, NOINST, fullset); | ||
802 | commit = addoffsetinst(compst, IPartialCommit); | ||
803 | jumptothere(compst, commit, l2); | ||
804 | jumptohere(compst, pchoice); | ||
805 | jumptohere(compst, test); | ||
806 | } | ||
807 | } | ||
808 | } | ||
809 | |||
810 | |||
811 | /* | ||
812 | ** Not predicate; optimizations: | ||
813 | ** In any case, if first test fails, 'not' succeeds, so it can jump to | ||
814 | ** the end. If pattern is headfail, that is all (it cannot fail | ||
815 | ** in other parts); this case includes 'not' of simple sets. Otherwise, | ||
816 | ** use the default code (a choice plus a failtwice). | ||
817 | */ | ||
818 | static void codenot (CompileState *compst, TTree *tree) { | ||
819 | Charset st; | ||
820 | int e = getfirst(tree, fullset, &st); | ||
821 | int test = codetestset(compst, &st, e); | ||
822 | if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ | ||
823 | addinstruction(compst, IFail, 0); | ||
824 | else { | ||
825 | /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ | ||
826 | int pchoice = addoffsetinst(compst, IChoice); | ||
827 | codegen(compst, tree, 0, NOINST, fullset); | ||
828 | addinstruction(compst, IFailTwice, 0); | ||
829 | jumptohere(compst, pchoice); | ||
830 | } | ||
831 | jumptohere(compst, test); | ||
832 | } | ||
833 | |||
834 | |||
835 | /* | ||
836 | ** change open calls to calls, using list 'positions' to find | ||
837 | ** correct offsets; also optimize tail calls | ||
838 | */ | ||
839 | static void correctcalls (CompileState *compst, int *positions, | ||
840 | int from, int to) { | ||
841 | int i; | ||
842 | Instruction *code = compst->p->code; | ||
843 | for (i = from; i < to; i += sizei(&code[i])) { | ||
844 | if (code[i].i.code == IOpenCall) { | ||
845 | int n = code[i].i.key; /* rule number */ | ||
846 | int rule = positions[n]; /* rule position */ | ||
847 | assert(rule == from || code[rule - 1].i.code == IRet); | ||
848 | if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ | ||
849 | code[i].i.code = IJmp; /* tail call */ | ||
850 | else | ||
851 | code[i].i.code = ICall; | ||
852 | jumptothere(compst, i, rule); /* call jumps to respective rule */ | ||
853 | } | ||
854 | } | ||
855 | assert(i == to); | ||
856 | } | ||
857 | |||
858 | |||
859 | /* | ||
860 | ** Code for a grammar: | ||
861 | ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: | ||
862 | */ | ||
863 | static void codegrammar (CompileState *compst, TTree *grammar) { | ||
864 | int positions[MAXRULES]; | ||
865 | int rulenumber = 0; | ||
866 | TTree *rule; | ||
867 | int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ | ||
868 | int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ | ||
869 | int start = gethere(compst); /* here starts the initial rule */ | ||
870 | jumptohere(compst, firstcall); | ||
871 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
872 | positions[rulenumber++] = gethere(compst); /* save rule position */ | ||
873 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ | ||
874 | addinstruction(compst, IRet, 0); | ||
875 | } | ||
876 | assert(rule->tag == TTrue); | ||
877 | jumptohere(compst, jumptoend); | ||
878 | correctcalls(compst, positions, start, gethere(compst)); | ||
879 | } | ||
880 | |||
881 | |||
882 | static void codecall (CompileState *compst, TTree *call) { | ||
883 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ | ||
884 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ | ||
885 | assert(sib2(call)->tag == TRule); | ||
886 | } | ||
887 | |||
888 | |||
889 | /* | ||
890 | ** Code first child of a sequence | ||
891 | ** (second child is called in-place to allow tail call) | ||
892 | ** Return 'tt' for second child | ||
893 | */ | ||
894 | static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, | ||
895 | int tt, const Charset *fl) { | ||
896 | if (needfollow(p1)) { | ||
897 | Charset fl1; | ||
898 | getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ | ||
899 | codegen(compst, p1, 0, tt, &fl1); | ||
900 | } | ||
901 | else /* use 'fullset' as follow */ | ||
902 | codegen(compst, p1, 0, tt, fullset); | ||
903 | if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ | ||
904 | return NOINST; /* invalidate test */ | ||
905 | else return tt; /* else 'tt' still protects sib2 */ | ||
906 | } | ||
907 | |||
908 | |||
909 | /* | ||
910 | ** Main code-generation function: dispatch to auxiliar functions | ||
911 | ** according to kind of tree | ||
912 | */ | ||
913 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
914 | const Charset *fl) { | ||
915 | tailcall: | ||
916 | switch (tree->tag) { | ||
917 | case TChar: codechar(compst, tree->u.n, tt); break; | ||
918 | case TAny: addinstruction(compst, IAny, 0); break; | ||
919 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | ||
920 | case TTrue: break; | ||
921 | case TFalse: addinstruction(compst, IFail, 0); break; | ||
922 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | ||
923 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | ||
924 | case TBehind: codebehind(compst, tree); break; | ||
925 | case TNot: codenot(compst, sib1(tree)); break; | ||
926 | case TAnd: codeand(compst, sib1(tree), tt); break; | ||
927 | case TCapture: codecapture(compst, tree, tt, fl); break; | ||
928 | case TRunTime: coderuntime(compst, tree, tt); break; | ||
929 | case TGrammar: codegrammar(compst, tree); break; | ||
930 | case TCall: codecall(compst, tree); break; | ||
931 | case TSeq: { | ||
932 | tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ | ||
933 | /* codegen(compst, p2, opt, tt, fl); */ | ||
934 | tree = sib2(tree); goto tailcall; | ||
935 | } | ||
936 | case TThrow: { /* labeled failure */ | ||
937 | addthrowinstruction(compst, tree->labels); | ||
938 | break; | ||
939 | } | ||
940 | case TLabChoice: { /* labeled failure */ | ||
941 | codelabchoice(compst, sib1(tree), sib2(tree), opt, fl, tree->labels); | ||
942 | break; | ||
943 | } | ||
944 | default: assert(0); | ||
945 | } | ||
946 | } | ||
947 | |||
948 | |||
949 | /* | ||
950 | ** Optimize jumps and other jump-like instructions. | ||
951 | ** * Update labels of instructions with labels to their final | ||
952 | ** destinations (e.g., choice L1; ... L1: jmp L2: becomes | ||
953 | ** choice L2) | ||
954 | ** * Jumps to other instructions that do jumps become those | ||
955 | ** instructions (e.g., jump to return becomes a return; jump | ||
956 | ** to commit becomes a commit) | ||
957 | */ | ||
958 | static void peephole (CompileState *compst) { | ||
959 | Instruction *code = compst->p->code; | ||
960 | int i; | ||
961 | for (i = 0; i < compst->ncode; i += sizei(&code[i])) { | ||
962 | switch (code[i].i.code) { | ||
963 | case IChoice: case ICall: case ICommit: case IPartialCommit: | ||
964 | case IBackCommit: case ITestChar: case ITestSet: case ILabChoice: /* labeled failure */ | ||
965 | case ITestAny: { /* instructions with labels */ | ||
966 | jumptothere(compst, i, finallabel(code, i)); /* optimize label */ | ||
967 | break; | ||
968 | } | ||
969 | case IJmp: { | ||
970 | int ft = finaltarget(code, i); | ||
971 | switch (code[ft].i.code) { /* jumping to what? */ | ||
972 | case IRet: case IFail: case IFailTwice: | ||
973 | case IEnd: { /* instructions with unconditional implicit jumps */ | ||
974 | code[i] = code[ft]; /* jump becomes that instruction */ | ||
975 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ | ||
976 | break; | ||
977 | } | ||
978 | case ICommit: case IPartialCommit: | ||
979 | case IBackCommit: { /* inst. with unconditional explicit jumps */ | ||
980 | int fft = finallabel(code, ft); | ||
981 | code[i] = code[ft]; /* jump becomes that instruction... */ | ||
982 | jumptothere(compst, i, fft); /* but must correct its offset */ | ||
983 | i--; /* reoptimize its label */ | ||
984 | break; | ||
985 | } | ||
986 | default: { | ||
987 | jumptothere(compst, i, ft); /* optimize label */ | ||
988 | break; | ||
989 | } | ||
990 | } | ||
991 | break; | ||
992 | } | ||
993 | default: break; | ||
994 | } | ||
995 | } | ||
996 | assert(code[i - 1].i.code == IEnd); | ||
997 | } | ||
998 | |||
999 | |||
1000 | /* | ||
1001 | ** Compile a pattern | ||
1002 | */ | ||
1003 | Instruction *compile (lua_State *L, Pattern *p) { | ||
1004 | CompileState compst; | ||
1005 | compst.p = p; compst.ncode = 0; compst.L = L; | ||
1006 | reallocprog(L, p, 2); /* minimum initial size */ | ||
1007 | codegen(&compst, p->tree, 0, NOINST, fullset); | ||
1008 | addinstruction(&compst, IEnd, 0); | ||
1009 | reallocprog(L, p, compst.ncode); /* set final size */ | ||
1010 | peephole(&compst); /* labeled failure */ | ||
1011 | return p->code; | ||
1012 | } | ||
1013 | |||
1014 | |||
1015 | /* }====================================================== */ | ||
1016 | |||