diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
commit | e08e5df853560de6482d84066a7accc6a18de545 (patch) | |
tree | ee19686bb35da90709a32ed24bf7855de1a3946a /lpcode.c | |
download | lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.gz lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.bz2 lpeg-e08e5df853560de6482d84066a7accc6a18de545.zip |
Fist version of LPeg on GIT
LPeg repository is being moved to git. Past versions won't be moved;
they are still available in RCS.
Diffstat (limited to 'lpcode.c')
-rw-r--r-- | lpcode.c | 1014 |
1 files changed, 1014 insertions, 0 deletions
diff --git a/lpcode.c b/lpcode.c new file mode 100644 index 0000000..2722d71 --- /dev/null +++ b/lpcode.c | |||
@@ -0,0 +1,1014 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | |||
8 | |||
9 | #include "lua.h" | ||
10 | #include "lauxlib.h" | ||
11 | |||
12 | #include "lptypes.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | /* signals a "no-instruction */ | ||
17 | #define NOINST -1 | ||
18 | |||
19 | |||
20 | |||
21 | static const Charset fullset_ = | ||
22 | {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
23 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
24 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
25 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; | ||
26 | |||
27 | static const Charset *fullset = &fullset_; | ||
28 | |||
29 | /* | ||
30 | ** {====================================================== | ||
31 | ** Analysis and some optimizations | ||
32 | ** ======================================================= | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
37 | ** full (IAny), or none of those (ISet). When singleton, '*c' returns | ||
38 | ** which character it is. (When generic set, the set was the input, | ||
39 | ** so there is no need to return it.) | ||
40 | */ | ||
41 | static Opcode charsettype (const byte *cs, int *c) { | ||
42 | int count = 0; /* number of characters in the set */ | ||
43 | int i; | ||
44 | int candidate = -1; /* candidate position for the singleton char */ | ||
45 | for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ | ||
46 | int b = cs[i]; | ||
47 | if (b == 0) { /* is byte empty? */ | ||
48 | if (count > 1) /* was set neither empty nor singleton? */ | ||
49 | return ISet; /* neither full nor empty nor singleton */ | ||
50 | /* else set is still empty or singleton */ | ||
51 | } | ||
52 | else if (b == 0xFF) { /* is byte full? */ | ||
53 | if (count < (i * BITSPERCHAR)) /* was set not full? */ | ||
54 | return ISet; /* neither full nor empty nor singleton */ | ||
55 | else count += BITSPERCHAR; /* set is still full */ | ||
56 | } | ||
57 | else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ | ||
58 | if (count > 0) /* was set not empty? */ | ||
59 | return ISet; /* neither full nor empty nor singleton */ | ||
60 | else { /* set has only one char till now; track it */ | ||
61 | count++; | ||
62 | candidate = i; | ||
63 | } | ||
64 | } | ||
65 | else return ISet; /* byte is neither empty, full, nor singleton */ | ||
66 | } | ||
67 | switch (count) { | ||
68 | case 0: return IFail; /* empty set */ | ||
69 | case 1: { /* singleton; find character bit inside byte */ | ||
70 | int b = cs[candidate]; | ||
71 | *c = candidate * BITSPERCHAR; | ||
72 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } | ||
73 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } | ||
74 | if ((b & 0x02) != 0) { *c += 1; } | ||
75 | return IChar; | ||
76 | } | ||
77 | default: { | ||
78 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ | ||
79 | return IAny; | ||
80 | } | ||
81 | } | ||
82 | } | ||
83 | |||
84 | |||
85 | /* | ||
86 | ** A few basic operations on Charsets | ||
87 | */ | ||
88 | static void cs_complement (Charset *cs) { | ||
89 | loopset(i, cs->cs[i] = ~cs->cs[i]); | ||
90 | } | ||
91 | |||
92 | static int cs_equal (const byte *cs1, const byte *cs2) { | ||
93 | loopset(i, if (cs1[i] != cs2[i]) return 0); | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | ||
98 | loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a | ||
105 | ** charset and return 1; else return 0. | ||
106 | */ | ||
107 | int tocharset (TTree *tree, Charset *cs) { | ||
108 | switch (tree->tag) { | ||
109 | case TSet: { /* copy set */ | ||
110 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
111 | return 1; | ||
112 | } | ||
113 | case TChar: { /* only one char */ | ||
114 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
115 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
116 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
117 | return 1; | ||
118 | } | ||
119 | case TAny: { | ||
120 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
121 | return 1; | ||
122 | } | ||
123 | default: return 0; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | |||
128 | /* | ||
129 | ** Visit a TCall node taking care to stop recursion. If node not yet | ||
130 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default | ||
131 | ** value) | ||
132 | */ | ||
133 | static int callrecursive (TTree *tree, int f (TTree *t), int def) { | ||
134 | int key = tree->key; | ||
135 | assert(tree->tag == TCall); | ||
136 | assert(sib2(tree)->tag == TRule); | ||
137 | if (key == 0) /* node already visited? */ | ||
138 | return def; /* return default value */ | ||
139 | else { /* first visit */ | ||
140 | int result; | ||
141 | tree->key = 0; /* mark call as already visited */ | ||
142 | result = f(sib2(tree)); /* go to called rule */ | ||
143 | tree->key = key; /* restore tree */ | ||
144 | return result; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | |||
149 | /* | ||
150 | ** Check whether a pattern tree has captures | ||
151 | */ | ||
152 | int hascaptures (TTree *tree) { | ||
153 | tailcall: | ||
154 | switch (tree->tag) { | ||
155 | case TCapture: case TRunTime: | ||
156 | return 1; | ||
157 | case TCall: | ||
158 | return callrecursive(tree, hascaptures, 0); | ||
159 | case TRule: /* do not follow siblings */ | ||
160 | tree = sib1(tree); goto tailcall; | ||
161 | case TOpenCall: assert(0); | ||
162 | default: { | ||
163 | switch (numsiblings[tree->tag]) { | ||
164 | case 1: /* return hascaptures(sib1(tree)); */ | ||
165 | tree = sib1(tree); goto tailcall; | ||
166 | case 2: | ||
167 | if (hascaptures(sib1(tree))) | ||
168 | return 1; | ||
169 | /* else return hascaptures(sib2(tree)); */ | ||
170 | tree = sib2(tree); goto tailcall; | ||
171 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
172 | } | ||
173 | } | ||
174 | } | ||
175 | } | ||
176 | |||
177 | |||
178 | /* | ||
179 | ** Checks how a pattern behaves regarding the empty string, | ||
180 | ** in one of two different ways: | ||
181 | ** A pattern is *nullable* if it can match without consuming any character; | ||
182 | ** A pattern is *nofail* if it never fails for any string | ||
183 | ** (including the empty string). | ||
184 | ** The difference is only for predicates and run-time captures; | ||
185 | ** for other patterns, the two properties are equivalent. | ||
186 | ** (With predicates, &'a' is nullable but not nofail. Of course, | ||
187 | ** nofail => nullable.) | ||
188 | ** These functions are all convervative in the following way: | ||
189 | ** p is nullable => nullable(p) | ||
190 | ** nofail(p) => p cannot fail | ||
191 | ** The function assumes that TOpenCall is not nullable; | ||
192 | ** this will be checked again when the grammar is fixed. | ||
193 | ** Run-time captures can do whatever they want, so the result | ||
194 | ** is conservative. | ||
195 | */ | ||
196 | int checkaux (TTree *tree, int pred) { | ||
197 | tailcall: | ||
198 | switch (tree->tag) { | ||
199 | case TChar: case TSet: case TAny: | ||
200 | case TFalse: case TOpenCall: | ||
201 | return 0; /* not nullable */ | ||
202 | case TRep: case TTrue: | ||
203 | return 1; /* no fail */ | ||
204 | case TNot: case TBehind: /* can match empty, but can fail */ | ||
205 | if (pred == PEnofail) return 0; | ||
206 | else return 1; /* PEnullable */ | ||
207 | case TAnd: /* can match empty; fail iff body does */ | ||
208 | if (pred == PEnullable) return 1; | ||
209 | /* else return checkaux(sib1(tree), pred); */ | ||
210 | tree = sib1(tree); goto tailcall; | ||
211 | case TRunTime: /* can fail; match empty iff body does */ | ||
212 | if (pred == PEnofail) return 0; | ||
213 | /* else return checkaux(sib1(tree), pred); */ | ||
214 | tree = sib1(tree); goto tailcall; | ||
215 | case TSeq: | ||
216 | if (!checkaux(sib1(tree), pred)) return 0; | ||
217 | /* else return checkaux(sib2(tree), pred); */ | ||
218 | tree = sib2(tree); goto tailcall; | ||
219 | case TChoice: | ||
220 | if (checkaux(sib2(tree), pred)) return 1; | ||
221 | /* else return checkaux(sib1(tree), pred); */ | ||
222 | tree = sib1(tree); goto tailcall; | ||
223 | case TCapture: case TGrammar: case TRule: | ||
224 | /* return checkaux(sib1(tree), pred); */ | ||
225 | tree = sib1(tree); goto tailcall; | ||
226 | case TCall: /* return checkaux(sib2(tree), pred); */ | ||
227 | tree = sib2(tree); goto tailcall; | ||
228 | default: assert(0); return 0; | ||
229 | } | ||
230 | } | ||
231 | |||
232 | |||
233 | /* | ||
234 | ** number of characters to match a pattern (or -1 if variable) | ||
235 | */ | ||
236 | int fixedlen (TTree *tree) { | ||
237 | int len = 0; /* to accumulate in tail calls */ | ||
238 | tailcall: | ||
239 | switch (tree->tag) { | ||
240 | case TChar: case TSet: case TAny: | ||
241 | return len + 1; | ||
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | ||
243 | return len; | ||
244 | case TRep: case TRunTime: case TOpenCall: | ||
245 | return -1; | ||
246 | case TCapture: case TRule: case TGrammar: | ||
247 | /* return fixedlen(sib1(tree)); */ | ||
248 | tree = sib1(tree); goto tailcall; | ||
249 | case TCall: { | ||
250 | int n1 = callrecursive(tree, fixedlen, -1); | ||
251 | if (n1 < 0) | ||
252 | return -1; | ||
253 | else | ||
254 | return len + n1; | ||
255 | } | ||
256 | case TSeq: { | ||
257 | int n1 = fixedlen(sib1(tree)); | ||
258 | if (n1 < 0) | ||
259 | return -1; | ||
260 | /* else return fixedlen(sib2(tree)) + len; */ | ||
261 | len += n1; tree = sib2(tree); goto tailcall; | ||
262 | } | ||
263 | case TChoice: { | ||
264 | int n1 = fixedlen(sib1(tree)); | ||
265 | int n2 = fixedlen(sib2(tree)); | ||
266 | if (n1 != n2 || n1 < 0) | ||
267 | return -1; | ||
268 | else | ||
269 | return len + n1; | ||
270 | } | ||
271 | default: assert(0); return 0; | ||
272 | }; | ||
273 | } | ||
274 | |||
275 | |||
276 | /* | ||
277 | ** Computes the 'first set' of a pattern. | ||
278 | ** The result is a conservative aproximation: | ||
279 | ** match p ax -> x (for some x) ==> a belongs to first(p) | ||
280 | ** or | ||
281 | ** a not in first(p) ==> match p ax -> fail (for all x) | ||
282 | ** | ||
283 | ** The set 'follow' is the first set of what follows the | ||
284 | ** pattern (full set if nothing follows it). | ||
285 | ** | ||
286 | ** The function returns 0 when this resulting set can be used for | ||
287 | ** test instructions that avoid the pattern altogether. | ||
288 | ** A non-zero return can happen for two reasons: | ||
289 | ** 1) match p '' -> '' ==> return has bit 1 set | ||
290 | ** (tests cannot be used because they would always fail for an empty input); | ||
291 | ** 2) there is a match-time capture ==> return has bit 2 set | ||
292 | ** (optimizations should not bypass match-time captures). | ||
293 | */ | ||
294 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | ||
295 | tailcall: | ||
296 | switch (tree->tag) { | ||
297 | case TChar: case TSet: case TAny: { | ||
298 | tocharset(tree, firstset); | ||
299 | return 0; | ||
300 | } | ||
301 | case TTrue: { | ||
302 | loopset(i, firstset->cs[i] = follow->cs[i]); | ||
303 | return 1; /* accepts the empty string */ | ||
304 | } | ||
305 | case TFalse: { | ||
306 | loopset(i, firstset->cs[i] = 0); | ||
307 | return 0; | ||
308 | } | ||
309 | case TChoice: { | ||
310 | Charset csaux; | ||
311 | int e1 = getfirst(sib1(tree), follow, firstset); | ||
312 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
313 | loopset(i, firstset->cs[i] |= csaux.cs[i]); | ||
314 | return e1 | e2; | ||
315 | } | ||
316 | case TSeq: { | ||
317 | if (!nullable(sib1(tree))) { | ||
318 | /* when p1 is not nullable, p2 has nothing to contribute; | ||
319 | return getfirst(sib1(tree), fullset, firstset); */ | ||
320 | tree = sib1(tree); follow = fullset; goto tailcall; | ||
321 | } | ||
322 | else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ | ||
323 | Charset csaux; | ||
324 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
325 | int e1 = getfirst(sib1(tree), &csaux, firstset); | ||
326 | if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ | ||
327 | else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ | ||
328 | return 2; /* pattern has a matchtime capture */ | ||
329 | else return e2; /* else depends on 'e2' */ | ||
330 | } | ||
331 | } | ||
332 | case TRep: { | ||
333 | getfirst(sib1(tree), follow, firstset); | ||
334 | loopset(i, firstset->cs[i] |= follow->cs[i]); | ||
335 | return 1; /* accept the empty string */ | ||
336 | } | ||
337 | case TCapture: case TGrammar: case TRule: { | ||
338 | /* return getfirst(sib1(tree), follow, firstset); */ | ||
339 | tree = sib1(tree); goto tailcall; | ||
340 | } | ||
341 | case TRunTime: { /* function invalidates any follow info. */ | ||
342 | int e = getfirst(sib1(tree), fullset, firstset); | ||
343 | if (e) return 2; /* function is not "protected"? */ | ||
344 | else return 0; /* pattern inside capture ensures first can be used */ | ||
345 | } | ||
346 | case TCall: { | ||
347 | /* return getfirst(sib2(tree), follow, firstset); */ | ||
348 | tree = sib2(tree); goto tailcall; | ||
349 | } | ||
350 | case TAnd: { | ||
351 | int e = getfirst(sib1(tree), follow, firstset); | ||
352 | loopset(i, firstset->cs[i] &= follow->cs[i]); | ||
353 | return e; | ||
354 | } | ||
355 | case TNot: { | ||
356 | if (tocharset(sib1(tree), firstset)) { | ||
357 | cs_complement(firstset); | ||
358 | return 1; | ||
359 | } | ||
360 | /* else go through */ | ||
361 | } | ||
362 | case TBehind: { /* instruction gives no new information */ | ||
363 | /* call 'getfirst' only to check for math-time captures */ | ||
364 | int e = getfirst(sib1(tree), follow, firstset); | ||
365 | loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ | ||
366 | return e | 1; /* always can accept the empty string */ | ||
367 | } | ||
368 | default: assert(0); return 0; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | |||
373 | /* | ||
374 | ** If 'headfail(tree)' true, then 'tree' can fail only depending on the | ||
375 | ** next character of the subject. | ||
376 | */ | ||
377 | static int headfail (TTree *tree) { | ||
378 | tailcall: | ||
379 | switch (tree->tag) { | ||
380 | case TChar: case TSet: case TAny: case TFalse: | ||
381 | return 1; | ||
382 | case TTrue: case TRep: case TRunTime: case TNot: | ||
383 | case TBehind: | ||
384 | return 0; | ||
385 | case TCapture: case TGrammar: case TRule: case TAnd: | ||
386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | ||
387 | case TCall: | ||
388 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ | ||
389 | case TSeq: | ||
390 | if (!nofail(sib2(tree))) return 0; | ||
391 | /* else return headfail(sib1(tree)); */ | ||
392 | tree = sib1(tree); goto tailcall; | ||
393 | case TChoice: | ||
394 | if (!headfail(sib1(tree))) return 0; | ||
395 | /* else return headfail(sib2(tree)); */ | ||
396 | tree = sib2(tree); goto tailcall; | ||
397 | default: assert(0); return 0; | ||
398 | } | ||
399 | } | ||
400 | |||
401 | |||
402 | /* | ||
403 | ** Check whether the code generation for the given tree can benefit | ||
404 | ** from a follow set (to avoid computing the follow set when it is | ||
405 | ** not needed) | ||
406 | */ | ||
407 | static int needfollow (TTree *tree) { | ||
408 | tailcall: | ||
409 | switch (tree->tag) { | ||
410 | case TChar: case TSet: case TAny: | ||
411 | case TFalse: case TTrue: case TAnd: case TNot: | ||
412 | case TRunTime: case TGrammar: case TCall: case TBehind: | ||
413 | return 0; | ||
414 | case TChoice: case TRep: | ||
415 | return 1; | ||
416 | case TCapture: | ||
417 | tree = sib1(tree); goto tailcall; | ||
418 | case TSeq: | ||
419 | tree = sib2(tree); goto tailcall; | ||
420 | default: assert(0); return 0; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | /* }====================================================== */ | ||
425 | |||
426 | |||
427 | |||
428 | /* | ||
429 | ** {====================================================== | ||
430 | ** Code generation | ||
431 | ** ======================================================= | ||
432 | */ | ||
433 | |||
434 | |||
435 | /* | ||
436 | ** size of an instruction | ||
437 | */ | ||
438 | int sizei (const Instruction *i) { | ||
439 | switch((Opcode)i->i.code) { | ||
440 | case ISet: case ISpan: return CHARSETINSTSIZE; | ||
441 | case ITestSet: return CHARSETINSTSIZE + 1; | ||
442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | ||
443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | ||
444 | return 2; | ||
445 | default: return 1; | ||
446 | } | ||
447 | } | ||
448 | |||
449 | |||
450 | /* | ||
451 | ** state for the compiler | ||
452 | */ | ||
453 | typedef struct CompileState { | ||
454 | Pattern *p; /* pattern being compiled */ | ||
455 | int ncode; /* next position in p->code to be filled */ | ||
456 | lua_State *L; | ||
457 | } CompileState; | ||
458 | |||
459 | |||
460 | /* | ||
461 | ** code generation is recursive; 'opt' indicates that the code is being | ||
462 | ** generated as the last thing inside an optional pattern (so, if that | ||
463 | ** code is optional too, it can reuse the 'IChoice' already in place for | ||
464 | ** the outer pattern). 'tt' points to a previous test protecting this | ||
465 | ** code (or NOINST). 'fl' is the follow set of the pattern. | ||
466 | */ | ||
467 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
468 | const Charset *fl); | ||
469 | |||
470 | |||
471 | void realloccode (lua_State *L, Pattern *p, int nsize) { | ||
472 | void *ud; | ||
473 | lua_Alloc f = lua_getallocf(L, &ud); | ||
474 | void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), | ||
475 | nsize * sizeof(Instruction)); | ||
476 | if (newblock == NULL && nsize > 0) | ||
477 | luaL_error(L, "not enough memory"); | ||
478 | p->code = (Instruction *)newblock; | ||
479 | p->codesize = nsize; | ||
480 | } | ||
481 | |||
482 | |||
483 | static int nextinstruction (CompileState *compst) { | ||
484 | int size = compst->p->codesize; | ||
485 | if (compst->ncode >= size) | ||
486 | realloccode(compst->L, compst->p, size * 2); | ||
487 | return compst->ncode++; | ||
488 | } | ||
489 | |||
490 | |||
491 | #define getinstr(cs,i) ((cs)->p->code[i]) | ||
492 | |||
493 | |||
494 | static int addinstruction (CompileState *compst, Opcode op, int aux) { | ||
495 | int i = nextinstruction(compst); | ||
496 | getinstr(compst, i).i.code = op; | ||
497 | getinstr(compst, i).i.aux = aux; | ||
498 | return i; | ||
499 | } | ||
500 | |||
501 | |||
502 | /* | ||
503 | ** Add an instruction followed by space for an offset (to be set later) | ||
504 | */ | ||
505 | static int addoffsetinst (CompileState *compst, Opcode op) { | ||
506 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
507 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
508 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); | ||
509 | return i; | ||
510 | } | ||
511 | |||
512 | |||
513 | /* | ||
514 | ** Set the offset of an instruction | ||
515 | */ | ||
516 | static void setoffset (CompileState *compst, int instruction, int offset) { | ||
517 | getinstr(compst, instruction + 1).offset = offset; | ||
518 | } | ||
519 | |||
520 | |||
521 | /* | ||
522 | ** Add a capture instruction: | ||
523 | ** 'op' is the capture instruction; 'cap' the capture kind; | ||
524 | ** 'key' the key into ktable; 'aux' is the optional capture offset | ||
525 | ** | ||
526 | */ | ||
527 | static int addinstcap (CompileState *compst, Opcode op, int cap, int key, | ||
528 | int aux) { | ||
529 | int i = addinstruction(compst, op, joinkindoff(cap, aux)); | ||
530 | getinstr(compst, i).i.key = key; | ||
531 | return i; | ||
532 | } | ||
533 | |||
534 | |||
535 | #define gethere(compst) ((compst)->ncode) | ||
536 | |||
537 | #define target(code,i) ((i) + code[i + 1].offset) | ||
538 | |||
539 | |||
540 | /* | ||
541 | ** Patch 'instruction' to jump to 'target' | ||
542 | */ | ||
543 | static void jumptothere (CompileState *compst, int instruction, int target) { | ||
544 | if (instruction >= 0) | ||
545 | setoffset(compst, instruction, target - instruction); | ||
546 | } | ||
547 | |||
548 | |||
549 | /* | ||
550 | ** Patch 'instruction' to jump to current position | ||
551 | */ | ||
552 | static void jumptohere (CompileState *compst, int instruction) { | ||
553 | jumptothere(compst, instruction, gethere(compst)); | ||
554 | } | ||
555 | |||
556 | |||
557 | /* | ||
558 | ** Code an IChar instruction, or IAny if there is an equivalent | ||
559 | ** test dominating it | ||
560 | */ | ||
561 | static void codechar (CompileState *compst, int c, int tt) { | ||
562 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && | ||
563 | getinstr(compst, tt).i.aux == c) | ||
564 | addinstruction(compst, IAny, 0); | ||
565 | else | ||
566 | addinstruction(compst, IChar, c); | ||
567 | } | ||
568 | |||
569 | |||
570 | /* | ||
571 | ** Add a charset posfix to an instruction | ||
572 | */ | ||
573 | static void addcharset (CompileState *compst, const byte *cs) { | ||
574 | int p = gethere(compst); | ||
575 | int i; | ||
576 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | ||
577 | nextinstruction(compst); /* space for buffer */ | ||
578 | /* fill buffer with charset */ | ||
579 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | ||
580 | } | ||
581 | |||
582 | |||
583 | /* | ||
584 | ** code a char set, optimizing unit sets for IChar, "complete" | ||
585 | ** sets for IAny, and empty sets for IFail; also use an IAny | ||
586 | ** when instruction is dominated by an equivalent test. | ||
587 | */ | ||
588 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | ||
589 | int c = 0; /* (=) to avoid warnings */ | ||
590 | Opcode op = charsettype(cs, &c); | ||
591 | switch (op) { | ||
592 | case IChar: codechar(compst, c, tt); break; | ||
593 | case ISet: { /* non-trivial set? */ | ||
594 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && | ||
595 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | ||
596 | addinstruction(compst, IAny, 0); | ||
597 | else { | ||
598 | addinstruction(compst, ISet, 0); | ||
599 | addcharset(compst, cs); | ||
600 | } | ||
601 | break; | ||
602 | } | ||
603 | default: addinstruction(compst, op, c); break; | ||
604 | } | ||
605 | } | ||
606 | |||
607 | |||
608 | /* | ||
609 | ** code a test set, optimizing unit sets for ITestChar, "complete" | ||
610 | ** sets for ITestAny, and empty sets for IJmp (always fails). | ||
611 | ** 'e' is true iff test should accept the empty string. (Test | ||
612 | ** instructions in the current VM never accept the empty string.) | ||
613 | */ | ||
614 | static int codetestset (CompileState *compst, Charset *cs, int e) { | ||
615 | if (e) return NOINST; /* no test */ | ||
616 | else { | ||
617 | int c = 0; | ||
618 | Opcode op = charsettype(cs->cs, &c); | ||
619 | switch (op) { | ||
620 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ | ||
621 | case IAny: return addoffsetinst(compst, ITestAny); | ||
622 | case IChar: { | ||
623 | int i = addoffsetinst(compst, ITestChar); | ||
624 | getinstr(compst, i).i.aux = c; | ||
625 | return i; | ||
626 | } | ||
627 | case ISet: { | ||
628 | int i = addoffsetinst(compst, ITestSet); | ||
629 | addcharset(compst, cs->cs); | ||
630 | return i; | ||
631 | } | ||
632 | default: assert(0); return 0; | ||
633 | } | ||
634 | } | ||
635 | } | ||
636 | |||
637 | |||
638 | /* | ||
639 | ** Find the final destination of a sequence of jumps | ||
640 | */ | ||
641 | static int finaltarget (Instruction *code, int i) { | ||
642 | while (code[i].i.code == IJmp) | ||
643 | i = target(code, i); | ||
644 | return i; | ||
645 | } | ||
646 | |||
647 | |||
648 | /* | ||
649 | ** final label (after traversing any jumps) | ||
650 | */ | ||
651 | static int finallabel (Instruction *code, int i) { | ||
652 | return finaltarget(code, target(code, i)); | ||
653 | } | ||
654 | |||
655 | |||
656 | /* | ||
657 | ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) | ||
658 | */ | ||
659 | static void codebehind (CompileState *compst, TTree *tree) { | ||
660 | if (tree->u.n > 0) | ||
661 | addinstruction(compst, IBehind, tree->u.n); | ||
662 | codegen(compst, sib1(tree), 0, NOINST, fullset); | ||
663 | } | ||
664 | |||
665 | |||
666 | /* | ||
667 | ** Choice; optimizations: | ||
668 | ** - when p1 is headfail or | ||
669 | ** when first(p1) and first(p2) are disjoint, than | ||
670 | ** a character not in first(p1) cannot go to p1, and a character | ||
671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | ||
672 | ** (The optimization is not valid if p1 accepts the empty string, | ||
673 | ** as then there is no character at all...) | ||
674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | ||
675 | ** the Choice already active in the stack. | ||
676 | */ | ||
677 | static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
678 | const Charset *fl) { | ||
679 | int emptyp2 = (p2->tag == TTrue); | ||
680 | Charset cs1, cs2; | ||
681 | int e1 = getfirst(p1, fullset, &cs1); | ||
682 | if (headfail(p1) || | ||
683 | (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { | ||
684 | /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ | ||
685 | int test = codetestset(compst, &cs1, 0); | ||
686 | int jmp = NOINST; | ||
687 | codegen(compst, p1, 0, test, fl); | ||
688 | if (!emptyp2) | ||
689 | jmp = addoffsetinst(compst, IJmp); | ||
690 | jumptohere(compst, test); | ||
691 | codegen(compst, p2, opt, NOINST, fl); | ||
692 | jumptohere(compst, jmp); | ||
693 | } | ||
694 | else if (opt && emptyp2) { | ||
695 | /* p1? == IPartialCommit; p1 */ | ||
696 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
697 | codegen(compst, p1, 1, NOINST, fullset); | ||
698 | } | ||
699 | else { | ||
700 | /* <p1 / p2> == | ||
701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | ||
702 | int pcommit; | ||
703 | int test = codetestset(compst, &cs1, e1); | ||
704 | int pchoice = addoffsetinst(compst, IChoice); | ||
705 | codegen(compst, p1, emptyp2, test, fullset); | ||
706 | pcommit = addoffsetinst(compst, ICommit); | ||
707 | jumptohere(compst, pchoice); | ||
708 | jumptohere(compst, test); | ||
709 | codegen(compst, p2, opt, NOINST, fl); | ||
710 | jumptohere(compst, pcommit); | ||
711 | } | ||
712 | } | ||
713 | |||
714 | |||
715 | /* | ||
716 | ** And predicate | ||
717 | ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n | ||
718 | ** (valid only when 'p' has no captures) | ||
719 | */ | ||
720 | static void codeand (CompileState *compst, TTree *tree, int tt) { | ||
721 | int n = fixedlen(tree); | ||
722 | if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { | ||
723 | codegen(compst, tree, 0, tt, fullset); | ||
724 | if (n > 0) | ||
725 | addinstruction(compst, IBehind, n); | ||
726 | } | ||
727 | else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ | ||
728 | int pcommit; | ||
729 | int pchoice = addoffsetinst(compst, IChoice); | ||
730 | codegen(compst, tree, 0, tt, fullset); | ||
731 | pcommit = addoffsetinst(compst, IBackCommit); | ||
732 | jumptohere(compst, pchoice); | ||
733 | addinstruction(compst, IFail, 0); | ||
734 | jumptohere(compst, pcommit); | ||
735 | } | ||
736 | } | ||
737 | |||
738 | |||
739 | /* | ||
740 | ** Captures: if pattern has fixed (and not too big) length, and it | ||
741 | ** has no nested captures, use a single IFullCapture instruction | ||
742 | ** after the match; otherwise, enclose the pattern with OpenCapture - | ||
743 | ** CloseCapture. | ||
744 | */ | ||
745 | static void codecapture (CompileState *compst, TTree *tree, int tt, | ||
746 | const Charset *fl) { | ||
747 | int len = fixedlen(sib1(tree)); | ||
748 | if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { | ||
749 | codegen(compst, sib1(tree), 0, tt, fl); | ||
750 | addinstcap(compst, IFullCapture, tree->cap, tree->key, len); | ||
751 | } | ||
752 | else { | ||
753 | addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); | ||
754 | codegen(compst, sib1(tree), 0, tt, fl); | ||
755 | addinstcap(compst, ICloseCapture, Cclose, 0, 0); | ||
756 | } | ||
757 | } | ||
758 | |||
759 | |||
760 | static void coderuntime (CompileState *compst, TTree *tree, int tt) { | ||
761 | addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); | ||
762 | codegen(compst, sib1(tree), 0, tt, fullset); | ||
763 | addinstcap(compst, ICloseRunTime, Cclose, 0, 0); | ||
764 | } | ||
765 | |||
766 | |||
767 | /* | ||
768 | ** Repetion; optimizations: | ||
769 | ** When pattern is a charset, can use special instruction ISpan. | ||
770 | ** When pattern is head fail, or if it starts with characters that | ||
771 | ** are disjoint from what follows the repetions, a simple test | ||
772 | ** is enough (a fail inside the repetition would backtrack to fail | ||
773 | ** again in the following pattern, so there is no need for a choice). | ||
774 | ** When 'opt' is true, the repetion can reuse the Choice already | ||
775 | ** active in the stack. | ||
776 | */ | ||
777 | static void coderep (CompileState *compst, TTree *tree, int opt, | ||
778 | const Charset *fl) { | ||
779 | Charset st; | ||
780 | if (tocharset(tree, &st)) { | ||
781 | addinstruction(compst, ISpan, 0); | ||
782 | addcharset(compst, st.cs); | ||
783 | } | ||
784 | else { | ||
785 | int e1 = getfirst(tree, fullset, &st); | ||
786 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | ||
787 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | ||
788 | int jmp; | ||
789 | int test = codetestset(compst, &st, 0); | ||
790 | codegen(compst, tree, 0, test, fullset); | ||
791 | jmp = addoffsetinst(compst, IJmp); | ||
792 | jumptohere(compst, test); | ||
793 | jumptothere(compst, jmp, test); | ||
794 | } | ||
795 | else { | ||
796 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ | ||
797 | /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ | ||
798 | int commit, l2; | ||
799 | int test = codetestset(compst, &st, e1); | ||
800 | int pchoice = NOINST; | ||
801 | if (opt) | ||
802 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
803 | else | ||
804 | pchoice = addoffsetinst(compst, IChoice); | ||
805 | l2 = gethere(compst); | ||
806 | codegen(compst, tree, 0, NOINST, fullset); | ||
807 | commit = addoffsetinst(compst, IPartialCommit); | ||
808 | jumptothere(compst, commit, l2); | ||
809 | jumptohere(compst, pchoice); | ||
810 | jumptohere(compst, test); | ||
811 | } | ||
812 | } | ||
813 | } | ||
814 | |||
815 | |||
816 | /* | ||
817 | ** Not predicate; optimizations: | ||
818 | ** In any case, if first test fails, 'not' succeeds, so it can jump to | ||
819 | ** the end. If pattern is headfail, that is all (it cannot fail | ||
820 | ** in other parts); this case includes 'not' of simple sets. Otherwise, | ||
821 | ** use the default code (a choice plus a failtwice). | ||
822 | */ | ||
823 | static void codenot (CompileState *compst, TTree *tree) { | ||
824 | Charset st; | ||
825 | int e = getfirst(tree, fullset, &st); | ||
826 | int test = codetestset(compst, &st, e); | ||
827 | if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ | ||
828 | addinstruction(compst, IFail, 0); | ||
829 | else { | ||
830 | /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ | ||
831 | int pchoice = addoffsetinst(compst, IChoice); | ||
832 | codegen(compst, tree, 0, NOINST, fullset); | ||
833 | addinstruction(compst, IFailTwice, 0); | ||
834 | jumptohere(compst, pchoice); | ||
835 | } | ||
836 | jumptohere(compst, test); | ||
837 | } | ||
838 | |||
839 | |||
840 | /* | ||
841 | ** change open calls to calls, using list 'positions' to find | ||
842 | ** correct offsets; also optimize tail calls | ||
843 | */ | ||
844 | static void correctcalls (CompileState *compst, int *positions, | ||
845 | int from, int to) { | ||
846 | int i; | ||
847 | Instruction *code = compst->p->code; | ||
848 | for (i = from; i < to; i += sizei(&code[i])) { | ||
849 | if (code[i].i.code == IOpenCall) { | ||
850 | int n = code[i].i.key; /* rule number */ | ||
851 | int rule = positions[n]; /* rule position */ | ||
852 | assert(rule == from || code[rule - 1].i.code == IRet); | ||
853 | if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ | ||
854 | code[i].i.code = IJmp; /* tail call */ | ||
855 | else | ||
856 | code[i].i.code = ICall; | ||
857 | jumptothere(compst, i, rule); /* call jumps to respective rule */ | ||
858 | } | ||
859 | } | ||
860 | assert(i == to); | ||
861 | } | ||
862 | |||
863 | |||
864 | /* | ||
865 | ** Code for a grammar: | ||
866 | ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: | ||
867 | */ | ||
868 | static void codegrammar (CompileState *compst, TTree *grammar) { | ||
869 | int positions[MAXRULES]; | ||
870 | int rulenumber = 0; | ||
871 | TTree *rule; | ||
872 | int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ | ||
873 | int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ | ||
874 | int start = gethere(compst); /* here starts the initial rule */ | ||
875 | jumptohere(compst, firstcall); | ||
876 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
877 | positions[rulenumber++] = gethere(compst); /* save rule position */ | ||
878 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ | ||
879 | addinstruction(compst, IRet, 0); | ||
880 | } | ||
881 | assert(rule->tag == TTrue); | ||
882 | jumptohere(compst, jumptoend); | ||
883 | correctcalls(compst, positions, start, gethere(compst)); | ||
884 | } | ||
885 | |||
886 | |||
887 | static void codecall (CompileState *compst, TTree *call) { | ||
888 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ | ||
889 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ | ||
890 | assert(sib2(call)->tag == TRule); | ||
891 | } | ||
892 | |||
893 | |||
894 | /* | ||
895 | ** Code first child of a sequence | ||
896 | ** (second child is called in-place to allow tail call) | ||
897 | ** Return 'tt' for second child | ||
898 | */ | ||
899 | static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, | ||
900 | int tt, const Charset *fl) { | ||
901 | if (needfollow(p1)) { | ||
902 | Charset fl1; | ||
903 | getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ | ||
904 | codegen(compst, p1, 0, tt, &fl1); | ||
905 | } | ||
906 | else /* use 'fullset' as follow */ | ||
907 | codegen(compst, p1, 0, tt, fullset); | ||
908 | if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ | ||
909 | return NOINST; /* invalidate test */ | ||
910 | else return tt; /* else 'tt' still protects sib2 */ | ||
911 | } | ||
912 | |||
913 | |||
914 | /* | ||
915 | ** Main code-generation function: dispatch to auxiliar functions | ||
916 | ** according to kind of tree. ('needfollow' should return true | ||
917 | ** only for consructions that use 'fl'.) | ||
918 | */ | ||
919 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
920 | const Charset *fl) { | ||
921 | tailcall: | ||
922 | switch (tree->tag) { | ||
923 | case TChar: codechar(compst, tree->u.n, tt); break; | ||
924 | case TAny: addinstruction(compst, IAny, 0); break; | ||
925 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | ||
926 | case TTrue: break; | ||
927 | case TFalse: addinstruction(compst, IFail, 0); break; | ||
928 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | ||
929 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | ||
930 | case TBehind: codebehind(compst, tree); break; | ||
931 | case TNot: codenot(compst, sib1(tree)); break; | ||
932 | case TAnd: codeand(compst, sib1(tree), tt); break; | ||
933 | case TCapture: codecapture(compst, tree, tt, fl); break; | ||
934 | case TRunTime: coderuntime(compst, tree, tt); break; | ||
935 | case TGrammar: codegrammar(compst, tree); break; | ||
936 | case TCall: codecall(compst, tree); break; | ||
937 | case TSeq: { | ||
938 | tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ | ||
939 | /* codegen(compst, p2, opt, tt, fl); */ | ||
940 | tree = sib2(tree); goto tailcall; | ||
941 | } | ||
942 | default: assert(0); | ||
943 | } | ||
944 | } | ||
945 | |||
946 | |||
947 | /* | ||
948 | ** Optimize jumps and other jump-like instructions. | ||
949 | ** * Update labels of instructions with labels to their final | ||
950 | ** destinations (e.g., choice L1; ... L1: jmp L2: becomes | ||
951 | ** choice L2) | ||
952 | ** * Jumps to other instructions that do jumps become those | ||
953 | ** instructions (e.g., jump to return becomes a return; jump | ||
954 | ** to commit becomes a commit) | ||
955 | */ | ||
956 | static void peephole (CompileState *compst) { | ||
957 | Instruction *code = compst->p->code; | ||
958 | int i; | ||
959 | for (i = 0; i < compst->ncode; i += sizei(&code[i])) { | ||
960 | redo: | ||
961 | switch (code[i].i.code) { | ||
962 | case IChoice: case ICall: case ICommit: case IPartialCommit: | ||
963 | case IBackCommit: case ITestChar: case ITestSet: | ||
964 | case ITestAny: { /* instructions with labels */ | ||
965 | jumptothere(compst, i, finallabel(code, i)); /* optimize label */ | ||
966 | break; | ||
967 | } | ||
968 | case IJmp: { | ||
969 | int ft = finaltarget(code, i); | ||
970 | switch (code[ft].i.code) { /* jumping to what? */ | ||
971 | case IRet: case IFail: case IFailTwice: | ||
972 | case IEnd: { /* instructions with unconditional implicit jumps */ | ||
973 | code[i] = code[ft]; /* jump becomes that instruction */ | ||
974 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ | ||
975 | break; | ||
976 | } | ||
977 | case ICommit: case IPartialCommit: | ||
978 | case IBackCommit: { /* inst. with unconditional explicit jumps */ | ||
979 | int fft = finallabel(code, ft); | ||
980 | code[i] = code[ft]; /* jump becomes that instruction... */ | ||
981 | jumptothere(compst, i, fft); /* but must correct its offset */ | ||
982 | goto redo; /* reoptimize its label */ | ||
983 | } | ||
984 | default: { | ||
985 | jumptothere(compst, i, ft); /* optimize label */ | ||
986 | break; | ||
987 | } | ||
988 | } | ||
989 | break; | ||
990 | } | ||
991 | default: break; | ||
992 | } | ||
993 | } | ||
994 | assert(code[i - 1].i.code == IEnd); | ||
995 | } | ||
996 | |||
997 | |||
998 | /* | ||
999 | ** Compile a pattern | ||
1000 | */ | ||
1001 | Instruction *compile (lua_State *L, Pattern *p) { | ||
1002 | CompileState compst; | ||
1003 | compst.p = p; compst.ncode = 0; compst.L = L; | ||
1004 | realloccode(L, p, 2); /* minimum initial size */ | ||
1005 | codegen(&compst, p->tree, 0, NOINST, fullset); | ||
1006 | addinstruction(&compst, IEnd, 0); | ||
1007 | realloccode(L, p, compst.ncode); /* set final size */ | ||
1008 | peephole(&compst); | ||
1009 | return p->code; | ||
1010 | } | ||
1011 | |||
1012 | |||
1013 | /* }====================================================== */ | ||
1014 | |||