diff options
author | Sergio Medeiros <sqmedeiros@gmail.com> | 2014-10-29 18:13:38 -0300 |
---|---|---|
committer | Sergio Medeiros <sqmedeiros@gmail.com> | 2014-10-29 18:13:38 -0300 |
commit | 8d30a0ff8a8584e225c03d878a9add439ea193a3 (patch) | |
tree | cdc03907837ebec1aed26be7290a5a40d00f3e2c | |
parent | 3af55803f0a261dd9b2ccf85a7532288c4a270ae (diff) | |
download | lpeglabel-8d30a0ff8a8584e225c03d878a9add439ea193a3.tar.gz lpeglabel-8d30a0ff8a8584e225c03d878a9add439ea193a3.tar.bz2 lpeglabel-8d30a0ff8a8584e225c03d878a9add439ea193a3.zip |
Creating the git repository with the current implementation.
-rw-r--r-- | lpcap.c | 537 | ||||
-rw-r--r-- | lpcap.h | 43 | ||||
-rw-r--r-- | lpcode.c | 1016 | ||||
-rw-r--r-- | lpcode.h | 34 | ||||
-rw-r--r-- | lpprint.c | 255 | ||||
-rw-r--r-- | lpprint.h | 37 | ||||
-rw-r--r-- | lptree.c | 1282 | ||||
-rw-r--r-- | lptree.h | 79 | ||||
-rw-r--r-- | lptypes.h | 158 | ||||
-rw-r--r-- | lpvm.c | 391 | ||||
-rw-r--r-- | lpvm.h | 66 | ||||
-rw-r--r-- | makefile | 55 | ||||
-rw-r--r-- | re.lua | 276 | ||||
-rwxr-xr-x | test.lua | 1386 | ||||
-rw-r--r-- | testlabel.lua | 422 |
15 files changed, 6037 insertions, 0 deletions
@@ -0,0 +1,537 @@ | |||
1 | /* | ||
2 | ** $Id: lpcap.c,v 1.4 2013/03/21 20:25:12 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include "lua.h" | ||
7 | #include "lauxlib.h" | ||
8 | |||
9 | #include "lpcap.h" | ||
10 | #include "lptypes.h" | ||
11 | |||
12 | |||
13 | #define captype(cap) ((cap)->kind) | ||
14 | |||
15 | #define isclosecap(cap) (captype(cap) == Cclose) | ||
16 | |||
17 | #define closeaddr(c) ((c)->s + (c)->siz - 1) | ||
18 | |||
19 | #define isfullcap(cap) ((cap)->siz != 0) | ||
20 | |||
21 | #define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v) | ||
22 | |||
23 | #define pushluaval(cs) getfromktable(cs, (cs)->cap->idx) | ||
24 | |||
25 | |||
26 | |||
27 | /* | ||
28 | ** Put at the cache for Lua values the value indexed by 'v' in ktable | ||
29 | ** of the running pattern (if it is not there yet); returns its index. | ||
30 | */ | ||
31 | static int updatecache (CapState *cs, int v) { | ||
32 | int idx = cs->ptop + 1; /* stack index of cache for Lua values */ | ||
33 | if (v != cs->valuecached) { /* not there? */ | ||
34 | getfromktable(cs, v); /* get value from 'ktable' */ | ||
35 | lua_replace(cs->L, idx); /* put it at reserved stack position */ | ||
36 | cs->valuecached = v; /* keep track of what is there */ | ||
37 | } | ||
38 | return idx; | ||
39 | } | ||
40 | |||
41 | |||
42 | static int pushcapture (CapState *cs); | ||
43 | |||
44 | |||
45 | /* | ||
46 | ** Goes back in a list of captures looking for an open capture | ||
47 | ** corresponding to a close | ||
48 | */ | ||
49 | static Capture *findopen (Capture *cap) { | ||
50 | int n = 0; /* number of closes waiting an open */ | ||
51 | for (;;) { | ||
52 | cap--; | ||
53 | if (isclosecap(cap)) n++; /* one more open to skip */ | ||
54 | else if (!isfullcap(cap)) | ||
55 | if (n-- == 0) return cap; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | |||
60 | /* | ||
61 | ** Go to the next capture | ||
62 | */ | ||
63 | static void nextcap (CapState *cs) { | ||
64 | Capture *cap = cs->cap; | ||
65 | if (!isfullcap(cap)) { /* not a single capture? */ | ||
66 | int n = 0; /* number of opens waiting a close */ | ||
67 | for (;;) { /* look for corresponding close */ | ||
68 | cap++; | ||
69 | if (isclosecap(cap)) { | ||
70 | if (n-- == 0) break; | ||
71 | } | ||
72 | else if (!isfullcap(cap)) n++; | ||
73 | } | ||
74 | } | ||
75 | cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */ | ||
76 | } | ||
77 | |||
78 | |||
79 | /* | ||
80 | ** Push on the Lua stack all values generated by nested captures inside | ||
81 | ** the current capture. Returns number of values pushed. 'addextra' | ||
82 | ** makes it push the entire match after all captured values. The | ||
83 | ** entire match is pushed also if there are no other nested values, | ||
84 | ** so the function never returns zero. | ||
85 | */ | ||
86 | static int pushnestedvalues (CapState *cs, int addextra) { | ||
87 | Capture *co = cs->cap; | ||
88 | if (isfullcap(cs->cap++)) { /* no nested captures? */ | ||
89 | lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */ | ||
90 | return 1; /* that is it */ | ||
91 | } | ||
92 | else { | ||
93 | int n = 0; | ||
94 | while (!isclosecap(cs->cap)) /* repeat for all nested patterns */ | ||
95 | n += pushcapture(cs); | ||
96 | if (addextra || n == 0) { /* need extra? */ | ||
97 | lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */ | ||
98 | n++; | ||
99 | } | ||
100 | cs->cap++; /* skip close entry */ | ||
101 | return n; | ||
102 | } | ||
103 | } | ||
104 | |||
105 | |||
106 | /* | ||
107 | ** Push only the first value generated by nested captures | ||
108 | */ | ||
109 | static void pushonenestedvalue (CapState *cs) { | ||
110 | int n = pushnestedvalues(cs, 0); | ||
111 | if (n > 1) | ||
112 | lua_pop(cs->L, n - 1); /* pop extra values */ | ||
113 | } | ||
114 | |||
115 | |||
116 | /* | ||
117 | ** Try to find a named group capture with the name given at the top of | ||
118 | ** the stack; goes backward from 'cap'. | ||
119 | */ | ||
120 | static Capture *findback (CapState *cs, Capture *cap) { | ||
121 | lua_State *L = cs->L; | ||
122 | while (cap-- > cs->ocap) { /* repeat until end of list */ | ||
123 | if (isclosecap(cap)) | ||
124 | cap = findopen(cap); /* skip nested captures */ | ||
125 | else if (!isfullcap(cap)) | ||
126 | continue; /* opening an enclosing capture: skip and get previous */ | ||
127 | if (captype(cap) == Cgroup) { | ||
128 | getfromktable(cs, cap->idx); /* get group name */ | ||
129 | if (lua_equal(L, -2, -1)) { /* right group? */ | ||
130 | lua_pop(L, 2); /* remove reference name and group name */ | ||
131 | return cap; | ||
132 | } | ||
133 | else lua_pop(L, 1); /* remove group name */ | ||
134 | } | ||
135 | } | ||
136 | luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1)); | ||
137 | return NULL; /* to avoid warnings */ | ||
138 | } | ||
139 | |||
140 | |||
141 | /* | ||
142 | ** Back-reference capture. Return number of values pushed. | ||
143 | */ | ||
144 | static int backrefcap (CapState *cs) { | ||
145 | int n; | ||
146 | Capture *curr = cs->cap; | ||
147 | pushluaval(cs); /* reference name */ | ||
148 | cs->cap = findback(cs, curr); /* find corresponding group */ | ||
149 | n = pushnestedvalues(cs, 0); /* push group's values */ | ||
150 | cs->cap = curr + 1; | ||
151 | return n; | ||
152 | } | ||
153 | |||
154 | |||
155 | /* | ||
156 | ** Table capture: creates a new table and populates it with nested | ||
157 | ** captures. | ||
158 | */ | ||
159 | static int tablecap (CapState *cs) { | ||
160 | lua_State *L = cs->L; | ||
161 | int n = 0; | ||
162 | lua_newtable(L); | ||
163 | if (isfullcap(cs->cap++)) | ||
164 | return 1; /* table is empty */ | ||
165 | while (!isclosecap(cs->cap)) { | ||
166 | if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */ | ||
167 | pushluaval(cs); /* push group name */ | ||
168 | pushonenestedvalue(cs); | ||
169 | lua_settable(L, -3); | ||
170 | } | ||
171 | else { /* not a named group */ | ||
172 | int i; | ||
173 | int k = pushcapture(cs); | ||
174 | for (i = k; i > 0; i--) /* store all values into table */ | ||
175 | lua_rawseti(L, -(i + 1), n + i); | ||
176 | n += k; | ||
177 | } | ||
178 | } | ||
179 | cs->cap++; /* skip close entry */ | ||
180 | return 1; /* number of values pushed (only the table) */ | ||
181 | } | ||
182 | |||
183 | |||
184 | /* | ||
185 | ** Table-query capture | ||
186 | */ | ||
187 | static int querycap (CapState *cs) { | ||
188 | int idx = cs->cap->idx; | ||
189 | pushonenestedvalue(cs); /* get nested capture */ | ||
190 | lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */ | ||
191 | if (!lua_isnil(cs->L, -1)) | ||
192 | return 1; | ||
193 | else { /* no value */ | ||
194 | lua_pop(cs->L, 1); /* remove nil */ | ||
195 | return 0; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | |||
200 | /* | ||
201 | ** Fold capture | ||
202 | */ | ||
203 | static int foldcap (CapState *cs) { | ||
204 | int n; | ||
205 | lua_State *L = cs->L; | ||
206 | int idx = cs->cap->idx; | ||
207 | if (isfullcap(cs->cap++) || /* no nested captures? */ | ||
208 | isclosecap(cs->cap) || /* no nested captures (large subject)? */ | ||
209 | (n = pushcapture(cs)) == 0) /* nested captures with no values? */ | ||
210 | return luaL_error(L, "no initial value for fold capture"); | ||
211 | if (n > 1) | ||
212 | lua_pop(L, n - 1); /* leave only one result for accumulator */ | ||
213 | while (!isclosecap(cs->cap)) { | ||
214 | lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */ | ||
215 | lua_insert(L, -2); /* put it before accumulator */ | ||
216 | n = pushcapture(cs); /* get next capture's values */ | ||
217 | lua_call(L, n + 1, 1); /* call folding function */ | ||
218 | } | ||
219 | cs->cap++; /* skip close entry */ | ||
220 | return 1; /* only accumulator left on the stack */ | ||
221 | } | ||
222 | |||
223 | |||
224 | /* | ||
225 | ** Function capture | ||
226 | */ | ||
227 | static int functioncap (CapState *cs) { | ||
228 | int n; | ||
229 | int top = lua_gettop(cs->L); | ||
230 | pushluaval(cs); /* push function */ | ||
231 | n = pushnestedvalues(cs, 0); /* push nested captures */ | ||
232 | lua_call(cs->L, n, LUA_MULTRET); /* call function */ | ||
233 | return lua_gettop(cs->L) - top; /* return function's results */ | ||
234 | } | ||
235 | |||
236 | |||
237 | /* | ||
238 | ** Select capture | ||
239 | */ | ||
240 | static int numcap (CapState *cs) { | ||
241 | int idx = cs->cap->idx; /* value to select */ | ||
242 | if (idx == 0) { /* no values? */ | ||
243 | nextcap(cs); /* skip entire capture */ | ||
244 | return 0; /* no value produced */ | ||
245 | } | ||
246 | else { | ||
247 | int n = pushnestedvalues(cs, 0); | ||
248 | if (n < idx) /* invalid index? */ | ||
249 | return luaL_error(cs->L, "no capture '%d'", idx); | ||
250 | else { | ||
251 | lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */ | ||
252 | lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */ | ||
253 | lua_pop(cs->L, n - 1); /* remove other captures */ | ||
254 | return 1; | ||
255 | } | ||
256 | } | ||
257 | } | ||
258 | |||
259 | |||
260 | /* | ||
261 | ** Return the stack index of the first runtime capture in the given | ||
262 | ** list of captures (or zero if no runtime captures) | ||
263 | */ | ||
264 | int finddyncap (Capture *cap, Capture *last) { | ||
265 | for (; cap < last; cap++) { | ||
266 | if (cap->kind == Cruntime) | ||
267 | return cap->idx; /* stack position of first capture */ | ||
268 | } | ||
269 | return 0; /* no dynamic captures in this segment */ | ||
270 | } | ||
271 | |||
272 | |||
273 | /* | ||
274 | ** Calls a runtime capture. Returns number of captures removed by | ||
275 | ** the call, including the initial Cgroup. (Captures to be added are | ||
276 | ** on the Lua stack.) | ||
277 | */ | ||
278 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { | ||
279 | int n, id; | ||
280 | lua_State *L = cs->L; | ||
281 | int otop = lua_gettop(L); | ||
282 | Capture *open = findopen(close); | ||
283 | assert(captype(open) == Cgroup); | ||
284 | id = finddyncap(open, close); /* get first dynamic capture argument */ | ||
285 | close->kind = Cclose; /* closes the group */ | ||
286 | close->s = s; | ||
287 | cs->cap = open; cs->valuecached = 0; /* prepare capture state */ | ||
288 | luaL_checkstack(L, 4, "too many runtime captures"); | ||
289 | pushluaval(cs); /* push function to be called */ | ||
290 | lua_pushvalue(L, SUBJIDX); /* push original subject */ | ||
291 | lua_pushinteger(L, s - cs->s + 1); /* push current position */ | ||
292 | n = pushnestedvalues(cs, 0); /* push nested captures */ | ||
293 | lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */ | ||
294 | if (id > 0) { /* are there old dynamic captures to be removed? */ | ||
295 | int i; | ||
296 | for (i = id; i <= otop; i++) | ||
297 | lua_remove(L, id); /* remove old dynamic captures */ | ||
298 | *rem = otop - id + 1; /* total number of dynamic captures removed */ | ||
299 | } | ||
300 | else | ||
301 | *rem = 0; /* no dynamic captures removed */ | ||
302 | return close - open; /* number of captures of all kinds removed */ | ||
303 | } | ||
304 | |||
305 | |||
306 | /* | ||
307 | ** Auxiliary structure for substitution and string captures: keep | ||
308 | ** information about nested captures for future use, avoiding to push | ||
309 | ** string results into Lua | ||
310 | */ | ||
311 | typedef struct StrAux { | ||
312 | int isstring; /* whether capture is a string */ | ||
313 | union { | ||
314 | Capture *cp; /* if not a string, respective capture */ | ||
315 | struct { /* if it is a string... */ | ||
316 | const char *s; /* ... starts here */ | ||
317 | const char *e; /* ... ends here */ | ||
318 | } s; | ||
319 | } u; | ||
320 | } StrAux; | ||
321 | |||
322 | #define MAXSTRCAPS 10 | ||
323 | |||
324 | /* | ||
325 | ** Collect values from current capture into array 'cps'. Current | ||
326 | ** capture must be Cstring (first call) or Csimple (recursive calls). | ||
327 | ** (In first call, fills %0 with whole match for Cstring.) | ||
328 | ** Returns number of elements in the array that were filled. | ||
329 | */ | ||
330 | static int getstrcaps (CapState *cs, StrAux *cps, int n) { | ||
331 | int k = n++; | ||
332 | cps[k].isstring = 1; /* get string value */ | ||
333 | cps[k].u.s.s = cs->cap->s; /* starts here */ | ||
334 | if (!isfullcap(cs->cap++)) { /* nested captures? */ | ||
335 | while (!isclosecap(cs->cap)) { /* traverse them */ | ||
336 | if (n >= MAXSTRCAPS) /* too many captures? */ | ||
337 | nextcap(cs); /* skip extra captures (will not need them) */ | ||
338 | else if (captype(cs->cap) == Csimple) /* string? */ | ||
339 | n = getstrcaps(cs, cps, n); /* put info. into array */ | ||
340 | else { | ||
341 | cps[n].isstring = 0; /* not a string */ | ||
342 | cps[n].u.cp = cs->cap; /* keep original capture */ | ||
343 | nextcap(cs); | ||
344 | n++; | ||
345 | } | ||
346 | } | ||
347 | cs->cap++; /* skip close */ | ||
348 | } | ||
349 | cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */ | ||
350 | return n; | ||
351 | } | ||
352 | |||
353 | |||
354 | /* | ||
355 | ** add next capture value (which should be a string) to buffer 'b' | ||
356 | */ | ||
357 | static int addonestring (luaL_Buffer *b, CapState *cs, const char *what); | ||
358 | |||
359 | |||
360 | /* | ||
361 | ** String capture: add result to buffer 'b' (instead of pushing | ||
362 | ** it into the stack) | ||
363 | */ | ||
364 | static void stringcap (luaL_Buffer *b, CapState *cs) { | ||
365 | StrAux cps[MAXSTRCAPS]; | ||
366 | int n; | ||
367 | size_t len, i; | ||
368 | const char *fmt; /* format string */ | ||
369 | fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len); | ||
370 | n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */ | ||
371 | for (i = 0; i < len; i++) { /* traverse them */ | ||
372 | if (fmt[i] != '%') /* not an escape? */ | ||
373 | luaL_addchar(b, fmt[i]); /* add it to buffer */ | ||
374 | else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */ | ||
375 | luaL_addchar(b, fmt[i]); /* add to buffer */ | ||
376 | else { | ||
377 | int l = fmt[i] - '0'; /* capture index */ | ||
378 | if (l > n) | ||
379 | luaL_error(cs->L, "invalid capture index (%d)", l); | ||
380 | else if (cps[l].isstring) | ||
381 | luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s); | ||
382 | else { | ||
383 | Capture *curr = cs->cap; | ||
384 | cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */ | ||
385 | if (!addonestring(b, cs, "capture")) | ||
386 | luaL_error(cs->L, "no values in capture index %d", l); | ||
387 | cs->cap = curr; /* continue from where it stopped */ | ||
388 | } | ||
389 | } | ||
390 | } | ||
391 | } | ||
392 | |||
393 | |||
394 | /* | ||
395 | ** Substitution capture: add result to buffer 'b' | ||
396 | */ | ||
397 | static void substcap (luaL_Buffer *b, CapState *cs) { | ||
398 | const char *curr = cs->cap->s; | ||
399 | if (isfullcap(cs->cap)) /* no nested captures? */ | ||
400 | luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */ | ||
401 | else { | ||
402 | cs->cap++; /* skip open entry */ | ||
403 | while (!isclosecap(cs->cap)) { /* traverse nested captures */ | ||
404 | const char *next = cs->cap->s; | ||
405 | luaL_addlstring(b, curr, next - curr); /* add text up to capture */ | ||
406 | if (addonestring(b, cs, "replacement")) | ||
407 | curr = closeaddr(cs->cap - 1); /* continue after match */ | ||
408 | else /* no capture value */ | ||
409 | curr = next; /* keep original text in final result */ | ||
410 | } | ||
411 | luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */ | ||
412 | } | ||
413 | cs->cap++; /* go to next capture */ | ||
414 | } | ||
415 | |||
416 | |||
417 | /* | ||
418 | ** Evaluates a capture and adds its first value to buffer 'b'; returns | ||
419 | ** whether there was a value | ||
420 | */ | ||
421 | static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { | ||
422 | switch (captype(cs->cap)) { | ||
423 | case Cstring: | ||
424 | stringcap(b, cs); /* add capture directly to buffer */ | ||
425 | return 1; | ||
426 | case Csubst: | ||
427 | substcap(b, cs); /* add capture directly to buffer */ | ||
428 | return 1; | ||
429 | default: { | ||
430 | lua_State *L = cs->L; | ||
431 | int n = pushcapture(cs); | ||
432 | if (n > 0) { | ||
433 | if (n > 1) lua_pop(L, n - 1); /* only one result */ | ||
434 | if (!lua_isstring(L, -1)) | ||
435 | luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); | ||
436 | luaL_addvalue(b); | ||
437 | } | ||
438 | return n; | ||
439 | } | ||
440 | } | ||
441 | } | ||
442 | |||
443 | |||
444 | /* | ||
445 | ** Push all values of the current capture into the stack; returns | ||
446 | ** number of values pushed | ||
447 | */ | ||
448 | static int pushcapture (CapState *cs) { | ||
449 | lua_State *L = cs->L; | ||
450 | luaL_checkstack(L, 4, "too many captures"); | ||
451 | switch (captype(cs->cap)) { | ||
452 | case Cposition: { | ||
453 | lua_pushinteger(L, cs->cap->s - cs->s + 1); | ||
454 | cs->cap++; | ||
455 | return 1; | ||
456 | } | ||
457 | case Cconst: { | ||
458 | pushluaval(cs); | ||
459 | cs->cap++; | ||
460 | return 1; | ||
461 | } | ||
462 | case Carg: { | ||
463 | int arg = (cs->cap++)->idx; | ||
464 | if (arg + FIXEDARGS > cs->ptop) | ||
465 | return luaL_error(L, "reference to absent argument #%d", arg); | ||
466 | lua_pushvalue(L, arg + FIXEDARGS); | ||
467 | return 1; | ||
468 | } | ||
469 | case Csimple: { | ||
470 | int k = pushnestedvalues(cs, 1); | ||
471 | lua_insert(L, -k); /* make whole match be first result */ | ||
472 | return k; | ||
473 | } | ||
474 | case Cruntime: { | ||
475 | lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ | ||
476 | return 1; | ||
477 | } | ||
478 | case Cstring: { | ||
479 | luaL_Buffer b; | ||
480 | luaL_buffinit(L, &b); | ||
481 | stringcap(&b, cs); | ||
482 | luaL_pushresult(&b); | ||
483 | return 1; | ||
484 | } | ||
485 | case Csubst: { | ||
486 | luaL_Buffer b; | ||
487 | luaL_buffinit(L, &b); | ||
488 | substcap(&b, cs); | ||
489 | luaL_pushresult(&b); | ||
490 | return 1; | ||
491 | } | ||
492 | case Cgroup: { | ||
493 | if (cs->cap->idx == 0) /* anonymous group? */ | ||
494 | return pushnestedvalues(cs, 0); /* add all nested values */ | ||
495 | else { /* named group: add no values */ | ||
496 | nextcap(cs); /* skip capture */ | ||
497 | return 0; | ||
498 | } | ||
499 | } | ||
500 | case Cbackref: return backrefcap(cs); | ||
501 | case Ctable: return tablecap(cs); | ||
502 | case Cfunction: return functioncap(cs); | ||
503 | case Cnum: return numcap(cs); | ||
504 | case Cquery: return querycap(cs); | ||
505 | case Cfold: return foldcap(cs); | ||
506 | default: assert(0); return 0; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | |||
511 | /* | ||
512 | ** Prepare a CapState structure and traverse the entire list of | ||
513 | ** captures in the stack pushing its results. 's' is the subject | ||
514 | ** string, 'r' is the final position of the match, and 'ptop' | ||
515 | ** the index in the stack where some useful values were pushed. | ||
516 | ** Returns the number of results pushed. (If the list produces no | ||
517 | ** results, push the final position of the match.) | ||
518 | */ | ||
519 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { | ||
520 | Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); | ||
521 | int n = 0; | ||
522 | if (!isclosecap(capture)) { /* is there any capture? */ | ||
523 | CapState cs; | ||
524 | cs.ocap = cs.cap = capture; cs.L = L; | ||
525 | cs.s = s; cs.valuecached = 0; cs.ptop = ptop; | ||
526 | do { /* collect their values */ | ||
527 | n += pushcapture(&cs); | ||
528 | } while (!isclosecap(cs.cap)); | ||
529 | } | ||
530 | if (n == 0) { /* no capture values? */ | ||
531 | lua_pushinteger(L, r - s + 1); /* return only end position */ | ||
532 | n = 1; | ||
533 | } | ||
534 | return n; | ||
535 | } | ||
536 | |||
537 | |||
@@ -0,0 +1,43 @@ | |||
1 | /* | ||
2 | ** $Id: lpcap.h,v 1.1 2013/03/21 20:25:12 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpcap_h) | ||
6 | #define lpcap_h | ||
7 | |||
8 | |||
9 | #include "lptypes.h" | ||
10 | |||
11 | |||
12 | /* kinds of captures */ | ||
13 | typedef enum CapKind { | ||
14 | Cclose, Cposition, Cconst, Cbackref, Carg, Csimple, Ctable, Cfunction, | ||
15 | Cquery, Cstring, Cnum, Csubst, Cfold, Cruntime, Cgroup | ||
16 | } CapKind; | ||
17 | |||
18 | |||
19 | typedef struct Capture { | ||
20 | const char *s; /* subject position */ | ||
21 | short idx; /* extra info about capture (group name, arg index, etc.) */ | ||
22 | byte kind; /* kind of capture */ | ||
23 | byte siz; /* size of full capture + 1 (0 = not a full capture) */ | ||
24 | } Capture; | ||
25 | |||
26 | |||
27 | typedef struct CapState { | ||
28 | Capture *cap; /* current capture */ | ||
29 | Capture *ocap; /* (original) capture list */ | ||
30 | lua_State *L; | ||
31 | int ptop; /* index of last argument to 'match' */ | ||
32 | const char *s; /* original string */ | ||
33 | int valuecached; /* value stored in cache slot */ | ||
34 | } CapState; | ||
35 | |||
36 | |||
37 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); | ||
38 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop); | ||
39 | int finddyncap (Capture *cap, Capture *last); | ||
40 | |||
41 | #endif | ||
42 | |||
43 | |||
diff --git a/lpcode.c b/lpcode.c new file mode 100644 index 0000000..4a49e7d --- /dev/null +++ b/lpcode.c | |||
@@ -0,0 +1,1016 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.c,v 1.18 2013/04/12 16:30:33 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | |||
8 | |||
9 | #include "lua.h" | ||
10 | #include "lauxlib.h" | ||
11 | |||
12 | #include "lptypes.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | /* signals a "no-instruction */ | ||
17 | #define NOINST -1 | ||
18 | |||
19 | |||
20 | |||
21 | static const Charset fullset_ = | ||
22 | {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
23 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
24 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
25 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; | ||
26 | |||
27 | static const Charset *fullset = &fullset_; | ||
28 | |||
29 | /* | ||
30 | ** {====================================================== | ||
31 | ** Analysis and some optimizations | ||
32 | ** ======================================================= | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | ** Check whether a charset is empty (IFail), singleton (IChar), | ||
37 | ** full (IAny), or none of those (ISet). | ||
38 | */ | ||
39 | static Opcode charsettype (const byte *cs, int *c) { | ||
40 | int count = 0; | ||
41 | int i; | ||
42 | int candidate = -1; /* candidate position for a char */ | ||
43 | for (i = 0; i < CHARSETSIZE; i++) { | ||
44 | int b = cs[i]; | ||
45 | if (b == 0) { | ||
46 | if (count > 1) return ISet; /* else set is still empty */ | ||
47 | } | ||
48 | else if (b == 0xFF) { | ||
49 | if (count < (i * BITSPERCHAR)) | ||
50 | return ISet; | ||
51 | else count += BITSPERCHAR; /* set is still full */ | ||
52 | } | ||
53 | else if ((b & (b - 1)) == 0) { /* byte has only one bit? */ | ||
54 | if (count > 0) | ||
55 | return ISet; /* set is neither full nor empty */ | ||
56 | else { /* set has only one char till now; track it */ | ||
57 | count++; | ||
58 | candidate = i; | ||
59 | } | ||
60 | } | ||
61 | else return ISet; /* byte is neither empty, full, nor singleton */ | ||
62 | } | ||
63 | switch (count) { | ||
64 | case 0: return IFail; /* empty set */ | ||
65 | case 1: { /* singleton; find character bit inside byte */ | ||
66 | int b = cs[candidate]; | ||
67 | *c = candidate * BITSPERCHAR; | ||
68 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } | ||
69 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } | ||
70 | if ((b & 0x02) != 0) { *c += 1; } | ||
71 | return IChar; | ||
72 | } | ||
73 | default: { | ||
74 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ | ||
75 | return IAny; | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | ** A few basic operations on Charsets | ||
82 | */ | ||
83 | static void cs_complement (Charset *cs) { | ||
84 | loopset(i, cs->cs[i] = ~cs->cs[i]); | ||
85 | } | ||
86 | |||
87 | |||
88 | static int cs_equal (const byte *cs1, const byte *cs2) { | ||
89 | loopset(i, if (cs1[i] != cs2[i]) return 0); | ||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | |||
94 | /* | ||
95 | ** computes whether sets cs1 and cs2 are disjoint | ||
96 | */ | ||
97 | static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | ||
98 | loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | ** Convert a 'char' pattern (TSet, TChar, TAny) to a charset | ||
105 | */ | ||
106 | int tocharset (TTree *tree, Charset *cs) { | ||
107 | switch (tree->tag) { | ||
108 | case TSet: { /* copy set */ | ||
109 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
110 | return 1; | ||
111 | } | ||
112 | case TChar: { /* only one char */ | ||
113 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
114 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
115 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
116 | return 1; | ||
117 | } | ||
118 | case TAny: { | ||
119 | loopset(i, cs->cs[i] = 0xFF); /* add all to the set */ | ||
120 | return 1; | ||
121 | } | ||
122 | default: return 0; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | |||
127 | /* | ||
128 | ** Checks whether a pattern has captures | ||
129 | */ | ||
130 | int hascaptures (TTree *tree) { | ||
131 | tailcall: | ||
132 | switch (tree->tag) { | ||
133 | case TCapture: case TRunTime: | ||
134 | return 1; | ||
135 | default: { | ||
136 | switch (numsiblings[tree->tag]) { | ||
137 | case 1: /* return hascaptures(sib1(tree)); */ | ||
138 | tree = sib1(tree); goto tailcall; | ||
139 | case 2: | ||
140 | if (hascaptures(sib1(tree))) return 1; | ||
141 | /* else return hascaptures(sib2(tree)); */ | ||
142 | tree = sib2(tree); goto tailcall; | ||
143 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | } | ||
148 | |||
149 | |||
150 | /* | ||
151 | ** Checks how a pattern behaves regarding the empty string, | ||
152 | ** in one of two different ways: | ||
153 | ** A pattern is *nullable* if it can match without consuming any character; | ||
154 | ** A pattern is *nofail* if it never fails for any string | ||
155 | ** (including the empty string). | ||
156 | ** The difference is only for predicates and run-time captures; | ||
157 | ** for other patterns, the two properties are equivalent. | ||
158 | ** (With predicates, &'a' is nullable but not nofail. Of course, | ||
159 | ** nofail => nullable.) | ||
160 | ** These functions are all convervative in the following way: | ||
161 | ** p is nullable => nullable(p) | ||
162 | ** nofail(p) => p cannot fail | ||
163 | ** The function assumes that TOpenCall is not nullable; | ||
164 | ** this will be checked again when the grammar is fixed.) | ||
165 | ** Run-time captures can do whatever they want, so the result | ||
166 | ** is conservative. | ||
167 | */ | ||
168 | int checkaux (TTree *tree, int pred) { | ||
169 | tailcall: | ||
170 | switch (tree->tag) { | ||
171 | case TChar: case TSet: case TAny: | ||
172 | case TFalse: case TOpenCall: case TThrow: /* labeled failure */ | ||
173 | return 0; /* not nullable */ | ||
174 | case TRep: case TTrue: | ||
175 | return 1; /* no fail */ | ||
176 | case TNot: case TBehind: /* can match empty, but can fail */ | ||
177 | if (pred == PEnofail) return 0; | ||
178 | else return 1; /* PEnullable */ | ||
179 | case TAnd: /* can match empty; fail iff body does */ | ||
180 | if (pred == PEnullable) return 1; | ||
181 | /* else return checkaux(sib1(tree), pred); */ | ||
182 | tree = sib1(tree); goto tailcall; | ||
183 | case TRunTime: /* can fail; match empty iff body does */ | ||
184 | if (pred == PEnofail) return 0; | ||
185 | /* else return checkaux(sib1(tree), pred); */ | ||
186 | tree = sib1(tree); goto tailcall; | ||
187 | case TSeq: | ||
188 | if (!checkaux(sib1(tree), pred)) return 0; | ||
189 | /* else return checkaux(sib2(tree), pred); */ | ||
190 | tree = sib2(tree); goto tailcall; | ||
191 | case TChoice: case TLabChoice: /* labeled failure */ | ||
192 | if (checkaux(sib2(tree), pred)) return 1; | ||
193 | /* else return checkaux(sib1(tree), pred); */ | ||
194 | tree = sib1(tree); goto tailcall; | ||
195 | case TCapture: case TGrammar: case TRule: | ||
196 | /* return checkaux(sib1(tree), pred); */ | ||
197 | tree = sib1(tree); goto tailcall; | ||
198 | case TCall: /* return checkaux(sib2(tree), pred); */ | ||
199 | tree = sib2(tree); goto tailcall; | ||
200 | default: assert(0); return 0; | ||
201 | }; | ||
202 | } | ||
203 | |||
204 | |||
205 | /* | ||
206 | ** number of characters to match a pattern (or -1 if variable) | ||
207 | ** ('count' avoids infinite loops for grammars) | ||
208 | */ | ||
209 | int fixedlenx (TTree *tree, int count, int len) { | ||
210 | tailcall: | ||
211 | switch (tree->tag) { | ||
212 | case TChar: case TSet: case TAny: | ||
213 | return len + 1; | ||
214 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | ||
215 | case TThrow: /* labeled failure */ | ||
216 | return len; | ||
217 | case TRep: case TRunTime: case TOpenCall: | ||
218 | return -1; | ||
219 | case TCapture: case TRule: case TGrammar: | ||
220 | /* return fixedlenx(sib1(tree), count); */ | ||
221 | tree = sib1(tree); goto tailcall; | ||
222 | case TCall: | ||
223 | if (count++ >= MAXRULES) | ||
224 | return -1; /* may be a loop */ | ||
225 | /* else return fixedlenx(sib2(tree), count); */ | ||
226 | tree = sib2(tree); goto tailcall; | ||
227 | case TSeq: { | ||
228 | len = fixedlenx(sib1(tree), count, len); | ||
229 | if (len < 0) return -1; | ||
230 | /* else return fixedlenx(sib2(tree), count, len); */ | ||
231 | tree = sib2(tree); goto tailcall; | ||
232 | } | ||
233 | case TChoice: case TLabChoice: { /* labeled failure */ | ||
234 | int n1, n2; | ||
235 | n1 = fixedlenx(sib1(tree), count, len); | ||
236 | if (n1 < 0) return -1; | ||
237 | n2 = fixedlenx(sib2(tree), count, len); | ||
238 | if (n1 == n2) return n1; | ||
239 | else return -1; | ||
240 | } | ||
241 | default: assert(0); return 0; | ||
242 | }; | ||
243 | } | ||
244 | |||
245 | |||
246 | /* | ||
247 | ** Computes the 'first set' of a pattern. | ||
248 | ** The result is a conservative aproximation: | ||
249 | ** match p ax -> x' for some x ==> a in first(p). | ||
250 | ** The set 'follow' is the first set of what follows the | ||
251 | ** pattern (full set if nothing follows it). | ||
252 | ** The function returns 0 when this set can be used for | ||
253 | ** tests that avoid the pattern altogether. | ||
254 | ** A non-zero return can happen for two reasons: | ||
255 | ** 1) match p '' -> '' ==> returns 1. | ||
256 | ** (tests cannot be used because they always fail for an empty input) | ||
257 | ** 2) there is a match-time capture ==> returns 2. | ||
258 | ** (match-time captures should not be avoided by optimizations) | ||
259 | */ | ||
260 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | ||
261 | tailcall: | ||
262 | switch (tree->tag) { | ||
263 | case TChar: case TSet: case TAny: { | ||
264 | tocharset(tree, firstset); | ||
265 | return 0; | ||
266 | } | ||
267 | case TTrue: { | ||
268 | loopset(i, firstset->cs[i] = follow->cs[i]); | ||
269 | return 1; | ||
270 | } | ||
271 | case TFalse: { | ||
272 | loopset(i, firstset->cs[i] = 0); | ||
273 | return 0; | ||
274 | } | ||
275 | case TThrow: { /* (?)labeled failure: should always throw the label */ | ||
276 | loopset(i, firstset->cs[i] = follow->cs[i]); /* follow = fullset? */ | ||
277 | return 1; | ||
278 | } | ||
279 | case TChoice: case TLabChoice: { /*(?) labeled failure */ | ||
280 | Charset csaux; | ||
281 | int e1 = getfirst(sib1(tree), follow, firstset); | ||
282 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
283 | loopset(i, firstset->cs[i] |= csaux.cs[i]); | ||
284 | return e1 | e2; | ||
285 | } | ||
286 | case TSeq: { | ||
287 | if (!nullable(sib1(tree))) { | ||
288 | /* return getfirst(sib1(tree), fullset, firstset); */ | ||
289 | tree = sib1(tree); follow = fullset; goto tailcall; | ||
290 | } | ||
291 | else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ | ||
292 | Charset csaux; | ||
293 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
294 | int e1 = getfirst(sib1(tree), &csaux, firstset); | ||
295 | if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ | ||
296 | else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ | ||
297 | return 2; /* pattern has a matchtime capture */ | ||
298 | else return e2; /* else depends on 'e2' */ | ||
299 | } | ||
300 | } | ||
301 | case TRep: { | ||
302 | getfirst(sib1(tree), follow, firstset); | ||
303 | loopset(i, firstset->cs[i] |= follow->cs[i]); | ||
304 | return 1; /* accept the empty string */ | ||
305 | } | ||
306 | case TCapture: case TGrammar: case TRule: { | ||
307 | /* return getfirst(sib1(tree), follow, firstset); */ | ||
308 | tree = sib1(tree); goto tailcall; | ||
309 | } | ||
310 | case TRunTime: { /* function invalidates any follow info. */ | ||
311 | int e = getfirst(sib1(tree), fullset, firstset); | ||
312 | if (e) return 2; /* function is not "protected"? */ | ||
313 | else return 0; /* pattern inside capture ensures first can be used */ | ||
314 | } | ||
315 | case TCall: { | ||
316 | /* return getfirst(sib2(tree), follow, firstset); */ | ||
317 | tree = sib2(tree); goto tailcall; | ||
318 | } | ||
319 | case TAnd: { | ||
320 | int e = getfirst(sib1(tree), follow, firstset); | ||
321 | loopset(i, firstset->cs[i] &= follow->cs[i]); | ||
322 | return e; | ||
323 | } | ||
324 | case TNot: { | ||
325 | if (tocharset(sib1(tree), firstset)) { | ||
326 | cs_complement(firstset); | ||
327 | return 1; | ||
328 | } | ||
329 | /* else go through */ | ||
330 | } | ||
331 | case TBehind: { /* instruction gives no new information */ | ||
332 | /* call 'getfirst' to check for math-time captures */ | ||
333 | int e = getfirst(sib1(tree), follow, firstset); | ||
334 | loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ | ||
335 | return e | 1; /* always can accept the empty string */ | ||
336 | } | ||
337 | default: assert(0); return 0; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | |||
342 | /* | ||
343 | ** If it returns true, then pattern can fail only depending on the next | ||
344 | ** character of the subject | ||
345 | */ | ||
346 | static int headfail (TTree *tree) { | ||
347 | tailcall: | ||
348 | switch (tree->tag) { | ||
349 | case TChar: case TSet: case TAny: case TFalse: | ||
350 | return 1; | ||
351 | case TTrue: case TRep: case TRunTime: case TNot: | ||
352 | case TBehind: | ||
353 | case TThrow: /* labeled failure: should always throw the label */ | ||
354 | return 0; | ||
355 | case TCapture: case TGrammar: case TRule: case TAnd: | ||
356 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | ||
357 | case TCall: | ||
358 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ | ||
359 | case TSeq: | ||
360 | if (!nofail(sib2(tree))) return 0; | ||
361 | /* else return headfail(sib1(tree)); */ | ||
362 | tree = sib1(tree); goto tailcall; | ||
363 | case TChoice: case TLabChoice: /* labeled failure */ | ||
364 | if (!headfail(sib1(tree))) return 0; | ||
365 | /* else return headfail(sib2(tree)); */ | ||
366 | tree = sib2(tree); goto tailcall; | ||
367 | default: assert(0); return 0; | ||
368 | } | ||
369 | } | ||
370 | |||
371 | |||
372 | /* | ||
373 | ** Check whether the code generation for the given tree can benefit | ||
374 | ** from a follow set (to avoid computing the follow set when it is | ||
375 | ** not needed) | ||
376 | */ | ||
377 | static int needfollow (TTree *tree) { | ||
378 | tailcall: | ||
379 | switch (tree->tag) { | ||
380 | case TChar: case TSet: case TAny: | ||
381 | case TFalse: case TTrue: case TAnd: case TNot: | ||
382 | case TRunTime: case TGrammar: case TCall: case TBehind: | ||
383 | case TThrow: case TLabChoice: /* (?)labeled failure */ | ||
384 | return 0; | ||
385 | case TChoice: case TRep: | ||
386 | return 1; | ||
387 | case TCapture: | ||
388 | tree = sib1(tree); goto tailcall; | ||
389 | case TSeq: | ||
390 | tree = sib2(tree); goto tailcall; | ||
391 | default: assert(0); return 0; | ||
392 | } | ||
393 | } | ||
394 | |||
395 | /* }====================================================== */ | ||
396 | |||
397 | |||
398 | |||
399 | /* | ||
400 | ** {====================================================== | ||
401 | ** Code generation | ||
402 | ** ======================================================= | ||
403 | */ | ||
404 | |||
405 | |||
406 | /* | ||
407 | ** size of an instruction | ||
408 | */ | ||
409 | int sizei (const Instruction *i) { | ||
410 | switch((Opcode)i->i.code) { | ||
411 | case ISet: case ISpan: return CHARSETINSTSIZE; | ||
412 | case ITestSet: return CHARSETINSTSIZE + 1; | ||
413 | case ITestChar: case ITestAny: case IChoice: case IJmp: | ||
414 | case ICall: case IOpenCall: case ICommit: case IPartialCommit: | ||
415 | case IBackCommit: case IThrow: return 2; /* labeled failure */ | ||
416 | case ILabChoice: return 3; /* labeled failure */ | ||
417 | default: return 1; | ||
418 | } | ||
419 | } | ||
420 | |||
421 | |||
422 | /* | ||
423 | ** state for the compiler | ||
424 | */ | ||
425 | typedef struct CompileState { | ||
426 | Pattern *p; /* pattern being compiled */ | ||
427 | int ncode; /* next position in p->code to be filled */ | ||
428 | lua_State *L; | ||
429 | } CompileState; | ||
430 | |||
431 | |||
432 | /* | ||
433 | ** code generation is recursive; 'opt' indicates that the code is | ||
434 | ** being generated under a 'IChoice' operator jumping to its end. | ||
435 | ** 'tt' points to a previous test protecting this code. 'fl' is | ||
436 | ** the follow set of the pattern. | ||
437 | */ | ||
438 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
439 | const Charset *fl); | ||
440 | |||
441 | |||
442 | void reallocprog (lua_State *L, Pattern *p, int nsize) { | ||
443 | void *ud; | ||
444 | lua_Alloc f = lua_getallocf(L, &ud); | ||
445 | void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), | ||
446 | nsize * sizeof(Instruction)); | ||
447 | if (newblock == NULL && nsize > 0) | ||
448 | luaL_error(L, "not enough memory"); | ||
449 | p->code = (Instruction *)newblock; | ||
450 | p->codesize = nsize; | ||
451 | } | ||
452 | |||
453 | |||
454 | static int nextinstruction (CompileState *compst) { | ||
455 | int size = compst->p->codesize; | ||
456 | if (compst->ncode >= size) | ||
457 | reallocprog(compst->L, compst->p, size * 2); | ||
458 | return compst->ncode++; | ||
459 | } | ||
460 | |||
461 | |||
462 | #define getinstr(cs,i) ((cs)->p->code[i]) | ||
463 | |||
464 | |||
465 | static int addinstruction (CompileState *compst, Opcode op, int aux) { | ||
466 | int i = nextinstruction(compst); | ||
467 | getinstr(compst, i).i.code = op; | ||
468 | getinstr(compst, i).i.aux = aux; | ||
469 | return i; | ||
470 | } | ||
471 | |||
472 | |||
473 | static int addoffsetinst (CompileState *compst, Opcode op) { | ||
474 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
475 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
476 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); | ||
477 | return i; | ||
478 | } | ||
479 | |||
480 | /* labeled failure begin */ | ||
481 | static int addthrowinstruction (CompileState *compst, Labelset ls) { | ||
482 | int i = nextinstruction(compst); | ||
483 | getinstr(compst, i).i.code = IThrow; | ||
484 | i = nextinstruction(compst); | ||
485 | getinstr(compst, i).labels = ls; | ||
486 | return i; | ||
487 | } | ||
488 | |||
489 | static int addoffsetlabinst (CompileState *compst, Opcode op, Labelset ls) { | ||
490 | int j; | ||
491 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
492 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
493 | j = nextinstruction(compst); /* open space for labels */ | ||
494 | getinstr(compst, j).labels = ls; | ||
495 | assert(op == ILabChoice); | ||
496 | return i; | ||
497 | } | ||
498 | /* labeled failure end */ | ||
499 | |||
500 | |||
501 | static void setoffset (CompileState *compst, int instruction, int offset) { | ||
502 | getinstr(compst, instruction + 1).offset = offset; | ||
503 | } | ||
504 | |||
505 | |||
506 | /* | ||
507 | ** Add a capture instruction: | ||
508 | ** 'op' is the capture instruction; 'cap' the capture kind; | ||
509 | ** 'key' the key into ktable; 'aux' is optional offset | ||
510 | ** | ||
511 | */ | ||
512 | static int addinstcap (CompileState *compst, Opcode op, int cap, int key, | ||
513 | int aux) { | ||
514 | int i = addinstruction(compst, op, joinkindoff(cap, aux)); | ||
515 | getinstr(compst, i).i.key = key; | ||
516 | return i; | ||
517 | } | ||
518 | |||
519 | |||
520 | #define gethere(compst) ((compst)->ncode) | ||
521 | |||
522 | #define target(code,i) ((i) + code[i + 1].offset) | ||
523 | |||
524 | |||
525 | static void jumptothere (CompileState *compst, int instruction, int target) { | ||
526 | if (instruction >= 0) | ||
527 | setoffset(compst, instruction, target - instruction); | ||
528 | } | ||
529 | |||
530 | |||
531 | static void jumptohere (CompileState *compst, int instruction) { | ||
532 | jumptothere(compst, instruction, gethere(compst)); | ||
533 | } | ||
534 | |||
535 | |||
536 | /* | ||
537 | ** Code an IChar instruction, or IAny if there is an equivalent | ||
538 | ** test dominating it | ||
539 | */ | ||
540 | static void codechar (CompileState *compst, int c, int tt) { | ||
541 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && | ||
542 | getinstr(compst, tt).i.aux == c) | ||
543 | addinstruction(compst, IAny, 0); | ||
544 | else | ||
545 | addinstruction(compst, IChar, c); | ||
546 | } | ||
547 | |||
548 | |||
549 | /* | ||
550 | ** Add a charset posfix to an instruction | ||
551 | */ | ||
552 | static void addcharset (CompileState *compst, const byte *cs) { | ||
553 | int p = gethere(compst); | ||
554 | int i; | ||
555 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | ||
556 | nextinstruction(compst); /* space for buffer */ | ||
557 | /* fill buffer with charset */ | ||
558 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | ||
559 | } | ||
560 | |||
561 | |||
562 | /* | ||
563 | ** code a char set, optimizing unit sets for IChar, "complete" | ||
564 | ** sets for IAny, and empty sets for IFail; also use an IAny | ||
565 | ** when instruction is dominated by an equivalent test. | ||
566 | */ | ||
567 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | ||
568 | int c = 0; /* (=) to avoid warnings */ | ||
569 | Opcode op = charsettype(cs, &c); | ||
570 | switch (op) { | ||
571 | case IChar: codechar(compst, c, tt); break; | ||
572 | case ISet: { /* non-trivial set? */ | ||
573 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && | ||
574 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | ||
575 | addinstruction(compst, IAny, 0); | ||
576 | else { | ||
577 | addinstruction(compst, ISet, 0); | ||
578 | addcharset(compst, cs); | ||
579 | } | ||
580 | break; | ||
581 | } | ||
582 | default: addinstruction(compst, op, c); break; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | |||
587 | /* | ||
588 | ** code a test set, optimizing unit sets for ITestChar, "complete" | ||
589 | ** sets for ITestAny, and empty sets for IJmp (always fails). | ||
590 | ** 'e' is true iff test should accept the empty string. (Test | ||
591 | ** instructions in the current VM never accept the empty string.) | ||
592 | */ | ||
593 | static int codetestset (CompileState *compst, Charset *cs, int e) { | ||
594 | if (e) return NOINST; /* no test */ | ||
595 | else { | ||
596 | int c = 0; | ||
597 | Opcode op = charsettype(cs->cs, &c); | ||
598 | switch (op) { | ||
599 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ | ||
600 | case IAny: return addoffsetinst(compst, ITestAny); | ||
601 | case IChar: { | ||
602 | int i = addoffsetinst(compst, ITestChar); | ||
603 | getinstr(compst, i).i.aux = c; | ||
604 | return i; | ||
605 | } | ||
606 | case ISet: { | ||
607 | int i = addoffsetinst(compst, ITestSet); | ||
608 | addcharset(compst, cs->cs); | ||
609 | return i; | ||
610 | } | ||
611 | default: assert(0); return 0; | ||
612 | } | ||
613 | } | ||
614 | } | ||
615 | |||
616 | |||
617 | /* | ||
618 | ** Find the final destination of a sequence of jumps | ||
619 | */ | ||
620 | static int finaltarget (Instruction *code, int i) { | ||
621 | while (code[i].i.code == IJmp) | ||
622 | i = target(code, i); | ||
623 | return i; | ||
624 | } | ||
625 | |||
626 | |||
627 | /* | ||
628 | ** final label (after traversing any jumps) | ||
629 | */ | ||
630 | static int finallabel (Instruction *code, int i) { | ||
631 | return finaltarget(code, target(code, i)); | ||
632 | } | ||
633 | |||
634 | |||
635 | /* | ||
636 | ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) | ||
637 | */ | ||
638 | static void codebehind (CompileState *compst, TTree *tree) { | ||
639 | if (tree->u.n > 0) | ||
640 | addinstruction(compst, IBehind, tree->u.n); | ||
641 | codegen(compst, sib1(tree), 0, NOINST, fullset); | ||
642 | } | ||
643 | |||
644 | |||
645 | /* | ||
646 | ** Choice; optimizations: | ||
647 | ** - when p1 is headfail | ||
648 | ** - when first(p1) and first(p2) are disjoint; than | ||
649 | ** a character not in first(p1) cannot go to p1, and a character | ||
650 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | ||
651 | ** (The optimization is not valid if p1 accepts the empty string, | ||
652 | ** as then there is no character at all...) | ||
653 | ** - when p2 is empty and opt is true; a IPartialCommit can resuse | ||
654 | ** the Choice already active in the stack. | ||
655 | */ | ||
656 | static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
657 | const Charset *fl) { | ||
658 | int emptyp2 = (p2->tag == TTrue); | ||
659 | Charset cs1, cs2; | ||
660 | int e1 = getfirst(p1, fullset, &cs1); | ||
661 | if (headfail(p1) || | ||
662 | (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { | ||
663 | /*if (0) {*/ | ||
664 | /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ | ||
665 | int test = codetestset(compst, &cs1, 0); | ||
666 | int jmp = NOINST; | ||
667 | codegen(compst, p1, 0, test, fl); | ||
668 | if (!emptyp2) | ||
669 | jmp = addoffsetinst(compst, IJmp); | ||
670 | jumptohere(compst, test); | ||
671 | codegen(compst, p2, opt, NOINST, fl); | ||
672 | jumptohere(compst, jmp); | ||
673 | } | ||
674 | else if (opt && emptyp2) { | ||
675 | /* p1? == IPartialCommit; p1 */ | ||
676 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
677 | codegen(compst, p1, 1, NOINST, fullset); | ||
678 | } | ||
679 | else { | ||
680 | /* <p1 / p2> == | ||
681 | test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | ||
682 | int pcommit; | ||
683 | int test = codetestset(compst, &cs1, e1); | ||
684 | int pchoice = addoffsetinst(compst, IChoice); | ||
685 | codegen(compst, p1, emptyp2, test, fullset); | ||
686 | pcommit = addoffsetinst(compst, ICommit); | ||
687 | jumptohere(compst, pchoice); | ||
688 | jumptohere(compst, test); | ||
689 | codegen(compst, p2, opt, NOINST, fl); | ||
690 | jumptohere(compst, pcommit); | ||
691 | } | ||
692 | } | ||
693 | |||
694 | /* labeled failure begin */ | ||
695 | static void codelabchoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
696 | const Charset *fl, Labelset ls) { | ||
697 | int emptyp2 = (p2->tag == TTrue); | ||
698 | int pcommit; | ||
699 | int test = NOINST; | ||
700 | int pchoice = addoffsetlabinst(compst, ILabChoice, ls); | ||
701 | codegen(compst, p1, emptyp2, test, fullset); | ||
702 | pcommit = addoffsetinst(compst, ICommit); | ||
703 | jumptohere(compst, pchoice); | ||
704 | jumptohere(compst, test); | ||
705 | codegen(compst, p2, opt, NOINST, fl); | ||
706 | jumptohere(compst, pcommit); | ||
707 | |||
708 | } | ||
709 | /* labeled failure end */ | ||
710 | |||
711 | /* | ||
712 | ** And predicate | ||
713 | ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n | ||
714 | ** (valid only when 'p' has no captures) | ||
715 | */ | ||
716 | static void codeand (CompileState *compst, TTree *tree, int tt) { | ||
717 | int n = fixedlen(tree); | ||
718 | if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { | ||
719 | codegen(compst, tree, 0, tt, fullset); | ||
720 | if (n > 0) | ||
721 | addinstruction(compst, IBehind, n); | ||
722 | } | ||
723 | else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ | ||
724 | int pcommit; | ||
725 | int pchoice = addoffsetinst(compst, IChoice); | ||
726 | codegen(compst, tree, 0, tt, fullset); | ||
727 | pcommit = addoffsetinst(compst, IBackCommit); | ||
728 | jumptohere(compst, pchoice); | ||
729 | addinstruction(compst, IFail, 0); | ||
730 | jumptohere(compst, pcommit); | ||
731 | } | ||
732 | } | ||
733 | |||
734 | |||
735 | /* | ||
736 | ** Captures: if pattern has fixed (and not too big) length, use | ||
737 | ** a single IFullCapture instruction after the match; otherwise, | ||
738 | ** enclose the pattern with OpenCapture - CloseCapture. | ||
739 | */ | ||
740 | static void codecapture (CompileState *compst, TTree *tree, int tt, | ||
741 | const Charset *fl) { | ||
742 | int len = fixedlen(sib1(tree)); | ||
743 | if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { | ||
744 | codegen(compst, sib1(tree), 0, tt, fl); | ||
745 | addinstcap(compst, IFullCapture, tree->cap, tree->key, len); | ||
746 | } | ||
747 | else { | ||
748 | addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); | ||
749 | codegen(compst, sib1(tree), 0, tt, fl); | ||
750 | addinstcap(compst, ICloseCapture, Cclose, 0, 0); | ||
751 | } | ||
752 | } | ||
753 | |||
754 | |||
755 | static void coderuntime (CompileState *compst, TTree *tree, int tt) { | ||
756 | addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); | ||
757 | codegen(compst, sib1(tree), 0, tt, fullset); | ||
758 | addinstcap(compst, ICloseRunTime, Cclose, 0, 0); | ||
759 | } | ||
760 | |||
761 | |||
762 | /* | ||
763 | ** Repetion; optimizations: | ||
764 | ** When pattern is a charset, can use special instruction ISpan. | ||
765 | ** When pattern is head fail, or if it starts with characters that | ||
766 | ** are disjoint from what follows the repetions, a simple test | ||
767 | ** is enough (a fail inside the repetition would backtrack to fail | ||
768 | ** again in the following pattern, so there is no need for a choice). | ||
769 | ** When 'opt' is true, the repetion can reuse the Choice already | ||
770 | ** active in the stack. | ||
771 | */ | ||
772 | static void coderep (CompileState *compst, TTree *tree, int opt, | ||
773 | const Charset *fl) { | ||
774 | Charset st; | ||
775 | if (tocharset(tree, &st)) { | ||
776 | addinstruction(compst, ISpan, 0); | ||
777 | addcharset(compst, st.cs); | ||
778 | } | ||
779 | else { | ||
780 | int e1 = getfirst(tree, fullset, &st); | ||
781 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | ||
782 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | ||
783 | int jmp; | ||
784 | int test = codetestset(compst, &st, 0); | ||
785 | codegen(compst, tree, opt, test, fullset); | ||
786 | jmp = addoffsetinst(compst, IJmp); | ||
787 | jumptohere(compst, test); | ||
788 | jumptothere(compst, jmp, test); | ||
789 | } | ||
790 | else { | ||
791 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ | ||
792 | /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ | ||
793 | int commit, l2; | ||
794 | int test = codetestset(compst, &st, e1); | ||
795 | int pchoice = NOINST; | ||
796 | if (opt) | ||
797 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
798 | else | ||
799 | pchoice = addoffsetinst(compst, IChoice); | ||
800 | l2 = gethere(compst); | ||
801 | codegen(compst, tree, 0, NOINST, fullset); | ||
802 | commit = addoffsetinst(compst, IPartialCommit); | ||
803 | jumptothere(compst, commit, l2); | ||
804 | jumptohere(compst, pchoice); | ||
805 | jumptohere(compst, test); | ||
806 | } | ||
807 | } | ||
808 | } | ||
809 | |||
810 | |||
811 | /* | ||
812 | ** Not predicate; optimizations: | ||
813 | ** In any case, if first test fails, 'not' succeeds, so it can jump to | ||
814 | ** the end. If pattern is headfail, that is all (it cannot fail | ||
815 | ** in other parts); this case includes 'not' of simple sets. Otherwise, | ||
816 | ** use the default code (a choice plus a failtwice). | ||
817 | */ | ||
818 | static void codenot (CompileState *compst, TTree *tree) { | ||
819 | Charset st; | ||
820 | int e = getfirst(tree, fullset, &st); | ||
821 | int test = codetestset(compst, &st, e); | ||
822 | if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ | ||
823 | addinstruction(compst, IFail, 0); | ||
824 | else { | ||
825 | /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ | ||
826 | int pchoice = addoffsetinst(compst, IChoice); | ||
827 | codegen(compst, tree, 0, NOINST, fullset); | ||
828 | addinstruction(compst, IFailTwice, 0); | ||
829 | jumptohere(compst, pchoice); | ||
830 | } | ||
831 | jumptohere(compst, test); | ||
832 | } | ||
833 | |||
834 | |||
835 | /* | ||
836 | ** change open calls to calls, using list 'positions' to find | ||
837 | ** correct offsets; also optimize tail calls | ||
838 | */ | ||
839 | static void correctcalls (CompileState *compst, int *positions, | ||
840 | int from, int to) { | ||
841 | int i; | ||
842 | Instruction *code = compst->p->code; | ||
843 | for (i = from; i < to; i += sizei(&code[i])) { | ||
844 | if (code[i].i.code == IOpenCall) { | ||
845 | int n = code[i].i.key; /* rule number */ | ||
846 | int rule = positions[n]; /* rule position */ | ||
847 | assert(rule == from || code[rule - 1].i.code == IRet); | ||
848 | if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ | ||
849 | code[i].i.code = IJmp; /* tail call */ | ||
850 | else | ||
851 | code[i].i.code = ICall; | ||
852 | jumptothere(compst, i, rule); /* call jumps to respective rule */ | ||
853 | } | ||
854 | } | ||
855 | assert(i == to); | ||
856 | } | ||
857 | |||
858 | |||
859 | /* | ||
860 | ** Code for a grammar: | ||
861 | ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: | ||
862 | */ | ||
863 | static void codegrammar (CompileState *compst, TTree *grammar) { | ||
864 | int positions[MAXRULES]; | ||
865 | int rulenumber = 0; | ||
866 | TTree *rule; | ||
867 | int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ | ||
868 | int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ | ||
869 | int start = gethere(compst); /* here starts the initial rule */ | ||
870 | jumptohere(compst, firstcall); | ||
871 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
872 | positions[rulenumber++] = gethere(compst); /* save rule position */ | ||
873 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ | ||
874 | addinstruction(compst, IRet, 0); | ||
875 | } | ||
876 | assert(rule->tag == TTrue); | ||
877 | jumptohere(compst, jumptoend); | ||
878 | correctcalls(compst, positions, start, gethere(compst)); | ||
879 | } | ||
880 | |||
881 | |||
882 | static void codecall (CompileState *compst, TTree *call) { | ||
883 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ | ||
884 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ | ||
885 | assert(sib2(call)->tag == TRule); | ||
886 | } | ||
887 | |||
888 | |||
889 | /* | ||
890 | ** Code first child of a sequence | ||
891 | ** (second child is called in-place to allow tail call) | ||
892 | ** Return 'tt' for second child | ||
893 | */ | ||
894 | static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, | ||
895 | int tt, const Charset *fl) { | ||
896 | if (needfollow(p1)) { | ||
897 | Charset fl1; | ||
898 | getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ | ||
899 | codegen(compst, p1, 0, tt, &fl1); | ||
900 | } | ||
901 | else /* use 'fullset' as follow */ | ||
902 | codegen(compst, p1, 0, tt, fullset); | ||
903 | if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ | ||
904 | return NOINST; /* invalidate test */ | ||
905 | else return tt; /* else 'tt' still protects sib2 */ | ||
906 | } | ||
907 | |||
908 | |||
909 | /* | ||
910 | ** Main code-generation function: dispatch to auxiliar functions | ||
911 | ** according to kind of tree | ||
912 | */ | ||
913 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
914 | const Charset *fl) { | ||
915 | tailcall: | ||
916 | switch (tree->tag) { | ||
917 | case TChar: codechar(compst, tree->u.n, tt); break; | ||
918 | case TAny: addinstruction(compst, IAny, 0); break; | ||
919 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | ||
920 | case TTrue: break; | ||
921 | case TFalse: addinstruction(compst, IFail, 0); break; | ||
922 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | ||
923 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | ||
924 | case TBehind: codebehind(compst, tree); break; | ||
925 | case TNot: codenot(compst, sib1(tree)); break; | ||
926 | case TAnd: codeand(compst, sib1(tree), tt); break; | ||
927 | case TCapture: codecapture(compst, tree, tt, fl); break; | ||
928 | case TRunTime: coderuntime(compst, tree, tt); break; | ||
929 | case TGrammar: codegrammar(compst, tree); break; | ||
930 | case TCall: codecall(compst, tree); break; | ||
931 | case TSeq: { | ||
932 | tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ | ||
933 | /* codegen(compst, p2, opt, tt, fl); */ | ||
934 | tree = sib2(tree); goto tailcall; | ||
935 | } | ||
936 | case TThrow: { /* labeled failure */ | ||
937 | addthrowinstruction(compst, tree->labels); | ||
938 | break; | ||
939 | } | ||
940 | case TLabChoice: { /* labeled failure */ | ||
941 | codelabchoice(compst, sib1(tree), sib2(tree), opt, fl, tree->labels); | ||
942 | break; | ||
943 | } | ||
944 | default: assert(0); | ||
945 | } | ||
946 | } | ||
947 | |||
948 | |||
949 | /* | ||
950 | ** Optimize jumps and other jump-like instructions. | ||
951 | ** * Update labels of instructions with labels to their final | ||
952 | ** destinations (e.g., choice L1; ... L1: jmp L2: becomes | ||
953 | ** choice L2) | ||
954 | ** * Jumps to other instructions that do jumps become those | ||
955 | ** instructions (e.g., jump to return becomes a return; jump | ||
956 | ** to commit becomes a commit) | ||
957 | */ | ||
958 | static void peephole (CompileState *compst) { | ||
959 | Instruction *code = compst->p->code; | ||
960 | int i; | ||
961 | for (i = 0; i < compst->ncode; i += sizei(&code[i])) { | ||
962 | switch (code[i].i.code) { | ||
963 | case IChoice: case ICall: case ICommit: case IPartialCommit: | ||
964 | case IBackCommit: case ITestChar: case ITestSet: case ILabChoice: /* labeled failure */ | ||
965 | case ITestAny: { /* instructions with labels */ | ||
966 | jumptothere(compst, i, finallabel(code, i)); /* optimize label */ | ||
967 | break; | ||
968 | } | ||
969 | case IJmp: { | ||
970 | int ft = finaltarget(code, i); | ||
971 | switch (code[ft].i.code) { /* jumping to what? */ | ||
972 | case IRet: case IFail: case IFailTwice: | ||
973 | case IEnd: { /* instructions with unconditional implicit jumps */ | ||
974 | code[i] = code[ft]; /* jump becomes that instruction */ | ||
975 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ | ||
976 | break; | ||
977 | } | ||
978 | case ICommit: case IPartialCommit: | ||
979 | case IBackCommit: { /* inst. with unconditional explicit jumps */ | ||
980 | int fft = finallabel(code, ft); | ||
981 | code[i] = code[ft]; /* jump becomes that instruction... */ | ||
982 | jumptothere(compst, i, fft); /* but must correct its offset */ | ||
983 | i--; /* reoptimize its label */ | ||
984 | break; | ||
985 | } | ||
986 | default: { | ||
987 | jumptothere(compst, i, ft); /* optimize label */ | ||
988 | break; | ||
989 | } | ||
990 | } | ||
991 | break; | ||
992 | } | ||
993 | default: break; | ||
994 | } | ||
995 | } | ||
996 | assert(code[i - 1].i.code == IEnd); | ||
997 | } | ||
998 | |||
999 | |||
1000 | /* | ||
1001 | ** Compile a pattern | ||
1002 | */ | ||
1003 | Instruction *compile (lua_State *L, Pattern *p) { | ||
1004 | CompileState compst; | ||
1005 | compst.p = p; compst.ncode = 0; compst.L = L; | ||
1006 | reallocprog(L, p, 2); /* minimum initial size */ | ||
1007 | codegen(&compst, p->tree, 0, NOINST, fullset); | ||
1008 | addinstruction(&compst, IEnd, 0); | ||
1009 | reallocprog(L, p, compst.ncode); /* set final size */ | ||
1010 | peephole(&compst); /* labeled failure */ | ||
1011 | return p->code; | ||
1012 | } | ||
1013 | |||
1014 | |||
1015 | /* }====================================================== */ | ||
1016 | |||
diff --git a/lpcode.h b/lpcode.h new file mode 100644 index 0000000..5c9d54f --- /dev/null +++ b/lpcode.h | |||
@@ -0,0 +1,34 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.h,v 1.5 2013/04/04 21:24:45 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpcode_h) | ||
6 | #define lpcode_h | ||
7 | |||
8 | #include "lua.h" | ||
9 | |||
10 | #include "lptypes.h" | ||
11 | #include "lptree.h" | ||
12 | #include "lpvm.h" | ||
13 | |||
14 | int tocharset (TTree *tree, Charset *cs); | ||
15 | int checkaux (TTree *tree, int pred); | ||
16 | int fixedlenx (TTree *tree, int count, int len); | ||
17 | int hascaptures (TTree *tree); | ||
18 | int lp_gc (lua_State *L); | ||
19 | Instruction *compile (lua_State *L, Pattern *p); | ||
20 | void reallocprog (lua_State *L, Pattern *p, int nsize); | ||
21 | int sizei (const Instruction *i); | ||
22 | |||
23 | |||
24 | #define PEnullable 0 | ||
25 | #define PEnofail 1 | ||
26 | |||
27 | #define nofail(t) checkaux(t, PEnofail) | ||
28 | #define nullable(t) checkaux(t, PEnullable) | ||
29 | |||
30 | #define fixedlen(t) fixedlenx(t, 0, 0) | ||
31 | |||
32 | |||
33 | |||
34 | #endif | ||
diff --git a/lpprint.c b/lpprint.c new file mode 100644 index 0000000..03239fd --- /dev/null +++ b/lpprint.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | ** $Id: lpprint.c,v 1.7 2013/04/12 16:29:49 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <ctype.h> | ||
7 | #include <limits.h> | ||
8 | #include <stdio.h> | ||
9 | |||
10 | |||
11 | #include "lptypes.h" | ||
12 | #include "lpprint.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | #if defined(LPEG_DEBUG) | ||
17 | |||
18 | /* | ||
19 | ** {====================================================== | ||
20 | ** Printing patterns (for debugging) | ||
21 | ** ======================================================= | ||
22 | */ | ||
23 | |||
24 | |||
25 | void printcharset (const byte *st) { | ||
26 | int i; | ||
27 | printf("["); | ||
28 | for (i = 0; i <= UCHAR_MAX; i++) { | ||
29 | int first = i; | ||
30 | while (testchar(st, i) && i <= UCHAR_MAX) i++; | ||
31 | if (i - 1 == first) /* unary range? */ | ||
32 | printf("(%02x)", first); | ||
33 | else if (i - 1 > first) /* non-empty range? */ | ||
34 | printf("(%02x-%02x)", first, i - 1); | ||
35 | } | ||
36 | printf("]"); | ||
37 | } | ||
38 | |||
39 | |||
40 | static void printcapkind (int kind) { | ||
41 | const char *const modes[] = { | ||
42 | "close", "position", "constant", "backref", | ||
43 | "argument", "simple", "table", "function", | ||
44 | "query", "string", "num", "substitution", "fold", | ||
45 | "runtime", "group"}; | ||
46 | printf("%s", modes[kind]); | ||
47 | } | ||
48 | |||
49 | |||
50 | static void printjmp (const Instruction *op, const Instruction *p) { | ||
51 | printf("-> %d", (int)(p + (p + 1)->offset - op)); | ||
52 | } | ||
53 | |||
54 | |||
55 | void printinst (const Instruction *op, const Instruction *p) { | ||
56 | const char *const names[] = { | ||
57 | "any", "char", "set", | ||
58 | "testany", "testchar", "testset", | ||
59 | "span", "behind", | ||
60 | "ret", "end", | ||
61 | "choice", "jmp", "call", "open_call", | ||
62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", | ||
63 | "fullcapture", "opencapture", "closecapture", "closeruntime", | ||
64 | "throw", "labeled_choice" /* labeled failure */ | ||
65 | }; | ||
66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); | ||
67 | switch ((Opcode)p->i.code) { | ||
68 | case IChar: { | ||
69 | printf("'%c'", p->i.aux); | ||
70 | break; | ||
71 | } | ||
72 | case ITestChar: { | ||
73 | printf("'%c'", p->i.aux); printjmp(op, p); | ||
74 | break; | ||
75 | } | ||
76 | case IFullCapture: { | ||
77 | printcapkind(getkind(p)); | ||
78 | printf(" (size = %d) (idx = %d)", getoff(p), p->i.key); | ||
79 | break; | ||
80 | } | ||
81 | case IOpenCapture: { | ||
82 | printcapkind(getkind(p)); | ||
83 | printf(" (idx = %d)", p->i.key); | ||
84 | break; | ||
85 | } | ||
86 | case ISet: { | ||
87 | printcharset((p+1)->buff); | ||
88 | break; | ||
89 | } | ||
90 | case ITestSet: { | ||
91 | printcharset((p+2)->buff); printjmp(op, p); | ||
92 | break; | ||
93 | } | ||
94 | case ISpan: { | ||
95 | printcharset((p+1)->buff); | ||
96 | break; | ||
97 | } | ||
98 | case IOpenCall: { | ||
99 | printf("-> %d", (p + 1)->offset); | ||
100 | break; | ||
101 | } | ||
102 | case IBehind: { | ||
103 | printf("%d", p->i.aux); | ||
104 | break; | ||
105 | } | ||
106 | case IJmp: case ICall: case ICommit: case IChoice: | ||
107 | case IPartialCommit: case IBackCommit: case ITestAny: { | ||
108 | printjmp(op, p); | ||
109 | break; | ||
110 | } | ||
111 | case IThrow: { /* labeled failure */ | ||
112 | printf("%d", (p + 1)->labels); | ||
113 | break; | ||
114 | } | ||
115 | case ILabChoice: { /* labeled failure */ | ||
116 | printjmp(op, p); | ||
117 | printf(" %d", (p + 2)->labels); | ||
118 | break; | ||
119 | } | ||
120 | |||
121 | default: break; | ||
122 | } | ||
123 | printf("\n"); | ||
124 | } | ||
125 | |||
126 | |||
127 | void printpatt (Instruction *p, int n) { | ||
128 | Instruction *op = p; | ||
129 | while (p < op + n) { | ||
130 | printinst(op, p); | ||
131 | p += sizei(p); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | |||
136 | #if defined(LPEG_DEBUG) | ||
137 | static void printcap (Capture *cap) { | ||
138 | printcapkind(cap->kind); | ||
139 | printf(" (idx: %d - size: %d) -> %p\n", cap->idx, cap->siz, cap->s); | ||
140 | } | ||
141 | |||
142 | |||
143 | void printcaplist (Capture *cap, Capture *limit) { | ||
144 | printf(">======\n"); | ||
145 | for (; cap->s && (limit == NULL || cap < limit); cap++) | ||
146 | printcap(cap); | ||
147 | printf("=======\n"); | ||
148 | } | ||
149 | #endif | ||
150 | |||
151 | /* }====================================================== */ | ||
152 | |||
153 | |||
154 | /* | ||
155 | ** {====================================================== | ||
156 | ** Printing trees (for debugging) | ||
157 | ** ======================================================= | ||
158 | */ | ||
159 | |||
160 | static const char *tagnames[] = { | ||
161 | "char", "set", "any", | ||
162 | "true", "false", | ||
163 | "rep", | ||
164 | "seq", "choice", | ||
165 | "not", "and", | ||
166 | "call", "opencall", "rule", "grammar", | ||
167 | "behind", | ||
168 | "capture", "run-time" | ||
169 | }; | ||
170 | |||
171 | |||
172 | void printtree (TTree *tree, int ident) { | ||
173 | int i; | ||
174 | for (i = 0; i < ident; i++) printf(" "); | ||
175 | printf("%s", tagnames[tree->tag]); | ||
176 | switch (tree->tag) { | ||
177 | case TChar: { | ||
178 | int c = tree->u.n; | ||
179 | if (isprint(c)) | ||
180 | printf(" '%c'\n", c); | ||
181 | else | ||
182 | printf(" (%02X)\n", c); | ||
183 | break; | ||
184 | } | ||
185 | case TSet: { | ||
186 | printcharset(treebuffer(tree)); | ||
187 | printf("\n"); | ||
188 | break; | ||
189 | } | ||
190 | case TOpenCall: case TCall: { | ||
191 | printf(" key: %d\n", tree->key); | ||
192 | break; | ||
193 | } | ||
194 | case TBehind: { | ||
195 | printf(" %d\n", tree->u.n); | ||
196 | printtree(sib1(tree), ident + 2); | ||
197 | break; | ||
198 | } | ||
199 | case TCapture: { | ||
200 | printf(" cap: %d key: %d n: %d\n", tree->cap, tree->key, tree->u.n); | ||
201 | printtree(sib1(tree), ident + 2); | ||
202 | break; | ||
203 | } | ||
204 | case TRule: { | ||
205 | printf(" n: %d key: %d\n", tree->cap, tree->key); | ||
206 | printtree(sib1(tree), ident + 2); | ||
207 | break; /* do not print next rule as a sibling */ | ||
208 | } | ||
209 | case TGrammar: { | ||
210 | TTree *rule = sib1(tree); | ||
211 | printf(" %d\n", tree->u.n); /* number of rules */ | ||
212 | for (i = 0; i < tree->u.n; i++) { | ||
213 | printtree(rule, ident + 2); | ||
214 | rule = sib2(rule); | ||
215 | } | ||
216 | assert(rule->tag == TTrue); /* sentinel */ | ||
217 | break; | ||
218 | } | ||
219 | default: { | ||
220 | int sibs = numsiblings[tree->tag]; | ||
221 | printf("\n"); | ||
222 | if (sibs >= 1) { | ||
223 | printtree(sib1(tree), ident + 2); | ||
224 | if (sibs >= 2) | ||
225 | printtree(sib2(tree), ident + 2); | ||
226 | } | ||
227 | break; | ||
228 | } | ||
229 | } | ||
230 | } | ||
231 | |||
232 | |||
233 | void printktable (lua_State *L, int idx) { | ||
234 | int n, i; | ||
235 | lua_getfenv(L, idx); | ||
236 | if (lua_isnil(L, -1)) /* no ktable? */ | ||
237 | return; | ||
238 | n = lua_objlen(L, -1); | ||
239 | printf("["); | ||
240 | for (i = 1; i <= n; i++) { | ||
241 | printf("%d = ", i); | ||
242 | lua_rawgeti(L, -1, i); | ||
243 | if (lua_isstring(L, -1)) | ||
244 | printf("%s ", lua_tostring(L, -1)); | ||
245 | else | ||
246 | printf("%s ", lua_typename(L, lua_type(L, -1))); | ||
247 | lua_pop(L, 1); | ||
248 | } | ||
249 | printf("]\n"); | ||
250 | /* leave ktable at the stack */ | ||
251 | } | ||
252 | |||
253 | /* }====================================================== */ | ||
254 | |||
255 | #endif | ||
diff --git a/lpprint.h b/lpprint.h new file mode 100644 index 0000000..6cbe47d --- /dev/null +++ b/lpprint.h | |||
@@ -0,0 +1,37 @@ | |||
1 | /* | ||
2 | ** $Id: lpprint.h,v 1.1 2013/03/21 20:25:12 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | |||
6 | #if !defined(lpprint_h) | ||
7 | #define lpprint_h | ||
8 | |||
9 | |||
10 | #include "lptree.h" | ||
11 | #include "lpvm.h" | ||
12 | |||
13 | |||
14 | #if defined(LPEG_DEBUG) | ||
15 | |||
16 | void printpatt (Instruction *p, int n); | ||
17 | void printtree (TTree *tree, int ident); | ||
18 | void printktable (lua_State *L, int idx); | ||
19 | void printcharset (const byte *st); | ||
20 | void printcaplist (Capture *cap, Capture *limit); | ||
21 | void printinst (const Instruction *op, const Instruction *p); | ||
22 | |||
23 | |||
24 | #else | ||
25 | |||
26 | #define printktable(L,idx) \ | ||
27 | luaL_error(L, "function only implemented in debug mode") | ||
28 | #define printtree(tree,i) \ | ||
29 | luaL_error(L, "function only implemented in debug mode") | ||
30 | #define printpatt(p,n) \ | ||
31 | luaL_error(L, "function only implemented in debug mode") | ||
32 | |||
33 | #endif | ||
34 | |||
35 | |||
36 | #endif | ||
37 | |||
diff --git a/lptree.c b/lptree.c new file mode 100644 index 0000000..5d2933d --- /dev/null +++ b/lptree.c | |||
@@ -0,0 +1,1282 @@ | |||
1 | /* | ||
2 | ** $Id: lptree.c,v 1.10 2013/04/12 16:30:33 roberto Exp $ | ||
3 | ** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <ctype.h> | ||
7 | #include <limits.h> | ||
8 | #include <string.h> | ||
9 | |||
10 | |||
11 | #include "lua.h" | ||
12 | #include "lauxlib.h" | ||
13 | |||
14 | #include "lptypes.h" | ||
15 | #include "lpcap.h" | ||
16 | #include "lpcode.h" | ||
17 | #include "lpprint.h" | ||
18 | #include "lptree.h" | ||
19 | |||
20 | |||
21 | /* number of siblings for each tree */ | ||
22 | const byte numsiblings[] = { | ||
23 | 0, 0, 0, /* char, set, any */ | ||
24 | 0, 0, /* true, false */ | ||
25 | 1, /* rep */ | ||
26 | 2, 2, /* seq, choice */ | ||
27 | 1, 1, /* not, and */ | ||
28 | 0, 0, 2, 1, /* call, opencall, rule, grammar */ | ||
29 | 1, /* behind */ | ||
30 | 1, 1, /* capture, runtime capture */ | ||
31 | 0, 2 /* labeled failure throw, labeled choice */ | ||
32 | }; | ||
33 | |||
34 | |||
35 | static TTree *newgrammar (lua_State *L, int arg); | ||
36 | |||
37 | |||
38 | /* | ||
39 | ** returns a reasonable name for value at index 'idx' on the stack | ||
40 | */ | ||
41 | static const char *val2str (lua_State *L, int idx) { | ||
42 | const char *k = lua_tostring(L, idx); | ||
43 | if (k != NULL) | ||
44 | return lua_pushfstring(L, "%s", k); | ||
45 | else | ||
46 | return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx)); | ||
47 | } | ||
48 | |||
49 | |||
50 | /* | ||
51 | ** Fix a TOpenCall into a TCall node, using table 'postable' to | ||
52 | ** translate a key to its rule address in the tree. Raises an | ||
53 | ** error if key does not exist. | ||
54 | */ | ||
55 | static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { | ||
56 | int n; | ||
57 | lua_rawgeti(L, -1, t->key); /* get rule's name */ | ||
58 | lua_gettable(L, postable); /* query name in position table */ | ||
59 | n = lua_tonumber(L, -1); /* get (absolute) position */ | ||
60 | lua_pop(L, 1); /* remove position */ | ||
61 | if (n == 0) { /* no position? */ | ||
62 | lua_rawgeti(L, -1, t->key); /* get rule's name again */ | ||
63 | luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); | ||
64 | } | ||
65 | t->tag = TCall; | ||
66 | t->u.ps = n - (t - g); /* position relative to node */ | ||
67 | assert(sib2(t)->tag == TRule); | ||
68 | sib2(t)->key = t->key; | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | ** Transform left associative constructions into right | ||
74 | ** associative ones, for sequence and choice; that is: | ||
75 | ** (t11 + t12) + t2 => t11 + (t12 + t2) | ||
76 | ** (t11 * t12) * t2 => t11 * (t12 * t2) | ||
77 | ** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) | ||
78 | */ | ||
79 | static void correctassociativity (TTree *tree) { | ||
80 | TTree *t1 = sib1(tree); | ||
81 | assert(tree->tag == TChoice || tree->tag == TSeq); | ||
82 | while (t1->tag == tree->tag) { | ||
83 | int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */ | ||
84 | int n11size = t1->u.ps - 1; | ||
85 | int n12size = n1size - n11size - 1; | ||
86 | memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */ | ||
87 | tree->u.ps = n11size + 1; | ||
88 | sib2(tree)->tag = tree->tag; | ||
89 | sib2(tree)->u.ps = n12size + 1; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | |||
94 | /* | ||
95 | ** Make final adjustments in a tree. Fix open calls in tree 't', | ||
96 | ** making them refer to their respective rules or raising appropriate | ||
97 | ** errors (if not inside a grammar). Correct associativity of associative | ||
98 | ** constructions (making them right associative). Assume that tree's | ||
99 | ** ktable is at the top of the stack (for error messages). | ||
100 | */ | ||
101 | static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { | ||
102 | tailcall: | ||
103 | switch (t->tag) { | ||
104 | case TGrammar: /* subgrammars were already fixed */ | ||
105 | return; | ||
106 | case TOpenCall: { | ||
107 | if (g != NULL) /* inside a grammar? */ | ||
108 | fixonecall(L, postable, g, t); | ||
109 | else { /* open call outside grammar */ | ||
110 | lua_rawgeti(L, -1, t->key); | ||
111 | luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); | ||
112 | } | ||
113 | break; | ||
114 | } | ||
115 | case TSeq: case TChoice: | ||
116 | correctassociativity(t); | ||
117 | break; | ||
118 | } | ||
119 | switch (numsiblings[t->tag]) { | ||
120 | case 1: /* finalfix(L, postable, g, sib1(t)); */ | ||
121 | t = sib1(t); goto tailcall; | ||
122 | case 2: | ||
123 | finalfix(L, postable, g, sib1(t)); | ||
124 | t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */ | ||
125 | default: assert(numsiblings[t->tag] == 0); break; | ||
126 | } | ||
127 | } | ||
128 | |||
129 | |||
130 | /* | ||
131 | ** {====================================================== | ||
132 | ** Tree generation | ||
133 | ** ======================================================= | ||
134 | */ | ||
135 | |||
136 | /* | ||
137 | ** In 5.2, could use 'luaL_testudata'... | ||
138 | */ | ||
139 | static int testpattern (lua_State *L, int idx) { | ||
140 | if (lua_touserdata(L, idx)) { /* value is a userdata? */ | ||
141 | if (lua_getmetatable(L, idx)) { /* does it have a metatable? */ | ||
142 | luaL_getmetatable(L, PATTERN_T); | ||
143 | if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */ | ||
144 | lua_pop(L, 2); /* remove both metatables */ | ||
145 | return 1; | ||
146 | } | ||
147 | } | ||
148 | } | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | |||
153 | static Pattern *getpattern (lua_State *L, int idx) { | ||
154 | return (Pattern *)luaL_checkudata(L, idx, PATTERN_T); | ||
155 | } | ||
156 | |||
157 | |||
158 | static int getsize (lua_State *L, int idx) { | ||
159 | return (lua_objlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1; | ||
160 | } | ||
161 | |||
162 | |||
163 | static TTree *gettree (lua_State *L, int idx, int *len) { | ||
164 | Pattern *p = getpattern(L, idx); | ||
165 | if (len) | ||
166 | *len = getsize(L, idx); | ||
167 | return p->tree; | ||
168 | } | ||
169 | |||
170 | |||
171 | /* | ||
172 | ** create a pattern | ||
173 | */ | ||
174 | static TTree *newtree (lua_State *L, int len) { | ||
175 | size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern); | ||
176 | Pattern *p = (Pattern *)lua_newuserdata(L, size); | ||
177 | luaL_getmetatable(L, PATTERN_T); | ||
178 | lua_setmetatable(L, -2); | ||
179 | p->code = NULL; p->codesize = 0; | ||
180 | return p->tree; | ||
181 | } | ||
182 | |||
183 | |||
184 | static TTree *newleaf (lua_State *L, int tag) { | ||
185 | TTree *tree = newtree(L, 1); | ||
186 | tree->tag = tag; | ||
187 | return tree; | ||
188 | } | ||
189 | |||
190 | |||
191 | /* labeled failure begin */ | ||
192 | static TTree *newlabelleaf (lua_State *L, Labelset ls) { | ||
193 | TTree *tree = newtree(L, 1); | ||
194 | tree->tag = TThrow; | ||
195 | tree->labels = ls; | ||
196 | return tree; | ||
197 | } | ||
198 | /* labeled failure end */ | ||
199 | |||
200 | |||
201 | static TTree *newcharset (lua_State *L) { | ||
202 | TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); | ||
203 | tree->tag = TSet; | ||
204 | loopset(i, treebuffer(tree)[i] = 0); | ||
205 | return tree; | ||
206 | } | ||
207 | |||
208 | |||
209 | /* | ||
210 | ** add to tree a sequence where first sibling is 'sib' (with size | ||
211 | ** 'sibsize'); returns position for second sibling | ||
212 | */ | ||
213 | static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) { | ||
214 | tree->tag = TSeq; tree->u.ps = sibsize + 1; | ||
215 | memcpy(sib1(tree), sib, sibsize * sizeof(TTree)); | ||
216 | return sib2(tree); | ||
217 | } | ||
218 | |||
219 | |||
220 | /* | ||
221 | ** Add element 'idx' to 'ktable' of pattern at the top of the stack; | ||
222 | ** create new 'ktable' if necessary. Return index of new element. | ||
223 | */ | ||
224 | static int addtoktable (lua_State *L, int idx) { | ||
225 | if (idx == 0 || lua_isnil(L, idx)) /* no actual value to insert? */ | ||
226 | return 0; | ||
227 | else { | ||
228 | int n; | ||
229 | lua_getfenv(L, -1); /* get ktable from pattern */ | ||
230 | n = lua_objlen(L, -1); | ||
231 | if (n == 0) { /* is it empty/non-existent? */ | ||
232 | lua_pop(L, 1); /* remove it */ | ||
233 | lua_createtable(L, 1, 0); /* create a fresh table */ | ||
234 | } | ||
235 | lua_pushvalue(L, idx); /* element to be added */ | ||
236 | lua_rawseti(L, -2, n + 1); | ||
237 | lua_setfenv(L, -2); /* set it as ktable for pattern */ | ||
238 | return n + 1; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | |||
243 | /* | ||
244 | ** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got | ||
245 | ** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it | ||
246 | ** must build a sequence of sequence of sequence...) | ||
247 | */ | ||
248 | static void fillseq (TTree *tree, int tag, int n, const char *s) { | ||
249 | int i; | ||
250 | for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */ | ||
251 | tree->tag = TSeq; tree->u.ps = 2; | ||
252 | sib1(tree)->tag = tag; | ||
253 | sib1(tree)->u.n = s ? (byte)s[i] : 0; | ||
254 | tree = sib2(tree); | ||
255 | } | ||
256 | tree->tag = tag; /* last one does not need TSeq */ | ||
257 | tree->u.n = s ? (byte)s[i] : 0; | ||
258 | } | ||
259 | |||
260 | |||
261 | /* | ||
262 | ** Numbers as patterns: | ||
263 | ** 0 == true (always match); n == TAny repeated 'n' times; | ||
264 | ** -n == not (TAny repeated 'n' times) | ||
265 | */ | ||
266 | static TTree *numtree (lua_State *L, int n) { | ||
267 | if (n == 0) | ||
268 | return newleaf(L, TTrue); | ||
269 | else { | ||
270 | TTree *tree, *nd; | ||
271 | if (n > 0) | ||
272 | tree = nd = newtree(L, 2 * n - 1); | ||
273 | else { /* negative: code it as !(-n) */ | ||
274 | n = -n; | ||
275 | tree = newtree(L, 2 * n); | ||
276 | tree->tag = TNot; | ||
277 | nd = sib1(tree); | ||
278 | } | ||
279 | fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */ | ||
280 | return tree; | ||
281 | } | ||
282 | } | ||
283 | |||
284 | |||
285 | /* | ||
286 | ** Convert value at index 'idx' to a pattern | ||
287 | */ | ||
288 | static TTree *getpatt (lua_State *L, int idx, int *len) { | ||
289 | TTree *tree; | ||
290 | switch (lua_type(L, idx)) { | ||
291 | case LUA_TSTRING: { | ||
292 | size_t slen; | ||
293 | const char *s = lua_tolstring(L, idx, &slen); /* get string */ | ||
294 | if (slen == 0) /* empty? */ | ||
295 | tree = newleaf(L, TTrue); /* always match */ | ||
296 | else { | ||
297 | tree = newtree(L, 2 * (slen - 1) + 1); | ||
298 | fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */ | ||
299 | } | ||
300 | break; | ||
301 | } | ||
302 | case LUA_TNUMBER: { | ||
303 | int n = lua_tointeger(L, idx); | ||
304 | tree = numtree(L, n); | ||
305 | break; | ||
306 | } | ||
307 | case LUA_TBOOLEAN: { | ||
308 | tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse)); | ||
309 | break; | ||
310 | } | ||
311 | case LUA_TTABLE: { | ||
312 | tree = newgrammar(L, idx); | ||
313 | break; | ||
314 | } | ||
315 | case LUA_TFUNCTION: { | ||
316 | tree = newtree(L, 2); | ||
317 | tree->tag = TRunTime; | ||
318 | tree->key = addtoktable(L, idx); | ||
319 | sib1(tree)->tag = TTrue; | ||
320 | break; | ||
321 | } | ||
322 | default: { | ||
323 | return gettree(L, idx, len); | ||
324 | } | ||
325 | } | ||
326 | lua_replace(L, idx); /* put new tree into 'idx' slot */ | ||
327 | if (len) | ||
328 | *len = getsize(L, idx); | ||
329 | return tree; | ||
330 | } | ||
331 | |||
332 | |||
333 | /* | ||
334 | ** Return the number of elements in the ktable of pattern at 'idx'. | ||
335 | ** In Lua 5.2, default "environment" for patterns is nil, not | ||
336 | ** a table. Treat it as an empty table. In Lua 5.1, assumes that | ||
337 | ** the environment has no numeric indices (len == 0) | ||
338 | */ | ||
339 | static int ktablelen (lua_State *L, int idx) { | ||
340 | if (!lua_istable(L, idx)) return 0; | ||
341 | else return lua_objlen(L, idx); | ||
342 | } | ||
343 | |||
344 | |||
345 | /* | ||
346 | ** Concatentate the contents of table 'idx1' into table 'idx2'. | ||
347 | ** (Assume that both indices are negative.) | ||
348 | ** Return the original length of table 'idx2' | ||
349 | */ | ||
350 | static int concattable (lua_State *L, int idx1, int idx2) { | ||
351 | int i; | ||
352 | int n1 = ktablelen(L, idx1); | ||
353 | int n2 = ktablelen(L, idx2); | ||
354 | if (n1 == 0) return 0; /* nothing to correct */ | ||
355 | for (i = 1; i <= n1; i++) { | ||
356 | lua_rawgeti(L, idx1, i); | ||
357 | lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */ | ||
358 | } | ||
359 | return n2; | ||
360 | } | ||
361 | |||
362 | |||
363 | /* | ||
364 | ** Make a merge of ktables from p1 and p2 the ktable for the new | ||
365 | ** pattern at the top of the stack. | ||
366 | */ | ||
367 | static int joinktables (lua_State *L, int p1, int p2) { | ||
368 | int n1, n2; | ||
369 | lua_getfenv(L, p1); /* get ktables */ | ||
370 | lua_getfenv(L, p2); | ||
371 | n1 = ktablelen(L, -2); | ||
372 | n2 = ktablelen(L, -1); | ||
373 | if (n1 == 0 && n2 == 0) { /* are both tables empty? */ | ||
374 | lua_pop(L, 2); /* nothing to be done; pop tables */ | ||
375 | return 0; /* nothing to correct */ | ||
376 | } | ||
377 | if (n2 == 0 || lua_equal(L, -2, -1)) { /* second table is empty or equal? */ | ||
378 | lua_pop(L, 1); /* pop 2nd table */ | ||
379 | lua_setfenv(L, -2); /* set 1st ktable into new pattern */ | ||
380 | return 0; /* nothing to correct */ | ||
381 | } | ||
382 | if (n1 == 0) { /* first table is empty? */ | ||
383 | lua_setfenv(L, -3); /* set 2nd table into new pattern */ | ||
384 | lua_pop(L, 1); /* pop 1st table */ | ||
385 | return 0; /* nothing to correct */ | ||
386 | } | ||
387 | else { | ||
388 | lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */ | ||
389 | /* stack: new p; ktable p1; ktable p2; new ktable */ | ||
390 | concattable(L, -3, -1); /* from p1 into new ktable */ | ||
391 | concattable(L, -2, -1); /* from p2 into new ktable */ | ||
392 | lua_setfenv(L, -4); /* new ktable becomes p env */ | ||
393 | lua_pop(L, 2); /* pop other ktables */ | ||
394 | return n1; /* correction for indices from p2 */ | ||
395 | } | ||
396 | } | ||
397 | |||
398 | |||
399 | static void correctkeys (TTree *tree, int n) { | ||
400 | if (n == 0) return; /* no correction? */ | ||
401 | tailcall: | ||
402 | switch (tree->tag) { | ||
403 | case TOpenCall: case TCall: case TRunTime: case TRule: { | ||
404 | if (tree->key > 0) | ||
405 | tree->key += n; | ||
406 | break; | ||
407 | } | ||
408 | case TCapture: { | ||
409 | if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum) | ||
410 | tree->key += n; | ||
411 | break; | ||
412 | } | ||
413 | default: break; | ||
414 | } | ||
415 | switch (numsiblings[tree->tag]) { | ||
416 | case 1: /* correctkeys(sib1(tree), n); */ | ||
417 | tree = sib1(tree); goto tailcall; | ||
418 | case 2: | ||
419 | correctkeys(sib1(tree), n); | ||
420 | tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */ | ||
421 | default: assert(numsiblings[tree->tag] == 0); break; | ||
422 | } | ||
423 | } | ||
424 | |||
425 | |||
426 | /* | ||
427 | ** copy 'ktable' of element 'idx' to new tree (on top of stack) | ||
428 | */ | ||
429 | static void copyktable (lua_State *L, int idx) { | ||
430 | lua_getfenv(L, idx); | ||
431 | lua_setfenv(L, -2); | ||
432 | } | ||
433 | |||
434 | |||
435 | /* | ||
436 | ** merge 'ktable' from rule at stack index 'idx' into 'ktable' | ||
437 | ** from tree at the top of the stack, and correct corresponding | ||
438 | ** tree. | ||
439 | */ | ||
440 | static void mergektable (lua_State *L, int idx, TTree *rule) { | ||
441 | int n; | ||
442 | lua_getfenv(L, -1); /* get ktables */ | ||
443 | lua_getfenv(L, idx); | ||
444 | n = concattable(L, -1, -2); | ||
445 | lua_pop(L, 2); /* remove both ktables */ | ||
446 | correctkeys(rule, n); | ||
447 | } | ||
448 | |||
449 | |||
450 | /* | ||
451 | ** create a new tree, whith a new root and one sibling. | ||
452 | ** Sibling must be on the Lua stack, at index 1. | ||
453 | */ | ||
454 | static TTree *newroot1sib (lua_State *L, int tag) { | ||
455 | int s1; | ||
456 | TTree *tree1 = getpatt(L, 1, &s1); | ||
457 | TTree *tree = newtree(L, 1 + s1); /* create new tree */ | ||
458 | tree->tag = tag; | ||
459 | memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); | ||
460 | copyktable(L, 1); | ||
461 | return tree; | ||
462 | } | ||
463 | |||
464 | |||
465 | /* | ||
466 | ** create a new tree, whith a new root and 2 siblings. | ||
467 | ** Siblings must be on the Lua stack, first one at index 1. | ||
468 | */ | ||
469 | static TTree *newroot2sib (lua_State *L, int tag) { | ||
470 | int s1, s2; | ||
471 | TTree *tree1 = getpatt(L, 1, &s1); | ||
472 | TTree *tree2 = getpatt(L, 2, &s2); | ||
473 | TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */ | ||
474 | tree->tag = tag; | ||
475 | tree->u.ps = 1 + s1; | ||
476 | memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); | ||
477 | memcpy(sib2(tree), tree2, s2 * sizeof(TTree)); | ||
478 | correctkeys(sib2(tree), joinktables(L, 1, 2)); | ||
479 | return tree; | ||
480 | } | ||
481 | |||
482 | |||
483 | static int lp_P (lua_State *L) { | ||
484 | luaL_checkany(L, 1); | ||
485 | getpatt(L, 1, NULL); | ||
486 | lua_settop(L, 1); | ||
487 | return 1; | ||
488 | } | ||
489 | |||
490 | |||
491 | /* | ||
492 | ** sequence operator; optimizations: | ||
493 | ** false x => false, x true => x, true x => x | ||
494 | ** (cannot do x . false => false because x may have runtime captures) | ||
495 | */ | ||
496 | static int lp_seq (lua_State *L) { | ||
497 | TTree *tree1 = getpatt(L, 1, NULL); | ||
498 | TTree *tree2 = getpatt(L, 2, NULL); | ||
499 | if (tree1->tag == TFalse || tree2->tag == TTrue) | ||
500 | lua_pushvalue(L, 1); /* false . x == false, x . true = x */ | ||
501 | else if (tree1->tag == TTrue) | ||
502 | lua_pushvalue(L, 2); /* true . x = x */ | ||
503 | else | ||
504 | newroot2sib(L, TSeq); | ||
505 | return 1; | ||
506 | } | ||
507 | |||
508 | |||
509 | /* | ||
510 | ** choice operator; optimizations: | ||
511 | ** charset / charset => charset | ||
512 | ** true / x => true, x / false => x, false / x => x | ||
513 | ** (x / true is not equivalent to true) | ||
514 | */ | ||
515 | static int lp_choice (lua_State *L) { | ||
516 | Charset st1, st2; | ||
517 | TTree *t1 = getpatt(L, 1, NULL); | ||
518 | TTree *t2 = getpatt(L, 2, NULL); | ||
519 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | ||
520 | TTree *t = newcharset(L); | ||
521 | loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); | ||
522 | } | ||
523 | else if (nofail(t1) || t2->tag == TFalse) | ||
524 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ | ||
525 | else if (t1->tag == TFalse) | ||
526 | lua_pushvalue(L, 2); /* false / x => x */ | ||
527 | else | ||
528 | newroot2sib(L, TChoice); | ||
529 | return 1; | ||
530 | } | ||
531 | |||
532 | |||
533 | /* | ||
534 | ** p^n | ||
535 | */ | ||
536 | static int lp_star (lua_State *L) { | ||
537 | int size1; | ||
538 | int n = luaL_checkint(L, 2); | ||
539 | TTree *tree1 = gettree(L, 1, &size1); | ||
540 | if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */ | ||
541 | TTree *tree = newtree(L, (n + 1) * (size1 + 1)); | ||
542 | if (nullable(tree1)) | ||
543 | luaL_error(L, "loop body may accept empty string"); | ||
544 | while (n--) /* repeat 'n' times */ | ||
545 | tree = seqaux(tree, tree1, size1); | ||
546 | tree->tag = TRep; | ||
547 | memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); | ||
548 | } | ||
549 | else { /* choice (seq tree1 ... choice tree1 true ...) true */ | ||
550 | TTree *tree; | ||
551 | n = -n; | ||
552 | /* size = (choice + seq + tree1 + true) * n, but the last has no seq */ | ||
553 | tree = newtree(L, n * (size1 + 3) - 1); | ||
554 | for (; n > 1; n--) { /* repeat (n - 1) times */ | ||
555 | tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2; | ||
556 | sib2(tree)->tag = TTrue; | ||
557 | tree = sib1(tree); | ||
558 | tree = seqaux(tree, tree1, size1); | ||
559 | } | ||
560 | tree->tag = TChoice; tree->u.ps = size1 + 1; | ||
561 | sib2(tree)->tag = TTrue; | ||
562 | memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); | ||
563 | } | ||
564 | copyktable(L, 1); | ||
565 | return 1; | ||
566 | } | ||
567 | |||
568 | |||
569 | /* | ||
570 | ** #p == &p | ||
571 | */ | ||
572 | static int lp_and (lua_State *L) { | ||
573 | newroot1sib(L, TAnd); | ||
574 | return 1; | ||
575 | } | ||
576 | |||
577 | |||
578 | /* | ||
579 | ** -p == !p | ||
580 | */ | ||
581 | static int lp_not (lua_State *L) { | ||
582 | newroot1sib(L, TNot); | ||
583 | return 1; | ||
584 | } | ||
585 | |||
586 | |||
587 | /* | ||
588 | ** [t1 - t2] == Seq (Not t2) t1 | ||
589 | ** If t1 and t2 are charsets, make their difference. | ||
590 | */ | ||
591 | static int lp_sub (lua_State *L) { | ||
592 | Charset st1, st2; | ||
593 | int s1, s2; | ||
594 | TTree *t1 = getpatt(L, 1, &s1); | ||
595 | TTree *t2 = getpatt(L, 2, &s2); | ||
596 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | ||
597 | TTree *t = newcharset(L); | ||
598 | loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); | ||
599 | } | ||
600 | else { | ||
601 | TTree *tree = newtree(L, 2 + s1 + s2); | ||
602 | tree->tag = TSeq; /* sequence of... */ | ||
603 | tree->u.ps = 2 + s2; | ||
604 | sib1(tree)->tag = TNot; /* ...not... */ | ||
605 | memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */ | ||
606 | memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */ | ||
607 | correctkeys(sib1(tree), joinktables(L, 1, 2)); | ||
608 | } | ||
609 | return 1; | ||
610 | } | ||
611 | |||
612 | |||
613 | static int lp_set (lua_State *L) { | ||
614 | size_t l; | ||
615 | const char *s = luaL_checklstring(L, 1, &l); | ||
616 | TTree *tree = newcharset(L); | ||
617 | while (l--) { | ||
618 | setchar(treebuffer(tree), (byte)(*s)); | ||
619 | s++; | ||
620 | } | ||
621 | return 1; | ||
622 | } | ||
623 | |||
624 | |||
625 | static int lp_range (lua_State *L) { | ||
626 | int arg; | ||
627 | int top = lua_gettop(L); | ||
628 | TTree *tree = newcharset(L); | ||
629 | for (arg = 1; arg <= top; arg++) { | ||
630 | int c; | ||
631 | size_t l; | ||
632 | const char *r = luaL_checklstring(L, arg, &l); | ||
633 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); | ||
634 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) | ||
635 | setchar(treebuffer(tree), c); | ||
636 | } | ||
637 | return 1; | ||
638 | } | ||
639 | |||
640 | |||
641 | /* | ||
642 | ** Look-behind predicate | ||
643 | */ | ||
644 | static int lp_behind (lua_State *L) { | ||
645 | TTree *tree; | ||
646 | TTree *tree1 = getpatt(L, 1, NULL); | ||
647 | int n = fixedlen(tree1); | ||
648 | luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures"); | ||
649 | luaL_argcheck(L, n > 0, 1, "pattern may not have fixed length"); | ||
650 | luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind"); | ||
651 | tree = newroot1sib(L, TBehind); | ||
652 | tree->u.n = n; | ||
653 | return 1; | ||
654 | } | ||
655 | |||
656 | |||
657 | /* labeled failure begin */ | ||
658 | /* | ||
659 | ** Throws a label or a set of labels | ||
660 | */ | ||
661 | static int lp_throw (lua_State *L) { | ||
662 | int n = lua_gettop(L); | ||
663 | Labelset ls = 0; | ||
664 | int i; | ||
665 | for (i = 1; i <= n; i++) { | ||
666 | int d = luaL_checkint(L, i); | ||
667 | luaL_argcheck(L, d >= 0 && d < (int)MAXLABELS, i, "invalid label index"); | ||
668 | setlabel(ls, d); | ||
669 | } | ||
670 | newlabelleaf(L, ls); | ||
671 | return 1; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | ** labeled choice function | ||
676 | */ | ||
677 | static int lp_labchoice (lua_State *L) { | ||
678 | TTree *tree; | ||
679 | int n = lua_gettop(L); | ||
680 | int i; | ||
681 | Labelset ls = 0; | ||
682 | for (i = 3; i <= n; i++) { | ||
683 | int d = luaL_checkint(L, i); | ||
684 | luaL_argcheck(L, d >= 0 && d < (int)MAXLABELS, i, "invalid label index"); | ||
685 | setlabel(ls, d); | ||
686 | } | ||
687 | tree = newroot2sib(L, TLabChoice); | ||
688 | tree->labels = ls; | ||
689 | return 1; | ||
690 | } | ||
691 | /* labeled failure end */ | ||
692 | |||
693 | |||
694 | /* | ||
695 | ** Create a non-terminal | ||
696 | */ | ||
697 | static int lp_V (lua_State *L) { | ||
698 | TTree *tree = newleaf(L, TOpenCall); | ||
699 | luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected"); | ||
700 | tree->key = addtoktable(L, 1); | ||
701 | return 1; | ||
702 | } | ||
703 | |||
704 | |||
705 | /* | ||
706 | ** Create a tree for a non-empty capture, with a body and | ||
707 | ** optionally with an associated Lua value (at index 'labelidx' in the | ||
708 | ** stack) | ||
709 | */ | ||
710 | static int capture_aux (lua_State *L, int cap, int labelidx) { | ||
711 | TTree *tree = newroot1sib(L, TCapture); | ||
712 | tree->cap = cap; | ||
713 | tree->key = addtoktable(L, labelidx); | ||
714 | return 1; | ||
715 | } | ||
716 | |||
717 | |||
718 | /* | ||
719 | ** Fill a tree with an empty capture, using an empty (TTrue) sibling. | ||
720 | */ | ||
721 | static TTree *auxemptycap (lua_State *L, TTree *tree, int cap, int idx) { | ||
722 | tree->tag = TCapture; | ||
723 | tree->cap = cap; | ||
724 | tree->key = addtoktable(L, idx); | ||
725 | sib1(tree)->tag = TTrue; | ||
726 | return tree; | ||
727 | } | ||
728 | |||
729 | |||
730 | /* | ||
731 | ** Create a tree for an empty capture | ||
732 | */ | ||
733 | static TTree *newemptycap (lua_State *L, int cap, int idx) { | ||
734 | return auxemptycap(L, newtree(L, 2), cap, idx); | ||
735 | } | ||
736 | |||
737 | |||
738 | /* | ||
739 | ** Captures with syntax p / v | ||
740 | ** (function capture, query capture, string capture, or number capture) | ||
741 | */ | ||
742 | static int lp_divcapture (lua_State *L) { | ||
743 | switch (lua_type(L, 2)) { | ||
744 | case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2); | ||
745 | case LUA_TTABLE: return capture_aux(L, Cquery, 2); | ||
746 | case LUA_TSTRING: return capture_aux(L, Cstring, 2); | ||
747 | case LUA_TNUMBER: { | ||
748 | int n = lua_tointeger(L, 2); | ||
749 | TTree *tree = newroot1sib(L, TCapture); | ||
750 | luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number"); | ||
751 | tree->cap = Cnum; | ||
752 | tree->key = n; | ||
753 | return 1; | ||
754 | } | ||
755 | default: return luaL_argerror(L, 2, "invalid replacement value"); | ||
756 | } | ||
757 | } | ||
758 | |||
759 | |||
760 | static int lp_substcapture (lua_State *L) { | ||
761 | return capture_aux(L, Csubst, 0); | ||
762 | } | ||
763 | |||
764 | |||
765 | static int lp_tablecapture (lua_State *L) { | ||
766 | return capture_aux(L, Ctable, 0); | ||
767 | } | ||
768 | |||
769 | |||
770 | static int lp_groupcapture (lua_State *L) { | ||
771 | if (lua_isnoneornil(L, 2)) | ||
772 | return capture_aux(L, Cgroup, 0); | ||
773 | else { | ||
774 | luaL_checkstring(L, 2); | ||
775 | return capture_aux(L, Cgroup, 2); | ||
776 | } | ||
777 | } | ||
778 | |||
779 | |||
780 | static int lp_foldcapture (lua_State *L) { | ||
781 | luaL_checktype(L, 2, LUA_TFUNCTION); | ||
782 | return capture_aux(L, Cfold, 2); | ||
783 | } | ||
784 | |||
785 | |||
786 | static int lp_simplecapture (lua_State *L) { | ||
787 | return capture_aux(L, Csimple, 0); | ||
788 | } | ||
789 | |||
790 | |||
791 | static int lp_poscapture (lua_State *L) { | ||
792 | newemptycap(L, Cposition, 0); | ||
793 | return 1; | ||
794 | } | ||
795 | |||
796 | |||
797 | static int lp_argcapture (lua_State *L) { | ||
798 | int n = luaL_checkint(L, 1); | ||
799 | TTree *tree = newemptycap(L, Carg, 0); | ||
800 | tree->key = n; | ||
801 | luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); | ||
802 | return 1; | ||
803 | } | ||
804 | |||
805 | |||
806 | static int lp_backref (lua_State *L) { | ||
807 | luaL_checkstring(L, 1); | ||
808 | newemptycap(L, Cbackref, 1); | ||
809 | return 1; | ||
810 | } | ||
811 | |||
812 | |||
813 | /* | ||
814 | ** Constant capture | ||
815 | */ | ||
816 | static int lp_constcapture (lua_State *L) { | ||
817 | int i; | ||
818 | int n = lua_gettop(L); /* number of values */ | ||
819 | if (n == 0) /* no values? */ | ||
820 | newleaf(L, TTrue); /* no capture */ | ||
821 | else if (n == 1) | ||
822 | newemptycap(L, Cconst, 1); /* single constant capture */ | ||
823 | else { /* create a group capture with all values */ | ||
824 | TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2); | ||
825 | tree->tag = TCapture; | ||
826 | tree->cap = Cgroup; | ||
827 | tree->key = 0; | ||
828 | tree = sib1(tree); | ||
829 | for (i = 1; i <= n - 1; i++) { | ||
830 | tree->tag = TSeq; | ||
831 | tree->u.ps = 3; /* skip TCapture and its sibling */ | ||
832 | auxemptycap(L, sib1(tree), Cconst, i); | ||
833 | tree = sib2(tree); | ||
834 | } | ||
835 | auxemptycap(L, tree, Cconst, i); | ||
836 | } | ||
837 | return 1; | ||
838 | } | ||
839 | |||
840 | |||
841 | static int lp_matchtime (lua_State *L) { | ||
842 | TTree *tree; | ||
843 | luaL_checktype(L, 2, LUA_TFUNCTION); | ||
844 | tree = newroot1sib(L, TRunTime); | ||
845 | tree->key = addtoktable(L, 2); | ||
846 | return 1; | ||
847 | } | ||
848 | |||
849 | /* }====================================================== */ | ||
850 | |||
851 | |||
852 | /* | ||
853 | ** {====================================================== | ||
854 | ** Grammar - Tree generation | ||
855 | ** ======================================================= | ||
856 | */ | ||
857 | |||
858 | /* | ||
859 | ** push on the stack the index and the pattern for the | ||
860 | ** initial rule of grammar at index 'arg' in the stack; | ||
861 | ** also add that index into position table. | ||
862 | */ | ||
863 | static void getfirstrule (lua_State *L, int arg, int postab) { | ||
864 | lua_rawgeti(L, arg, 1); /* access first element */ | ||
865 | if (lua_isstring(L, -1)) { /* is it the name of initial rule? */ | ||
866 | lua_pushvalue(L, -1); /* duplicate it to use as key */ | ||
867 | lua_gettable(L, arg); /* get associated rule */ | ||
868 | } | ||
869 | else { | ||
870 | lua_pushinteger(L, 1); /* key for initial rule */ | ||
871 | lua_insert(L, -2); /* put it before rule */ | ||
872 | } | ||
873 | if (!testpattern(L, -1)) { /* initial rule not a pattern? */ | ||
874 | if (lua_isnil(L, -1)) | ||
875 | luaL_error(L, "grammar has no initial rule"); | ||
876 | else | ||
877 | luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2)); | ||
878 | } | ||
879 | lua_pushvalue(L, -2); /* push key */ | ||
880 | lua_pushinteger(L, 1); /* push rule position (after TGrammar) */ | ||
881 | lua_settable(L, postab); /* insert pair at position table */ | ||
882 | } | ||
883 | |||
884 | /* | ||
885 | ** traverse grammar at index 'arg', pushing all its keys and patterns | ||
886 | ** into the stack. Create a new table (before all pairs key-pattern) to | ||
887 | ** collect all keys and their associated positions in the final tree | ||
888 | ** (the "position table"). | ||
889 | ** Return the number of rules and (in 'totalsize') the total size | ||
890 | ** for the new tree. | ||
891 | */ | ||
892 | static int collectrules (lua_State *L, int arg, int *totalsize) { | ||
893 | int n = 1; /* to count number of rules */ | ||
894 | int postab = lua_gettop(L) + 1; /* index of position table */ | ||
895 | int size; /* accumulator for total size */ | ||
896 | lua_newtable(L); /* create position table */ | ||
897 | getfirstrule(L, arg, postab); | ||
898 | size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ | ||
899 | lua_pushnil(L); /* prepare to traverse grammar table */ | ||
900 | while (lua_next(L, arg) != 0) { | ||
901 | if (lua_tonumber(L, -2) == 1 || | ||
902 | lua_equal(L, -2, postab + 1)) { /* initial rule? */ | ||
903 | lua_pop(L, 1); /* remove value (keep key for lua_next) */ | ||
904 | continue; | ||
905 | } | ||
906 | if (!testpattern(L, -1)) /* value is not a pattern? */ | ||
907 | luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2)); | ||
908 | luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules"); | ||
909 | lua_pushvalue(L, -2); /* push key (to insert into position table) */ | ||
910 | lua_pushinteger(L, size); | ||
911 | lua_settable(L, postab); | ||
912 | size += 1 + getsize(L, -1); /* update size */ | ||
913 | lua_pushvalue(L, -2); /* push key (for next lua_next) */ | ||
914 | n++; | ||
915 | } | ||
916 | *totalsize = size + 1; /* TTrue to finish list of rules */ | ||
917 | return n; | ||
918 | } | ||
919 | |||
920 | |||
921 | static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { | ||
922 | int i; | ||
923 | TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */ | ||
924 | for (i = 0; i < n; i++) { /* add each rule into new tree */ | ||
925 | int ridx = frule + 2*i + 1; /* index of i-th rule */ | ||
926 | int rulesize; | ||
927 | TTree *rn = gettree(L, ridx, &rulesize); | ||
928 | nd->tag = TRule; | ||
929 | nd->key = 0; | ||
930 | nd->cap = i; /* rule number */ | ||
931 | nd->u.ps = rulesize + 1; /* point to next rule */ | ||
932 | memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ | ||
933 | mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ | ||
934 | nd = sib2(nd); /* move to next rule */ | ||
935 | } | ||
936 | nd->tag = TTrue; /* finish list of rules */ | ||
937 | } | ||
938 | |||
939 | |||
940 | /* | ||
941 | ** Check whether a tree has potential infinite loops | ||
942 | */ | ||
943 | static int checkloops (TTree *tree) { | ||
944 | tailcall: | ||
945 | if (tree->tag == TRep && nullable(sib1(tree))) | ||
946 | return 1; | ||
947 | else if (tree->tag == TGrammar) | ||
948 | return 0; /* sub-grammars already checked */ | ||
949 | else { | ||
950 | switch (numsiblings[tree->tag]) { | ||
951 | case 1: /* return checkloops(sib1(tree)); */ | ||
952 | tree = sib1(tree); goto tailcall; | ||
953 | case 2: | ||
954 | if (checkloops(sib1(tree))) return 1; | ||
955 | /* else return checkloops(sib2(tree)); */ | ||
956 | tree = sib2(tree); goto tailcall; | ||
957 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
958 | } | ||
959 | } | ||
960 | } | ||
961 | |||
962 | |||
963 | static int verifyerror (lua_State *L, int *passed, int npassed) { | ||
964 | int i, j; | ||
965 | for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ | ||
966 | for (j = i - 1; j >= 0; j--) { | ||
967 | if (passed[i] == passed[j]) { | ||
968 | lua_rawgeti(L, -1, passed[i]); /* get rule's key */ | ||
969 | return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1)); | ||
970 | } | ||
971 | } | ||
972 | } | ||
973 | return luaL_error(L, "too many left calls in grammar"); | ||
974 | } | ||
975 | |||
976 | |||
977 | /* | ||
978 | ** Check whether a rule can be left recursive; raise an error in that | ||
979 | ** case; otherwise return 1 iff pattern is nullable. Assume ktable at | ||
980 | ** the top of the stack. | ||
981 | */ | ||
982 | static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | ||
983 | int nullable) { | ||
984 | tailcall: | ||
985 | switch (tree->tag) { | ||
986 | case TChar: case TSet: case TAny: | ||
987 | case TFalse: case TThrow: /* labeled failure */ | ||
988 | return nullable; /* cannot pass from here */ | ||
989 | case TTrue: | ||
990 | case TBehind: /* look-behind cannot have calls */ | ||
991 | return 1; | ||
992 | case TNot: case TAnd: case TRep: | ||
993 | /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ | ||
994 | tree = sib1(tree); nullable = 1; goto tailcall; | ||
995 | case TCapture: case TRunTime: | ||
996 | /* return verifyrule(L, sib1(tree), passed, npassed); */ | ||
997 | tree = sib1(tree); goto tailcall; | ||
998 | case TCall: | ||
999 | /* return verifyrule(L, sib2(tree), passed, npassed); */ | ||
1000 | tree = sib2(tree); goto tailcall; | ||
1001 | case TSeq: /* only check 2nd child if first is nullable */ | ||
1002 | if (!verifyrule(L, sib1(tree), passed, npassed, 0)) | ||
1003 | return nullable; | ||
1004 | /* else return verifyrule(L, sib2(tree), passed, npassed); */ | ||
1005 | tree = sib2(tree); goto tailcall; | ||
1006 | case TChoice: case TLabChoice: /* must check both children */ /* labeled failure */ | ||
1007 | nullable = verifyrule(L, sib1(tree), passed, npassed, nullable); | ||
1008 | /* return verifyrule(L, sib2(tree), passed, npassed, nullable); */ | ||
1009 | tree = sib2(tree); goto tailcall; | ||
1010 | case TRule: | ||
1011 | if (npassed >= MAXRULES) | ||
1012 | return verifyerror(L, passed, npassed); | ||
1013 | else { | ||
1014 | passed[npassed++] = tree->key; | ||
1015 | /* return verifyrule(L, sib1(tree), passed, npassed); */ | ||
1016 | tree = sib1(tree); goto tailcall; | ||
1017 | } | ||
1018 | case TGrammar: | ||
1019 | return nullable(tree); /* sub-grammar cannot be left recursive */ | ||
1020 | default: assert(0); return 0; | ||
1021 | } | ||
1022 | } | ||
1023 | |||
1024 | |||
1025 | static void verifygrammar (lua_State *L, TTree *grammar) { | ||
1026 | int passed[MAXRULES]; | ||
1027 | TTree *rule; | ||
1028 | /* check left-recursive rules */ | ||
1029 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
1030 | if (rule->key == 0) continue; /* unused rule */ | ||
1031 | verifyrule(L, sib1(rule), passed, 0, 0); | ||
1032 | } | ||
1033 | assert(rule->tag == TTrue); | ||
1034 | /* check infinite loops inside rules */ | ||
1035 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
1036 | if (rule->key == 0) continue; /* unused rule */ | ||
1037 | if (checkloops(sib1(rule))) { | ||
1038 | lua_rawgeti(L, -1, rule->key); /* get rule's key */ | ||
1039 | luaL_error(L, "empty loop in rule '%s'", val2str(L, -1)); | ||
1040 | } | ||
1041 | } | ||
1042 | assert(rule->tag == TTrue); | ||
1043 | } | ||
1044 | |||
1045 | |||
1046 | /* | ||
1047 | ** Give a name for the initial rule if it is not referenced | ||
1048 | */ | ||
1049 | static void initialrulename (lua_State *L, TTree *grammar, int frule) { | ||
1050 | if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */ | ||
1051 | int n = lua_objlen(L, -1) + 1; /* index for name */ | ||
1052 | lua_pushvalue(L, frule); /* rule's name */ | ||
1053 | lua_rawseti(L, -2, n); /* ktable was on the top of the stack */ | ||
1054 | sib1(grammar)->key = n; | ||
1055 | } | ||
1056 | } | ||
1057 | |||
1058 | |||
1059 | static TTree *newgrammar (lua_State *L, int arg) { | ||
1060 | int treesize; | ||
1061 | int frule = lua_gettop(L) + 2; /* position of first rule's key */ | ||
1062 | int n = collectrules(L, arg, &treesize); | ||
1063 | TTree *g = newtree(L, treesize); | ||
1064 | luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules"); | ||
1065 | g->tag = TGrammar; g->u.n = n; | ||
1066 | lua_newtable(L); /* create 'ktable' */ | ||
1067 | lua_setfenv(L, -2); | ||
1068 | buildgrammar(L, g, frule, n); | ||
1069 | lua_getfenv(L, -1); /* get 'ktable' for new tree */ | ||
1070 | finalfix(L, frule - 1, g, sib1(g)); | ||
1071 | initialrulename(L, g, frule); | ||
1072 | verifygrammar(L, g); | ||
1073 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1074 | lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */ | ||
1075 | lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */ | ||
1076 | return g; /* new table at the top of the stack */ | ||
1077 | } | ||
1078 | |||
1079 | /* }====================================================== */ | ||
1080 | |||
1081 | |||
1082 | static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) { | ||
1083 | lua_getfenv(L, idx); /* push 'ktable' (may be used by 'finalfix') */ | ||
1084 | finalfix(L, 0, NULL, p->tree); | ||
1085 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1086 | return compile(L, p); | ||
1087 | } | ||
1088 | |||
1089 | |||
1090 | static int lp_printtree (lua_State *L) { | ||
1091 | TTree *tree = getpatt(L, 1, NULL); | ||
1092 | int c = lua_toboolean(L, 2); | ||
1093 | if (c) { | ||
1094 | lua_getfenv(L, 1); /* push 'ktable' (may be used by 'finalfix') */ | ||
1095 | finalfix(L, 0, NULL, tree); | ||
1096 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1097 | } | ||
1098 | printktable(L, 1); | ||
1099 | printtree(tree, 0); | ||
1100 | return 0; | ||
1101 | } | ||
1102 | |||
1103 | |||
1104 | static int lp_printcode (lua_State *L) { | ||
1105 | Pattern *p = getpattern(L, 1); | ||
1106 | printktable(L, 1); | ||
1107 | if (p->code == NULL) /* not compiled yet? */ | ||
1108 | prepcompile(L, p, 1); | ||
1109 | printpatt(p->code, p->codesize); | ||
1110 | return 0; | ||
1111 | } | ||
1112 | |||
1113 | |||
1114 | /* | ||
1115 | ** Get the initial position for the match, interpreting negative | ||
1116 | ** values from the end of the subject | ||
1117 | */ | ||
1118 | static size_t initposition (lua_State *L, size_t len) { | ||
1119 | lua_Integer ii = luaL_optinteger(L, 3, 1); | ||
1120 | if (ii > 0) { /* positive index? */ | ||
1121 | if ((size_t)ii <= len) /* inside the string? */ | ||
1122 | return (size_t)ii - 1; /* return it (corrected to 0-base) */ | ||
1123 | else return len; /* crop at the end */ | ||
1124 | } | ||
1125 | else { /* negative index */ | ||
1126 | if ((size_t)(-ii) <= len) /* inside the string? */ | ||
1127 | return len - ((size_t)(-ii)); /* return position from the end */ | ||
1128 | else return 0; /* crop at the beginning */ | ||
1129 | } | ||
1130 | } | ||
1131 | |||
1132 | |||
1133 | /* | ||
1134 | ** Main match function | ||
1135 | */ | ||
1136 | static int lp_match (lua_State *L) { | ||
1137 | Capture capture[INITCAPSIZE]; | ||
1138 | const char *r; | ||
1139 | size_t l; | ||
1140 | Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1)); | ||
1141 | Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1); | ||
1142 | const char *s = luaL_checklstring(L, SUBJIDX, &l); | ||
1143 | size_t i = initposition(L, l); | ||
1144 | int ptop = lua_gettop(L); | ||
1145 | lua_pushnil(L); /* initialize subscache */ | ||
1146 | lua_pushlightuserdata(L, capture); /* initialize caplistidx */ | ||
1147 | lua_getfenv(L, 1); /* initialize penvidx */ | ||
1148 | r = match(L, s, s + i, s + l, code, capture, ptop); | ||
1149 | if (r == NULL) { | ||
1150 | lua_pushnil(L); | ||
1151 | return 1; | ||
1152 | } | ||
1153 | return getcaptures(L, s, r, ptop); | ||
1154 | } | ||
1155 | |||
1156 | |||
1157 | |||
1158 | /* | ||
1159 | ** {====================================================== | ||
1160 | ** Library creation and functions not related to matching | ||
1161 | ** ======================================================= | ||
1162 | */ | ||
1163 | |||
1164 | static int lp_setmax (lua_State *L) { | ||
1165 | luaL_optinteger(L, 1, -1); | ||
1166 | lua_settop(L, 1); | ||
1167 | lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
1171 | |||
1172 | static int lp_version (lua_State *L) { | ||
1173 | lua_pushstring(L, VERSION); | ||
1174 | return 1; | ||
1175 | } | ||
1176 | |||
1177 | |||
1178 | static int lp_type (lua_State *L) { | ||
1179 | if (testpattern(L, 1)) | ||
1180 | lua_pushliteral(L, "pattern"); | ||
1181 | else | ||
1182 | lua_pushnil(L); | ||
1183 | return 1; | ||
1184 | } | ||
1185 | |||
1186 | |||
1187 | int lp_gc (lua_State *L) { | ||
1188 | Pattern *p = getpattern(L, 1); | ||
1189 | if (p->codesize > 0) | ||
1190 | reallocprog(L, p, 0); | ||
1191 | return 0; | ||
1192 | } | ||
1193 | |||
1194 | |||
1195 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { | ||
1196 | TTree *t = newcharset(L); | ||
1197 | int i; | ||
1198 | for (i = 0; i <= UCHAR_MAX; i++) | ||
1199 | if (catf(i)) setchar(treebuffer(t), i); | ||
1200 | lua_setfield(L, -2, catname); | ||
1201 | } | ||
1202 | |||
1203 | |||
1204 | static int lp_locale (lua_State *L) { | ||
1205 | if (lua_isnoneornil(L, 1)) { | ||
1206 | lua_settop(L, 0); | ||
1207 | lua_createtable(L, 0, 12); | ||
1208 | } | ||
1209 | else { | ||
1210 | luaL_checktype(L, 1, LUA_TTABLE); | ||
1211 | lua_settop(L, 1); | ||
1212 | } | ||
1213 | createcat(L, "alnum", isalnum); | ||
1214 | createcat(L, "alpha", isalpha); | ||
1215 | createcat(L, "cntrl", iscntrl); | ||
1216 | createcat(L, "digit", isdigit); | ||
1217 | createcat(L, "graph", isgraph); | ||
1218 | createcat(L, "lower", islower); | ||
1219 | createcat(L, "print", isprint); | ||
1220 | createcat(L, "punct", ispunct); | ||
1221 | createcat(L, "space", isspace); | ||
1222 | createcat(L, "upper", isupper); | ||
1223 | createcat(L, "xdigit", isxdigit); | ||
1224 | return 1; | ||
1225 | } | ||
1226 | |||
1227 | |||
1228 | static struct luaL_Reg pattreg[] = { | ||
1229 | {"ptree", lp_printtree}, | ||
1230 | {"pcode", lp_printcode}, | ||
1231 | {"match", lp_match}, | ||
1232 | {"B", lp_behind}, | ||
1233 | {"V", lp_V}, | ||
1234 | {"C", lp_simplecapture}, | ||
1235 | {"Cc", lp_constcapture}, | ||
1236 | {"Cmt", lp_matchtime}, | ||
1237 | {"Cb", lp_backref}, | ||
1238 | {"Carg", lp_argcapture}, | ||
1239 | {"Cp", lp_poscapture}, | ||
1240 | {"Cs", lp_substcapture}, | ||
1241 | {"Ct", lp_tablecapture}, | ||
1242 | {"Cf", lp_foldcapture}, | ||
1243 | {"Cg", lp_groupcapture}, | ||
1244 | {"P", lp_P}, | ||
1245 | {"S", lp_set}, | ||
1246 | {"R", lp_range}, | ||
1247 | {"locale", lp_locale}, | ||
1248 | {"version", lp_version}, | ||
1249 | {"setmaxstack", lp_setmax}, | ||
1250 | {"type", lp_type}, | ||
1251 | {"T", lp_throw}, /* labeled failure throw */ | ||
1252 | {"Lc", lp_labchoice}, /* labeled failure choice */ | ||
1253 | {NULL, NULL} | ||
1254 | }; | ||
1255 | |||
1256 | |||
1257 | static struct luaL_Reg metareg[] = { | ||
1258 | {"__mul", lp_seq}, | ||
1259 | {"__add", lp_choice}, | ||
1260 | {"__pow", lp_star}, | ||
1261 | {"__gc", lp_gc}, | ||
1262 | {"__len", lp_and}, | ||
1263 | {"__div", lp_divcapture}, | ||
1264 | {"__unm", lp_not}, | ||
1265 | {"__sub", lp_sub}, | ||
1266 | {NULL, NULL} | ||
1267 | }; | ||
1268 | |||
1269 | |||
1270 | int luaopen_lpeglabel (lua_State *L); | ||
1271 | int luaopen_lpeglabel (lua_State *L) { | ||
1272 | luaL_newmetatable(L, PATTERN_T); | ||
1273 | lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */ | ||
1274 | lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
1275 | luaL_register(L, NULL, metareg); | ||
1276 | luaL_register(L, "lpeglabel", pattreg); | ||
1277 | lua_pushvalue(L, -1); | ||
1278 | lua_setfield(L, -3, "__index"); | ||
1279 | return 1; | ||
1280 | } | ||
1281 | |||
1282 | /* }====================================================== */ | ||
diff --git a/lptree.h b/lptree.h new file mode 100644 index 0000000..43299a4 --- /dev/null +++ b/lptree.h | |||
@@ -0,0 +1,79 @@ | |||
1 | /* | ||
2 | ** $Id: lptree.h,v 1.2 2013/03/24 13:51:12 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lptree_h) | ||
6 | #define lptree_h | ||
7 | |||
8 | |||
9 | #include "lptypes.h" | ||
10 | |||
11 | |||
12 | /* | ||
13 | ** types of trees | ||
14 | */ | ||
15 | typedef enum TTag { | ||
16 | TChar = 0, TSet, TAny, /* standard PEG elements */ | ||
17 | TTrue, TFalse, | ||
18 | TRep, | ||
19 | TSeq, TChoice, | ||
20 | TNot, TAnd, | ||
21 | TCall, | ||
22 | TOpenCall, | ||
23 | TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */ | ||
24 | TGrammar, /* sib1 is initial (and first) rule */ | ||
25 | TBehind, /* match behind */ | ||
26 | TCapture, /* regular capture */ | ||
27 | TRunTime, /* run-time capture */ | ||
28 | TThrow, TLabChoice /* labeled failure */ | ||
29 | } TTag; | ||
30 | |||
31 | /* number of siblings for each tree */ | ||
32 | extern const byte numsiblings[]; | ||
33 | |||
34 | |||
35 | /* | ||
36 | ** Tree trees | ||
37 | ** The first sibling of a tree (if there is one) is immediately after | ||
38 | ** the tree. A reference to a second sibling (ps) is its position | ||
39 | ** relative to the position of the tree itself. A key in ktable | ||
40 | ** uses the (unique) address of the original tree that created that | ||
41 | ** entry. NULL means no data. | ||
42 | */ | ||
43 | typedef struct TTree { | ||
44 | byte tag; | ||
45 | byte cap; /* kind of capture (if it is a capture) */ | ||
46 | unsigned short key; /* key in ktable for Lua data (0 if no key) */ | ||
47 | Labelset labels; /* labeled failure */ | ||
48 | union { | ||
49 | int ps; /* occasional second sibling */ | ||
50 | int n; /* occasional counter */ | ||
51 | } u; | ||
52 | } TTree; | ||
53 | |||
54 | |||
55 | /* | ||
56 | ** A complete pattern has its tree plus, if already compiled, | ||
57 | ** its corresponding code | ||
58 | */ | ||
59 | typedef struct Pattern { | ||
60 | union Instruction *code; | ||
61 | int codesize; | ||
62 | TTree tree[1]; | ||
63 | } Pattern; | ||
64 | |||
65 | |||
66 | /* number of siblings for each tree */ | ||
67 | extern const byte numsiblings[]; | ||
68 | |||
69 | /* access to siblings */ | ||
70 | #define sib1(t) ((t) + 1) | ||
71 | #define sib2(t) ((t) + (t)->u.ps) | ||
72 | |||
73 | |||
74 | |||
75 | |||
76 | |||
77 | |||
78 | #endif | ||
79 | |||
diff --git a/lptypes.h b/lptypes.h new file mode 100644 index 0000000..503f1f0 --- /dev/null +++ b/lptypes.h | |||
@@ -0,0 +1,158 @@ | |||
1 | /* | ||
2 | ** $Id: lptypes.h,v 1.8 2013/04/12 16:26:38 roberto Exp $ | ||
3 | ** LPeg - PEG pattern matching for Lua | ||
4 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
5 | ** written by Roberto Ierusalimschy | ||
6 | */ | ||
7 | |||
8 | #if !defined(lptypes_h) | ||
9 | #define lptypes_h | ||
10 | |||
11 | |||
12 | #if !defined(LPEG_DEBUG) | ||
13 | #define NDEBUG | ||
14 | #endif | ||
15 | |||
16 | #include <assert.h> | ||
17 | #include <limits.h> | ||
18 | |||
19 | #include "lua.h" | ||
20 | |||
21 | |||
22 | #define VERSION "0.12" | ||
23 | |||
24 | |||
25 | #define PATTERN_T "lpeg-pattern" | ||
26 | #define MAXSTACKIDX "lpeg-maxstack" | ||
27 | |||
28 | |||
29 | /* | ||
30 | ** compatibility with Lua 5.2 | ||
31 | */ | ||
32 | #if (LUA_VERSION_NUM == 502) | ||
33 | |||
34 | #undef lua_equal | ||
35 | #define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) | ||
36 | |||
37 | #undef lua_getfenv | ||
38 | #define lua_getfenv lua_getuservalue | ||
39 | #undef lua_setfenv | ||
40 | #define lua_setfenv lua_setuservalue | ||
41 | |||
42 | #undef lua_objlen | ||
43 | #define lua_objlen lua_rawlen | ||
44 | |||
45 | #undef luaL_register | ||
46 | #define luaL_register(L,n,f) \ | ||
47 | { if ((n) == NULL) luaL_setfuncs(L,f,0); else luaL_newlib(L,f); } | ||
48 | |||
49 | #endif | ||
50 | |||
51 | |||
52 | /* default maximum size for call/backtrack stack */ | ||
53 | #if !defined(MAXBACK) | ||
54 | #define MAXBACK 100 | ||
55 | #endif | ||
56 | |||
57 | |||
58 | /* maximum number of rules in a grammar */ | ||
59 | #define MAXRULES 200 | ||
60 | |||
61 | |||
62 | |||
63 | /* initial size for capture's list */ | ||
64 | #define INITCAPSIZE 32 | ||
65 | |||
66 | |||
67 | /* index, on Lua stack, for subject */ | ||
68 | #define SUBJIDX 2 | ||
69 | |||
70 | /* number of fixed arguments to 'match' (before capture arguments) */ | ||
71 | #define FIXEDARGS 3 | ||
72 | |||
73 | /* index, on Lua stack, for capture list */ | ||
74 | #define caplistidx(ptop) ((ptop) + 2) | ||
75 | |||
76 | /* index, on Lua stack, for pattern's ktable */ | ||
77 | #define ktableidx(ptop) ((ptop) + 3) | ||
78 | |||
79 | /* index, on Lua stack, for backtracking stack */ | ||
80 | #define stackidx(ptop) ((ptop) + 4) | ||
81 | |||
82 | |||
83 | |||
84 | typedef unsigned char byte; | ||
85 | |||
86 | |||
87 | #define BITSPERCHAR 8 | ||
88 | |||
89 | #define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1) | ||
90 | |||
91 | |||
92 | |||
93 | typedef struct Charset { | ||
94 | byte cs[CHARSETSIZE]; | ||
95 | } Charset; | ||
96 | |||
97 | |||
98 | |||
99 | #define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} } | ||
100 | |||
101 | /* access to charset */ | ||
102 | #define treebuffer(t) ((byte *)((t) + 1)) | ||
103 | |||
104 | /* number of slots needed for 'n' bytes */ | ||
105 | #define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1) | ||
106 | |||
107 | /* set 'b' bit in charset 'cs' */ | ||
108 | #define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7))) | ||
109 | |||
110 | /* labeled failure begin */ | ||
111 | typedef int Labelset; | ||
112 | |||
113 | #define MAXLABELS (sizeof(int) * 8) | ||
114 | |||
115 | #define LFAIL 1 | ||
116 | |||
117 | /* set bit 'b' in set 's' */ | ||
118 | #define setlabel(s, b) ((s) |= (1 << (b))) | ||
119 | /* labeled failure end */ | ||
120 | |||
121 | |||
122 | /* | ||
123 | ** in capture instructions, 'kind' of capture and its offset are | ||
124 | ** packed in field 'aux', 4 bits for each | ||
125 | */ | ||
126 | #define getkind(op) ((op)->i.aux & 0xF) | ||
127 | #define getoff(op) (((op)->i.aux >> 4) & 0xF) | ||
128 | #define joinkindoff(k,o) ((k) | ((o) << 4)) | ||
129 | |||
130 | #define MAXOFF 0xF | ||
131 | #define MAXAUX 0xFF | ||
132 | |||
133 | |||
134 | /* maximum number of bytes to look behind */ | ||
135 | #define MAXBEHIND MAXAUX | ||
136 | |||
137 | |||
138 | /* maximum size (in elements) for a pattern */ | ||
139 | #define MAXPATTSIZE (SHRT_MAX - 10) | ||
140 | |||
141 | |||
142 | /* size (in elements) for an instruction plus extra l bytes */ | ||
143 | #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) | ||
144 | |||
145 | |||
146 | /* size (in elements) for a ISet instruction */ | ||
147 | #define CHARSETINSTSIZE instsize(CHARSETSIZE) | ||
148 | |||
149 | /* size (in elements) for a IFunc instruction */ | ||
150 | #define funcinstsize(p) ((p)->i.aux + 2) | ||
151 | |||
152 | |||
153 | |||
154 | #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) | ||
155 | |||
156 | |||
157 | #endif | ||
158 | |||
@@ -0,0 +1,391 @@ | |||
1 | /* | ||
2 | ** $Id: lpvm.c,v 1.5 2013/04/12 16:29:49 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | #include <string.h> | ||
8 | |||
9 | |||
10 | #include "lua.h" | ||
11 | #include "lauxlib.h" | ||
12 | |||
13 | #include "lpcap.h" | ||
14 | #include "lptypes.h" | ||
15 | #include "lpvm.h" | ||
16 | #include "lpprint.h" | ||
17 | |||
18 | |||
19 | /* initial size for call/backtrack stack */ | ||
20 | #if !defined(INITBACK) | ||
21 | #define INITBACK 100 | ||
22 | #endif | ||
23 | |||
24 | |||
25 | #define getoffset(p) (((p) + 1)->offset) | ||
26 | |||
27 | static const Instruction giveup = {{IGiveup, 0, 0}}; | ||
28 | |||
29 | |||
30 | /* | ||
31 | ** {====================================================== | ||
32 | ** Virtual Machine | ||
33 | ** ======================================================= | ||
34 | */ | ||
35 | |||
36 | |||
37 | typedef struct Stack { | ||
38 | const char *s; /* saved position (or NULL for calls) */ | ||
39 | const Instruction *p; /* next instruction */ | ||
40 | int caplevel; | ||
41 | Labelset ls; /* labeled failure */ | ||
42 | } Stack; | ||
43 | |||
44 | |||
45 | #define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) | ||
46 | |||
47 | |||
48 | /* | ||
49 | ** Double the size of the array of captures | ||
50 | */ | ||
51 | static Capture *doublecap (lua_State *L, Capture *cap, int captop, int ptop) { | ||
52 | Capture *newc; | ||
53 | if (captop >= INT_MAX/((int)sizeof(Capture) * 2)) | ||
54 | luaL_error(L, "too many captures"); | ||
55 | newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture)); | ||
56 | memcpy(newc, cap, captop * sizeof(Capture)); | ||
57 | lua_replace(L, caplistidx(ptop)); | ||
58 | return newc; | ||
59 | } | ||
60 | |||
61 | |||
62 | /* | ||
63 | ** Double the size of the stack | ||
64 | */ | ||
65 | static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) { | ||
66 | Stack *stack = getstackbase(L, ptop); | ||
67 | Stack *newstack; | ||
68 | int n = *stacklimit - stack; /* current stack size */ | ||
69 | int max, newn; | ||
70 | lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
71 | max = lua_tointeger(L, -1); /* maximum allowed size */ | ||
72 | lua_pop(L, 1); | ||
73 | if (n >= max) /* already at maximum size? */ | ||
74 | luaL_error(L, "too many pending calls/choices"); | ||
75 | newn = 2 * n; /* new size */ | ||
76 | if (newn > max) newn = max; | ||
77 | newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack)); | ||
78 | memcpy(newstack, stack, n * sizeof(Stack)); | ||
79 | lua_replace(L, stackidx(ptop)); | ||
80 | *stacklimit = newstack + newn; | ||
81 | return newstack + n; /* return next position */ | ||
82 | } | ||
83 | |||
84 | |||
85 | /* | ||
86 | ** Interpret the result of a dynamic capture: false -> fail; | ||
87 | ** true -> keep current position; number -> next position. | ||
88 | ** Return new subject position. 'fr' is stack index where | ||
89 | ** is the result; 'curr' is current subject position; 'limit' | ||
90 | ** is subject's size. | ||
91 | */ | ||
92 | static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { | ||
93 | lua_Integer res; | ||
94 | if (!lua_toboolean(L, fr)) { /* false value? */ | ||
95 | lua_settop(L, fr - 1); /* remove results */ | ||
96 | return -1; /* and fail */ | ||
97 | } | ||
98 | else if (lua_isboolean(L, fr)) /* true? */ | ||
99 | res = curr; /* keep current position */ | ||
100 | else { | ||
101 | res = lua_tointeger(L, fr) - 1; /* new position */ | ||
102 | if (res < curr || res > limit) | ||
103 | luaL_error(L, "invalid position returned by match-time capture"); | ||
104 | } | ||
105 | lua_remove(L, fr); /* remove first result (offset) */ | ||
106 | return res; | ||
107 | } | ||
108 | |||
109 | |||
110 | /* | ||
111 | ** Add capture values returned by a dynamic capture to the capture list | ||
112 | ** 'base', nested inside a group capture. 'fd' indexes the first capture | ||
113 | ** value, 'n' is the number of values (at least 1). | ||
114 | */ | ||
115 | static void adddyncaptures (const char *s, Capture *base, int n, int fd) { | ||
116 | int i; | ||
117 | /* Cgroup capture is already there */ | ||
118 | assert(base[0].kind == Cgroup && base[0].siz == 0); | ||
119 | base[0].idx = 0; /* make it an anonymous group */ | ||
120 | for (i = 1; i <= n; i++) { /* add runtime captures */ | ||
121 | base[i].kind = Cruntime; | ||
122 | base[i].siz = 1; /* mark it as closed */ | ||
123 | base[i].idx = fd + i - 1; /* stack index of capture value */ | ||
124 | base[i].s = s; | ||
125 | } | ||
126 | base[i].kind = Cclose; /* close group */ | ||
127 | base[i].siz = 1; | ||
128 | base[i].s = s; | ||
129 | } | ||
130 | |||
131 | |||
132 | /* | ||
133 | ** Remove dynamic captures from the Lua stack (called in case of failure) | ||
134 | */ | ||
135 | static int removedyncap (lua_State *L, Capture *capture, | ||
136 | int level, int last) { | ||
137 | int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */ | ||
138 | int top = lua_gettop(L); | ||
139 | if (id == 0) return 0; /* no dynamic captures? */ | ||
140 | lua_settop(L, id - 1); /* remove captures */ | ||
141 | return top - id + 1; /* number of values removed */ | ||
142 | } | ||
143 | |||
144 | |||
145 | /* | ||
146 | ** Opcode interpreter | ||
147 | */ | ||
148 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | ||
149 | Instruction *op, Capture *capture, int ptop) { | ||
150 | Stack stackbase[INITBACK]; | ||
151 | Stack *stacklimit = stackbase + INITBACK; | ||
152 | Stack *stack = stackbase; /* point to first empty slot in stack */ | ||
153 | int capsize = INITCAPSIZE; | ||
154 | int captop = 0; /* point to first empty slot in captures */ | ||
155 | int ndyncap = 0; /* number of dynamic captures (in Lua stack) */ | ||
156 | const Instruction *p = op; /* current instruction */ | ||
157 | Labelset labelf; /* labeled failure */ | ||
158 | stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++; | ||
159 | lua_pushlightuserdata(L, stackbase); | ||
160 | for (;;) { | ||
161 | #if defined(LPEGDEBUG) | ||
162 | printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", | ||
163 | s, stack - getstackbase(L, ptop), ndyncap, captop); | ||
164 | printinst(op, p); | ||
165 | printcaplist(capture, capture + captop); | ||
166 | #endif | ||
167 | assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); | ||
168 | switch ((Opcode)p->i.code) { | ||
169 | case IEnd: { | ||
170 | assert(stack == getstackbase(L, ptop) + 1); | ||
171 | capture[captop].kind = Cclose; | ||
172 | capture[captop].s = NULL; | ||
173 | return s; | ||
174 | } | ||
175 | case IGiveup: { | ||
176 | assert(stack == getstackbase(L, ptop)); | ||
177 | return NULL; | ||
178 | } | ||
179 | case IRet: { | ||
180 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL); | ||
181 | p = (--stack)->p; | ||
182 | continue; | ||
183 | } | ||
184 | case IAny: { | ||
185 | if (s < e) { p++; s++; } | ||
186 | else { | ||
187 | labelf = LFAIL; /* labeled failure */ | ||
188 | goto fail; | ||
189 | } | ||
190 | continue; | ||
191 | } | ||
192 | case ITestAny: { | ||
193 | if (s < e) p += 2; | ||
194 | else p += getoffset(p); | ||
195 | continue; | ||
196 | } | ||
197 | case IChar: { | ||
198 | if ((byte)*s == p->i.aux && s < e) { p++; s++; } | ||
199 | else { | ||
200 | labelf = LFAIL; /* labeled failure */ | ||
201 | goto fail; | ||
202 | } | ||
203 | continue; | ||
204 | } | ||
205 | case ITestChar: { | ||
206 | if ((byte)*s == p->i.aux && s < e) p += 2; | ||
207 | else p += getoffset(p); | ||
208 | continue; | ||
209 | } | ||
210 | case ISet: { | ||
211 | int c = (byte)*s; | ||
212 | if (testchar((p+1)->buff, c) && s < e) | ||
213 | { p += CHARSETINSTSIZE; s++; } | ||
214 | else { | ||
215 | labelf = LFAIL; /* labeled failure */ | ||
216 | goto fail; | ||
217 | } | ||
218 | continue; | ||
219 | } | ||
220 | case ITestSet: { | ||
221 | int c = (byte)*s; | ||
222 | if (testchar((p + 2)->buff, c) && s < e) | ||
223 | p += 1 + CHARSETINSTSIZE; | ||
224 | else p += getoffset(p); | ||
225 | continue; | ||
226 | } | ||
227 | case IBehind: { | ||
228 | int n = p->i.aux; | ||
229 | if (n > s - o) { | ||
230 | labelf = LFAIL; /* labeled failure */ | ||
231 | goto fail; | ||
232 | } | ||
233 | s -= n; p++; | ||
234 | continue; | ||
235 | } | ||
236 | case ISpan: { | ||
237 | for (; s < e; s++) { | ||
238 | int c = (byte)*s; | ||
239 | if (!testchar((p+1)->buff, c)) break; | ||
240 | } | ||
241 | p += CHARSETINSTSIZE; | ||
242 | continue; | ||
243 | } | ||
244 | case IJmp: { | ||
245 | p += getoffset(p); | ||
246 | continue; | ||
247 | } | ||
248 | case IChoice: { | ||
249 | if (stack == stacklimit) | ||
250 | stack = doublestack(L, &stacklimit, ptop); | ||
251 | stack->p = p + getoffset(p); | ||
252 | stack->s = s; | ||
253 | stack->ls = LFAIL; /* labeled failure */ | ||
254 | stack->caplevel = captop; | ||
255 | stack++; | ||
256 | p += 2; | ||
257 | continue; | ||
258 | } | ||
259 | case ILabChoice: { /* labeled failure */ | ||
260 | if (stack == stacklimit) | ||
261 | stack = doublestack(L, &stacklimit, ptop); | ||
262 | stack->p = p + getoffset(p); | ||
263 | stack->s = s; | ||
264 | stack->ls = (p + 2)->labels; | ||
265 | stack->caplevel = captop; | ||
266 | stack++; | ||
267 | p += 3; | ||
268 | continue; | ||
269 | } | ||
270 | |||
271 | case ICall: { | ||
272 | if (stack == stacklimit) | ||
273 | stack = doublestack(L, &stacklimit, ptop); | ||
274 | stack->s = NULL; | ||
275 | stack->p = p + 2; /* save return address */ | ||
276 | stack++; | ||
277 | p += getoffset(p); | ||
278 | continue; | ||
279 | } | ||
280 | case ICommit: { | ||
281 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
282 | stack--; | ||
283 | p += getoffset(p); | ||
284 | continue; | ||
285 | } | ||
286 | case IPartialCommit: { | ||
287 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
288 | (stack - 1)->s = s; | ||
289 | (stack - 1)->caplevel = captop; | ||
290 | p += getoffset(p); | ||
291 | continue; | ||
292 | } | ||
293 | case IBackCommit: { | ||
294 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
295 | s = (--stack)->s; | ||
296 | captop = stack->caplevel; | ||
297 | p += getoffset(p); | ||
298 | continue; | ||
299 | } | ||
300 | case IThrow: { /* labeled failure */ | ||
301 | labelf = (p+1)->labels; | ||
302 | goto fail; | ||
303 | } | ||
304 | case IFailTwice: | ||
305 | assert(stack > getstackbase(L, ptop)); | ||
306 | stack--; | ||
307 | /* go through */ | ||
308 | case IFail: | ||
309 | labelf = LFAIL; /* labeled failure */ | ||
310 | fail: { /* pattern failed: try to backtrack */ | ||
311 | do { /* remove pending calls */ | ||
312 | assert(stack > getstackbase(L, ptop)); | ||
313 | s = (--stack)->s; | ||
314 | /*printf("fail (s == NULL => %d), labelf=%d stack->ls=%d (stack-> == giveup %d)\n", | ||
315 | s == NULL, labelf, stack->ls, stack->p == &giveup);*/ | ||
316 | } while (s == NULL || (!(stack->ls & labelf) && stack->p != &giveup)); | ||
317 | if (ndyncap > 0) /* is there matchtime captures? */ | ||
318 | ndyncap -= removedyncap(L, capture, stack->caplevel, captop); | ||
319 | captop = stack->caplevel; | ||
320 | p = stack->p; | ||
321 | continue; | ||
322 | } | ||
323 | case ICloseRunTime: { | ||
324 | CapState cs; | ||
325 | int rem, res, n; | ||
326 | int fr = lua_gettop(L) + 1; /* stack index of first result */ | ||
327 | cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop; | ||
328 | n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ | ||
329 | captop -= n; /* remove nested captures */ | ||
330 | fr -= rem; /* 'rem' items were popped from Lua stack */ | ||
331 | res = resdyncaptures(L, fr, s - o, e - o); /* get result */ | ||
332 | if (res == -1) { /* fail? */ | ||
333 | labelf = LFAIL; /* labeled failure */ | ||
334 | goto fail; | ||
335 | } | ||
336 | s = o + res; /* else update current position */ | ||
337 | n = lua_gettop(L) - fr + 1; /* number of new captures */ | ||
338 | ndyncap += n - rem; /* update number of dynamic captures */ | ||
339 | if (n > 0) { /* any new capture? */ | ||
340 | if ((captop += n + 2) >= capsize) { | ||
341 | capture = doublecap(L, capture, captop, ptop); | ||
342 | capsize = 2 * captop; | ||
343 | } | ||
344 | /* add new captures to 'capture' list */ | ||
345 | adddyncaptures(s, capture + captop - n - 2, n, fr); | ||
346 | } | ||
347 | p++; | ||
348 | continue; | ||
349 | } | ||
350 | case ICloseCapture: { | ||
351 | const char *s1 = s; | ||
352 | assert(captop > 0); | ||
353 | /* if possible, turn capture into a full capture */ | ||
354 | if (capture[captop - 1].siz == 0 && | ||
355 | s1 - capture[captop - 1].s < UCHAR_MAX) { | ||
356 | capture[captop - 1].siz = s1 - capture[captop - 1].s + 1; | ||
357 | p++; | ||
358 | continue; | ||
359 | } | ||
360 | else { | ||
361 | capture[captop].siz = 1; /* mark entry as closed */ | ||
362 | capture[captop].s = s; | ||
363 | goto pushcapture; | ||
364 | } | ||
365 | } | ||
366 | case IOpenCapture: | ||
367 | capture[captop].siz = 0; /* mark entry as open */ | ||
368 | capture[captop].s = s; | ||
369 | goto pushcapture; | ||
370 | case IFullCapture: | ||
371 | capture[captop].siz = getoff(p) + 1; /* save capture size */ | ||
372 | capture[captop].s = s - getoff(p); | ||
373 | /* goto pushcapture; */ | ||
374 | pushcapture: { | ||
375 | capture[captop].idx = p->i.key; | ||
376 | capture[captop].kind = getkind(p); | ||
377 | if (++captop >= capsize) { | ||
378 | capture = doublecap(L, capture, captop, ptop); | ||
379 | capsize = 2 * captop; | ||
380 | } | ||
381 | p++; | ||
382 | continue; | ||
383 | } | ||
384 | default: assert(0); return NULL; | ||
385 | } | ||
386 | } | ||
387 | } | ||
388 | |||
389 | /* }====================================================== */ | ||
390 | |||
391 | |||
@@ -0,0 +1,66 @@ | |||
1 | /* | ||
2 | ** $Id: lpvm.h,v 1.2 2013/04/03 20:37:18 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpvm_h) | ||
6 | #define lpvm_h | ||
7 | |||
8 | #include "lpcap.h" | ||
9 | |||
10 | |||
11 | /* Virtual Machine's instructions */ | ||
12 | typedef enum Opcode { | ||
13 | IAny, /* if no char, fail */ | ||
14 | IChar, /* if char != aux, fail */ | ||
15 | ISet, /* if char not in buff, fail */ | ||
16 | ITestAny, /* in no char, jump to 'offset' */ | ||
17 | ITestChar, /* if char != aux, jump to 'offset' */ | ||
18 | ITestSet, /* if char not in buff, jump to 'offset' */ | ||
19 | ISpan, /* read a span of chars in buff */ | ||
20 | IBehind, /* walk back 'aux' characters (fail if not possible) */ | ||
21 | IRet, /* return from a rule */ | ||
22 | IEnd, /* end of pattern */ | ||
23 | IChoice, /* stack a choice; next fail will jump to 'offset' */ | ||
24 | IJmp, /* jump to 'offset' */ | ||
25 | ICall, /* call rule at 'offset' */ | ||
26 | IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ | ||
27 | ICommit, /* pop choice and jump to 'offset' */ | ||
28 | IPartialCommit, /* update top choice to current position and jump */ | ||
29 | IBackCommit, /* "fails" but jump to its own 'offset' */ | ||
30 | IFailTwice, /* pop one choice and then fail */ | ||
31 | IFail, /* go back to saved state on choice and jump to saved offset */ | ||
32 | IGiveup, /* internal use */ | ||
33 | IFullCapture, /* complete capture of last 'off' chars */ | ||
34 | IOpenCapture, /* start a capture */ | ||
35 | ICloseCapture, | ||
36 | ICloseRunTime, | ||
37 | IThrow, /* "fails" with a specific label labeled failure */ | ||
38 | ILabChoice /* labeled choice */ | ||
39 | } Opcode; | ||
40 | |||
41 | |||
42 | |||
43 | typedef union Instruction { | ||
44 | struct Inst { | ||
45 | byte code; | ||
46 | byte aux; | ||
47 | short key; | ||
48 | } i; | ||
49 | int offset; | ||
50 | Labelset labels; /* labeled failure */ | ||
51 | byte buff[1]; | ||
52 | } Instruction; | ||
53 | |||
54 | |||
55 | int getposition (lua_State *L, int t, int i); | ||
56 | void printpatt (Instruction *p, int n); | ||
57 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | ||
58 | Instruction *op, Capture *capture, int ptop); | ||
59 | int verify (lua_State *L, Instruction *op, const Instruction *p, | ||
60 | Instruction *e, int postable, int rule); | ||
61 | void checkrule (lua_State *L, Instruction *op, int from, int to, | ||
62 | int postable, int rule); | ||
63 | |||
64 | |||
65 | #endif | ||
66 | |||
diff --git a/makefile b/makefile new file mode 100644 index 0000000..4f41062 --- /dev/null +++ b/makefile | |||
@@ -0,0 +1,55 @@ | |||
1 | LIBNAME = lpeglabel | ||
2 | LUADIR = /usr/include/lua5.1/ | ||
3 | |||
4 | #COPT = -O2 | ||
5 | COPT = -DLPEG_DEBUG -g | ||
6 | |||
7 | CWARNS = -Wall -Wextra -pedantic \ | ||
8 | -Waggregate-return \ | ||
9 | -Wcast-align \ | ||
10 | -Wcast-qual \ | ||
11 | -Wdisabled-optimization \ | ||
12 | -Wpointer-arith \ | ||
13 | -Wshadow \ | ||
14 | -Wsign-compare \ | ||
15 | -Wundef \ | ||
16 | -Wwrite-strings \ | ||
17 | -Wbad-function-cast \ | ||
18 | -Wdeclaration-after-statement \ | ||
19 | -Wmissing-prototypes \ | ||
20 | -Wnested-externs \ | ||
21 | -Wstrict-prototypes \ | ||
22 | # -Wunreachable-code \ | ||
23 | |||
24 | |||
25 | CFLAGS = $(CWARNS) $(COPT) -ansi -I$(LUADIR) -fPIC | ||
26 | CC = gcc | ||
27 | |||
28 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o | ||
29 | |||
30 | # For Linux | ||
31 | linux: | ||
32 | make lpeglabel.so "DLLFLAGS = -shared -fPIC" | ||
33 | |||
34 | # For Mac OS | ||
35 | macosx: | ||
36 | make lpeglabel.so "DLLFLAGS = -bundle -undefined dynamic_lookup" | ||
37 | |||
38 | lpeglabel.so: $(FILES) | ||
39 | env $(CC) $(DLLFLAGS) $(FILES) -o lpeglabel.so | ||
40 | |||
41 | $(FILES): makefile | ||
42 | |||
43 | test: test.lua re.lua lpeglabel.so | ||
44 | ./test.lua | ||
45 | |||
46 | clean: | ||
47 | rm -f $(FILES) lpeglabel.so | ||
48 | |||
49 | |||
50 | lpcap.o: lpcap.c lpcap.h lptypes.h | ||
51 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | ||
52 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | ||
53 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | ||
54 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | ||
55 | |||
@@ -0,0 +1,276 @@ | |||
1 | -- $Id: re.lua,v 1.44 2013/03/26 20:11:40 roberto Exp $ | ||
2 | |||
3 | -- imported functions and modules | ||
4 | local tonumber, type, print, error = tonumber, type, print, error | ||
5 | local setmetatable = setmetatable | ||
6 | local unpack = table.unpack | ||
7 | local m = require"lpeglabel" | ||
8 | |||
9 | -- 'm' will be used to parse expressions, and 'mm' will be used to | ||
10 | -- create expressions; that is, 're' runs on 'm', creating patterns | ||
11 | -- on 'mm' | ||
12 | local mm = m | ||
13 | |||
14 | -- pattern's metatable | ||
15 | local mt = getmetatable(mm.P(0)) | ||
16 | |||
17 | |||
18 | |||
19 | -- No more global accesses after this point | ||
20 | local version = _VERSION | ||
21 | if version == "Lua 5.2" then _ENV = nil end | ||
22 | |||
23 | |||
24 | local any = m.P(1) | ||
25 | |||
26 | |||
27 | -- Pre-defined names | ||
28 | local Predef = { nl = m.P"\n" } | ||
29 | |||
30 | |||
31 | local mem | ||
32 | local fmem | ||
33 | local gmem | ||
34 | |||
35 | |||
36 | local function updatelocale () | ||
37 | mm.locale(Predef) | ||
38 | Predef.a = Predef.alpha | ||
39 | Predef.c = Predef.cntrl | ||
40 | Predef.d = Predef.digit | ||
41 | Predef.g = Predef.graph | ||
42 | Predef.l = Predef.lower | ||
43 | Predef.p = Predef.punct | ||
44 | Predef.s = Predef.space | ||
45 | Predef.u = Predef.upper | ||
46 | Predef.w = Predef.alnum | ||
47 | Predef.x = Predef.xdigit | ||
48 | Predef.A = any - Predef.a | ||
49 | Predef.C = any - Predef.c | ||
50 | Predef.D = any - Predef.d | ||
51 | Predef.G = any - Predef.g | ||
52 | Predef.L = any - Predef.l | ||
53 | Predef.P = any - Predef.p | ||
54 | Predef.S = any - Predef.s | ||
55 | Predef.U = any - Predef.u | ||
56 | Predef.W = any - Predef.w | ||
57 | Predef.X = any - Predef.x | ||
58 | mem = {} -- restart memoization | ||
59 | fmem = {} | ||
60 | gmem = {} | ||
61 | local mt = {__mode = "v"} | ||
62 | setmetatable(mem, mt) | ||
63 | setmetatable(fmem, mt) | ||
64 | setmetatable(gmem, mt) | ||
65 | end | ||
66 | |||
67 | |||
68 | updatelocale() | ||
69 | |||
70 | |||
71 | |||
72 | local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) | ||
73 | |||
74 | |||
75 | local function getdef (id, defs) | ||
76 | local c = defs and defs[id] | ||
77 | if not c then error("undefined name: " .. id) end | ||
78 | return c | ||
79 | end | ||
80 | |||
81 | |||
82 | local function patt_error (s, i) | ||
83 | local msg = (#s < i + 20) and s:sub(i) | ||
84 | or s:sub(i,i+20) .. "..." | ||
85 | msg = ("pattern error near '%s'"):format(msg) | ||
86 | error(msg, 2) | ||
87 | end | ||
88 | |||
89 | local function mult (p, n) | ||
90 | local np = mm.P(true) | ||
91 | while n >= 1 do | ||
92 | if n%2 >= 1 then np = np * p end | ||
93 | p = p * p | ||
94 | n = n/2 | ||
95 | end | ||
96 | return np | ||
97 | end | ||
98 | |||
99 | local function equalcap (s, i, c) | ||
100 | if type(c) ~= "string" then return nil end | ||
101 | local e = #c + i | ||
102 | if s:sub(i, e - 1) == c then return e else return nil end | ||
103 | end | ||
104 | |||
105 | |||
106 | local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 | ||
107 | |||
108 | local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 | ||
109 | |||
110 | local arrow = S * "<-" | ||
111 | |||
112 | local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 | ||
113 | |||
114 | name = m.C(name) | ||
115 | |||
116 | |||
117 | -- a defined name only have meaning in a given environment | ||
118 | local Def = name * m.Carg(1) | ||
119 | |||
120 | local num = m.C(m.R"09"^1) * S / tonumber | ||
121 | |||
122 | local String = "'" * m.C((any - "'")^0) * "'" + | ||
123 | '"' * m.C((any - '"')^0) * '"' | ||
124 | |||
125 | |||
126 | local defined = "%" * Def / function (c,Defs) | ||
127 | local cat = Defs and Defs[c] or Predef[c] | ||
128 | if not cat then error ("name '" .. c .. "' undefined") end | ||
129 | return cat | ||
130 | end | ||
131 | |||
132 | local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R | ||
133 | |||
134 | local item = defined + Range + m.C(any) | ||
135 | |||
136 | local Class = | ||
137 | "[" | ||
138 | * (m.C(m.P"^"^-1)) -- optional complement symbol | ||
139 | * m.Cf(item * (item - "]")^0, mt.__add) / | ||
140 | function (c, p) return c == "^" and any - p or p end | ||
141 | * "]" | ||
142 | |||
143 | local function adddef (t, k, exp) | ||
144 | if t[k] then | ||
145 | error("'"..k.."' already defined as a rule") | ||
146 | else | ||
147 | t[k] = exp | ||
148 | end | ||
149 | return t | ||
150 | end | ||
151 | |||
152 | local function firstdef (n, r) return adddef({n}, n, r) end | ||
153 | |||
154 | |||
155 | local function NT (n, b) | ||
156 | if not b then | ||
157 | error("rule '"..n.."' used outside a grammar") | ||
158 | else return mm.V(n) | ||
159 | end | ||
160 | end | ||
161 | |||
162 | local function labchoice (...) | ||
163 | local t = { ... } | ||
164 | local n = #t | ||
165 | local p = t[1] | ||
166 | local i = 2 | ||
167 | while i + 1 <= n do | ||
168 | p = mm.Lc(p, t[i+1], unpack(t[i])) | ||
169 | i = i + 2 | ||
170 | end | ||
171 | |||
172 | return p | ||
173 | end | ||
174 | |||
175 | |||
176 | local exp = m.P{ "Exp", | ||
177 | Exp = S * ( m.V"Grammar" | ||
178 | + (m.V"Seq") * ("/" * m.V"Labels" * S * m.V"Seq")^1 / labchoice | ||
179 | + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) ); | ||
180 | Labels = m.Ct(m.P"{" * S * num * (S * "," * S * num)^0 * S * "}"); | ||
181 | Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul) | ||
182 | * (#seq_follow + patt_error); | ||
183 | Prefix = "&" * S * m.V"Prefix" / mt.__len | ||
184 | + "!" * S * m.V"Prefix" / mt.__unm | ||
185 | + m.V"Suffix"; | ||
186 | Suffix = m.Cf(m.V"Primary" * S * | ||
187 | ( ( m.P"+" * m.Cc(1, mt.__pow) | ||
188 | + m.P"*" * m.Cc(0, mt.__pow) | ||
189 | + m.P"?" * m.Cc(-1, mt.__pow) | ||
190 | + "^" * ( m.Cg(num * m.Cc(mult)) | ||
191 | + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) | ||
192 | ) | ||
193 | + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) | ||
194 | + m.P"{}" * m.Cc(nil, m.Ct) | ||
195 | + m.Cg(Def / getdef * m.Cc(mt.__div)) | ||
196 | ) | ||
197 | + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) | ||
198 | ) * S | ||
199 | )^0, function (a,b,f) return f(a,b) end ); | ||
200 | Primary = "(" * m.V"Exp" * ")" | ||
201 | + String / mm.P | ||
202 | + Class | ||
203 | + defined | ||
204 | + "%{" * S * num * (S * "," * S * num)^0 * S * "}" / mm.T | ||
205 | + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / | ||
206 | function (n, p) return mm.Cg(p, n) end | ||
207 | + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end | ||
208 | + m.P"{}" / mm.Cp | ||
209 | + "{~" * m.V"Exp" * "~}" / mm.Cs | ||
210 | + "{|" * m.V"Exp" * "|}" / mm.Ct | ||
211 | + "{" * m.V"Exp" * "}" / mm.C | ||
212 | + m.P"." * m.Cc(any) | ||
213 | + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT; | ||
214 | Definition = name * arrow * m.V"Exp"; | ||
215 | Grammar = m.Cg(m.Cc(true), "G") * | ||
216 | m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0, | ||
217 | adddef) / mm.P | ||
218 | } | ||
219 | |||
220 | local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) | ||
221 | |||
222 | |||
223 | local function compile (p, defs) | ||
224 | if mm.type(p) == "pattern" then return p end -- already compiled | ||
225 | local cp = pattern:match(p, 1, defs) | ||
226 | if not cp then error("incorrect pattern", 3) end | ||
227 | return cp | ||
228 | end | ||
229 | |||
230 | local function match (s, p, i) | ||
231 | local cp = mem[p] | ||
232 | if not cp then | ||
233 | cp = compile(p) | ||
234 | mem[p] = cp | ||
235 | end | ||
236 | return cp:match(s, i or 1) | ||
237 | end | ||
238 | |||
239 | local function find (s, p, i) | ||
240 | local cp = fmem[p] | ||
241 | if not cp then | ||
242 | cp = compile(p) / 0 | ||
243 | cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } | ||
244 | fmem[p] = cp | ||
245 | end | ||
246 | local i, e = cp:match(s, i or 1) | ||
247 | if i then return i, e - 1 | ||
248 | else return i | ||
249 | end | ||
250 | end | ||
251 | |||
252 | local function gsub (s, p, rep) | ||
253 | local g = gmem[p] or {} -- ensure gmem[p] is not collected while here | ||
254 | gmem[p] = g | ||
255 | local cp = g[rep] | ||
256 | if not cp then | ||
257 | cp = compile(p) | ||
258 | cp = mm.Cs((cp / rep + 1)^0) | ||
259 | g[rep] = cp | ||
260 | end | ||
261 | return cp:match(s) | ||
262 | end | ||
263 | |||
264 | |||
265 | -- exported names | ||
266 | local re = { | ||
267 | compile = compile, | ||
268 | match = match, | ||
269 | find = find, | ||
270 | gsub = gsub, | ||
271 | updatelocale = updatelocale, | ||
272 | } | ||
273 | |||
274 | if version == "Lua 5.1" then _G.re = re end | ||
275 | |||
276 | return re | ||
diff --git a/test.lua b/test.lua new file mode 100755 index 0000000..d486c03 --- /dev/null +++ b/test.lua | |||
@@ -0,0 +1,1386 @@ | |||
1 | #!/usr/bin/env lua5.1 | ||
2 | |||
3 | -- $Id: test.lua,v 1.101 2013/04/12 16:30:33 roberto Exp $ | ||
4 | |||
5 | -- require"strict" -- just to be pedantic | ||
6 | |||
7 | local m = require"lpeglabel" | ||
8 | |||
9 | |||
10 | -- for general use | ||
11 | local a, b, c, d, e, f, g, p, t | ||
12 | |||
13 | |||
14 | -- compatibility with Lua 5.2 | ||
15 | local unpack = rawget(table, "unpack") or unpack | ||
16 | local loadstring = rawget(_G, "loadstring") or load | ||
17 | |||
18 | |||
19 | -- most tests here do not need much stack space | ||
20 | m.setmaxstack(5) | ||
21 | |||
22 | local any = m.P(1) | ||
23 | local space = m.S" \t\n"^0 | ||
24 | |||
25 | local function checkeq (x, y, p) | ||
26 | if p then print(x,y) end | ||
27 | if type(x) ~= "table" then assert(x == y) | ||
28 | else | ||
29 | for k,v in pairs(x) do checkeq(v, y[k], p) end | ||
30 | for k,v in pairs(y) do checkeq(v, x[k], p) end | ||
31 | end | ||
32 | end | ||
33 | |||
34 | |||
35 | local mt = getmetatable(m.P(1)) | ||
36 | |||
37 | |||
38 | local allchar = {} | ||
39 | for i=0,255 do allchar[i + 1] = i end | ||
40 | allchar = string.char(unpack(allchar)) | ||
41 | assert(#allchar == 256) | ||
42 | |||
43 | local function cs2str (c) | ||
44 | return m.match(m.Cs((c + m.P(1)/"")^0), allchar) | ||
45 | end | ||
46 | |||
47 | local function eqcharset (c1, c2) | ||
48 | assert(cs2str(c1) == cs2str(c2)) | ||
49 | end | ||
50 | |||
51 | |||
52 | print"General tests for LPeg library" | ||
53 | |||
54 | assert(type(m.version()) == "string") | ||
55 | print("version " .. m.version()) | ||
56 | assert(m.type("alo") ~= "pattern") | ||
57 | assert(m.type(io.input) ~= "pattern") | ||
58 | assert(m.type(m.P"alo") == "pattern") | ||
59 | |||
60 | -- tests for some basic optimizations | ||
61 | assert(m.match(m.P(false) + "a", "a") == 2) | ||
62 | assert(m.match(m.P(true) + "a", "a") == 1) | ||
63 | assert(m.match("a" + m.P(false), "b") == nil) | ||
64 | assert(m.match("a" + m.P(true), "b") == 1) | ||
65 | |||
66 | assert(m.match(m.P(false) * "a", "a") == nil) | ||
67 | assert(m.match(m.P(true) * "a", "a") == 2) | ||
68 | assert(m.match("a" * m.P(false), "a") == nil) | ||
69 | assert(m.match("a" * m.P(true), "a") == 2) | ||
70 | |||
71 | assert(m.match(#m.P(false) * "a", "a") == nil) | ||
72 | assert(m.match(#m.P(true) * "a", "a") == 2) | ||
73 | assert(m.match("a" * #m.P(false), "a") == nil) | ||
74 | assert(m.match("a" * #m.P(true), "a") == 2) | ||
75 | |||
76 | |||
77 | -- tests for locale | ||
78 | do | ||
79 | assert(m.locale(m) == m) | ||
80 | local t = {} | ||
81 | assert(m.locale(t, m) == t) | ||
82 | local x = m.locale() | ||
83 | for n,v in pairs(x) do | ||
84 | assert(type(n) == "string") | ||
85 | eqcharset(v, m[n]) | ||
86 | end | ||
87 | end | ||
88 | |||
89 | |||
90 | assert(m.match(3, "aaaa")) | ||
91 | assert(m.match(4, "aaaa")) | ||
92 | assert(not m.match(5, "aaaa")) | ||
93 | assert(m.match(-3, "aa")) | ||
94 | assert(not m.match(-3, "aaa")) | ||
95 | assert(not m.match(-3, "aaaa")) | ||
96 | assert(not m.match(-4, "aaaa")) | ||
97 | assert(m.P(-5):match"aaaa") | ||
98 | |||
99 | assert(m.match("a", "alo") == 2) | ||
100 | assert(m.match("al", "alo") == 3) | ||
101 | assert(not m.match("alu", "alo")) | ||
102 | assert(m.match(true, "") == 1) | ||
103 | |||
104 | local digit = m.S"0123456789" | ||
105 | local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
106 | local lower = m.S"abcdefghijklmnopqrstuvwxyz" | ||
107 | local letter = m.S"" + upper + lower | ||
108 | local alpha = letter + digit + m.R() | ||
109 | |||
110 | eqcharset(m.S"", m.P(false)) | ||
111 | eqcharset(upper, m.R("AZ")) | ||
112 | eqcharset(lower, m.R("az")) | ||
113 | eqcharset(upper + lower, m.R("AZ", "az")) | ||
114 | eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")) | ||
115 | eqcharset(digit, m.S"01234567" + "8" + "9") | ||
116 | eqcharset(upper, letter - lower) | ||
117 | eqcharset(m.S(""), m.R()) | ||
118 | assert(cs2str(m.S("")) == "") | ||
119 | |||
120 | eqcharset(m.S"\0", "\0") | ||
121 | eqcharset(m.S"\1\0\2", m.R"\0\2") | ||
122 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") | ||
123 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") | ||
124 | |||
125 | local word = alpha^1 * (1 - alpha)^0 | ||
126 | |||
127 | assert((word^0 * -1):match"alo alo") | ||
128 | assert(m.match(word^1 * -1, "alo alo")) | ||
129 | assert(m.match(word^2 * -1, "alo alo")) | ||
130 | assert(not m.match(word^3 * -1, "alo alo")) | ||
131 | |||
132 | assert(not m.match(word^-1 * -1, "alo alo")) | ||
133 | assert(m.match(word^-2 * -1, "alo alo")) | ||
134 | assert(m.match(word^-3 * -1, "alo alo")) | ||
135 | |||
136 | local eos = m.P(-1) | ||
137 | |||
138 | assert(m.match(digit^0 * letter * digit * eos, "1298a1")) | ||
139 | assert(not m.match(digit^0 * letter * eos, "1257a1")) | ||
140 | |||
141 | b = { | ||
142 | [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")" | ||
143 | } | ||
144 | |||
145 | assert(m.match(b, "(al())()")) | ||
146 | assert(not m.match(b * eos, "(al())()")) | ||
147 | assert(m.match(b * eos, "((al())()(é))")) | ||
148 | assert(not m.match(b, "(al()()")) | ||
149 | |||
150 | assert(not m.match(letter^1 - "for", "foreach")) | ||
151 | assert(m.match(letter^1 - ("for" * eos), "foreach")) | ||
152 | assert(not m.match(letter^1 - ("for" * eos), "for")) | ||
153 | |||
154 | function basiclookfor (p) | ||
155 | return m.P { | ||
156 | [1] = p + (1 * m.V(1)) | ||
157 | } | ||
158 | end | ||
159 | |||
160 | function caplookfor (p) | ||
161 | return basiclookfor(p:C()) | ||
162 | end | ||
163 | |||
164 | assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou") | ||
165 | a = {m.match(caplookfor(letter^1)^0, " two words, one more ")} | ||
166 | checkeq(a, {"two", "words", "one", "more"}) | ||
167 | |||
168 | assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7) | ||
169 | |||
170 | a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")} | ||
171 | checkeq(a, {"123", "d"}) | ||
172 | |||
173 | a = {m.match(m.C(digit^1) * "d" * -1 + m.C(letter^1 * m.Cc"l"), "123d")} | ||
174 | checkeq(a, {"123"}) | ||
175 | |||
176 | a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")} | ||
177 | checkeq(a, {"abcd", "l"}) | ||
178 | |||
179 | a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} | ||
180 | checkeq(a, {10,20,30,2}) | ||
181 | a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} | ||
182 | checkeq(a, {1,10,20,30,2}) | ||
183 | a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') | ||
184 | checkeq(a, {1,10,20,30,2}) | ||
185 | a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') | ||
186 | checkeq(a, {1,7,8,10,20,30,2}) | ||
187 | a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')} | ||
188 | checkeq(a, {1,2,3,4}) | ||
189 | |||
190 | a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")} | ||
191 | checkeq(a, {1, 5}) | ||
192 | |||
193 | |||
194 | t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")} | ||
195 | checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}) | ||
196 | |||
197 | |||
198 | -- test for small capture boundary | ||
199 | for i = 250,260 do | ||
200 | assert(#m.match(m.C(i), string.rep('a', i)) == i) | ||
201 | assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i) | ||
202 | end | ||
203 | |||
204 | |||
205 | -- tests for any*n and any*-n | ||
206 | for n = 1, 550 do | ||
207 | local x_1 = string.rep('x', n - 1) | ||
208 | local x = x_1 .. 'a' | ||
209 | assert(not m.P(n):match(x_1)) | ||
210 | assert(m.P(n):match(x) == n + 1) | ||
211 | assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4) | ||
212 | assert(m.C(n):match(x) == x) | ||
213 | assert(m.C(m.C(n)):match(x) == x) | ||
214 | assert(m.P(-n):match(x_1) == 1) | ||
215 | assert(not m.P(-n):match(x)) | ||
216 | assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20) | ||
217 | local n3 = math.floor(n/3) | ||
218 | assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1) | ||
219 | end | ||
220 | |||
221 | -- true values | ||
222 | assert(m.P(0):match("x") == 1) | ||
223 | assert(m.P(0):match("") == 1) | ||
224 | assert(m.C(0):match("x") == "") | ||
225 | |||
226 | assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1) | ||
227 | assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0) | ||
228 | assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd") | ||
229 | p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4 | ||
230 | |||
231 | |||
232 | -- test for alternation optimization | ||
233 | assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2) | ||
234 | assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3) | ||
235 | assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1) | ||
236 | assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3) | ||
237 | assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3) | ||
238 | assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3) | ||
239 | assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3) | ||
240 | assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4) | ||
241 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3) | ||
242 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4) | ||
243 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4) | ||
244 | assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) | ||
245 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3) | ||
246 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4) | ||
247 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4) | ||
248 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) | ||
249 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3) | ||
250 | assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4) | ||
251 | assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4) | ||
252 | assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4) | ||
253 | assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3) | ||
254 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3) | ||
255 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4) | ||
256 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3) | ||
257 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1) | ||
258 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1) | ||
259 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3) | ||
260 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4) | ||
261 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3) | ||
262 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2) | ||
263 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1) | ||
264 | assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe")) | ||
265 | |||
266 | assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4) | ||
267 | |||
268 | |||
269 | -- bug in 0.12 (rc1) | ||
270 | assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4) | ||
271 | |||
272 | assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) == | ||
273 | 4*10 + 1) | ||
274 | |||
275 | -- optimizations with optional parts | ||
276 | assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1) | ||
277 | assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1) | ||
278 | assert(m.match(("ab" * m.B"c")^-1, "ab") == 1) | ||
279 | assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7) | ||
280 | |||
281 | assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3) | ||
282 | |||
283 | p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1 | ||
284 | assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21) | ||
285 | |||
286 | |||
287 | pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510" | ||
288 | assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) == | ||
289 | m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi)) | ||
290 | print"+" | ||
291 | |||
292 | |||
293 | -- tests for capture optimizations | ||
294 | assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5) | ||
295 | t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")} | ||
296 | checkeq(t, {3, 6}) | ||
297 | |||
298 | |||
299 | -- tests for numbered captures | ||
300 | p = m.C(1) | ||
301 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a") | ||
302 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef") | ||
303 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc") | ||
304 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7) | ||
305 | |||
306 | a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh") | ||
307 | assert(a == "a" and b == "efg" and c == "h") | ||
308 | |||
309 | -- test for table captures | ||
310 | t = m.match(m.Ct(letter^1), "alo") | ||
311 | checkeq(t, {}) | ||
312 | |||
313 | t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo") | ||
314 | assert(n == "t" and table.concat(t) == "alo") | ||
315 | |||
316 | t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") | ||
317 | assert(table.concat(t, ";") == "alo;a;l;o") | ||
318 | |||
319 | t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") | ||
320 | assert(table.concat(t, ";") == "alo;a;l;o") | ||
321 | |||
322 | t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo") | ||
323 | assert(table.concat(t[1], ";") == "1;2;2;3;3;4") | ||
324 | |||
325 | t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo") | ||
326 | checkeq(t, {"alo", "a", "o"}) | ||
327 | |||
328 | |||
329 | -- tests for groups | ||
330 | p = m.Cg(1) -- no capture | ||
331 | assert(p:match('x') == 'x') | ||
332 | p = m.Cg(m.P(true)/function () end * 1) -- no value | ||
333 | assert(p:match('x') == 'x') | ||
334 | p = m.Cg(m.Cg(m.Cg(m.C(1)))) | ||
335 | assert(p:match('x') == 'x') | ||
336 | p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2))) | ||
337 | t = {p:match'abc'} | ||
338 | checkeq(t, {'a', 'b', 'c', 1, 2}) | ||
339 | |||
340 | p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho")) | ||
341 | t = p:match'' | ||
342 | checkeq(t, {hi = 10, ho = 20}) | ||
343 | t = p:match'abc' | ||
344 | checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}) | ||
345 | |||
346 | |||
347 | -- test for error messages | ||
348 | local function checkerr (msg, ...) | ||
349 | assert(m.match({ m.P(msg) + 1 * m.V(1) }, select(2, pcall(...)))) | ||
350 | end | ||
351 | |||
352 | checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a") | ||
353 | checkerr("rule '1' used outside a grammar", m.match, m.V(1), "") | ||
354 | checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "") | ||
355 | checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "") | ||
356 | checkerr("undefined in given grammar", m.match, { m.V{} }, "") | ||
357 | |||
358 | checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }) | ||
359 | checkerr("grammar has no initial rule", m.P, { [print] = {} }) | ||
360 | |||
361 | -- grammar with a long call chain before left recursion | ||
362 | p = {'a', | ||
363 | a = m.V'b' * m.V'c' * m.V'd' * m.V'a', | ||
364 | b = m.V'c', | ||
365 | c = m.V'd', | ||
366 | d = m.V'e', | ||
367 | e = m.V'f', | ||
368 | f = m.V'g', | ||
369 | g = m.P'' | ||
370 | } | ||
371 | checkerr("rule 'a' may be left recursive", m.match, p, "a") | ||
372 | |||
373 | |||
374 | -- tests for non-pattern as arguments to pattern functions | ||
375 | |||
376 | p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } | ||
377 | assert(m.match(p, "aaabaac") == 7) | ||
378 | |||
379 | p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans | ||
380 | |||
381 | assert(p:match("abc01de") == 8) | ||
382 | assert(p:match("abc01de3456") == nil) | ||
383 | |||
384 | p = 'abc' * (2 * (-5 * (true * m.P'de'))) | ||
385 | |||
386 | assert(p:match("abc01de") == 8) | ||
387 | assert(p:match("abc01de3456") == nil) | ||
388 | |||
389 | p = { m.V(2), m.P"abc" } * | ||
390 | (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" }) | ||
391 | assert(p:match("abcaaaxx") == 7) | ||
392 | assert(p:match("abcxx") == 6) | ||
393 | |||
394 | |||
395 | -- a large table capture | ||
396 | t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000)) | ||
397 | assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a') | ||
398 | |||
399 | print('+') | ||
400 | |||
401 | |||
402 | -- bug in 0.10 (rechecking a grammar, after tail-call optimization) | ||
403 | m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } } | ||
404 | |||
405 | local V = m.V | ||
406 | |||
407 | local Space = m.S(" \n\t")^0 | ||
408 | local Number = m.C(m.R("09")^1) * Space | ||
409 | local FactorOp = m.C(m.S("+-")) * Space | ||
410 | local TermOp = m.C(m.S("*/")) * Space | ||
411 | local Open = "(" * Space | ||
412 | local Close = ")" * Space | ||
413 | |||
414 | |||
415 | local function f_factor (v1, op, v2, d) | ||
416 | assert(d == nil) | ||
417 | if op == "+" then return v1 + v2 | ||
418 | else return v1 - v2 | ||
419 | end | ||
420 | end | ||
421 | |||
422 | |||
423 | local function f_term (v1, op, v2, d) | ||
424 | assert(d == nil) | ||
425 | if op == "*" then return v1 * v2 | ||
426 | else return v1 / v2 | ||
427 | end | ||
428 | end | ||
429 | |||
430 | G = m.P{ "Exp", | ||
431 | Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor); | ||
432 | Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term); | ||
433 | Term = Number / tonumber + Open * V"Exp" * Close; | ||
434 | } | ||
435 | |||
436 | G = Space * G * -1 | ||
437 | |||
438 | for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do | ||
439 | assert(m.match(G, s) == loadstring("return "..s)()) | ||
440 | end | ||
441 | |||
442 | |||
443 | -- test for grammars (errors deep in calling non-terminals) | ||
444 | g = m.P{ | ||
445 | [1] = m.V(2) + "a", | ||
446 | [2] = "a" * m.V(3) * "x", | ||
447 | [3] = "b" * m.V(3) + "c" | ||
448 | } | ||
449 | |||
450 | assert(m.match(g, "abbbcx") == 7) | ||
451 | assert(m.match(g, "abbbbx") == 2) | ||
452 | |||
453 | |||
454 | -- tests for \0 | ||
455 | assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4) | ||
456 | assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5) | ||
457 | assert(m.match(m.P(1)^3, "\0\1\0a") == 5) | ||
458 | assert(not m.match(-4, "\0\1\0a")) | ||
459 | assert(m.match("\0\1\0a", "\0\1\0a") == 5) | ||
460 | assert(m.match("\0\0\0", "\0\0\0") == 4) | ||
461 | assert(not m.match("\0\0\0", "\0\0")) | ||
462 | |||
463 | |||
464 | -- tests for predicates | ||
465 | assert(not m.match(-m.P("a") * 2, "alo")) | ||
466 | assert(m.match(- -m.P("a") * 2, "alo") == 3) | ||
467 | assert(m.match(#m.P("a") * 2, "alo") == 3) | ||
468 | assert(m.match(##m.P("a") * 2, "alo") == 3) | ||
469 | assert(not m.match(##m.P("c") * 2, "alo")) | ||
470 | assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
471 | assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
472 | assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
473 | assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
474 | |||
475 | p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) | ||
476 | assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) | ||
477 | |||
478 | p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20) | ||
479 | assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10) | ||
480 | |||
481 | |||
482 | |||
483 | -- look-behind predicate | ||
484 | assert(not m.match(m.B'a', 'a')) | ||
485 | assert(m.match(1 * m.B'a', 'a') == 2) | ||
486 | assert(not m.match(m.B(1), 'a')) | ||
487 | assert(m.match(1 * m.B(1), 'a') == 2) | ||
488 | assert(m.match(-m.B(1), 'a') == 1) | ||
489 | assert(m.match(m.B(250), string.rep('a', 250)) == nil) | ||
490 | assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251) | ||
491 | assert(not pcall(m.B, 260)) | ||
492 | |||
493 | B = #letter * -m.B(letter) + -letter * m.B(letter) | ||
494 | x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) }) | ||
495 | checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}) | ||
496 | checkeq(m.match(x, ' ar cal '), {2,4,5,8}) | ||
497 | checkeq(m.match(x, ' '), {}) | ||
498 | checkeq(m.match(x, 'aloalo'), {1,7}) | ||
499 | |||
500 | assert(m.match(B, "a") == 1) | ||
501 | assert(m.match(1 * B, "a") == 2) | ||
502 | assert(not m.B(1 - letter):match("")) | ||
503 | assert((-m.B(letter)):match("") == 1) | ||
504 | |||
505 | assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5) | ||
506 | assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa")) | ||
507 | assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5) | ||
508 | |||
509 | -- look-behind with grammars | ||
510 | assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil) | ||
511 | assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil) | ||
512 | assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4) | ||
513 | |||
514 | |||
515 | |||
516 | -- bug in 0.9 | ||
517 | assert(m.match(('a' * #m.P'b'), "ab") == 2) | ||
518 | assert(not m.match(('a' * #m.P'b'), "a")) | ||
519 | |||
520 | assert(not m.match(#m.S'567', "")) | ||
521 | assert(m.match(#m.S'567' * 1, "6") == 2) | ||
522 | |||
523 | |||
524 | -- tests for Tail Calls | ||
525 | |||
526 | --labeled failure | ||
527 | p = m.P{ 'a' * m.V(1) + '' } | ||
528 | assert(p:match(string.rep('a', 1000)) == 1001) | ||
529 | |||
530 | -- create a grammar for a simple DFA for even number of 0s and 1s | ||
531 | -- | ||
532 | -- ->1 <---0---> 2 | ||
533 | -- ^ ^ | ||
534 | -- | | | ||
535 | -- 1 1 | ||
536 | -- | | | ||
537 | -- V V | ||
538 | -- 3 <---0---> 4 | ||
539 | -- | ||
540 | -- this grammar should keep no backtracking information | ||
541 | |||
542 | p = m.P{ | ||
543 | [1] = '0' * m.V(2) + '1' * m.V(3) + -1, | ||
544 | [2] = '0' * m.V(1) + '1' * m.V(4), | ||
545 | [3] = '0' * m.V(4) + '1' * m.V(1), | ||
546 | [4] = '0' * m.V(3) + '1' * m.V(2), | ||
547 | } | ||
548 | |||
549 | -- labeled failure | ||
550 | assert(p:match(string.rep("00", 10000))) | ||
551 | assert(p:match(string.rep("01", 10000))) | ||
552 | assert(p:match(string.rep("011", 10000))) | ||
553 | assert(not p:match(string.rep("011", 10000) .. "1")) | ||
554 | assert(not p:match(string.rep("011", 10001))) | ||
555 | |||
556 | |||
557 | -- this grammar does need backtracking info. | ||
558 | local lim = 10000 | ||
559 | p = m.P{ '0' * m.V(1) + '0' } | ||
560 | assert(not pcall(m.match, p, string.rep("0", lim))) | ||
561 | m.setmaxstack(2*lim) | ||
562 | assert(not pcall(m.match, p, string.rep("0", lim))) | ||
563 | m.setmaxstack(2*lim + 4) | ||
564 | assert(pcall(m.match, p, string.rep("0", lim))) | ||
565 | |||
566 | -- this repetition should not need stack space (only the call does) | ||
567 | p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' } | ||
568 | m.setmaxstack(200) | ||
569 | -- labeled failure | ||
570 | assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362) | ||
571 | |||
572 | m.setmaxstack(5) -- restore original limit | ||
573 | |||
574 | -- tests for optional start position | ||
575 | assert(m.match("a", "abc", 1)) | ||
576 | assert(m.match("b", "abc", 2)) | ||
577 | assert(m.match("c", "abc", 3)) | ||
578 | assert(not m.match(1, "abc", 4)) | ||
579 | assert(m.match("a", "abc", -3)) | ||
580 | assert(m.match("b", "abc", -2)) | ||
581 | assert(m.match("c", "abc", -1)) | ||
582 | assert(m.match("abc", "abc", -4)) -- truncate to position 1 | ||
583 | |||
584 | assert(m.match("", "abc", 10)) -- empty string is everywhere! | ||
585 | assert(m.match("", "", 10)) | ||
586 | assert(not m.match(1, "", 1)) | ||
587 | assert(not m.match(1, "", -1)) | ||
588 | assert(not m.match(1, "", 0)) | ||
589 | |||
590 | print("+") | ||
591 | |||
592 | |||
593 | -- tests for argument captures | ||
594 | assert(not pcall(m.Carg, 0)) | ||
595 | assert(not pcall(m.Carg, -1)) | ||
596 | assert(not pcall(m.Carg, 2^18)) | ||
597 | assert(not pcall(m.match, m.Carg(1), 'a', 1)) | ||
598 | assert(m.match(m.Carg(1), 'a', 1, print) == print) | ||
599 | x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)} | ||
600 | checkeq(x, {10, 20}) | ||
601 | |||
602 | assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * | ||
603 | m.Cmt(m.Cb("a"), function (s,i,x) | ||
604 | assert(s == "a" and i == 1); | ||
605 | return i, x+1 | ||
606 | end) * | ||
607 | m.Carg(2), function (s,i,a,b,c) | ||
608 | assert(s == "a" and i == 1 and c == nil); | ||
609 | return i, 2*a + 3*b | ||
610 | end) * "a", | ||
611 | "a", 1, false, 100, 1000) == 2*1001 + 3*100) | ||
612 | |||
613 | |||
614 | -- tests for Lua functions | ||
615 | |||
616 | t = {} | ||
617 | s = "" | ||
618 | p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false | ||
619 | s = "hi, this is a test" | ||
620 | assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1) | ||
621 | assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3) | ||
622 | |||
623 | assert(not m.match(p, s)) | ||
624 | |||
625 | p = mt.__add(function (s, i) return i end, function (s, i) return nil end) | ||
626 | assert(m.match(p, "alo")) | ||
627 | |||
628 | p = mt.__mul(function (s, i) return i end, function (s, i) return nil end) | ||
629 | assert(not m.match(p, "alo")) | ||
630 | |||
631 | |||
632 | t = {} | ||
633 | p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end | ||
634 | s = "hi, this is a test" | ||
635 | assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1) | ||
636 | assert(#t == string.len(s) and t[1] == 2 and t[2] == 3) | ||
637 | |||
638 | t = {} | ||
639 | p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; | ||
640 | return i <= s1:len() and i end) * 1 | ||
641 | s = "hi, this is a test" | ||
642 | assert(m.match(p^0, s) == string.len(s) + 1) | ||
643 | assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2) | ||
644 | |||
645 | p = function (s1, i) return m.match(m.P"a"^1, s1, i) end | ||
646 | assert(m.match(p, "aaaa") == 5) | ||
647 | assert(m.match(p, "abaa") == 2) | ||
648 | assert(not m.match(p, "baaa")) | ||
649 | |||
650 | assert(not pcall(m.match, function () return 2^20 end, s)) | ||
651 | assert(not pcall(m.match, function () return 0 end, s)) | ||
652 | assert(not pcall(m.match, function (s, i) return i - 1 end, s)) | ||
653 | assert(not pcall(m.match, m.P(1)^0 * function (_, i) return i - 1 end, s)) | ||
654 | assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s)) | ||
655 | assert(not pcall(m.match, m.P(1)^0 * function (_, i) return i + 1 end, s)) | ||
656 | assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s)) | ||
657 | assert(not pcall(m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s)) | ||
658 | assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s)) | ||
659 | assert(m.match(m.P(1)^0 * function (_, i) return true end, s) == | ||
660 | string.len(s) + 1) | ||
661 | for i = 1, string.len(s) + 1 do | ||
662 | assert(m.match(function (_, _) return i end, s) == i) | ||
663 | end | ||
664 | |||
665 | p = (m.P(function (s, i) return i%2 == 0 and i end) * 1 | ||
666 | + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0 | ||
667 | * -1 | ||
668 | assert(p:match(string.rep('a', 14000))) | ||
669 | |||
670 | -- tests for Function Replacements | ||
671 | f = function (a, ...) if a ~= "x" then return {a, ...} end end | ||
672 | |||
673 | t = m.match(m.C(1)^0/f, "abc") | ||
674 | checkeq(t, {"a", "b", "c"}) | ||
675 | |||
676 | t = m.match(m.C(1)^0/f/f, "abc") | ||
677 | checkeq(t, {{"a", "b", "c"}}) | ||
678 | |||
679 | t = m.match(m.P(1)^0/f/f, "abc") -- no capture | ||
680 | checkeq(t, {{"abc"}}) | ||
681 | |||
682 | t = m.match((m.P(1)^0/f * m.Cp())/f, "abc") | ||
683 | checkeq(t, {{"abc"}, 4}) | ||
684 | |||
685 | t = m.match((m.C(1)^0/f * m.Cp())/f, "abc") | ||
686 | checkeq(t, {{"a", "b", "c"}, 4}) | ||
687 | |||
688 | t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc") | ||
689 | checkeq(t, {4}) | ||
690 | |||
691 | t = m.match(m.C(m.C(1)^0)/f, "abc") | ||
692 | checkeq(t, {"abc", "a", "b", "c"}) | ||
693 | |||
694 | g = function (...) return 1, ... end | ||
695 | t = {m.match(m.C(1)^0/g/g, "abc")} | ||
696 | checkeq(t, {1, 1, "a", "b", "c"}) | ||
697 | |||
698 | t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")} | ||
699 | t1 = {1,1,nil,nil,4,nil,3,nil,nil} | ||
700 | for i=1,10 do assert(t[i] == t1[i]) end | ||
701 | |||
702 | t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")} | ||
703 | checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}) | ||
704 | |||
705 | t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc") | ||
706 | checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1}) | ||
707 | |||
708 | -- tests for Query Replacements | ||
709 | |||
710 | assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10) | ||
711 | assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10) | ||
712 | assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40) | ||
713 | t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc") | ||
714 | checkeq(t, {40}) | ||
715 | |||
716 | assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e") | ||
717 | assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde") | ||
718 | assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e") | ||
719 | assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.") | ||
720 | assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+") | ||
721 | assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde") | ||
722 | assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde") | ||
723 | assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde") | ||
724 | assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde") | ||
725 | assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx") | ||
726 | assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3") | ||
727 | assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3) | ||
728 | assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3) | ||
729 | |||
730 | assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0") | ||
731 | |||
732 | assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd") | ||
733 | assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d") | ||
734 | assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad") | ||
735 | assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a") | ||
736 | assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx") | ||
737 | assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == | ||
738 | "411 - abc ") | ||
739 | |||
740 | assert(pcall(m.match, m.P(1)/"%0", "abc")) | ||
741 | assert(not pcall(m.match, m.P(1)/"%1", "abc")) -- out of range | ||
742 | assert(not pcall(m.match, m.P(1)/"%9", "abc")) -- out of range | ||
743 | |||
744 | p = m.C(1) | ||
745 | p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1" | ||
746 | assert(p:match("1234567890") == "9 - 1") | ||
747 | |||
748 | assert(m.match(m.Cc(print), "") == print) | ||
749 | |||
750 | -- too many captures (just ignore extra ones) | ||
751 | p = m.C(1)^0 / "%2-%9-%0-%9" | ||
752 | assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8") | ||
753 | s = string.rep("12345678901234567890", 20) | ||
754 | assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3") | ||
755 | |||
756 | -- string captures with non-string subcaptures | ||
757 | p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1" | ||
758 | assert(p:match'x' == 'alo - x - alo') | ||
759 | |||
760 | assert(not pcall(m.match, m.Cc(true) / "%1", "a")) | ||
761 | |||
762 | -- long strings for string capture | ||
763 | l = 10000 | ||
764 | s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) | ||
765 | |||
766 | p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' | ||
767 | |||
768 | assert(p:match(s) == string.rep('c', l) .. | ||
769 | string.rep('b', l) .. | ||
770 | string.rep('a', l)) | ||
771 | |||
772 | print"+" | ||
773 | |||
774 | -- accumulator capture | ||
775 | function f (x) return x + 1 end | ||
776 | assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) | ||
777 | |||
778 | t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} | ||
779 | checkeq(t, {1}) | ||
780 | p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0, | ||
781 | rawset) | ||
782 | t = p:match("a=b;c=du;xux=yuy;") | ||
783 | checkeq(t, {a="b", c="du", xux="yuy"}) | ||
784 | |||
785 | |||
786 | -- errors in accumulator capture | ||
787 | |||
788 | -- very long match (forces fold to be a pair open-close) producing with | ||
789 | -- no initial capture | ||
790 | assert(not pcall(m.match, m.Cf(m.P(500), print), string.rep('a', 600))) | ||
791 | |||
792 | -- nested capture produces no initial value | ||
793 | assert(not pcall(m.match, m.Cf(m.P(1) / {}, print), "alo")) | ||
794 | |||
795 | |||
796 | -- tests for loop checker | ||
797 | |||
798 | local function haveloop (p) | ||
799 | assert(not pcall(function (p) return p^0 end, m.P(p))) | ||
800 | end | ||
801 | |||
802 | haveloop(m.P("x")^-4) | ||
803 | assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3) | ||
804 | assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3) | ||
805 | haveloop("") | ||
806 | haveloop(m.P("x")^0) | ||
807 | haveloop(m.P("x")^-1) | ||
808 | haveloop(m.P("x") + 1 + 2 + m.P("a")^-1) | ||
809 | haveloop(-m.P("ab")) | ||
810 | haveloop(- -m.P("ab")) | ||
811 | haveloop(# #(m.P("ab") + "xy")) | ||
812 | haveloop(- #m.P("ab")^0) | ||
813 | haveloop(# -m.P("ab")^1) | ||
814 | haveloop(#m.V(3)) | ||
815 | haveloop(m.V(3) + m.V(1) + m.P('a')^-1) | ||
816 | haveloop({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}) | ||
817 | assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc") | ||
818 | == 3) | ||
819 | assert(m.match(m.P""^-3, "a") == 1) | ||
820 | |||
821 | local function find (p, s) | ||
822 | return m.match(basiclookfor(p), s) | ||
823 | end | ||
824 | |||
825 | |||
826 | local function badgrammar (g, expected) | ||
827 | local stat, msg = pcall(m.P, g) | ||
828 | assert(not stat) | ||
829 | if expected then assert(find(expected, msg)) end | ||
830 | end | ||
831 | |||
832 | badgrammar({[1] = m.V(1)}, "rule '1'") | ||
833 | badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal | ||
834 | badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal | ||
835 | badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal | ||
836 | badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive | ||
837 | badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive | ||
838 | badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive | ||
839 | badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive | ||
840 | badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive | ||
841 | badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop | ||
842 | badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop | ||
843 | badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop | ||
844 | badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop | ||
845 | badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop | ||
846 | badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive | ||
847 | badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'") | ||
848 | |||
849 | assert(m.match({'a' * -m.V(1)}, "aaa") == 2) | ||
850 | assert(m.match({'a' * -m.V(1)}, "aaaa") == nil) | ||
851 | |||
852 | |||
853 | -- good x bad grammars | ||
854 | m.P{ ('a' * m.V(1))^-1 } | ||
855 | m.P{ -('a' * m.V(1)) } | ||
856 | m.P{ ('abc' * m.V(1))^-1 } | ||
857 | m.P{ -('abc' * m.V(1)) } | ||
858 | badgrammar{ #m.P('abc') * m.V(1) } | ||
859 | badgrammar{ -('a' + m.V(1)) } | ||
860 | m.P{ #('a' * m.V(1)) } | ||
861 | badgrammar{ #('a' + m.V(1)) } | ||
862 | m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) } | ||
863 | badgrammar{ m.B{ m.P'abc' } * m.V(1) } | ||
864 | badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) } | ||
865 | |||
866 | |||
867 | -- simple tests for maximum sizes: | ||
868 | local p = m.P"a" | ||
869 | for i=1,14 do p = p * p end | ||
870 | |||
871 | p = {} | ||
872 | for i=1,100 do p[i] = m.P"a" end | ||
873 | p = m.P(p) | ||
874 | |||
875 | |||
876 | -- strange values for rule labels | ||
877 | |||
878 | p = m.P{ "print", | ||
879 | print = m.V(print), | ||
880 | [print] = m.V(_G), | ||
881 | [_G] = m.P"a", | ||
882 | } | ||
883 | |||
884 | assert(p:match("a")) | ||
885 | |||
886 | -- initial rule | ||
887 | g = {} | ||
888 | for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end | ||
889 | g.i11 = m.P"" | ||
890 | for i = 1, 10 do | ||
891 | g[1] = "i"..i | ||
892 | local p = m.P(g) | ||
893 | assert(p:match("aaaaaaaaaaa") == 11 - i + 1) | ||
894 | end | ||
895 | |||
896 | print"+" | ||
897 | |||
898 | |||
899 | -- tests for back references | ||
900 | assert(not pcall(m.match, m.Cb('x'), '')) | ||
901 | assert(not pcall(m.match, m.Cg(1, 'a') * m.Cb('b'), 'a')) | ||
902 | |||
903 | p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) | ||
904 | t = p:match("ab") | ||
905 | checkeq(t, {"a", "b"}) | ||
906 | |||
907 | |||
908 | t = {} | ||
909 | function foo (p) t[#t + 1] = p; return p .. "x" end | ||
910 | |||
911 | p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" * | ||
912 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * | ||
913 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * | ||
914 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" | ||
915 | x = {p:match'ab'} | ||
916 | checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}) | ||
917 | checkeq(t, {'ab', | ||
918 | 'ab', 'abx', | ||
919 | 'ab', 'abx', 'abxx', | ||
920 | 'ab', 'abx', 'abxx', 'abxxx'}) | ||
921 | |||
922 | |||
923 | |||
924 | -- tests for match-time captures | ||
925 | |||
926 | p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end) | ||
927 | + 'acd' | ||
928 | |||
929 | assert(p:match('abc') == 3) | ||
930 | assert(p:match('acd') == 4) | ||
931 | |||
932 | local function id (s, i, ...) | ||
933 | return true, ... | ||
934 | end | ||
935 | |||
936 | assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + | ||
937 | m.R'09'^1 / string.char + | ||
938 | m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") | ||
939 | |||
940 | p = m.P{'S', | ||
941 | S = m.V'atom' * space | ||
942 | + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id), | ||
943 | atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id) | ||
944 | } | ||
945 | x = p:match"(a g () ((b) c) (d (e)))" | ||
946 | checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); | ||
947 | |||
948 | x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))} | ||
949 | assert(#x == 500) | ||
950 | |||
951 | local function id(s, i, x) | ||
952 | if x == 'a' then return i, 1, 3, 7 | ||
953 | else return nil, 2, 4, 6, 8 | ||
954 | end | ||
955 | end | ||
956 | |||
957 | p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 | ||
958 | assert(table.concat{p:match('abababab')} == string.rep('137', 4)) | ||
959 | |||
960 | local function ref (s, i, x) | ||
961 | return m.match(x, s, i - x:len()) | ||
962 | end | ||
963 | |||
964 | assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4) | ||
965 | assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4) | ||
966 | assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo')) | ||
967 | |||
968 | ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end | ||
969 | |||
970 | assert(m.Cmt(1, ref):match'2') | ||
971 | assert(not m.Cmt(1, ref):match'1') | ||
972 | assert(m.Cmt(m.P(1)^0, ref):match'03') | ||
973 | |||
974 | function ref (s, i, a, b) | ||
975 | if a == b then return i, a:upper() end | ||
976 | end | ||
977 | |||
978 | p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref) | ||
979 | p = (any - p)^0 * p * any^0 * -1 | ||
980 | |||
981 | assert(p:match'abbbc-bc ddaa' == 'BC') | ||
982 | |||
983 | do -- match-time captures cannot be optimized away | ||
984 | local touch = 0 | ||
985 | f = m.P(function () touch = touch + 1; return true end) | ||
986 | |||
987 | local function check(n) n = n or 1; assert(touch == n); touch = 0 end | ||
988 | |||
989 | assert(m.match(f * false + 'b', 'a') == nil); check() | ||
990 | assert(m.match(f * false + 'b', '') == nil); check() | ||
991 | assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check() | ||
992 | assert(m.match( (f * 'a')^0 * 'b', '') == nil); check() | ||
993 | assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check() | ||
994 | assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check() | ||
995 | assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check() | ||
996 | assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() | ||
997 | assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); | ||
998 | check() | ||
999 | assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check() | ||
1000 | assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2) | ||
1001 | assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1) | ||
1002 | assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1) | ||
1003 | assert(m.match(-f * 'a' + 'b', '') == nil); check(1) | ||
1004 | end | ||
1005 | |||
1006 | c = '[' * m.Cg(m.P'='^0, "init") * '[' * | ||
1007 | { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2) | ||
1008 | return s1 == s2 end) | ||
1009 | + 1 * m.V(1) } / 0 | ||
1010 | |||
1011 | assert(c:match'[==[]]====]]]]==]===[]' == 18) | ||
1012 | assert(c:match'[[]=]====]=]]]==]===[]' == 14) | ||
1013 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1014 | |||
1015 | |||
1016 | -- old bug: optimization of concat with fail removed match-time capture | ||
1017 | p = m.Cmt(0, function (s) p = s end) * m.P(false) | ||
1018 | assert(not p:match('alo')) | ||
1019 | assert(p == 'alo') | ||
1020 | |||
1021 | |||
1022 | -- ensure that failed match-time captures are not kept on Lua stack | ||
1023 | do | ||
1024 | local t = {__mode = "kv"}; setmetatable(t,t) | ||
1025 | local c = 0 | ||
1026 | |||
1027 | local function foo (s,i) | ||
1028 | collectgarbage(); | ||
1029 | assert(next(t) == "__mode" and next(t, "__mode") == nil) | ||
1030 | local x = {} | ||
1031 | t[x] = true | ||
1032 | c = c + 1 | ||
1033 | return i, x | ||
1034 | end | ||
1035 | |||
1036 | local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" } | ||
1037 | p:match(string.rep('1', 10)) | ||
1038 | assert(c == 11) | ||
1039 | end | ||
1040 | |||
1041 | p = (m.P(function () return true, "a" end) * 'a' | ||
1042 | + m.P(function (s, i) return i, "aa", 20 end) * 'b' | ||
1043 | + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 | ||
1044 | |||
1045 | t = {p:match('abacc')} | ||
1046 | checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) | ||
1047 | |||
1048 | |||
1049 | ------------------------------------------------------------------- | ||
1050 | -- Tests for 're' module | ||
1051 | ------------------------------------------------------------------- | ||
1052 | |||
1053 | local re = require "re" | ||
1054 | |||
1055 | local match, compile = re.match, re.compile | ||
1056 | |||
1057 | assert(match("a", ".") == 2) | ||
1058 | assert(match("a", "''") == 1) | ||
1059 | assert(match("", " ! . ") == 1) | ||
1060 | assert(not match("a", " ! . ")) | ||
1061 | assert(match("abcde", " ( . . ) * ") == 5) | ||
1062 | assert(match("abbcde", " [a-c] +") == 5) | ||
1063 | assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7) | ||
1064 | assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8) | ||
1065 | assert(match("abbc--", " [a-c] + +") == 5) | ||
1066 | assert(match("abbc--", " [ac-] +") == 2) | ||
1067 | assert(match("abbc--", " [-acb] + ") == 7) | ||
1068 | assert(not match("abbcde", " [b-z] + ")) | ||
1069 | assert(match("abb\"de", '"abb"["]"de"') == 7) | ||
1070 | assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") | ||
1071 | assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) | ||
1072 | local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} | ||
1073 | checkeq(t, {4, 5, 7}) | ||
1074 | local t = {match("abceefe", "((&&'e' {})? .)*")} | ||
1075 | checkeq(t, {4, 5, 7}) | ||
1076 | local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")} | ||
1077 | checkeq(t, {4, 5, 7}) | ||
1078 | local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")} | ||
1079 | checkeq(t, {4, 5, 7}) | ||
1080 | |||
1081 | assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5) | ||
1082 | assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4) | ||
1083 | assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8) | ||
1084 | |||
1085 | assert(match("abc", "a <- (. a)?") == 4) | ||
1086 | b = "balanced <- '(' ([^()] / balanced)* ')'" | ||
1087 | assert(match("(abc)", b)) | ||
1088 | assert(match("(a(b)((c) (d)))", b)) | ||
1089 | assert(not match("(a(b ((c) (d)))", b)) | ||
1090 | |||
1091 | b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] | ||
1092 | assert(b == m.P(b)) | ||
1093 | assert(b:match"((((a))(b)))") | ||
1094 | |||
1095 | local g = [[ | ||
1096 | S <- "0" B / "1" A / "" -- balanced strings | ||
1097 | A <- "0" S / "1" A A -- one more 0 | ||
1098 | B <- "1" S / "0" B B -- one more 1 | ||
1099 | ]] | ||
1100 | assert(match("00011011", g) == 9) | ||
1101 | |||
1102 | local g = [[ | ||
1103 | S <- ("0" B / "1" A)* | ||
1104 | A <- "0" / "1" A A | ||
1105 | B <- "1" / "0" B B | ||
1106 | ]] | ||
1107 | assert(match("00011011", g) == 9) | ||
1108 | assert(match("000110110", g) == 9) | ||
1109 | assert(match("011110110", g) == 3) | ||
1110 | assert(match("000110010", g) == 1) | ||
1111 | |||
1112 | s = "aaaaaaaaaaaaaaaaaaaaaaaa" | ||
1113 | assert(match(s, "'a'^3") == 4) | ||
1114 | assert(match(s, "'a'^0") == 1) | ||
1115 | assert(match(s, "'a'^+3") == s:len() + 1) | ||
1116 | assert(not match(s, "'a'^+30")) | ||
1117 | assert(match(s, "'a'^-30") == s:len() + 1) | ||
1118 | assert(match(s, "'a'^-5") == 6) | ||
1119 | for i = 1, s:len() do | ||
1120 | assert(match(s, string.format("'a'^+%d", i)) >= i + 1) | ||
1121 | assert(match(s, string.format("'a'^-%d", i)) <= i + 1) | ||
1122 | assert(match(s, string.format("'a'^%d", i)) == i + 1) | ||
1123 | end | ||
1124 | assert(match("01234567890123456789", "[0-9]^3+") == 19) | ||
1125 | |||
1126 | |||
1127 | assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123") | ||
1128 | t = match("0123456789", "{| {.}* |}") | ||
1129 | checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}) | ||
1130 | assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101") | ||
1131 | |||
1132 | assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c") | ||
1133 | assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d") | ||
1134 | assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6) | ||
1135 | |||
1136 | assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")) | ||
1137 | assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")) | ||
1138 | |||
1139 | eqcharset(compile"[]]", "]") | ||
1140 | eqcharset(compile"[][]", m.S"[]") | ||
1141 | eqcharset(compile"[]-]", m.S"-]") | ||
1142 | eqcharset(compile"[-]", m.S"-") | ||
1143 | eqcharset(compile"[az-]", m.S"a-z") | ||
1144 | eqcharset(compile"[-az]", m.S"a-z") | ||
1145 | eqcharset(compile"[a-z]", m.R"az") | ||
1146 | eqcharset(compile"[]['\"]", m.S[[]['"]]) | ||
1147 | |||
1148 | eqcharset(compile"[^]]", any - "]") | ||
1149 | eqcharset(compile"[^][]", any - m.S"[]") | ||
1150 | eqcharset(compile"[^]-]", any - m.S"-]") | ||
1151 | eqcharset(compile"[^]-]", any - m.S"-]") | ||
1152 | eqcharset(compile"[^-]", any - m.S"-") | ||
1153 | eqcharset(compile"[^az-]", any - m.S"a-z") | ||
1154 | eqcharset(compile"[^-az]", any - m.S"a-z") | ||
1155 | eqcharset(compile"[^a-z]", any - m.R"az") | ||
1156 | eqcharset(compile"[^]['\"]", any - m.S[[]['"]]) | ||
1157 | |||
1158 | -- tests for comments in 're' | ||
1159 | e = compile[[ | ||
1160 | A <- _B -- \t \n %nl .<> <- -> -- | ||
1161 | _B <- 'x' --]] | ||
1162 | assert(e:match'xy' == 2) | ||
1163 | |||
1164 | -- tests for 're' with pre-definitions | ||
1165 | defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"} | ||
1166 | e = compile("%letters (%letters / %digits)*", defs) | ||
1167 | assert(e:match"x123" == 5) | ||
1168 | e = compile("%_", defs) | ||
1169 | assert(e:match"__" == 3) | ||
1170 | |||
1171 | e = compile([[ | ||
1172 | S <- A+ | ||
1173 | A <- %letters+ B | ||
1174 | B <- %digits+ | ||
1175 | ]], defs) | ||
1176 | |||
1177 | e = compile("{[0-9]+'.'?[0-9]*} -> sin", math) | ||
1178 | assert(e:match("2.34") == math.sin(2.34)) | ||
1179 | |||
1180 | |||
1181 | function eq (_, _, a, b) return a == b end | ||
1182 | |||
1183 | c = re.compile([[ | ||
1184 | longstring <- '[' {:init: '='* :} '[' close | ||
1185 | close <- ']' =init ']' / . close | ||
1186 | ]]) | ||
1187 | |||
1188 | assert(c:match'[==[]]===]]]]==]===[]' == 17) | ||
1189 | assert(c:match'[[]=]====]=]]]==]===[]' == 14) | ||
1190 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1191 | |||
1192 | c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. " | ||
1193 | |||
1194 | assert(c:match'[==[]]===]]]]==]') | ||
1195 | assert(c:match'[[]=]====]=][]==]===[]]') | ||
1196 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1197 | |||
1198 | assert(re.find("hi alalo", "{:x:..:} =x") == 4) | ||
1199 | assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4) | ||
1200 | assert(not re.find("hi alalo", "{:x:..:} =x", 5)) | ||
1201 | assert(re.find("hi alalo", "{'al'}", 5) == 6) | ||
1202 | assert(re.find("hi aloalolo", "{:x:..:} =x") == 8) | ||
1203 | assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11) | ||
1204 | |||
1205 | -- re.find discards any captures | ||
1206 | local a,b,c = re.find("alo", "{.}{'o'}") | ||
1207 | assert(a == 2 and b == 3 and c == nil) | ||
1208 | |||
1209 | local function match (s,p) | ||
1210 | local i,e = re.find(s,p) | ||
1211 | if i then return s:sub(i, e) end | ||
1212 | end | ||
1213 | assert(match("alo alo", '[a-z]+') == "alo") | ||
1214 | assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil) | ||
1215 | assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo") | ||
1216 | |||
1217 | assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo") | ||
1218 | assert(re.gsub("alo alo", "%w+", ".") == ". .") | ||
1219 | assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == | ||
1220 | "hI, hOw ArE yOU") | ||
1221 | |||
1222 | s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]' | ||
1223 | c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' " | ||
1224 | assert(re.gsub(s, c, "%2") == 'hi and =]') | ||
1225 | assert(re.gsub(s, c, "%0") == s) | ||
1226 | assert(re.gsub('[=[hi]=]', c, "%2") == '=') | ||
1227 | |||
1228 | assert(re.find("", "!.") == 1) | ||
1229 | assert(re.find("alo", "!.") == 4) | ||
1230 | |||
1231 | function addtag (s, i, t, tag) t.tag = tag; return i, t end | ||
1232 | |||
1233 | c = re.compile([[ | ||
1234 | doc <- block !. | ||
1235 | block <- (start {| (block / { [^<]+ })* |} end?) => addtag | ||
1236 | start <- '<' {:tag: [a-z]+ :} '>' | ||
1237 | end <- '</' { =tag } '>' | ||
1238 | ]], {addtag = addtag}) | ||
1239 | |||
1240 | x = c:match[[ | ||
1241 | <x>hi<b>hello</b>but<b>totheend</x>]] | ||
1242 | checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', | ||
1243 | {'totheend'}}) | ||
1244 | |||
1245 | |||
1246 | -- tests for look-ahead captures | ||
1247 | x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} | ||
1248 | checkeq(x, {"", "alo", ""}) | ||
1249 | |||
1250 | assert(re.match("aloalo", | ||
1251 | "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") | ||
1252 | == "AallooAalloo") | ||
1253 | |||
1254 | -- bug in 0.9 (and older versions), due to captures in look-aheads | ||
1255 | x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]] | ||
1256 | assert(x:match"alo alo" == "+ +") | ||
1257 | |||
1258 | -- valid capture in look-ahead (used inside the look-ahead itself) | ||
1259 | x = re.compile[[ | ||
1260 | S <- &({:two: .. :} . =two) {[a-z]+} / . S | ||
1261 | ]] | ||
1262 | assert(x:match("hello aloaLo aloalo xuxu") == "aloalo") | ||
1263 | |||
1264 | |||
1265 | p = re.compile[[ | ||
1266 | block <- {| {:ident:space*:} line | ||
1267 | ((=ident !space line) / &(=ident space) block)* |} | ||
1268 | line <- {[^%nl]*} %nl | ||
1269 | space <- '_' -- should be ' ', but '_' is simpler for editors | ||
1270 | ]] | ||
1271 | |||
1272 | t= p:match[[ | ||
1273 | 1 | ||
1274 | __1.1 | ||
1275 | __1.2 | ||
1276 | ____1.2.1 | ||
1277 | ____ | ||
1278 | 2 | ||
1279 | __2.1 | ||
1280 | ]] | ||
1281 | checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, | ||
1282 | "2", {"2.1", ident = "__"}, ident = ""}) | ||
1283 | |||
1284 | |||
1285 | -- nested grammars | ||
1286 | p = re.compile[[ | ||
1287 | s <- a b !. | ||
1288 | b <- ( x <- ('b' x)? ) | ||
1289 | a <- ( x <- 'a' x? ) | ||
1290 | ]] | ||
1291 | |||
1292 | assert(p:match'aaabbb') | ||
1293 | assert(p:match'aaa') | ||
1294 | assert(not p:match'bbb') | ||
1295 | assert(not p:match'aaabbba') | ||
1296 | |||
1297 | -- testing groups | ||
1298 | t = {re.match("abc", "{:S <- {:.:} {S} / '':}")} | ||
1299 | checkeq(t, {"a", "bc", "b", "c", "c", ""}) | ||
1300 | |||
1301 | t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}") | ||
1302 | checkeq(t, {a="1", b="2", c="4"}) | ||
1303 | t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}") | ||
1304 | checkeq(t, {a="1", b="2", c="4"}) | ||
1305 | t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}") | ||
1306 | checkeq(t, {"1", b="2", "4", "5"}) | ||
1307 | t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}") | ||
1308 | checkeq(t, {"1", "23", "4", "5"}) | ||
1309 | t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}") | ||
1310 | checkeq(t, {"1", "23", "4", "5"}) | ||
1311 | |||
1312 | |||
1313 | -- testing pre-defined names | ||
1314 | assert(os.setlocale("C") == "C") | ||
1315 | |||
1316 | function eqlpeggsub (p1, p2) | ||
1317 | local s1 = cs2str(re.compile(p1)) | ||
1318 | local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "") | ||
1319 | -- if s1 ~= s2 then print(#s1,#s2) end | ||
1320 | assert(s1 == s2) | ||
1321 | end | ||
1322 | |||
1323 | |||
1324 | eqlpeggsub("%w", "%w") | ||
1325 | eqlpeggsub("%a", "%a") | ||
1326 | eqlpeggsub("%l", "%l") | ||
1327 | eqlpeggsub("%u", "%u") | ||
1328 | eqlpeggsub("%p", "%p") | ||
1329 | eqlpeggsub("%d", "%d") | ||
1330 | eqlpeggsub("%x", "%x") | ||
1331 | eqlpeggsub("%s", "%s") | ||
1332 | eqlpeggsub("%c", "%c") | ||
1333 | |||
1334 | eqlpeggsub("%W", "%W") | ||
1335 | eqlpeggsub("%A", "%A") | ||
1336 | eqlpeggsub("%L", "%L") | ||
1337 | eqlpeggsub("%U", "%U") | ||
1338 | eqlpeggsub("%P", "%P") | ||
1339 | eqlpeggsub("%D", "%D") | ||
1340 | eqlpeggsub("%X", "%X") | ||
1341 | eqlpeggsub("%S", "%S") | ||
1342 | eqlpeggsub("%C", "%C") | ||
1343 | |||
1344 | eqlpeggsub("[%w]", "%w") | ||
1345 | eqlpeggsub("[_%w]", "_%w") | ||
1346 | eqlpeggsub("[^%w]", "%W") | ||
1347 | eqlpeggsub("[%W%S]", "%W%S") | ||
1348 | |||
1349 | re.updatelocale() | ||
1350 | |||
1351 | -- testing nested substitutions x string captures | ||
1352 | |||
1353 | p = re.compile[[ | ||
1354 | text <- {~ item* ~} | ||
1355 | item <- macro / [^()] / '(' item* ')' | ||
1356 | arg <- ' '* {~ (!',' item)* ~} | ||
1357 | args <- '(' arg (',' arg)* ')' | ||
1358 | macro <- ('apply' args) -> '%1(%2)' | ||
1359 | / ('add' args) -> '%1 + %2' | ||
1360 | / ('mul' args) -> '%1 * %2' | ||
1361 | ]] | ||
1362 | |||
1363 | assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)") | ||
1364 | |||
1365 | rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] | ||
1366 | |||
1367 | assert(rev:match"0123456789" == "9876543210") | ||
1368 | |||
1369 | |||
1370 | -- testing error messages in re | ||
1371 | |||
1372 | local function errmsg (p, err) | ||
1373 | local s, msg = pcall(re.compile, p) | ||
1374 | assert(not s and string.find(msg, err)) | ||
1375 | end | ||
1376 | |||
1377 | errmsg('aaaa', "rule 'aaaa'") | ||
1378 | errmsg('a', 'outside') | ||
1379 | errmsg('b <- a', 'undefined') | ||
1380 | errmsg("x <- 'a' x <- 'b'", 'already defined') | ||
1381 | errmsg("'a' -", "near '-'") | ||
1382 | |||
1383 | |||
1384 | print"OK" | ||
1385 | |||
1386 | |||
diff --git a/testlabel.lua b/testlabel.lua new file mode 100644 index 0000000..100c0d0 --- /dev/null +++ b/testlabel.lua | |||
@@ -0,0 +1,422 @@ | |||
1 | local m = require 'lpeglabel' | ||
2 | |||
3 | local p = m.T(1, 2, 5) | ||
4 | assert(p:match("abc") == nil) | ||
5 | |||
6 | -- throws a label that is not caught by ordinary choice | ||
7 | p = m.T(1) + m.P"a" | ||
8 | local r = p:match("abc") | ||
9 | assert(r == nil) | ||
10 | |||
11 | -- again throws a label that is not caught by ordinary choice | ||
12 | local g = m.P{ | ||
13 | "S", | ||
14 | S = m.V"A" + m.V"B", | ||
15 | A = m.T(1), | ||
16 | B = m.P"a" | ||
17 | } | ||
18 | r = g:match("abc") | ||
19 | assert(r == nil) | ||
20 | |||
21 | -- throws a label that is not caught by labeled choice | ||
22 | p = m.Lc(m.T(2), m.P"a", 1, 3) | ||
23 | r = p:match("abc") | ||
24 | assert(r == nil) | ||
25 | |||
26 | -- modifies previous pattern | ||
27 | -- adds another labeled choice to catch label "2" | ||
28 | p = m.Lc(p, m.P"a", 2) | ||
29 | r = p:match("abc") | ||
30 | assert(r == 2) | ||
31 | |||
32 | -- throws a label that is caught by labeled choice | ||
33 | p = m.Lc(m.T(25), m.P"a", 25) | ||
34 | r = p:match("abc") | ||
35 | assert(r == 2) | ||
36 | assert(p:match("bola") == nil) | ||
37 | |||
38 | -- labeled choice did not catch "fail" by default | ||
39 | p = m.Lc(m.P"b", m.P"a", 1) | ||
40 | r = p:match("abc") | ||
41 | assert(r == nil, r) | ||
42 | assert(p:match("bola") == 2) | ||
43 | |||
44 | -- "fail" is label "0" | ||
45 | -- labeled choice can catch "fail" | ||
46 | p = m.Lc(m.P"b", m.P"a", 0) | ||
47 | r = p:match("abc") | ||
48 | assert(r == 2, r) | ||
49 | assert(p:match("bola") == 2) | ||
50 | |||
51 | -- "fail" is label "0" | ||
52 | -- labeled choice can catch "fail" or "3" | ||
53 | p = m.Lc(m.P"a" * m.T(3), (m.P"a" + m.P"b"), 0, 3) | ||
54 | assert(p:match("abc") == 2) | ||
55 | assert(p:match("bac") == 2) | ||
56 | assert(p:match("cab") == nil) | ||
57 | |||
58 | --[[ | ||
59 | S -> A /{1} 'a' | ||
60 | A -> B | ||
61 | B -> %1 | ||
62 | ]] | ||
63 | g = m.P{ | ||
64 | "S", | ||
65 | S = m.Lc(m.V"A", m.P"a", 1), | ||
66 | A = m.V"B", | ||
67 | B = m.T(1), | ||
68 | } | ||
69 | assert(g:match("ab") == 2) | ||
70 | assert(g:match("bc") == nil) | ||
71 | |||
72 | |||
73 | --[[ | ||
74 | S -> A | ||
75 | A -> (B (';' / %{1}))* | ||
76 | B -> 'a' | ||
77 | ]] | ||
78 | g = m.P{ | ||
79 | "S", | ||
80 | S = m.V"A", | ||
81 | A = m.P(m.V"B" * (";" + m.T(1)))^0, | ||
82 | B = m.P'a', | ||
83 | } | ||
84 | assert(g:match("a;a;") == 5) | ||
85 | assert(g:match("a;a") == nil) | ||
86 | |||
87 | |||
88 | |||
89 | -- %1 /{1,3} %2 /{2} 'a' | ||
90 | p = m.Lc(m.Lc(m.T(1), m.T(2), 1, 3), m.P"a", 2) | ||
91 | r = p:match("abc") | ||
92 | assert(r == 2) | ||
93 | assert(p:match("") == nil) | ||
94 | |||
95 | p = m.Lc(m.T(1), m.Lc(m.T(2), m.P"a", 2), 1, 3) | ||
96 | r = p:match("abc") | ||
97 | assert(r == 2) | ||
98 | assert(p:match("") == nil) | ||
99 | |||
100 | print("+") | ||
101 | |||
102 | --[[ grammar based on Figure 8 of paper submitted to SCP | ||
103 | S -> S0 /{1} ID /{2} ID '=' Exp /{3} 'unsigned'* 'int' ID /{4} 'unsigned'* ID ID / %error | ||
104 | S0 -> ID S1 / 'unsigned' S2 / 'int' %3 | ||
105 | S1 -> '=' %2 / !. %1 / ID %4 | ||
106 | S2 -> 'unsigned' S2 / ID %4 / 'int' %3 | ||
107 | ]] | ||
108 | |||
109 | local sp = m.S" \t\n"^0 | ||
110 | local eq = sp * m.P"=" | ||
111 | |||
112 | g = m.P{ | ||
113 | "S", | ||
114 | S = m.Lc( | ||
115 | m.Lc( | ||
116 | m.Lc( | ||
117 | m.Lc(m.V"S0", m.V"ID" * (m.P(1) + ""), 1), | ||
118 | m.V"ID" * eq * m.V"Exp", 2 | ||
119 | ), | ||
120 | m.V"U"^0 * m.V"I" * m.V"ID", 3 | ||
121 | ), | ||
122 | m.V"U"^0 * m.V"ID" * m.V"ID", 4) | ||
123 | + m.T(5), | ||
124 | S0 = m.V"ID" * m.V"S1" + m.V"U" * m.V"S2" + m.V"I" * m.T(3), | ||
125 | S1 = eq * m.T(2) + sp * -m.P(1) * m.T(1) + m.V"ID" * m.T(4), | ||
126 | S2 = m.V"U" * m.V"S2" + m.V"ID" * m.T(4) + m.V"I" * m.T(3), | ||
127 | ID = sp * m.P"a", | ||
128 | U = sp * m.P"unsigned", | ||
129 | I = sp * m.P"int", | ||
130 | Exp = sp * m.P"E", | ||
131 | } | ||
132 | --g:pcode() | ||
133 | |||
134 | local s = "a" | ||
135 | assert(g:match(s) == #s + 1) --1 | ||
136 | s = "a = E" | ||
137 | assert(g:match(s) == #s + 1) --2 | ||
138 | s = "int a" | ||
139 | assert(g:match(s) == #s + 1) --3 | ||
140 | s = "unsigned int a" | ||
141 | assert(g:match(s) == #s + 1) --3 | ||
142 | s = "unsigned a a" | ||
143 | assert(g:match(s) == #s + 1) --4 | ||
144 | s = "b" | ||
145 | assert(g:match(s) == nil) | ||
146 | s = "unsigned" | ||
147 | assert(g:match(s) == nil) | ||
148 | s = "unsigned a" | ||
149 | assert(g:match(s) == nil) | ||
150 | s = "unsigned int" | ||
151 | assert(g:match(s) == nil) | ||
152 | |||
153 | |||
154 | print("+") | ||
155 | |||
156 | local re = require 're' | ||
157 | |||
158 | g = re.compile[['a' /{4,9} [a-z] | ||
159 | ]] | ||
160 | assert(g:match("a") == 2) | ||
161 | assert(g:match("b") == nil) | ||
162 | |||
163 | g = re.compile[['a' /{4,9} [a-f] /{5, 7} [a-z] | ||
164 | ]] | ||
165 | assert(g:match("a") == 2) | ||
166 | assert(g:match("b") == nil) | ||
167 | |||
168 | g = re.compile[[%{1} /{4,9} [a-z] | ||
169 | ]] | ||
170 | assert(g:match("a") == nil) | ||
171 | |||
172 | g = re.compile[[%{1} /{4,1} [a-f] | ||
173 | ]] | ||
174 | assert(g:match("a") == 2) | ||
175 | assert(g:match("h") == nil) | ||
176 | |||
177 | g = re.compile[[[a-f]%{15, 9} /{4,9} [a-c]%{7} /{5, 7} [a-z] ]] | ||
178 | assert(g:match("a") == 2) | ||
179 | assert(g:match("c") == 2) | ||
180 | assert(g:match("d") == nil) | ||
181 | assert(g:match("g") == nil) | ||
182 | |||
183 | --[[ grammar based on Figure 8 of paper submitted to SCP | ||
184 | S -> S0 /{1} ID /{2} ID '=' Exp /{3} 'unsigned'* 'int' ID /{4} 'unsigned'* ID ID / %error | ||
185 | S0 -> ID S1 / 'unsigned' S2 / 'int' %3 | ||
186 | S1 -> '=' %2 / !. %1 / ID %4 | ||
187 | S2 -> 'unsigned' S2 / ID %4 / 'int' %3 | ||
188 | ]] | ||
189 | |||
190 | |||
191 | g = re.compile([[ | ||
192 | S <- S0 /{1} ID /{2} ID %s* '=' Exp /{3} U* Int ID /{4} U ID ID /{0} %{5} | ||
193 | S0 <- ID S1 / U S2 / Int %{3} | ||
194 | S1 <- %s* '=' %{2} / !. %{1} / ID %{4} | ||
195 | S2 <- U S2 / ID %{4} / Int %{3} | ||
196 | ID <- %s* 'a' | ||
197 | U <- %s* 'unsigned' | ||
198 | Int <- %s* 'int' | ||
199 | Exp <- %s* 'E' | ||
200 | ]]) | ||
201 | |||
202 | local s = "a" | ||
203 | assert(g:match(s) == #s + 1) --1 | ||
204 | s = "a = E" | ||
205 | assert(g:match(s) == #s + 1) --2 | ||
206 | s = "int a" | ||
207 | assert(g:match(s) == #s + 1) --3 | ||
208 | s = "unsigned int a" | ||
209 | assert(g:match(s) == #s + 1) --3 | ||
210 | s = "unsigned a a" | ||
211 | assert(g:match(s) == #s + 1) --4 | ||
212 | s = "b" | ||
213 | assert(g:match(s) == nil) | ||
214 | s = "unsigned" | ||
215 | assert(g:match(s) == nil) | ||
216 | s = "unsigned a" | ||
217 | assert(g:match(s) == nil) | ||
218 | s = "unsigned int" | ||
219 | assert(g:match(s) == nil) | ||
220 | |||
221 | local terror = { ['cmdSeq'] = "Missing ';' in CmdSeq", | ||
222 | ['ifExp'] = "Error in expresion of 'if'", | ||
223 | ['ifThen'] = "Error matching 'then' keyword", | ||
224 | ['ifThenCmdSeq'] = "Error matching CmdSeq of 'then' branch", | ||
225 | ['ifElseCmdSeq'] = "Error matching CmdSeq of 'else' branch", | ||
226 | ['ifEnd'] = "Error matching 'end' keyword of 'if'", | ||
227 | ['repeatCmdSeq'] = "Error matching CmdSeq of 'repeat'", | ||
228 | ['repeatUntil'] = "Error matching 'until' keyword", | ||
229 | ['repeatExp'] = "Error matching expression of 'until'", | ||
230 | ['assignOp'] = "Error matching ':='", | ||
231 | ['assignExp'] = "Error matching expression of assignment", | ||
232 | ['readName'] = "Error matching 'NAME' after 'read'", | ||
233 | ['writeExp'] = "Error matching expression after 'write'", | ||
234 | ['simpleExp'] = "Error matching 'SimpleExp'", | ||
235 | ['term'] = "Error matching 'Term'", | ||
236 | ['factor'] = "Error matching 'Factor'", | ||
237 | ['openParExp'] = "Error matching expression after '('", | ||
238 | ['closePar'] = "Error matching ')'", | ||
239 | ['undefined'] = "Error undefined'"} | ||
240 | |||
241 | g = re.compile([[ | ||
242 | Tiny <- CmdSeq /{1} '' -> cmdSeq /{2} '' -> ifExp /{3} '' -> ifThen /{4} '' -> ifThenCmdSeq | ||
243 | /{5} '' -> ifElseCmdSeq /{6} '' -> ifEnd /{7} '' -> repeatCmdSeq | ||
244 | /{8} '' -> repeatUntil /{9} '' -> repeatExp /{10} '' -> assignOp | ||
245 | /{11} '' -> assignExp /{12} '' -> readName /{13} '' -> writeExp | ||
246 | /{14} '' -> simpleExp /{15} '' -> term /{16} '' -> factor | ||
247 | /{17} '' -> openParExp /{18} '' -> closePar /{0} '' -> undefined | ||
248 | CmdSeq <- (Cmd (SEMICOLON / %{1})) (Cmd (SEMICOLON / %{1}))* | ||
249 | Cmd <- IfCmd / RepeatCmd / ReadCmd / WriteCmd / AssignCmd | ||
250 | IfCmd <- IF (Exp / %{2}) (THEN / %{3}) (CmdSeq / %{4}) (ELSE (CmdSeq / %{5}) / '') (END / %{6}) | ||
251 | RepeatCmd <- REPEAT (CmdSeq / %{7}) (UNTIL / %{8}) (Exp / %{9}) | ||
252 | AssignCmd <- !RESERVED NAME (ASSIGNMENT / %{10}) (Exp / %{11}) | ||
253 | ReadCmd <- READ (NAME / %{12}) | ||
254 | WriteCmd <- WRITE (Exp / %{13}) | ||
255 | Exp <- SimpleExp ((LESS / EQUAL) (SimpleExp / %{14}) / '') | ||
256 | SimpleExp <- Term ((ADD / SUB) (Term / %{15}))* | ||
257 | Term <- Factor ((MUL / DIV) (Factor / %{16}))* | ||
258 | Factor <- OPENPAR (Exp / %{17}) (CLOSEPAR / %{18}) / NUMBER / NAME | ||
259 | ADD <- Sp '+' | ||
260 | ASSIGNMENT <- Sp ':=' | ||
261 | CLOSEPAR <- Sp ')' | ||
262 | DIV <- Sp '/' | ||
263 | IF <- Sp 'if' | ||
264 | ELSE <- Sp 'else' | ||
265 | END <- Sp 'end' | ||
266 | EQUAL <- Sp '=' | ||
267 | LESS <- Sp '<' | ||
268 | MUL <- Sp '*' | ||
269 | NAME <- Sp [a-z]+ | ||
270 | NUMBER <- Sp [0-9]+ | ||
271 | OPENPAR <- Sp '(' | ||
272 | READ <- Sp 'read' | ||
273 | REPEAT <- Sp 'repeat' | ||
274 | SEMICOLON <- Sp ';' | ||
275 | SUB <- Sp '-' | ||
276 | THEN <- Sp 'then' | ||
277 | UNTIL <- Sp 'until' | ||
278 | WRITE <- Sp 'write' | ||
279 | RESERVED <- IF / ELSE / END / READ / REPEAT / THEN / UNTIL / WRITE | ||
280 | Sp <- (%s / %nl)* | ||
281 | ]], terror) | ||
282 | |||
283 | s = [[ | ||
284 | n := 5;]] | ||
285 | assert(g:match(s) == #s + 1) | ||
286 | |||
287 | s = [[ | ||
288 | n := 5; | ||
289 | f := 1; | ||
290 | repeat | ||
291 | f := f * n; | ||
292 | n := n - 1; | ||
293 | until (n < 1); | ||
294 | write f;]] | ||
295 | assert(g:match(s) == #s + 1) | ||
296 | |||
297 | -- a ';' is missing in 'read a' | ||
298 | s = [[ | ||
299 | read a]] | ||
300 | assert(g:match(s) == terror['cmdSeq']) | ||
301 | |||
302 | |||
303 | -- a ';' is missing in 'n := n - 1' | ||
304 | s = [[ | ||
305 | n := 5; | ||
306 | f := 1; | ||
307 | repeat | ||
308 | f := f * n; | ||
309 | n := n - 1 | ||
310 | until (n < 1); | ||
311 | write f;]] | ||
312 | assert(g:match(s) == terror['cmdSeq']) | ||
313 | |||
314 | |||
315 | -- IF expression | ||
316 | s = [[ | ||
317 | if a then a := a + 1; end;]] | ||
318 | assert(g:match(s) == #s + 1) | ||
319 | |||
320 | -- IF expression | ||
321 | s = [[ | ||
322 | if a then a := a + 1; else write 2; end;]] | ||
323 | assert(g:match(s) == #s + 1) | ||
324 | |||
325 | -- Error in expression of 'if'. 'A' is not a valida name | ||
326 | s = [[ | ||
327 | if A then a := a + 1; else write 2; end;]] | ||
328 | assert(g:match(s) == terror['ifExp']) | ||
329 | |||
330 | -- Error matching the 'then' keyword | ||
331 | s = [[ | ||
332 | if a a := a + 1; else write 2; end;]] | ||
333 | assert(g:match(s) == terror['ifThen']) | ||
334 | |||
335 | -- Error matching the CmdSeq inside of 'then' branch | ||
336 | s = [[ | ||
337 | if a then 3 := 2; else write 2; end;]] | ||
338 | assert(g:match(s) == terror['ifThenCmdSeq']) | ||
339 | |||
340 | -- Error matching the CmdSeq inside of 'else' branch | ||
341 | s = [[ | ||
342 | if a then b := 2; else A := 2; end;]] | ||
343 | assert(g:match(s) == terror['ifElseCmdSeq']) | ||
344 | |||
345 | -- Error matching 'end' of 'if' | ||
346 | s = [[ | ||
347 | if a then b := 2; else a := 2; 77;]] | ||
348 | assert(g:match(s) == terror['ifEnd']) | ||
349 | |||
350 | -- Error matching the CmdSeq of 'repeat' | ||
351 | s = [[repeat | ||
352 | F := f * n; | ||
353 | n := n - 1; | ||
354 | until (n < 1);]] | ||
355 | assert(g:match(s) == terror['repeatCmdSeq']) | ||
356 | |||
357 | -- Error matching 'until' | ||
358 | s = [[repeat | ||
359 | f := f * n; | ||
360 | n := n - 1; | ||
361 | 88 (n < 1);]] | ||
362 | assert(g:match(s) == terror['repeatUntil']) | ||
363 | |||
364 | -- Error matching expression of 'until' | ||
365 | s = [[repeat | ||
366 | f := f * n; | ||
367 | n := n - 1; | ||
368 | until ; (n < 1);]] | ||
369 | assert(g:match(s) == terror['repeatExp']) | ||
370 | |||
371 | -- Error matching ':=' | ||
372 | s = [[ | ||
373 | f = f * n;]] | ||
374 | assert(g:match(s) == terror['assignOp']) | ||
375 | |||
376 | -- Error matching expression of assignment | ||
377 | s = [[ | ||
378 | f := A * n;]] | ||
379 | assert(g:match(s) == terror['assignExp']) | ||
380 | |||
381 | -- Error matching 'name' | ||
382 | s = [[ | ||
383 | read 2;]] | ||
384 | assert(g:match(s) == terror['readName']) | ||
385 | |||
386 | -- Error matching expression after 'write' | ||
387 | s = [[ | ||
388 | write [a] := 2;]] | ||
389 | assert(g:match(s) == terror['writeExp']) | ||
390 | |||
391 | -- Error matching 'SimpleExp' | ||
392 | s = [[ | ||
393 | a := a < A;]] | ||
394 | assert(g:match(s) == terror['simpleExp']) | ||
395 | |||
396 | -- Error matching 'Term' | ||
397 | s = [[ | ||
398 | a := a + A;]] | ||
399 | assert(g:match(s) == terror['term']) | ||
400 | |||
401 | -- Error matching 'Factor' | ||
402 | s = [[ | ||
403 | a := a * A;]] | ||
404 | assert(g:match(s) == terror['factor']) | ||
405 | |||
406 | -- Error matching expression after '(' | ||
407 | s = [[ | ||
408 | a := (A);]] | ||
409 | assert(g:match(s) == terror['openParExp']) | ||
410 | |||
411 | -- Error matching ')' | ||
412 | s = [[ | ||
413 | a := (a];]] | ||
414 | assert(g:match(s) == terror['closePar']) | ||
415 | |||
416 | -- Error undefined | ||
417 | s = [[ | ||
418 | A := a;]] | ||
419 | assert(g:match(s) == terror['undefined']) | ||
420 | |||
421 | |||
422 | print("OK") | ||