aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-02-20 10:13:46 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-02-20 10:13:46 -0300
commite08e5df853560de6482d84066a7accc6a18de545 (patch)
treeee19686bb35da90709a32ed24bf7855de1a3946a
downloadlpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.gz
lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.bz2
lpeg-e08e5df853560de6482d84066a7accc6a18de545.zip
Fist version of LPeg on GIT
LPeg repository is being moved to git. Past versions won't be moved; they are still available in RCS.
-rw-r--r--doc.css223
-rw-r--r--lpcap.c537
-rw-r--r--lpcap.h56
-rw-r--r--lpcode.c1014
-rw-r--r--lpcode.h40
-rw-r--r--lpeg.html1445
-rw-r--r--lpprint.c244
-rw-r--r--lpprint.h36
-rw-r--r--lptree.c1305
-rw-r--r--lptree.h82
-rw-r--r--lptypes.h145
-rw-r--r--lpvm.c364
-rw-r--r--lpvm.h58
-rw-r--r--makefile55
-rwxr-xr-xpack15
-rw-r--r--re.html500
-rw-r--r--re.lua267
-rwxr-xr-xtest.lua1513
18 files changed, 7899 insertions, 0 deletions
diff --git a/doc.css b/doc.css
new file mode 100644
index 0000000..3770e4e
--- /dev/null
+++ b/doc.css
@@ -0,0 +1,223 @@
1body {
2 margin-left: 1em;
3 margin-right: 1em;
4 font-family: arial, helvetica, geneva, sans-serif;
5 background-color:#ffffff; margin:0px;
6}
7
8code {
9 font-family: "Andale Mono", monospace;
10}
11
12tt {
13 font-family: "Andale Mono", monospace;
14}
15
16body, td, th { font-size: 11pt; }
17
18h1, h2, h3, h4 { margin-left: 0em; }
19
20textarea, pre, tt { font-size:10pt; }
21body, td, th { color:#000000; }
22small { font-size:0.85em; }
23h1 { font-size:1.5em; }
24h2 { font-size:1.25em; }
25h3 { font-size:1.15em; }
26h4 { font-size:1.06em; }
27
28a:link { font-weight:bold; color: #004080; text-decoration: none; }
29a:visited { font-weight:bold; color: #006699; text-decoration: none; }
30a:link:hover { text-decoration:underline; }
31hr { color:#cccccc }
32img { border-width: 0px; }
33
34
35h3 { padding-top: 1em; }
36
37p { margin-left: 1em; }
38
39p.name {
40 font-family: "Andale Mono", monospace;
41 padding-top: 1em;
42 margin-left: 0em;
43}
44
45blockquote { margin-left: 3em; }
46
47.example {
48 background-color: rgb(245, 245, 245);
49 border-top-width: 1px;
50 border-right-width: 1px;
51 border-bottom-width: 1px;
52 border-left-width: 1px;
53 border-top-style: solid;
54 border-right-style: solid;
55 border-bottom-style: solid;
56 border-left-style: solid;
57 border-top-color: silver;
58 border-right-color: silver;
59 border-bottom-color: silver;
60 border-left-color: silver;
61 padding: 1em;
62 margin-left: 1em;
63 margin-right: 1em;
64 font-family: "Andale Mono", monospace;
65 font-size: smaller;
66}
67
68
69hr {
70 margin-left: 0em;
71 background: #00007f;
72 border: 0px;
73 height: 1px;
74}
75
76ul { list-style-type: disc; }
77
78table.index { border: 1px #00007f; }
79table.index td { text-align: left; vertical-align: top; }
80table.index ul { padding-top: 0em; margin-top: 0em; }
81
82table {
83 border: 1px solid black;
84 border-collapse: collapse;
85 margin-left: auto;
86 margin-right: auto;
87}
88th {
89 border: 1px solid black;
90 padding: 0.5em;
91}
92td {
93 border: 1px solid black;
94 padding: 0.5em;
95}
96div.header, div.footer { margin-left: 0em; }
97
98#container
99{
100 margin-left: 1em;
101 margin-right: 1em;
102 background-color: #f0f0f0;
103}
104
105#product
106{
107 text-align: center;
108 border-bottom: 1px solid #cccccc;
109 background-color: #ffffff;
110}
111
112#product big {
113 font-size: 2em;
114}
115
116#product_logo
117{
118}
119
120#product_name
121{
122}
123
124#product_description
125{
126}
127
128#main
129{
130 background-color: #f0f0f0;
131 border-left: 2px solid #cccccc;
132}
133
134#navigation
135{
136 float: left;
137 width: 12em;
138 margin: 0;
139 vertical-align: top;
140 background-color: #f0f0f0;
141 overflow:visible;
142}
143
144#navigation h1 {
145 background-color:#e7e7e7;
146 font-size:1.1em;
147 color:#000000;
148 text-align:left;
149 margin:0px;
150 padding:0.2em;
151 border-top:1px solid #dddddd;
152 border-bottom:1px solid #dddddd;
153}
154
155#navigation ul
156{
157 font-size:1em;
158 list-style-type: none;
159 padding: 0;
160 margin: 1px;
161}
162
163#navigation li
164{
165 text-indent: -1em;
166 margin: 0em 0em 0em 0.5em;
167 display: block;
168 padding: 3px 0px 0px 12px;
169}
170
171#navigation li li a
172{
173 padding: 0px 3px 0px -1em;
174}
175
176#content
177{
178 margin-left: 12em;
179 padding: 1em;
180 border-left: 2px solid #cccccc;
181 border-right: 2px solid #cccccc;
182 background-color: #ffffff;
183}
184
185#about
186{
187 clear: both;
188 margin: 0;
189 padding: 5px;
190 border-top: 2px solid #cccccc;
191 background-color: #ffffff;
192}
193
194@media print {
195 body {
196 font: 10pt "Times New Roman", "TimeNR", Times, serif;
197 }
198 a { font-weight:bold; color: #004080; text-decoration: underline; }
199
200 #main { background-color: #ffffff; border-left: 0px; }
201 #container { margin-left: 2%; margin-right: 2%; background-color: #ffffff; }
202
203 #content { margin-left: 0px; padding: 1em; border-left: 0px; border-right: 0px; background-color: #ffffff; }
204
205 #navigation { display: none;
206 }
207
208 #product_logo
209 {
210 display: none;
211 }
212
213 #about img
214 {
215 display: none;
216 }
217
218 .example {
219 font-family: "Andale Mono", monospace;
220 font-size: 8pt;
221 page-break-inside: avoid;
222 }
223}
diff --git a/lpcap.c b/lpcap.c
new file mode 100644
index 0000000..c9085de
--- /dev/null
+++ b/lpcap.c
@@ -0,0 +1,537 @@
1/*
2** $Id: lpcap.c,v 1.6 2015/06/15 16:09:57 roberto Exp $
3** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
4*/
5
6#include "lua.h"
7#include "lauxlib.h"
8
9#include "lpcap.h"
10#include "lptypes.h"
11
12
13#define captype(cap) ((cap)->kind)
14
15#define isclosecap(cap) (captype(cap) == Cclose)
16
17#define closeaddr(c) ((c)->s + (c)->siz - 1)
18
19#define isfullcap(cap) ((cap)->siz != 0)
20
21#define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v)
22
23#define pushluaval(cs) getfromktable(cs, (cs)->cap->idx)
24
25
26
27/*
28** Put at the cache for Lua values the value indexed by 'v' in ktable
29** of the running pattern (if it is not there yet); returns its index.
30*/
31static int updatecache (CapState *cs, int v) {
32 int idx = cs->ptop + 1; /* stack index of cache for Lua values */
33 if (v != cs->valuecached) { /* not there? */
34 getfromktable(cs, v); /* get value from 'ktable' */
35 lua_replace(cs->L, idx); /* put it at reserved stack position */
36 cs->valuecached = v; /* keep track of what is there */
37 }
38 return idx;
39}
40
41
42static int pushcapture (CapState *cs);
43
44
45/*
46** Goes back in a list of captures looking for an open capture
47** corresponding to a close
48*/
49static Capture *findopen (Capture *cap) {
50 int n = 0; /* number of closes waiting an open */
51 for (;;) {
52 cap--;
53 if (isclosecap(cap)) n++; /* one more open to skip */
54 else if (!isfullcap(cap))
55 if (n-- == 0) return cap;
56 }
57}
58
59
60/*
61** Go to the next capture
62*/
63static void nextcap (CapState *cs) {
64 Capture *cap = cs->cap;
65 if (!isfullcap(cap)) { /* not a single capture? */
66 int n = 0; /* number of opens waiting a close */
67 for (;;) { /* look for corresponding close */
68 cap++;
69 if (isclosecap(cap)) {
70 if (n-- == 0) break;
71 }
72 else if (!isfullcap(cap)) n++;
73 }
74 }
75 cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */
76}
77
78
79/*
80** Push on the Lua stack all values generated by nested captures inside
81** the current capture. Returns number of values pushed. 'addextra'
82** makes it push the entire match after all captured values. The
83** entire match is pushed also if there are no other nested values,
84** so the function never returns zero.
85*/
86static int pushnestedvalues (CapState *cs, int addextra) {
87 Capture *co = cs->cap;
88 if (isfullcap(cs->cap++)) { /* no nested captures? */
89 lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */
90 return 1; /* that is it */
91 }
92 else {
93 int n = 0;
94 while (!isclosecap(cs->cap)) /* repeat for all nested patterns */
95 n += pushcapture(cs);
96 if (addextra || n == 0) { /* need extra? */
97 lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */
98 n++;
99 }
100 cs->cap++; /* skip close entry */
101 return n;
102 }
103}
104
105
106/*
107** Push only the first value generated by nested captures
108*/
109static void pushonenestedvalue (CapState *cs) {
110 int n = pushnestedvalues(cs, 0);
111 if (n > 1)
112 lua_pop(cs->L, n - 1); /* pop extra values */
113}
114
115
116/*
117** Try to find a named group capture with the name given at the top of
118** the stack; goes backward from 'cap'.
119*/
120static Capture *findback (CapState *cs, Capture *cap) {
121 lua_State *L = cs->L;
122 while (cap-- > cs->ocap) { /* repeat until end of list */
123 if (isclosecap(cap))
124 cap = findopen(cap); /* skip nested captures */
125 else if (!isfullcap(cap))
126 continue; /* opening an enclosing capture: skip and get previous */
127 if (captype(cap) == Cgroup) {
128 getfromktable(cs, cap->idx); /* get group name */
129 if (lp_equal(L, -2, -1)) { /* right group? */
130 lua_pop(L, 2); /* remove reference name and group name */
131 return cap;
132 }
133 else lua_pop(L, 1); /* remove group name */
134 }
135 }
136 luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1));
137 return NULL; /* to avoid warnings */
138}
139
140
141/*
142** Back-reference capture. Return number of values pushed.
143*/
144static int backrefcap (CapState *cs) {
145 int n;
146 Capture *curr = cs->cap;
147 pushluaval(cs); /* reference name */
148 cs->cap = findback(cs, curr); /* find corresponding group */
149 n = pushnestedvalues(cs, 0); /* push group's values */
150 cs->cap = curr + 1;
151 return n;
152}
153
154
155/*
156** Table capture: creates a new table and populates it with nested
157** captures.
158*/
159static int tablecap (CapState *cs) {
160 lua_State *L = cs->L;
161 int n = 0;
162 lua_newtable(L);
163 if (isfullcap(cs->cap++))
164 return 1; /* table is empty */
165 while (!isclosecap(cs->cap)) {
166 if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */
167 pushluaval(cs); /* push group name */
168 pushonenestedvalue(cs);
169 lua_settable(L, -3);
170 }
171 else { /* not a named group */
172 int i;
173 int k = pushcapture(cs);
174 for (i = k; i > 0; i--) /* store all values into table */
175 lua_rawseti(L, -(i + 1), n + i);
176 n += k;
177 }
178 }
179 cs->cap++; /* skip close entry */
180 return 1; /* number of values pushed (only the table) */
181}
182
183
184/*
185** Table-query capture
186*/
187static int querycap (CapState *cs) {
188 int idx = cs->cap->idx;
189 pushonenestedvalue(cs); /* get nested capture */
190 lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */
191 if (!lua_isnil(cs->L, -1))
192 return 1;
193 else { /* no value */
194 lua_pop(cs->L, 1); /* remove nil */
195 return 0;
196 }
197}
198
199
200/*
201** Fold capture
202*/
203static int foldcap (CapState *cs) {
204 int n;
205 lua_State *L = cs->L;
206 int idx = cs->cap->idx;
207 if (isfullcap(cs->cap++) || /* no nested captures? */
208 isclosecap(cs->cap) || /* no nested captures (large subject)? */
209 (n = pushcapture(cs)) == 0) /* nested captures with no values? */
210 return luaL_error(L, "no initial value for fold capture");
211 if (n > 1)
212 lua_pop(L, n - 1); /* leave only one result for accumulator */
213 while (!isclosecap(cs->cap)) {
214 lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */
215 lua_insert(L, -2); /* put it before accumulator */
216 n = pushcapture(cs); /* get next capture's values */
217 lua_call(L, n + 1, 1); /* call folding function */
218 }
219 cs->cap++; /* skip close entry */
220 return 1; /* only accumulator left on the stack */
221}
222
223
224/*
225** Function capture
226*/
227static int functioncap (CapState *cs) {
228 int n;
229 int top = lua_gettop(cs->L);
230 pushluaval(cs); /* push function */
231 n = pushnestedvalues(cs, 0); /* push nested captures */
232 lua_call(cs->L, n, LUA_MULTRET); /* call function */
233 return lua_gettop(cs->L) - top; /* return function's results */
234}
235
236
237/*
238** Select capture
239*/
240static int numcap (CapState *cs) {
241 int idx = cs->cap->idx; /* value to select */
242 if (idx == 0) { /* no values? */
243 nextcap(cs); /* skip entire capture */
244 return 0; /* no value produced */
245 }
246 else {
247 int n = pushnestedvalues(cs, 0);
248 if (n < idx) /* invalid index? */
249 return luaL_error(cs->L, "no capture '%d'", idx);
250 else {
251 lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */
252 lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */
253 lua_pop(cs->L, n - 1); /* remove other captures */
254 return 1;
255 }
256 }
257}
258
259
260/*
261** Return the stack index of the first runtime capture in the given
262** list of captures (or zero if no runtime captures)
263*/
264int finddyncap (Capture *cap, Capture *last) {
265 for (; cap < last; cap++) {
266 if (cap->kind == Cruntime)
267 return cap->idx; /* stack position of first capture */
268 }
269 return 0; /* no dynamic captures in this segment */
270}
271
272
273/*
274** Calls a runtime capture. Returns number of captures removed by
275** the call, including the initial Cgroup. (Captures to be added are
276** on the Lua stack.)
277*/
278int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) {
279 int n, id;
280 lua_State *L = cs->L;
281 int otop = lua_gettop(L);
282 Capture *open = findopen(close);
283 assert(captype(open) == Cgroup);
284 id = finddyncap(open, close); /* get first dynamic capture argument */
285 close->kind = Cclose; /* closes the group */
286 close->s = s;
287 cs->cap = open; cs->valuecached = 0; /* prepare capture state */
288 luaL_checkstack(L, 4, "too many runtime captures");
289 pushluaval(cs); /* push function to be called */
290 lua_pushvalue(L, SUBJIDX); /* push original subject */
291 lua_pushinteger(L, s - cs->s + 1); /* push current position */
292 n = pushnestedvalues(cs, 0); /* push nested captures */
293 lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */
294 if (id > 0) { /* are there old dynamic captures to be removed? */
295 int i;
296 for (i = id; i <= otop; i++)
297 lua_remove(L, id); /* remove old dynamic captures */
298 *rem = otop - id + 1; /* total number of dynamic captures removed */
299 }
300 else
301 *rem = 0; /* no dynamic captures removed */
302 return close - open; /* number of captures of all kinds removed */
303}
304
305
306/*
307** Auxiliary structure for substitution and string captures: keep
308** information about nested captures for future use, avoiding to push
309** string results into Lua
310*/
311typedef struct StrAux {
312 int isstring; /* whether capture is a string */
313 union {
314 Capture *cp; /* if not a string, respective capture */
315 struct { /* if it is a string... */
316 const char *s; /* ... starts here */
317 const char *e; /* ... ends here */
318 } s;
319 } u;
320} StrAux;
321
322#define MAXSTRCAPS 10
323
324/*
325** Collect values from current capture into array 'cps'. Current
326** capture must be Cstring (first call) or Csimple (recursive calls).
327** (In first call, fills %0 with whole match for Cstring.)
328** Returns number of elements in the array that were filled.
329*/
330static int getstrcaps (CapState *cs, StrAux *cps, int n) {
331 int k = n++;
332 cps[k].isstring = 1; /* get string value */
333 cps[k].u.s.s = cs->cap->s; /* starts here */
334 if (!isfullcap(cs->cap++)) { /* nested captures? */
335 while (!isclosecap(cs->cap)) { /* traverse them */
336 if (n >= MAXSTRCAPS) /* too many captures? */
337 nextcap(cs); /* skip extra captures (will not need them) */
338 else if (captype(cs->cap) == Csimple) /* string? */
339 n = getstrcaps(cs, cps, n); /* put info. into array */
340 else {
341 cps[n].isstring = 0; /* not a string */
342 cps[n].u.cp = cs->cap; /* keep original capture */
343 nextcap(cs);
344 n++;
345 }
346 }
347 cs->cap++; /* skip close */
348 }
349 cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */
350 return n;
351}
352
353
354/*
355** add next capture value (which should be a string) to buffer 'b'
356*/
357static int addonestring (luaL_Buffer *b, CapState *cs, const char *what);
358
359
360/*
361** String capture: add result to buffer 'b' (instead of pushing
362** it into the stack)
363*/
364static void stringcap (luaL_Buffer *b, CapState *cs) {
365 StrAux cps[MAXSTRCAPS];
366 int n;
367 size_t len, i;
368 const char *fmt; /* format string */
369 fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len);
370 n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */
371 for (i = 0; i < len; i++) { /* traverse them */
372 if (fmt[i] != '%') /* not an escape? */
373 luaL_addchar(b, fmt[i]); /* add it to buffer */
374 else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */
375 luaL_addchar(b, fmt[i]); /* add to buffer */
376 else {
377 int l = fmt[i] - '0'; /* capture index */
378 if (l > n)
379 luaL_error(cs->L, "invalid capture index (%d)", l);
380 else if (cps[l].isstring)
381 luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s);
382 else {
383 Capture *curr = cs->cap;
384 cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */
385 if (!addonestring(b, cs, "capture"))
386 luaL_error(cs->L, "no values in capture index %d", l);
387 cs->cap = curr; /* continue from where it stopped */
388 }
389 }
390 }
391}
392
393
394/*
395** Substitution capture: add result to buffer 'b'
396*/
397static void substcap (luaL_Buffer *b, CapState *cs) {
398 const char *curr = cs->cap->s;
399 if (isfullcap(cs->cap)) /* no nested captures? */
400 luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */
401 else {
402 cs->cap++; /* skip open entry */
403 while (!isclosecap(cs->cap)) { /* traverse nested captures */
404 const char *next = cs->cap->s;
405 luaL_addlstring(b, curr, next - curr); /* add text up to capture */
406 if (addonestring(b, cs, "replacement"))
407 curr = closeaddr(cs->cap - 1); /* continue after match */
408 else /* no capture value */
409 curr = next; /* keep original text in final result */
410 }
411 luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */
412 }
413 cs->cap++; /* go to next capture */
414}
415
416
417/*
418** Evaluates a capture and adds its first value to buffer 'b'; returns
419** whether there was a value
420*/
421static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) {
422 switch (captype(cs->cap)) {
423 case Cstring:
424 stringcap(b, cs); /* add capture directly to buffer */
425 return 1;
426 case Csubst:
427 substcap(b, cs); /* add capture directly to buffer */
428 return 1;
429 default: {
430 lua_State *L = cs->L;
431 int n = pushcapture(cs);
432 if (n > 0) {
433 if (n > 1) lua_pop(L, n - 1); /* only one result */
434 if (!lua_isstring(L, -1))
435 luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1));
436 luaL_addvalue(b);
437 }
438 return n;
439 }
440 }
441}
442
443
444/*
445** Push all values of the current capture into the stack; returns
446** number of values pushed
447*/
448static int pushcapture (CapState *cs) {
449 lua_State *L = cs->L;
450 luaL_checkstack(L, 4, "too many captures");
451 switch (captype(cs->cap)) {
452 case Cposition: {
453 lua_pushinteger(L, cs->cap->s - cs->s + 1);
454 cs->cap++;
455 return 1;
456 }
457 case Cconst: {
458 pushluaval(cs);
459 cs->cap++;
460 return 1;
461 }
462 case Carg: {
463 int arg = (cs->cap++)->idx;
464 if (arg + FIXEDARGS > cs->ptop)
465 return luaL_error(L, "reference to absent extra argument #%d", arg);
466 lua_pushvalue(L, arg + FIXEDARGS);
467 return 1;
468 }
469 case Csimple: {
470 int k = pushnestedvalues(cs, 1);
471 lua_insert(L, -k); /* make whole match be first result */
472 return k;
473 }
474 case Cruntime: {
475 lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */
476 return 1;
477 }
478 case Cstring: {
479 luaL_Buffer b;
480 luaL_buffinit(L, &b);
481 stringcap(&b, cs);
482 luaL_pushresult(&b);
483 return 1;
484 }
485 case Csubst: {
486 luaL_Buffer b;
487 luaL_buffinit(L, &b);
488 substcap(&b, cs);
489 luaL_pushresult(&b);
490 return 1;
491 }
492 case Cgroup: {
493 if (cs->cap->idx == 0) /* anonymous group? */
494 return pushnestedvalues(cs, 0); /* add all nested values */
495 else { /* named group: add no values */
496 nextcap(cs); /* skip capture */
497 return 0;
498 }
499 }
500 case Cbackref: return backrefcap(cs);
501 case Ctable: return tablecap(cs);
502 case Cfunction: return functioncap(cs);
503 case Cnum: return numcap(cs);
504 case Cquery: return querycap(cs);
505 case Cfold: return foldcap(cs);
506 default: assert(0); return 0;
507 }
508}
509
510
511/*
512** Prepare a CapState structure and traverse the entire list of
513** captures in the stack pushing its results. 's' is the subject
514** string, 'r' is the final position of the match, and 'ptop'
515** the index in the stack where some useful values were pushed.
516** Returns the number of results pushed. (If the list produces no
517** results, push the final position of the match.)
518*/
519int getcaptures (lua_State *L, const char *s, const char *r, int ptop) {
520 Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop));
521 int n = 0;
522 if (!isclosecap(capture)) { /* is there any capture? */
523 CapState cs;
524 cs.ocap = cs.cap = capture; cs.L = L;
525 cs.s = s; cs.valuecached = 0; cs.ptop = ptop;
526 do { /* collect their values */
527 n += pushcapture(&cs);
528 } while (!isclosecap(cs.cap));
529 }
530 if (n == 0) { /* no capture values? */
531 lua_pushinteger(L, r - s + 1); /* return only end position */
532 n = 1;
533 }
534 return n;
535}
536
537
diff --git a/lpcap.h b/lpcap.h
new file mode 100644
index 0000000..6133df2
--- /dev/null
+++ b/lpcap.h
@@ -0,0 +1,56 @@
1/*
2** $Id: lpcap.h,v 1.3 2016/09/13 17:45:58 roberto Exp $
3*/
4
5#if !defined(lpcap_h)
6#define lpcap_h
7
8
9#include "lptypes.h"
10
11
12/* kinds of captures */
13typedef enum CapKind {
14 Cclose, /* not used in trees */
15 Cposition,
16 Cconst, /* ktable[key] is Lua constant */
17 Cbackref, /* ktable[key] is "name" of group to get capture */
18 Carg, /* 'key' is arg's number */
19 Csimple, /* next node is pattern */
20 Ctable, /* next node is pattern */
21 Cfunction, /* ktable[key] is function; next node is pattern */
22 Cquery, /* ktable[key] is table; next node is pattern */
23 Cstring, /* ktable[key] is string; next node is pattern */
24 Cnum, /* numbered capture; 'key' is number of value to return */
25 Csubst, /* substitution capture; next node is pattern */
26 Cfold, /* ktable[key] is function; next node is pattern */
27 Cruntime, /* not used in trees (is uses another type for tree) */
28 Cgroup /* ktable[key] is group's "name" */
29} CapKind;
30
31
32typedef struct Capture {
33 const char *s; /* subject position */
34 unsigned short idx; /* extra info (group name, arg index, etc.) */
35 byte kind; /* kind of capture */
36 byte siz; /* size of full capture + 1 (0 = not a full capture) */
37} Capture;
38
39
40typedef struct CapState {
41 Capture *cap; /* current capture */
42 Capture *ocap; /* (original) capture list */
43 lua_State *L;
44 int ptop; /* index of last argument to 'match' */
45 const char *s; /* original string */
46 int valuecached; /* value stored in cache slot */
47} CapState;
48
49
50int runtimecap (CapState *cs, Capture *close, const char *s, int *rem);
51int getcaptures (lua_State *L, const char *s, const char *r, int ptop);
52int finddyncap (Capture *cap, Capture *last);
53
54#endif
55
56
diff --git a/lpcode.c b/lpcode.c
new file mode 100644
index 0000000..2722d71
--- /dev/null
+++ b/lpcode.c
@@ -0,0 +1,1014 @@
1/*
2** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $
3** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
4*/
5
6#include <limits.h>
7
8
9#include "lua.h"
10#include "lauxlib.h"
11
12#include "lptypes.h"
13#include "lpcode.h"
14
15
16/* signals a "no-instruction */
17#define NOINST -1
18
19
20
21static const Charset fullset_ =
22 {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
23 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
24 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
25 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}};
26
27static const Charset *fullset = &fullset_;
28
29/*
30** {======================================================
31** Analysis and some optimizations
32** =======================================================
33*/
34
35/*
36** Check whether a charset is empty (returns IFail), singleton (IChar),
37** full (IAny), or none of those (ISet). When singleton, '*c' returns
38** which character it is. (When generic set, the set was the input,
39** so there is no need to return it.)
40*/
41static Opcode charsettype (const byte *cs, int *c) {
42 int count = 0; /* number of characters in the set */
43 int i;
44 int candidate = -1; /* candidate position for the singleton char */
45 for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */
46 int b = cs[i];
47 if (b == 0) { /* is byte empty? */
48 if (count > 1) /* was set neither empty nor singleton? */
49 return ISet; /* neither full nor empty nor singleton */
50 /* else set is still empty or singleton */
51 }
52 else if (b == 0xFF) { /* is byte full? */
53 if (count < (i * BITSPERCHAR)) /* was set not full? */
54 return ISet; /* neither full nor empty nor singleton */
55 else count += BITSPERCHAR; /* set is still full */
56 }
57 else if ((b & (b - 1)) == 0) { /* has byte only one bit? */
58 if (count > 0) /* was set not empty? */
59 return ISet; /* neither full nor empty nor singleton */
60 else { /* set has only one char till now; track it */
61 count++;
62 candidate = i;
63 }
64 }
65 else return ISet; /* byte is neither empty, full, nor singleton */
66 }
67 switch (count) {
68 case 0: return IFail; /* empty set */
69 case 1: { /* singleton; find character bit inside byte */
70 int b = cs[candidate];
71 *c = candidate * BITSPERCHAR;
72 if ((b & 0xF0) != 0) { *c += 4; b >>= 4; }
73 if ((b & 0x0C) != 0) { *c += 2; b >>= 2; }
74 if ((b & 0x02) != 0) { *c += 1; }
75 return IChar;
76 }
77 default: {
78 assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */
79 return IAny;
80 }
81 }
82}
83
84
85/*
86** A few basic operations on Charsets
87*/
88static void cs_complement (Charset *cs) {
89 loopset(i, cs->cs[i] = ~cs->cs[i]);
90}
91
92static int cs_equal (const byte *cs1, const byte *cs2) {
93 loopset(i, if (cs1[i] != cs2[i]) return 0);
94 return 1;
95}
96
97static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
98 loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;)
99 return 1;
100}
101
102
103/*
104** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
105** charset and return 1; else return 0.
106*/
107int tocharset (TTree *tree, Charset *cs) {
108 switch (tree->tag) {
109 case TSet: { /* copy set */
110 loopset(i, cs->cs[i] = treebuffer(tree)[i]);
111 return 1;
112 }
113 case TChar: { /* only one char */
114 assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
115 loopset(i, cs->cs[i] = 0); /* erase all chars */
116 setchar(cs->cs, tree->u.n); /* add that one */
117 return 1;
118 }
119 case TAny: {
120 loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */
121 return 1;
122 }
123 default: return 0;
124 }
125}
126
127
128/*
129** Visit a TCall node taking care to stop recursion. If node not yet
130** visited, return 'f(sib2(tree))', otherwise return 'def' (default
131** value)
132*/
133static int callrecursive (TTree *tree, int f (TTree *t), int def) {
134 int key = tree->key;
135 assert(tree->tag == TCall);
136 assert(sib2(tree)->tag == TRule);
137 if (key == 0) /* node already visited? */
138 return def; /* return default value */
139 else { /* first visit */
140 int result;
141 tree->key = 0; /* mark call as already visited */
142 result = f(sib2(tree)); /* go to called rule */
143 tree->key = key; /* restore tree */
144 return result;
145 }
146}
147
148
149/*
150** Check whether a pattern tree has captures
151*/
152int hascaptures (TTree *tree) {
153 tailcall:
154 switch (tree->tag) {
155 case TCapture: case TRunTime:
156 return 1;
157 case TCall:
158 return callrecursive(tree, hascaptures, 0);
159 case TRule: /* do not follow siblings */
160 tree = sib1(tree); goto tailcall;
161 case TOpenCall: assert(0);
162 default: {
163 switch (numsiblings[tree->tag]) {
164 case 1: /* return hascaptures(sib1(tree)); */
165 tree = sib1(tree); goto tailcall;
166 case 2:
167 if (hascaptures(sib1(tree)))
168 return 1;
169 /* else return hascaptures(sib2(tree)); */
170 tree = sib2(tree); goto tailcall;
171 default: assert(numsiblings[tree->tag] == 0); return 0;
172 }
173 }
174 }
175}
176
177
178/*
179** Checks how a pattern behaves regarding the empty string,
180** in one of two different ways:
181** A pattern is *nullable* if it can match without consuming any character;
182** A pattern is *nofail* if it never fails for any string
183** (including the empty string).
184** The difference is only for predicates and run-time captures;
185** for other patterns, the two properties are equivalent.
186** (With predicates, &'a' is nullable but not nofail. Of course,
187** nofail => nullable.)
188** These functions are all convervative in the following way:
189** p is nullable => nullable(p)
190** nofail(p) => p cannot fail
191** The function assumes that TOpenCall is not nullable;
192** this will be checked again when the grammar is fixed.
193** Run-time captures can do whatever they want, so the result
194** is conservative.
195*/
196int checkaux (TTree *tree, int pred) {
197 tailcall:
198 switch (tree->tag) {
199 case TChar: case TSet: case TAny:
200 case TFalse: case TOpenCall:
201 return 0; /* not nullable */
202 case TRep: case TTrue:
203 return 1; /* no fail */
204 case TNot: case TBehind: /* can match empty, but can fail */
205 if (pred == PEnofail) return 0;
206 else return 1; /* PEnullable */
207 case TAnd: /* can match empty; fail iff body does */
208 if (pred == PEnullable) return 1;
209 /* else return checkaux(sib1(tree), pred); */
210 tree = sib1(tree); goto tailcall;
211 case TRunTime: /* can fail; match empty iff body does */
212 if (pred == PEnofail) return 0;
213 /* else return checkaux(sib1(tree), pred); */
214 tree = sib1(tree); goto tailcall;
215 case TSeq:
216 if (!checkaux(sib1(tree), pred)) return 0;
217 /* else return checkaux(sib2(tree), pred); */
218 tree = sib2(tree); goto tailcall;
219 case TChoice:
220 if (checkaux(sib2(tree), pred)) return 1;
221 /* else return checkaux(sib1(tree), pred); */
222 tree = sib1(tree); goto tailcall;
223 case TCapture: case TGrammar: case TRule:
224 /* return checkaux(sib1(tree), pred); */
225 tree = sib1(tree); goto tailcall;
226 case TCall: /* return checkaux(sib2(tree), pred); */
227 tree = sib2(tree); goto tailcall;
228 default: assert(0); return 0;
229 }
230}
231
232
233/*
234** number of characters to match a pattern (or -1 if variable)
235*/
236int fixedlen (TTree *tree) {
237 int len = 0; /* to accumulate in tail calls */
238 tailcall:
239 switch (tree->tag) {
240 case TChar: case TSet: case TAny:
241 return len + 1;
242 case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
243 return len;
244 case TRep: case TRunTime: case TOpenCall:
245 return -1;
246 case TCapture: case TRule: case TGrammar:
247 /* return fixedlen(sib1(tree)); */
248 tree = sib1(tree); goto tailcall;
249 case TCall: {
250 int n1 = callrecursive(tree, fixedlen, -1);
251 if (n1 < 0)
252 return -1;
253 else
254 return len + n1;
255 }
256 case TSeq: {
257 int n1 = fixedlen(sib1(tree));
258 if (n1 < 0)
259 return -1;
260 /* else return fixedlen(sib2(tree)) + len; */
261 len += n1; tree = sib2(tree); goto tailcall;
262 }
263 case TChoice: {
264 int n1 = fixedlen(sib1(tree));
265 int n2 = fixedlen(sib2(tree));
266 if (n1 != n2 || n1 < 0)
267 return -1;
268 else
269 return len + n1;
270 }
271 default: assert(0); return 0;
272 };
273}
274
275
276/*
277** Computes the 'first set' of a pattern.
278** The result is a conservative aproximation:
279** match p ax -> x (for some x) ==> a belongs to first(p)
280** or
281** a not in first(p) ==> match p ax -> fail (for all x)
282**
283** The set 'follow' is the first set of what follows the
284** pattern (full set if nothing follows it).
285**
286** The function returns 0 when this resulting set can be used for
287** test instructions that avoid the pattern altogether.
288** A non-zero return can happen for two reasons:
289** 1) match p '' -> '' ==> return has bit 1 set
290** (tests cannot be used because they would always fail for an empty input);
291** 2) there is a match-time capture ==> return has bit 2 set
292** (optimizations should not bypass match-time captures).
293*/
294static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
295 tailcall:
296 switch (tree->tag) {
297 case TChar: case TSet: case TAny: {
298 tocharset(tree, firstset);
299 return 0;
300 }
301 case TTrue: {
302 loopset(i, firstset->cs[i] = follow->cs[i]);
303 return 1; /* accepts the empty string */
304 }
305 case TFalse: {
306 loopset(i, firstset->cs[i] = 0);
307 return 0;
308 }
309 case TChoice: {
310 Charset csaux;
311 int e1 = getfirst(sib1(tree), follow, firstset);
312 int e2 = getfirst(sib2(tree), follow, &csaux);
313 loopset(i, firstset->cs[i] |= csaux.cs[i]);
314 return e1 | e2;
315 }
316 case TSeq: {
317 if (!nullable(sib1(tree))) {
318 /* when p1 is not nullable, p2 has nothing to contribute;
319 return getfirst(sib1(tree), fullset, firstset); */
320 tree = sib1(tree); follow = fullset; goto tailcall;
321 }
322 else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */
323 Charset csaux;
324 int e2 = getfirst(sib2(tree), follow, &csaux);
325 int e1 = getfirst(sib1(tree), &csaux, firstset);
326 if (e1 == 0) return 0; /* 'e1' ensures that first can be used */
327 else if ((e1 | e2) & 2) /* one of the children has a matchtime? */
328 return 2; /* pattern has a matchtime capture */
329 else return e2; /* else depends on 'e2' */
330 }
331 }
332 case TRep: {
333 getfirst(sib1(tree), follow, firstset);
334 loopset(i, firstset->cs[i] |= follow->cs[i]);
335 return 1; /* accept the empty string */
336 }
337 case TCapture: case TGrammar: case TRule: {
338 /* return getfirst(sib1(tree), follow, firstset); */
339 tree = sib1(tree); goto tailcall;
340 }
341 case TRunTime: { /* function invalidates any follow info. */
342 int e = getfirst(sib1(tree), fullset, firstset);
343 if (e) return 2; /* function is not "protected"? */
344 else return 0; /* pattern inside capture ensures first can be used */
345 }
346 case TCall: {
347 /* return getfirst(sib2(tree), follow, firstset); */
348 tree = sib2(tree); goto tailcall;
349 }
350 case TAnd: {
351 int e = getfirst(sib1(tree), follow, firstset);
352 loopset(i, firstset->cs[i] &= follow->cs[i]);
353 return e;
354 }
355 case TNot: {
356 if (tocharset(sib1(tree), firstset)) {
357 cs_complement(firstset);
358 return 1;
359 }
360 /* else go through */
361 }
362 case TBehind: { /* instruction gives no new information */
363 /* call 'getfirst' only to check for math-time captures */
364 int e = getfirst(sib1(tree), follow, firstset);
365 loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */
366 return e | 1; /* always can accept the empty string */
367 }
368 default: assert(0); return 0;
369 }
370}
371
372
373/*
374** If 'headfail(tree)' true, then 'tree' can fail only depending on the
375** next character of the subject.
376*/
377static int headfail (TTree *tree) {
378 tailcall:
379 switch (tree->tag) {
380 case TChar: case TSet: case TAny: case TFalse:
381 return 1;
382 case TTrue: case TRep: case TRunTime: case TNot:
383 case TBehind:
384 return 0;
385 case TCapture: case TGrammar: case TRule: case TAnd:
386 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
387 case TCall:
388 tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */
389 case TSeq:
390 if (!nofail(sib2(tree))) return 0;
391 /* else return headfail(sib1(tree)); */
392 tree = sib1(tree); goto tailcall;
393 case TChoice:
394 if (!headfail(sib1(tree))) return 0;
395 /* else return headfail(sib2(tree)); */
396 tree = sib2(tree); goto tailcall;
397 default: assert(0); return 0;
398 }
399}
400
401
402/*
403** Check whether the code generation for the given tree can benefit
404** from a follow set (to avoid computing the follow set when it is
405** not needed)
406*/
407static int needfollow (TTree *tree) {
408 tailcall:
409 switch (tree->tag) {
410 case TChar: case TSet: case TAny:
411 case TFalse: case TTrue: case TAnd: case TNot:
412 case TRunTime: case TGrammar: case TCall: case TBehind:
413 return 0;
414 case TChoice: case TRep:
415 return 1;
416 case TCapture:
417 tree = sib1(tree); goto tailcall;
418 case TSeq:
419 tree = sib2(tree); goto tailcall;
420 default: assert(0); return 0;
421 }
422}
423
424/* }====================================================== */
425
426
427
428/*
429** {======================================================
430** Code generation
431** =======================================================
432*/
433
434
435/*
436** size of an instruction
437*/
438int sizei (const Instruction *i) {
439 switch((Opcode)i->i.code) {
440 case ISet: case ISpan: return CHARSETINSTSIZE;
441 case ITestSet: return CHARSETINSTSIZE + 1;
442 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
443 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
444 return 2;
445 default: return 1;
446 }
447}
448
449
450/*
451** state for the compiler
452*/
453typedef struct CompileState {
454 Pattern *p; /* pattern being compiled */
455 int ncode; /* next position in p->code to be filled */
456 lua_State *L;
457} CompileState;
458
459
460/*
461** code generation is recursive; 'opt' indicates that the code is being
462** generated as the last thing inside an optional pattern (so, if that
463** code is optional too, it can reuse the 'IChoice' already in place for
464** the outer pattern). 'tt' points to a previous test protecting this
465** code (or NOINST). 'fl' is the follow set of the pattern.
466*/
467static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
468 const Charset *fl);
469
470
471void realloccode (lua_State *L, Pattern *p, int nsize) {
472 void *ud;
473 lua_Alloc f = lua_getallocf(L, &ud);
474 void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction),
475 nsize * sizeof(Instruction));
476 if (newblock == NULL && nsize > 0)
477 luaL_error(L, "not enough memory");
478 p->code = (Instruction *)newblock;
479 p->codesize = nsize;
480}
481
482
483static int nextinstruction (CompileState *compst) {
484 int size = compst->p->codesize;
485 if (compst->ncode >= size)
486 realloccode(compst->L, compst->p, size * 2);
487 return compst->ncode++;
488}
489
490
491#define getinstr(cs,i) ((cs)->p->code[i])
492
493
494static int addinstruction (CompileState *compst, Opcode op, int aux) {
495 int i = nextinstruction(compst);
496 getinstr(compst, i).i.code = op;
497 getinstr(compst, i).i.aux = aux;
498 return i;
499}
500
501
502/*
503** Add an instruction followed by space for an offset (to be set later)
504*/
505static int addoffsetinst (CompileState *compst, Opcode op) {
506 int i = addinstruction(compst, op, 0); /* instruction */
507 addinstruction(compst, (Opcode)0, 0); /* open space for offset */
508 assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2);
509 return i;
510}
511
512
513/*
514** Set the offset of an instruction
515*/
516static void setoffset (CompileState *compst, int instruction, int offset) {
517 getinstr(compst, instruction + 1).offset = offset;
518}
519
520
521/*
522** Add a capture instruction:
523** 'op' is the capture instruction; 'cap' the capture kind;
524** 'key' the key into ktable; 'aux' is the optional capture offset
525**
526*/
527static int addinstcap (CompileState *compst, Opcode op, int cap, int key,
528 int aux) {
529 int i = addinstruction(compst, op, joinkindoff(cap, aux));
530 getinstr(compst, i).i.key = key;
531 return i;
532}
533
534
535#define gethere(compst) ((compst)->ncode)
536
537#define target(code,i) ((i) + code[i + 1].offset)
538
539
540/*
541** Patch 'instruction' to jump to 'target'
542*/
543static void jumptothere (CompileState *compst, int instruction, int target) {
544 if (instruction >= 0)
545 setoffset(compst, instruction, target - instruction);
546}
547
548
549/*
550** Patch 'instruction' to jump to current position
551*/
552static void jumptohere (CompileState *compst, int instruction) {
553 jumptothere(compst, instruction, gethere(compst));
554}
555
556
557/*
558** Code an IChar instruction, or IAny if there is an equivalent
559** test dominating it
560*/
561static void codechar (CompileState *compst, int c, int tt) {
562 if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar &&
563 getinstr(compst, tt).i.aux == c)
564 addinstruction(compst, IAny, 0);
565 else
566 addinstruction(compst, IChar, c);
567}
568
569
570/*
571** Add a charset posfix to an instruction
572*/
573static void addcharset (CompileState *compst, const byte *cs) {
574 int p = gethere(compst);
575 int i;
576 for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
577 nextinstruction(compst); /* space for buffer */
578 /* fill buffer with charset */
579 loopset(j, getinstr(compst, p).buff[j] = cs[j]);
580}
581
582
583/*
584** code a char set, optimizing unit sets for IChar, "complete"
585** sets for IAny, and empty sets for IFail; also use an IAny
586** when instruction is dominated by an equivalent test.
587*/
588static void codecharset (CompileState *compst, const byte *cs, int tt) {
589 int c = 0; /* (=) to avoid warnings */
590 Opcode op = charsettype(cs, &c);
591 switch (op) {
592 case IChar: codechar(compst, c, tt); break;
593 case ISet: { /* non-trivial set? */
594 if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
595 cs_equal(cs, getinstr(compst, tt + 2).buff))
596 addinstruction(compst, IAny, 0);
597 else {
598 addinstruction(compst, ISet, 0);
599 addcharset(compst, cs);
600 }
601 break;
602 }
603 default: addinstruction(compst, op, c); break;
604 }
605}
606
607
608/*
609** code a test set, optimizing unit sets for ITestChar, "complete"
610** sets for ITestAny, and empty sets for IJmp (always fails).
611** 'e' is true iff test should accept the empty string. (Test
612** instructions in the current VM never accept the empty string.)
613*/
614static int codetestset (CompileState *compst, Charset *cs, int e) {
615 if (e) return NOINST; /* no test */
616 else {
617 int c = 0;
618 Opcode op = charsettype(cs->cs, &c);
619 switch (op) {
620 case IFail: return addoffsetinst(compst, IJmp); /* always jump */
621 case IAny: return addoffsetinst(compst, ITestAny);
622 case IChar: {
623 int i = addoffsetinst(compst, ITestChar);
624 getinstr(compst, i).i.aux = c;
625 return i;
626 }
627 case ISet: {
628 int i = addoffsetinst(compst, ITestSet);
629 addcharset(compst, cs->cs);
630 return i;
631 }
632 default: assert(0); return 0;
633 }
634 }
635}
636
637
638/*
639** Find the final destination of a sequence of jumps
640*/
641static int finaltarget (Instruction *code, int i) {
642 while (code[i].i.code == IJmp)
643 i = target(code, i);
644 return i;
645}
646
647
648/*
649** final label (after traversing any jumps)
650*/
651static int finallabel (Instruction *code, int i) {
652 return finaltarget(code, target(code, i));
653}
654
655
656/*
657** <behind(p)> == behind n; <p> (where n = fixedlen(p))
658*/
659static void codebehind (CompileState *compst, TTree *tree) {
660 if (tree->u.n > 0)
661 addinstruction(compst, IBehind, tree->u.n);
662 codegen(compst, sib1(tree), 0, NOINST, fullset);
663}
664
665
666/*
667** Choice; optimizations:
668** - when p1 is headfail or
669** when first(p1) and first(p2) are disjoint, than
670** a character not in first(p1) cannot go to p1, and a character
671** in first(p1) cannot go to p2 (at it is not in first(p2)).
672** (The optimization is not valid if p1 accepts the empty string,
673** as then there is no character at all...)
674** - when p2 is empty and opt is true; a IPartialCommit can reuse
675** the Choice already active in the stack.
676*/
677static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
678 const Charset *fl) {
679 int emptyp2 = (p2->tag == TTrue);
680 Charset cs1, cs2;
681 int e1 = getfirst(p1, fullset, &cs1);
682 if (headfail(p1) ||
683 (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) {
684 /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */
685 int test = codetestset(compst, &cs1, 0);
686 int jmp = NOINST;
687 codegen(compst, p1, 0, test, fl);
688 if (!emptyp2)
689 jmp = addoffsetinst(compst, IJmp);
690 jumptohere(compst, test);
691 codegen(compst, p2, opt, NOINST, fl);
692 jumptohere(compst, jmp);
693 }
694 else if (opt && emptyp2) {
695 /* p1? == IPartialCommit; p1 */
696 jumptohere(compst, addoffsetinst(compst, IPartialCommit));
697 codegen(compst, p1, 1, NOINST, fullset);
698 }
699 else {
700 /* <p1 / p2> ==
701 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
702 int pcommit;
703 int test = codetestset(compst, &cs1, e1);
704 int pchoice = addoffsetinst(compst, IChoice);
705 codegen(compst, p1, emptyp2, test, fullset);
706 pcommit = addoffsetinst(compst, ICommit);
707 jumptohere(compst, pchoice);
708 jumptohere(compst, test);
709 codegen(compst, p2, opt, NOINST, fl);
710 jumptohere(compst, pcommit);
711 }
712}
713
714
715/*
716** And predicate
717** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
718** (valid only when 'p' has no captures)
719*/
720static void codeand (CompileState *compst, TTree *tree, int tt) {
721 int n = fixedlen(tree);
722 if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) {
723 codegen(compst, tree, 0, tt, fullset);
724 if (n > 0)
725 addinstruction(compst, IBehind, n);
726 }
727 else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */
728 int pcommit;
729 int pchoice = addoffsetinst(compst, IChoice);
730 codegen(compst, tree, 0, tt, fullset);
731 pcommit = addoffsetinst(compst, IBackCommit);
732 jumptohere(compst, pchoice);
733 addinstruction(compst, IFail, 0);
734 jumptohere(compst, pcommit);
735 }
736}
737
738
739/*
740** Captures: if pattern has fixed (and not too big) length, and it
741** has no nested captures, use a single IFullCapture instruction
742** after the match; otherwise, enclose the pattern with OpenCapture -
743** CloseCapture.
744*/
745static void codecapture (CompileState *compst, TTree *tree, int tt,
746 const Charset *fl) {
747 int len = fixedlen(sib1(tree));
748 if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) {
749 codegen(compst, sib1(tree), 0, tt, fl);
750 addinstcap(compst, IFullCapture, tree->cap, tree->key, len);
751 }
752 else {
753 addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0);
754 codegen(compst, sib1(tree), 0, tt, fl);
755 addinstcap(compst, ICloseCapture, Cclose, 0, 0);
756 }
757}
758
759
760static void coderuntime (CompileState *compst, TTree *tree, int tt) {
761 addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0);
762 codegen(compst, sib1(tree), 0, tt, fullset);
763 addinstcap(compst, ICloseRunTime, Cclose, 0, 0);
764}
765
766
767/*
768** Repetion; optimizations:
769** When pattern is a charset, can use special instruction ISpan.
770** When pattern is head fail, or if it starts with characters that
771** are disjoint from what follows the repetions, a simple test
772** is enough (a fail inside the repetition would backtrack to fail
773** again in the following pattern, so there is no need for a choice).
774** When 'opt' is true, the repetion can reuse the Choice already
775** active in the stack.
776*/
777static void coderep (CompileState *compst, TTree *tree, int opt,
778 const Charset *fl) {
779 Charset st;
780 if (tocharset(tree, &st)) {
781 addinstruction(compst, ISpan, 0);
782 addcharset(compst, st.cs);
783 }
784 else {
785 int e1 = getfirst(tree, fullset, &st);
786 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
787 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
788 int jmp;
789 int test = codetestset(compst, &st, 0);
790 codegen(compst, tree, 0, test, fullset);
791 jmp = addoffsetinst(compst, IJmp);
792 jumptohere(compst, test);
793 jumptothere(compst, jmp, test);
794 }
795 else {
796 /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */
797 /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */
798 int commit, l2;
799 int test = codetestset(compst, &st, e1);
800 int pchoice = NOINST;
801 if (opt)
802 jumptohere(compst, addoffsetinst(compst, IPartialCommit));
803 else
804 pchoice = addoffsetinst(compst, IChoice);
805 l2 = gethere(compst);
806 codegen(compst, tree, 0, NOINST, fullset);
807 commit = addoffsetinst(compst, IPartialCommit);
808 jumptothere(compst, commit, l2);
809 jumptohere(compst, pchoice);
810 jumptohere(compst, test);
811 }
812 }
813}
814
815
816/*
817** Not predicate; optimizations:
818** In any case, if first test fails, 'not' succeeds, so it can jump to
819** the end. If pattern is headfail, that is all (it cannot fail
820** in other parts); this case includes 'not' of simple sets. Otherwise,
821** use the default code (a choice plus a failtwice).
822*/
823static void codenot (CompileState *compst, TTree *tree) {
824 Charset st;
825 int e = getfirst(tree, fullset, &st);
826 int test = codetestset(compst, &st, e);
827 if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */
828 addinstruction(compst, IFail, 0);
829 else {
830 /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */
831 int pchoice = addoffsetinst(compst, IChoice);
832 codegen(compst, tree, 0, NOINST, fullset);
833 addinstruction(compst, IFailTwice, 0);
834 jumptohere(compst, pchoice);
835 }
836 jumptohere(compst, test);
837}
838
839
840/*
841** change open calls to calls, using list 'positions' to find
842** correct offsets; also optimize tail calls
843*/
844static void correctcalls (CompileState *compst, int *positions,
845 int from, int to) {
846 int i;
847 Instruction *code = compst->p->code;
848 for (i = from; i < to; i += sizei(&code[i])) {
849 if (code[i].i.code == IOpenCall) {
850 int n = code[i].i.key; /* rule number */
851 int rule = positions[n]; /* rule position */
852 assert(rule == from || code[rule - 1].i.code == IRet);
853 if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */
854 code[i].i.code = IJmp; /* tail call */
855 else
856 code[i].i.code = ICall;
857 jumptothere(compst, i, rule); /* call jumps to respective rule */
858 }
859 }
860 assert(i == to);
861}
862
863
864/*
865** Code for a grammar:
866** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
867*/
868static void codegrammar (CompileState *compst, TTree *grammar) {
869 int positions[MAXRULES];
870 int rulenumber = 0;
871 TTree *rule;
872 int firstcall = addoffsetinst(compst, ICall); /* call initial rule */
873 int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */
874 int start = gethere(compst); /* here starts the initial rule */
875 jumptohere(compst, firstcall);
876 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
877 positions[rulenumber++] = gethere(compst); /* save rule position */
878 codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */
879 addinstruction(compst, IRet, 0);
880 }
881 assert(rule->tag == TTrue);
882 jumptohere(compst, jumptoend);
883 correctcalls(compst, positions, start, gethere(compst));
884}
885
886
887static void codecall (CompileState *compst, TTree *call) {
888 int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */
889 getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */
890 assert(sib2(call)->tag == TRule);
891}
892
893
894/*
895** Code first child of a sequence
896** (second child is called in-place to allow tail call)
897** Return 'tt' for second child
898*/
899static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2,
900 int tt, const Charset *fl) {
901 if (needfollow(p1)) {
902 Charset fl1;
903 getfirst(p2, fl, &fl1); /* p1 follow is p2 first */
904 codegen(compst, p1, 0, tt, &fl1);
905 }
906 else /* use 'fullset' as follow */
907 codegen(compst, p1, 0, tt, fullset);
908 if (fixedlen(p1) != 0) /* can 'p1' consume anything? */
909 return NOINST; /* invalidate test */
910 else return tt; /* else 'tt' still protects sib2 */
911}
912
913
914/*
915** Main code-generation function: dispatch to auxiliar functions
916** according to kind of tree. ('needfollow' should return true
917** only for consructions that use 'fl'.)
918*/
919static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
920 const Charset *fl) {
921 tailcall:
922 switch (tree->tag) {
923 case TChar: codechar(compst, tree->u.n, tt); break;
924 case TAny: addinstruction(compst, IAny, 0); break;
925 case TSet: codecharset(compst, treebuffer(tree), tt); break;
926 case TTrue: break;
927 case TFalse: addinstruction(compst, IFail, 0); break;
928 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
929 case TRep: coderep(compst, sib1(tree), opt, fl); break;
930 case TBehind: codebehind(compst, tree); break;
931 case TNot: codenot(compst, sib1(tree)); break;
932 case TAnd: codeand(compst, sib1(tree), tt); break;
933 case TCapture: codecapture(compst, tree, tt, fl); break;
934 case TRunTime: coderuntime(compst, tree, tt); break;
935 case TGrammar: codegrammar(compst, tree); break;
936 case TCall: codecall(compst, tree); break;
937 case TSeq: {
938 tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */
939 /* codegen(compst, p2, opt, tt, fl); */
940 tree = sib2(tree); goto tailcall;
941 }
942 default: assert(0);
943 }
944}
945
946
947/*
948** Optimize jumps and other jump-like instructions.
949** * Update labels of instructions with labels to their final
950** destinations (e.g., choice L1; ... L1: jmp L2: becomes
951** choice L2)
952** * Jumps to other instructions that do jumps become those
953** instructions (e.g., jump to return becomes a return; jump
954** to commit becomes a commit)
955*/
956static void peephole (CompileState *compst) {
957 Instruction *code = compst->p->code;
958 int i;
959 for (i = 0; i < compst->ncode; i += sizei(&code[i])) {
960 redo:
961 switch (code[i].i.code) {
962 case IChoice: case ICall: case ICommit: case IPartialCommit:
963 case IBackCommit: case ITestChar: case ITestSet:
964 case ITestAny: { /* instructions with labels */
965 jumptothere(compst, i, finallabel(code, i)); /* optimize label */
966 break;
967 }
968 case IJmp: {
969 int ft = finaltarget(code, i);
970 switch (code[ft].i.code) { /* jumping to what? */
971 case IRet: case IFail: case IFailTwice:
972 case IEnd: { /* instructions with unconditional implicit jumps */
973 code[i] = code[ft]; /* jump becomes that instruction */
974 code[i + 1].i.code = IAny; /* 'no-op' for target position */
975 break;
976 }
977 case ICommit: case IPartialCommit:
978 case IBackCommit: { /* inst. with unconditional explicit jumps */
979 int fft = finallabel(code, ft);
980 code[i] = code[ft]; /* jump becomes that instruction... */
981 jumptothere(compst, i, fft); /* but must correct its offset */
982 goto redo; /* reoptimize its label */
983 }
984 default: {
985 jumptothere(compst, i, ft); /* optimize label */
986 break;
987 }
988 }
989 break;
990 }
991 default: break;
992 }
993 }
994 assert(code[i - 1].i.code == IEnd);
995}
996
997
998/*
999** Compile a pattern
1000*/
1001Instruction *compile (lua_State *L, Pattern *p) {
1002 CompileState compst;
1003 compst.p = p; compst.ncode = 0; compst.L = L;
1004 realloccode(L, p, 2); /* minimum initial size */
1005 codegen(&compst, p->tree, 0, NOINST, fullset);
1006 addinstruction(&compst, IEnd, 0);
1007 realloccode(L, p, compst.ncode); /* set final size */
1008 peephole(&compst);
1009 return p->code;
1010}
1011
1012
1013/* }====================================================== */
1014
diff --git a/lpcode.h b/lpcode.h
new file mode 100644
index 0000000..2a5861e
--- /dev/null
+++ b/lpcode.h
@@ -0,0 +1,40 @@
1/*
2** $Id: lpcode.h,v 1.8 2016/09/15 17:46:13 roberto Exp $
3*/
4
5#if !defined(lpcode_h)
6#define lpcode_h
7
8#include "lua.h"
9
10#include "lptypes.h"
11#include "lptree.h"
12#include "lpvm.h"
13
14int tocharset (TTree *tree, Charset *cs);
15int checkaux (TTree *tree, int pred);
16int fixedlen (TTree *tree);
17int hascaptures (TTree *tree);
18int lp_gc (lua_State *L);
19Instruction *compile (lua_State *L, Pattern *p);
20void realloccode (lua_State *L, Pattern *p, int nsize);
21int sizei (const Instruction *i);
22
23
24#define PEnullable 0
25#define PEnofail 1
26
27/*
28** nofail(t) implies that 't' cannot fail with any input
29*/
30#define nofail(t) checkaux(t, PEnofail)
31
32/*
33** (not nullable(t)) implies 't' cannot match without consuming
34** something
35*/
36#define nullable(t) checkaux(t, PEnullable)
37
38
39
40#endif
diff --git a/lpeg.html b/lpeg.html
new file mode 100644
index 0000000..5c9535f
--- /dev/null
+++ b/lpeg.html
@@ -0,0 +1,1445 @@
1<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4<head>
5 <title>LPeg - Parsing Expression Grammars For Lua</title>
6 <link rel="stylesheet"
7 href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css"
8 type="text/css"/>
9 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
10</head>
11<body>
12
13<!-- $Id: lpeg.html,v 1.77 2017/01/13 13:40:05 roberto Exp $ -->
14
15<div id="container">
16
17<div id="product">
18 <div id="product_logo">
19 <a href="http://www.inf.puc-rio.br/~roberto/lpeg/">
20 <img alt="LPeg logo" src="lpeg-128.gif"/></a>
21
22 </div>
23 <div id="product_name"><big><strong>LPeg</strong></big></div>
24 <div id="product_description">
25 Parsing Expression Grammars For Lua, version 1.0
26 </div>
27</div> <!-- id="product" -->
28
29<div id="main">
30
31<div id="navigation">
32<h1>LPeg</h1>
33
34<ul>
35 <li><strong>Home</strong>
36 <ul>
37 <li><a href="#intro">Introduction</a></li>
38 <li><a href="#func">Functions</a></li>
39 <li><a href="#basic">Basic Constructions</a></li>
40 <li><a href="#grammar">Grammars</a></li>
41 <li><a href="#captures">Captures</a></li>
42 <li><a href="#ex">Some Examples</a></li>
43 <li><a href="re.html">The <code>re</code> Module</a></li>
44 <li><a href="#download">Download</a></li>
45 <li><a href="#license">License</a></li>
46 </ul>
47 </li>
48</ul>
49</div> <!-- id="navigation" -->
50
51<div id="content">
52
53
54<h2><a name="intro">Introduction</a></h2>
55
56<p>
57<em>LPeg</em> is a new pattern-matching library for Lua,
58based on
59<a href="http://pdos.csail.mit.edu/%7Ebaford/packrat/">
60Parsing Expression Grammars</a> (PEGs).
61This text is a reference manual for the library.
62For a more formal treatment of LPeg,
63as well as some discussion about its implementation,
64see
65<a href="http://www.inf.puc-rio.br/~roberto/docs/peg.pdf">
66A Text Pattern-Matching Tool based on Parsing Expression Grammars</a>.
67(You may also be interested in my
68<a href="http://vimeo.com/1485123">talk about LPeg</a>
69given at the III Lua Workshop.)
70</p>
71
72<p>
73Following the Snobol tradition,
74LPeg defines patterns as first-class objects.
75That is, patterns are regular Lua values
76(represented by userdata).
77The library offers several functions to create
78and compose patterns.
79With the use of metamethods,
80several of these functions are provided as infix or prefix
81operators.
82On the one hand,
83the result is usually much more verbose than the typical
84encoding of patterns using the so called
85<em>regular expressions</em>
86(which typically are not regular expressions in the formal sense).
87On the other hand,
88first-class patterns allow much better documentation
89(as it is easy to comment the code,
90to break complex definitions in smaller parts, etc.)
91and are extensible,
92as we can define new functions to create and compose patterns.
93</p>
94
95<p>
96For a quick glance of the library,
97the following table summarizes its basic operations
98for creating patterns:
99</p>
100<table border="1">
101<tbody><tr><td><b>Operator</b></td><td><b>Description</b></td></tr>
102<tr><td><a href="#op-p"><code>lpeg.P(string)</code></a></td>
103 <td>Matches <code>string</code> literally</td></tr>
104<tr><td><a href="#op-p"><code>lpeg.P(n)</code></a></td>
105 <td>Matches exactly <code>n</code> characters</td></tr>
106<tr><td><a href="#op-s"><code>lpeg.S(string)</code></a></td>
107 <td>Matches any character in <code>string</code> (Set)</td></tr>
108<tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td>
109 <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr>
110<tr><td><a href="#op-pow"><code>patt^n</code></a></td>
111 <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr>
112<tr><td><a href="#op-pow"><code>patt^-n</code></a></td>
113 <td>Matches at most <code>n</code> repetitions of <code>patt</code></td></tr>
114<tr><td><a href="#op-mul"><code>patt1 * patt2</code></a></td>
115 <td>Matches <code>patt1</code> followed by <code>patt2</code></td></tr>
116<tr><td><a href="#op-add"><code>patt1 + patt2</code></a></td>
117 <td>Matches <code>patt1</code> or <code>patt2</code>
118 (ordered choice)</td></tr>
119<tr><td><a href="#op-sub"><code>patt1 - patt2</code></a></td>
120 <td>Matches <code>patt1</code> if <code>patt2</code> does not match</td></tr>
121<tr><td><a href="#op-unm"><code>-patt</code></a></td>
122 <td>Equivalent to <code>("" - patt)</code></td></tr>
123<tr><td><a href="#op-len"><code>#patt</code></a></td>
124 <td>Matches <code>patt</code> but consumes no input</td></tr>
125<tr><td><a href="#op-behind"><code>lpeg.B(patt)</code></a></td>
126 <td>Matches <code>patt</code> behind the current position,
127 consuming no input</td></tr>
128</tbody></table>
129
130<p>As a very simple example,
131<code>lpeg.R("09")^1</code> creates a pattern that
132matches a non-empty sequence of digits.
133As a not so simple example,
134<code>-lpeg.P(1)</code>
135(which can be written as <code>lpeg.P(-1)</code>,
136or simply <code>-1</code> for operations expecting a pattern)
137matches an empty string only if it cannot match a single character;
138so, it succeeds only at the end of the subject.
139</p>
140
141<p>
142LPeg also offers the <a href="re.html"><code>re</code> module</a>,
143which implements patterns following a regular-expression style
144(e.g., <code>[09]+</code>).
145(This module is 260 lines of Lua code,
146and of course it uses LPeg to parse regular expressions and
147translate them to regular LPeg patterns.)
148</p>
149
150
151<h2><a name="func">Functions</a></h2>
152
153
154<h3><a name="f-match"></a><code>lpeg.match (pattern, subject [, init])</code></h3>
155<p>
156The matching function.
157It attempts to match the given pattern against the subject string.
158If the match succeeds,
159returns the index in the subject of the first character after the match,
160or the <a href="#captures">captured values</a>
161(if the pattern captured any value).
162</p>
163
164<p>
165An optional numeric argument <code>init</code> makes the match
166start at that position in the subject string.
167As usual in Lua libraries,
168a negative value counts from the end.
169</p>
170
171<p>
172Unlike typical pattern-matching functions,
173<code>match</code> works only in <em>anchored</em> mode;
174that is, it tries to match the pattern with a prefix of
175the given subject string (at position <code>init</code>),
176not with an arbitrary substring of the subject.
177So, if we want to find a pattern anywhere in a string,
178we must either write a loop in Lua or write a pattern that
179matches anywhere.
180This second approach is easy and quite efficient;
181see <a href="#ex">examples</a>.
182</p>
183
184<h3><a name="f-type"></a><code>lpeg.type (value)</code></h3>
185<p>
186If the given value is a pattern,
187returns the string <code>"pattern"</code>.
188Otherwise returns nil.
189</p>
190
191<h3><a name="f-version"></a><code>lpeg.version ()</code></h3>
192<p>
193Returns a string with the running version of LPeg.
194</p>
195
196<h3><a name="f-setstack"></a><code>lpeg.setmaxstack (max)</code></h3>
197<p>
198Sets a limit for the size of the backtrack stack used by LPeg to
199track calls and choices.
200(The default limit is 400.)
201Most well-written patterns need little backtrack levels and
202therefore you seldom need to change this limit;
203before changing it you should try to rewrite your
204pattern to avoid the need for extra space.
205Nevertheless, a few useful patterns may overflow.
206Also, with recursive grammars,
207subjects with deep recursion may also need larger limits.
208</p>
209
210
211<h2><a name="basic">Basic Constructions</a></h2>
212
213<p>
214The following operations build patterns.
215All operations that expect a pattern as an argument
216may receive also strings, tables, numbers, booleans, or functions,
217which are translated to patterns according to
218the rules of function <a href="#op-p"><code>lpeg.P</code></a>.
219</p>
220
221
222
223<h3><a name="op-p"></a><code>lpeg.P (value)</code></h3>
224<p>
225Converts the given value into a proper pattern,
226according to the following rules:
227</p>
228<ul>
229
230<li><p>
231If the argument is a pattern,
232it is returned unmodified.
233</p></li>
234
235<li><p>
236If the argument is a string,
237it is translated to a pattern that matches the string literally.
238</p></li>
239
240<li><p>
241If the argument is a non-negative number <em>n</em>,
242the result is a pattern that matches exactly <em>n</em> characters.
243</p></li>
244
245<li><p>
246If the argument is a negative number <em>-n</em>,
247the result is a pattern that
248succeeds only if the input string has less than <em>n</em> characters left:
249<code>lpeg.P(-n)</code>
250is equivalent to <code>-lpeg.P(n)</code>
251(see the <a href="#op-unm">unary minus operation</a>).
252</p></li>
253
254<li><p>
255If the argument is a boolean,
256the result is a pattern that always succeeds or always fails
257(according to the boolean value),
258without consuming any input.
259</p></li>
260
261<li><p>
262If the argument is a table,
263it is interpreted as a grammar
264(see <a href="#grammar">Grammars</a>).
265</p></li>
266
267<li><p>
268If the argument is a function,
269returns a pattern equivalent to a
270<a href="#matchtime">match-time capture</a> over the empty string.
271</p></li>
272
273</ul>
274
275
276<h3><a name="op-behind"></a><code>lpeg.B(patt)</code></h3>
277<p>
278Returns a pattern that
279matches only if the input string at the current position
280is preceded by <code>patt</code>.
281Pattern <code>patt</code> must match only strings
282with some fixed length,
283and it cannot contain captures.
284</p>
285
286<p>
287Like the <a href="#op-len">and predicate</a>,
288this pattern never consumes any input,
289independently of success or failure.
290</p>
291
292
293<h3><a name="op-r"></a><code>lpeg.R ({range})</code></h3>
294<p>
295Returns a pattern that matches any single character
296belonging to one of the given <em>ranges</em>.
297Each <code>range</code> is a string <em>xy</em> of length 2,
298representing all characters with code
299between the codes of <em>x</em> and <em>y</em>
300(both inclusive).
301</p>
302
303<p>
304As an example, the pattern
305<code>lpeg.R("09")</code> matches any digit,
306and <code>lpeg.R("az", "AZ")</code> matches any ASCII letter.
307</p>
308
309
310<h3><a name="op-s"></a><code>lpeg.S (string)</code></h3>
311<p>
312Returns a pattern that matches any single character that
313appears in the given string.
314(The <code>S</code> stands for <em>Set</em>.)
315</p>
316
317<p>
318As an example, the pattern
319<code>lpeg.S("+-*/")</code> matches any arithmetic operator.
320</p>
321
322<p>
323Note that, if <code>s</code> is a character
324(that is, a string of length 1),
325then <code>lpeg.P(s)</code> is equivalent to <code>lpeg.S(s)</code>
326which is equivalent to <code>lpeg.R(s..s)</code>.
327Note also that both <code>lpeg.S("")</code> and <code>lpeg.R()</code>
328are patterns that always fail.
329</p>
330
331
332<h3><a name="op-v"></a><code>lpeg.V (v)</code></h3>
333<p>
334This operation creates a non-terminal (a <em>variable</em>)
335for a grammar.
336The created non-terminal refers to the rule indexed by <code>v</code>
337in the enclosing grammar.
338(See <a href="#grammar">Grammars</a> for details.)
339</p>
340
341
342<h3><a name="op-locale"></a><code>lpeg.locale ([table])</code></h3>
343<p>
344Returns a table with patterns for matching some character classes
345according to the current locale.
346The table has fields named
347<code>alnum</code>,
348<code>alpha</code>,
349<code>cntrl</code>,
350<code>digit</code>,
351<code>graph</code>,
352<code>lower</code>,
353<code>print</code>,
354<code>punct</code>,
355<code>space</code>,
356<code>upper</code>, and
357<code>xdigit</code>,
358each one containing a correspondent pattern.
359Each pattern matches any single character that belongs to its class.
360</p>
361
362<p>
363If called with an argument <code>table</code>,
364then it creates those fields inside the given table and
365returns that table.
366</p>
367
368
369<h3><a name="op-len"></a><code>#patt</code></h3>
370<p>
371Returns a pattern that
372matches only if the input string matches <code>patt</code>,
373but without consuming any input,
374independently of success or failure.
375(This pattern is called an <em>and predicate</em>
376and it is equivalent to
377<em>&amp;patt</em> in the original PEG notation.)
378</p>
379
380
381<p>
382This pattern never produces any capture.
383</p>
384
385
386<h3><a name="op-unm"></a><code>-patt</code></h3>
387<p>
388Returns a pattern that
389matches only if the input string does not match <code>patt</code>.
390It does not consume any input,
391independently of success or failure.
392(This pattern is equivalent to
393<em>!patt</em> in the original PEG notation.)
394</p>
395
396<p>
397As an example, the pattern
398<code>-lpeg.P(1)</code> matches only the end of string.
399</p>
400
401<p>
402This pattern never produces any captures,
403because either <code>patt</code> fails
404or <code>-patt</code> fails.
405(A failing pattern never produces captures.)
406</p>
407
408
409<h3><a name="op-add"></a><code>patt1 + patt2</code></h3>
410<p>
411Returns a pattern equivalent to an <em>ordered choice</em>
412of <code>patt1</code> and <code>patt2</code>.
413(This is denoted by <em>patt1 / patt2</em> in the original PEG notation,
414not to be confused with the <code>/</code> operation in LPeg.)
415It matches either <code>patt1</code> or <code>patt2</code>,
416with no backtracking once one of them succeeds.
417The identity element for this operation is the pattern
418<code>lpeg.P(false)</code>,
419which always fails.
420</p>
421
422<p>
423If both <code>patt1</code> and <code>patt2</code> are
424character sets,
425this operation is equivalent to set union.
426</p>
427<pre class="example">
428lower = lpeg.R("az")
429upper = lpeg.R("AZ")
430letter = lower + upper
431</pre>
432
433
434<h3><a name="op-sub"></a><code>patt1 - patt2</code></h3>
435<p>
436Returns a pattern equivalent to <em>!patt2 patt1</em>.
437This pattern asserts that the input does not match
438<code>patt2</code> and then matches <code>patt1</code>.
439</p>
440
441<p>
442When successful,
443this pattern produces all captures from <code>patt1</code>.
444It never produces any capture from <code>patt2</code>
445(as either <code>patt2</code> fails or
446<code>patt1 - patt2</code> fails).
447</p>
448
449<p>
450If both <code>patt1</code> and <code>patt2</code> are
451character sets,
452this operation is equivalent to set difference.
453Note that <code>-patt</code> is equivalent to <code>"" - patt</code>
454(or <code>0 - patt</code>).
455If <code>patt</code> is a character set,
456<code>1 - patt</code> is its complement.
457</p>
458
459
460<h3><a name="op-mul"></a><code>patt1 * patt2</code></h3>
461<p>
462Returns a pattern that matches <code>patt1</code>
463and then matches <code>patt2</code>,
464starting where <code>patt1</code> finished.
465The identity element for this operation is the
466pattern <code>lpeg.P(true)</code>,
467which always succeeds.
468</p>
469
470<p>
471(LPeg uses the <code>*</code> operator
472[instead of the more obvious <code>..</code>]
473both because it has
474the right priority and because in formal languages it is
475common to use a dot for denoting concatenation.)
476</p>
477
478
479<h3><a name="op-pow"></a><code>patt^n</code></h3>
480<p>
481If <code>n</code> is nonnegative,
482this pattern is
483equivalent to <em>patt<sup>n</sup> patt*</em>:
484It matches <code>n</code> or more occurrences of <code>patt</code>.
485</p>
486
487<p>
488Otherwise, when <code>n</code> is negative,
489this pattern is equivalent to <em>(patt?)<sup>-n</sup></em>:
490It matches at most <code>|n|</code>
491occurrences of <code>patt</code>.
492</p>
493
494<p>
495In particular, <code>patt^0</code> is equivalent to <em>patt*</em>,
496<code>patt^1</code> is equivalent to <em>patt+</em>,
497and <code>patt^-1</code> is equivalent to <em>patt?</em>
498in the original PEG notation.
499</p>
500
501<p>
502In all cases,
503the resulting pattern is greedy with no backtracking
504(also called a <em>possessive</em> repetition).
505That is, it matches only the longest possible sequence
506of matches for <code>patt</code>.
507</p>
508
509
510
511<h2><a name="grammar">Grammars</a></h2>
512
513<p>
514With the use of Lua variables,
515it is possible to define patterns incrementally,
516with each new pattern using previously defined ones.
517However, this technique does not allow the definition of
518recursive patterns.
519For recursive patterns,
520we need real grammars.
521</p>
522
523<p>
524LPeg represents grammars with tables,
525where each entry is a rule.
526</p>
527
528<p>
529The call <code>lpeg.V(v)</code>
530creates a pattern that represents the nonterminal
531(or <em>variable</em>) with index <code>v</code> in a grammar.
532Because the grammar still does not exist when
533this function is evaluated,
534the result is an <em>open reference</em> to the respective rule.
535</p>
536
537<p>
538A table is <em>fixed</em> when it is converted to a pattern
539(either by calling <code>lpeg.P</code> or by using it wherein a
540pattern is expected).
541Then every open reference created by <code>lpeg.V(v)</code>
542is corrected to refer to the rule indexed by <code>v</code> in the table.
543</p>
544
545<p>
546When a table is fixed,
547the result is a pattern that matches its <em>initial rule</em>.
548The entry with index 1 in the table defines its initial rule.
549If that entry is a string,
550it is assumed to be the name of the initial rule.
551Otherwise, LPeg assumes that the entry 1 itself is the initial rule.
552</p>
553
554<p>
555As an example,
556the following grammar matches strings of a's and b's that
557have the same number of a's and b's:
558</p>
559<pre class="example">
560equalcount = lpeg.P{
561 "S"; -- initial rule name
562 S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "",
563 A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A",
564 B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B",
565} * -1
566</pre>
567<p>
568It is equivalent to the following grammar in standard PEG notation:
569</p>
570<pre class="example">
571 S <- 'a' B / 'b' A / ''
572 A <- 'a' S / 'b' A A
573 B <- 'b' S / 'a' B B
574</pre>
575
576
577<h2><a name="captures">Captures</a></h2>
578
579<p>
580A <em>capture</em> is a pattern that produces values
581(the so called <em>semantic information</em>)
582according to what it matches.
583LPeg offers several kinds of captures,
584which produces values based on matches and combine these values to
585produce new values.
586Each capture may produce zero or more values.
587</p>
588
589<p>
590The following table summarizes the basic captures:
591</p>
592<table border="1">
593<tbody><tr><td><b>Operation</b></td><td><b>What it Produces</b></td></tr>
594<tr><td><a href="#cap-c"><code>lpeg.C(patt)</code></a></td>
595 <td>the match for <code>patt</code> plus all captures
596 made by <code>patt</code></td></tr>
597<tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td>
598 <td>the value of the n<sup>th</sup> extra argument to
599 <code>lpeg.match</code> (matches the empty string)</td></tr>
600<tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td>
601 <td>the values produced by the previous
602 group capture named <code>name</code>
603 (matches the empty string)</td></tr>
604<tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td>
605 <td>the given values (matches the empty string)</td></tr>
606<tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td>
607 <td>a <em>folding</em> of the captures from <code>patt</code></td></tr>
608<tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td>
609 <td>the values produced by <code>patt</code>,
610 optionally tagged with <code>name</code></td></tr>
611<tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td>
612 <td>the current position (matches the empty string)</td></tr>
613<tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td>
614 <td>the match for <code>patt</code>
615 with the values from nested captures replacing their matches</td></tr>
616<tr><td><a href="#cap-t"><code>lpeg.Ct(patt)</code></a></td>
617 <td>a table with all captures from <code>patt</code></td></tr>
618<tr><td><a href="#cap-string"><code>patt / string</code></a></td>
619 <td><code>string</code>, with some marks replaced by captures
620 of <code>patt</code></td></tr>
621<tr><td><a href="#cap-num"><code>patt / number</code></a></td>
622 <td>the n-th value captured by <code>patt</code>,
623or no value when <code>number</code> is zero.</td></tr>
624<tr><td><a href="#cap-query"><code>patt / table</code></a></td>
625 <td><code>table[c]</code>, where <code>c</code> is the (first)
626 capture of <code>patt</code></td></tr>
627<tr><td><a href="#cap-func"><code>patt / function</code></a></td>
628 <td>the returns of <code>function</code> applied to the captures
629 of <code>patt</code></td></tr>
630<tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td>
631 <td>the returns of <code>function</code> applied to the captures
632 of <code>patt</code>; the application is done at match time</td></tr>
633</tbody></table>
634
635<p>
636A capture pattern produces its values only when it succeeds.
637For instance,
638the pattern <code>lpeg.C(lpeg.P"a"^-1)</code>
639produces the empty string when there is no <code>"a"</code>
640(because the pattern <code>"a"?</code> succeeds),
641while the pattern <code>lpeg.C("a")^-1</code>
642does not produce any value when there is no <code>"a"</code>
643(because the pattern <code>"a"</code> fails).
644A pattern inside a loop or inside a recursive structure
645produces values for each match.
646</p>
647
648<p>
649Usually,
650LPeg does not specify when (and if) it evaluates its captures.
651(As an example,
652consider the pattern <code>lpeg.P"a" / func / 0</code>.
653Because the "division" by 0 instructs LPeg to throw away the
654results from the pattern,
655LPeg may or may not call <code>func</code>.)
656Therefore, captures should avoid side effects.
657Moreover,
658most captures cannot affect the way a pattern matches a subject.
659The only exception to this rule is the
660so-called <a href="#matchtime"><em>match-time capture</em></a>.
661When a match-time capture matches,
662it forces the immediate evaluation of all its nested captures
663and then calls its corresponding function,
664which defines whether the match succeeds and also
665what values are produced.
666</p>
667
668<h3><a name="cap-c"></a><code>lpeg.C (patt)</code></h3>
669<p>
670Creates a <em>simple capture</em>,
671which captures the substring of the subject that matches <code>patt</code>.
672The captured value is a string.
673If <code>patt</code> has other captures,
674their values are returned after this one.
675</p>
676
677
678<h3><a name="cap-arg"></a><code>lpeg.Carg (n)</code></h3>
679<p>
680Creates an <em>argument capture</em>.
681This pattern matches the empty string and
682produces the value given as the n<sup>th</sup> extra
683argument given in the call to <code>lpeg.match</code>.
684</p>
685
686
687<h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3>
688<p>
689Creates a <em>back capture</em>.
690This pattern matches the empty string and
691produces the values produced by the <em>most recent</em>
692<a href="#cap-g">group capture</a> named <code>name</code>
693(where <code>name</code> can be any Lua value).
694</p>
695
696<p>
697<em>Most recent</em> means the last
698<em>complete</em>
699<em>outermost</em>
700group capture with the given name.
701A <em>Complete</em> capture means that the entire pattern
702corresponding to the capture has matched.
703An <em>Outermost</em> capture means that the capture is not inside
704another complete capture.
705</p>
706
707<p>
708In the same way that LPeg does not specify when it evaluates captures,
709it does not specify whether it reuses
710values previously produced by the group
711or re-evaluates them.
712</p>
713
714<h3><a name="cap-cc"></a><code>lpeg.Cc ([value, ...])</code></h3>
715<p>
716Creates a <em>constant capture</em>.
717This pattern matches the empty string and
718produces all given values as its captured values.
719</p>
720
721
722<h3><a name="cap-f"></a><code>lpeg.Cf (patt, func)</code></h3>
723<p>
724Creates a <em>fold capture</em>.
725If <code>patt</code> produces a list of captures
726<em>C<sub>1</sub> C<sub>2</sub> ... C<sub>n</sub></em>,
727this capture will produce the value
728<em>func(...func(func(C<sub>1</sub>, C<sub>2</sub>), C<sub>3</sub>)...,
729 C<sub>n</sub>)</em>,
730that is, it will <em>fold</em>
731(or <em>accumulate</em>, or <em>reduce</em>)
732the captures from <code>patt</code> using function <code>func</code>.
733</p>
734
735<p>
736This capture assumes that <code>patt</code> should produce
737at least one capture with at least one value (of any type),
738which becomes the initial value of an <em>accumulator</em>.
739(If you need a specific initial value,
740you may prefix a <a href="#cap-cc">constant capture</a> to <code>patt</code>.)
741For each subsequent capture,
742LPeg calls <code>func</code>
743with this accumulator as the first argument and all values produced
744by the capture as extra arguments;
745the first result from this call
746becomes the new value for the accumulator.
747The final value of the accumulator becomes the captured value.
748</p>
749
750<p>
751As an example,
752the following pattern matches a list of numbers separated
753by commas and returns their addition:
754</p>
755<pre class="example">
756-- matches a numeral and captures its numerical value
757number = lpeg.R"09"^1 / tonumber
758
759-- matches a list of numbers, capturing their values
760list = number * ("," * number)^0
761
762-- auxiliary function to add two numbers
763function add (acc, newvalue) return acc + newvalue end
764
765-- folds the list of numbers adding them
766sum = lpeg.Cf(list, add)
767
768-- example of use
769print(sum:match("10,30,43")) --&gt; 83
770</pre>
771
772
773<h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3>
774<p>
775Creates a <em>group capture</em>.
776It groups all values returned by <code>patt</code>
777into a single capture.
778The group may be anonymous (if no name is given)
779or named with the given name
780(which can be any non-nil Lua value).
781</p>
782
783<p>
784An anonymous group serves to join values from several captures into
785a single capture.
786A named group has a different behavior.
787In most situations, a named group returns no values at all.
788Its values are only relevant for a following
789<a href="#cap-b">back capture</a> or when used
790inside a <a href="#cap-t">table capture</a>.
791</p>
792
793
794<h3><a name="cap-p"></a><code>lpeg.Cp ()</code></h3>
795<p>
796Creates a <em>position capture</em>.
797It matches the empty string and
798captures the position in the subject where the match occurs.
799The captured value is a number.
800</p>
801
802
803<h3><a name="cap-s"></a><code>lpeg.Cs (patt)</code></h3>
804<p>
805Creates a <em>substitution capture</em>,
806which captures the substring of the subject that matches <code>patt</code>,
807with <em>substitutions</em>.
808For any capture inside <code>patt</code> with a value,
809the substring that matched the capture is replaced by the capture value
810(which should be a string).
811The final captured value is the string resulting from
812all replacements.
813</p>
814
815
816<h3><a name="cap-t"></a><code>lpeg.Ct (patt)</code></h3>
817<p>
818Creates a <em>table capture</em>.
819This capture returns a table with all values from all anonymous captures
820made by <code>patt</code> inside this table in successive integer keys,
821starting at 1.
822Moreover,
823for each named capture group created by <code>patt</code>,
824the first value of the group is put into the table
825with the group name as its key.
826The captured value is only the table.
827</p>
828
829
830<h3><a name="cap-string"></a><code>patt / string</code></h3>
831<p>
832Creates a <em>string capture</em>.
833It creates a capture string based on <code>string</code>.
834The captured value is a copy of <code>string</code>,
835except that the character <code>%</code> works as an escape character:
836any sequence in <code>string</code> of the form <code>%<em>n</em></code>,
837with <em>n</em> between 1 and 9,
838stands for the match of the <em>n</em>-th capture in <code>patt</code>.
839The sequence <code>%0</code> stands for the whole match.
840The sequence <code>%%</code> stands for a single&nbsp;<code>%</code>.
841</p>
842
843
844<h3><a name="cap-num"></a><code>patt / number</code></h3>
845<p>
846Creates a <em>numbered capture</em>.
847For a non-zero number,
848the captured value is the n-th value
849captured by <code>patt</code>.
850When <code>number</code> is zero,
851there are no captured values.
852</p>
853
854
855<h3><a name="cap-query"></a><code>patt / table</code></h3>
856<p>
857Creates a <em>query capture</em>.
858It indexes the given table using as key the first value captured by
859<code>patt</code>,
860or the whole match if <code>patt</code> produced no value.
861The value at that index is the final value of the capture.
862If the table does not have that key,
863there is no captured value.
864</p>
865
866
867<h3><a name="cap-func"></a><code>patt / function</code></h3>
868<p>
869Creates a <em>function capture</em>.
870It calls the given function passing all captures made by
871<code>patt</code> as arguments,
872or the whole match if <code>patt</code> made no capture.
873The values returned by the function
874are the final values of the capture.
875In particular,
876if <code>function</code> returns no value,
877there is no captured value.
878</p>
879
880
881<h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3>
882<p>
883Creates a <em>match-time capture</em>.
884Unlike all other captures,
885this one is evaluated immediately when a match occurs
886(even if it is part of a larger pattern that fails later).
887It forces the immediate evaluation of all its nested captures
888and then calls <code>function</code>.
889</p>
890
891<p>
892The given function gets as arguments the entire subject,
893the current position (after the match of <code>patt</code>),
894plus any capture values produced by <code>patt</code>.
895</p>
896
897<p>
898The first value returned by <code>function</code>
899defines how the match happens.
900If the call returns a number,
901the match succeeds
902and the returned number becomes the new current position.
903(Assuming a subject <em>s</em> and current position <em>i</em>,
904the returned number must be in the range <em>[i, len(s) + 1]</em>.)
905If the call returns <b>true</b>,
906the match succeeds without consuming any input.
907(So, to return <b>true</b> is equivalent to return <em>i</em>.)
908If the call returns <b>false</b>, <b>nil</b>, or no value,
909the match fails.
910</p>
911
912<p>
913Any extra values returned by the function become the
914values produced by the capture.
915</p>
916
917
918
919
920<h2><a name="ex">Some Examples</a></h2>
921
922<h3>Using a Pattern</h3>
923<p>
924This example shows a very simple but complete program
925that builds and uses a pattern:
926</p>
927<pre class="example">
928local lpeg = require "lpeg"
929
930-- matches a word followed by end-of-string
931p = lpeg.R"az"^1 * -1
932
933print(p:match("hello")) --> 6
934print(lpeg.match(p, "hello")) --> 6
935print(p:match("1 hello")) --> nil
936</pre>
937<p>
938The pattern is simply a sequence of one or more lower-case letters
939followed by the end of string (-1).
940The program calls <code>match</code> both as a method
941and as a function.
942In both sucessful cases,
943the match returns
944the index of the first character after the match,
945which is the string length plus one.
946</p>
947
948
949<h3>Name-value lists</h3>
950<p>
951This example parses a list of name-value pairs and returns a table
952with those pairs:
953</p>
954<pre class="example">
955lpeg.locale(lpeg) -- adds locale entries into 'lpeg' table
956
957local space = lpeg.space^0
958local name = lpeg.C(lpeg.alpha^1) * space
959local sep = lpeg.S(",;") * space
960local pair = lpeg.Cg(name * "=" * space * name) * sep^-1
961local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset)
962t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" }
963</pre>
964<p>
965Each pair has the format <code>name = name</code> followed by
966an optional separator (a comma or a semicolon).
967The <code>pair</code> pattern encloses the pair in a group pattern,
968so that the names become the values of a single capture.
969The <code>list</code> pattern then folds these captures.
970It starts with an empty table,
971created by a table capture matching an empty string;
972then for each capture (a pair of names) it applies <code>rawset</code>
973over the accumulator (the table) and the capture values (the pair of names).
974<code>rawset</code> returns the table itself,
975so the accumulator is always the table.
976</p>
977
978<h3>Splitting a string</h3>
979<p>
980The following code builds a pattern that
981splits a string using a given pattern
982<code>sep</code> as a separator:
983</p>
984<pre class="example">
985function split (s, sep)
986 sep = lpeg.P(sep)
987 local elem = lpeg.C((1 - sep)^0)
988 local p = elem * (sep * elem)^0
989 return lpeg.match(p, s)
990end
991</pre>
992<p>
993First the function ensures that <code>sep</code> is a proper pattern.
994The pattern <code>elem</code> is a repetition of zero of more
995arbitrary characters as long as there is not a match against
996the separator.
997It also captures its match.
998The pattern <code>p</code> matches a list of elements separated
999by <code>sep</code>.
1000</p>
1001
1002<p>
1003If the split results in too many values,
1004it may overflow the maximum number of values
1005that can be returned by a Lua function.
1006In this case,
1007we can collect these values in a table:
1008</p>
1009<pre class="example">
1010function split (s, sep)
1011 sep = lpeg.P(sep)
1012 local elem = lpeg.C((1 - sep)^0)
1013 local p = lpeg.Ct(elem * (sep * elem)^0) -- make a table capture
1014 return lpeg.match(p, s)
1015end
1016</pre>
1017
1018
1019<h3>Searching for a pattern</h3>
1020<p>
1021The primitive <code>match</code> works only in anchored mode.
1022If we want to find a pattern anywhere in a string,
1023we must write a pattern that matches anywhere.
1024</p>
1025
1026<p>
1027Because patterns are composable,
1028we can write a function that,
1029given any arbitrary pattern <code>p</code>,
1030returns a new pattern that searches for <code>p</code>
1031anywhere in a string.
1032There are several ways to do the search.
1033One way is like this:
1034</p>
1035<pre class="example">
1036function anywhere (p)
1037 return lpeg.P{ p + 1 * lpeg.V(1) }
1038end
1039</pre>
1040<p>
1041This grammar has a straight reading:
1042it matches <code>p</code> or skips one character and tries again.
1043</p>
1044
1045<p>
1046If we want to know where the pattern is in the string
1047(instead of knowing only that it is there somewhere),
1048we can add position captures to the pattern:
1049</p>
1050<pre class="example">
1051local I = lpeg.Cp()
1052function anywhere (p)
1053 return lpeg.P{ I * p * I + 1 * lpeg.V(1) }
1054end
1055
1056print(anywhere("world"):match("hello world!")) -> 7 12
1057</pre>
1058
1059<p>
1060Another option for the search is like this:
1061</p>
1062<pre class="example">
1063local I = lpeg.Cp()
1064function anywhere (p)
1065 return (1 - lpeg.P(p))^0 * I * p * I
1066end
1067</pre>
1068<p>
1069Again the pattern has a straight reading:
1070it skips as many characters as possible while not matching <code>p</code>,
1071and then matches <code>p</code> (plus appropriate captures).
1072</p>
1073
1074<p>
1075If we want to look for a pattern only at word boundaries,
1076we can use the following transformer:
1077</p>
1078
1079<pre class="example">
1080local t = lpeg.locale()
1081
1082function atwordboundary (p)
1083 return lpeg.P{
1084 [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1)
1085 }
1086end
1087</pre>
1088
1089
1090<h3><a name="balanced"></a>Balanced parentheses</h3>
1091<p>
1092The following pattern matches only strings with balanced parentheses:
1093</p>
1094<pre class="example">
1095b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" }
1096</pre>
1097<p>
1098Reading the first (and only) rule of the given grammar,
1099we have that a balanced string is
1100an open parenthesis,
1101followed by zero or more repetitions of either
1102a non-parenthesis character or
1103a balanced string (<code>lpeg.V(1)</code>),
1104followed by a closing parenthesis.
1105</p>
1106
1107
1108<h3>Global substitution</h3>
1109<p>
1110The next example does a job somewhat similar to <code>string.gsub</code>.
1111It receives a pattern and a replacement value,
1112and substitutes the replacement value for all occurrences of the pattern
1113in a given string:
1114</p>
1115<pre class="example">
1116function gsub (s, patt, repl)
1117 patt = lpeg.P(patt)
1118 patt = lpeg.Cs((patt / repl + 1)^0)
1119 return lpeg.match(patt, s)
1120end
1121</pre>
1122<p>
1123As in <code>string.gsub</code>,
1124the replacement value can be a string,
1125a function, or a table.
1126</p>
1127
1128
1129<h3><a name="CSV"></a>Comma-Separated Values (CSV)</h3>
1130<p>
1131This example breaks a string into comma-separated values,
1132returning all fields:
1133</p>
1134<pre class="example">
1135local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
1136 lpeg.C((1 - lpeg.S',\n"')^0)
1137
1138local record = field * (',' * field)^0 * (lpeg.P'\n' + -1)
1139
1140function csv (s)
1141 return lpeg.match(record, s)
1142end
1143</pre>
1144<p>
1145A field is either a quoted field
1146(which may contain any character except an individual quote,
1147which may be written as two quotes that are replaced by one)
1148or an unquoted field
1149(which cannot contain commas, newlines, or quotes).
1150A record is a list of fields separated by commas,
1151ending with a newline or the string end (-1).
1152</p>
1153
1154<p>
1155As it is,
1156the previous pattern returns each field as a separated result.
1157If we add a table capture in the definition of <code>record</code>,
1158the pattern will return instead a single table
1159containing all fields:
1160</p>
1161<pre>
1162local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1)
1163</pre>
1164
1165
1166<h3>UTF-8 and Latin 1</h3>
1167<p>
1168It is not difficult to use LPeg to convert a string from
1169UTF-8 encoding to Latin 1 (ISO 8859-1):
1170</p>
1171
1172<pre class="example">
1173-- convert a two-byte UTF-8 sequence to a Latin 1 character
1174local function f2 (s)
1175 local c1, c2 = string.byte(s, 1, 2)
1176 return string.char(c1 * 64 + c2 - 12416)
1177end
1178
1179local utf8 = lpeg.R("\0\127")
1180 + lpeg.R("\194\195") * lpeg.R("\128\191") / f2
1181
1182local decode_pattern = lpeg.Cs(utf8^0) * -1
1183</pre>
1184<p>
1185In this code,
1186the definition of UTF-8 is already restricted to the
1187Latin 1 range (from 0 to 255).
1188Any encoding outside this range (as well as any invalid encoding)
1189will not match that pattern.
1190</p>
1191
1192<p>
1193As the definition of <code>decode_pattern</code> demands that
1194the pattern matches the whole input (because of the -1 at its end),
1195any invalid string will simply fail to match,
1196without any useful information about the problem.
1197We can improve this situation redefining <code>decode_pattern</code>
1198as follows:
1199</p>
1200<pre class="example">
1201local function er (_, i) error("invalid encoding at position " .. i) end
1202
1203local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er))
1204</pre>
1205<p>
1206Now, if the pattern <code>utf8^0</code> stops
1207before the end of the string,
1208an appropriate error function is called.
1209</p>
1210
1211
1212<h3>UTF-8 and Unicode</h3>
1213<p>
1214We can extend the previous patterns to handle all Unicode code points.
1215Of course,
1216we cannot translate them to Latin 1 or any other one-byte encoding.
1217Instead, our translation results in a array with the code points
1218represented as numbers.
1219The full code is here:
1220</p>
1221<pre class="example">
1222-- decode a two-byte UTF-8 sequence
1223local function f2 (s)
1224 local c1, c2 = string.byte(s, 1, 2)
1225 return c1 * 64 + c2 - 12416
1226end
1227
1228-- decode a three-byte UTF-8 sequence
1229local function f3 (s)
1230 local c1, c2, c3 = string.byte(s, 1, 3)
1231 return (c1 * 64 + c2) * 64 + c3 - 925824
1232end
1233
1234-- decode a four-byte UTF-8 sequence
1235local function f4 (s)
1236 local c1, c2, c3, c4 = string.byte(s, 1, 4)
1237 return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
1238end
1239
1240local cont = lpeg.R("\128\191") -- continuation byte
1241
1242local utf8 = lpeg.R("\0\127") / string.byte
1243 + lpeg.R("\194\223") * cont / f2
1244 + lpeg.R("\224\239") * cont * cont / f3
1245 + lpeg.R("\240\244") * cont * cont * cont / f4
1246
1247local decode_pattern = lpeg.Ct(utf8^0) * -1
1248</pre>
1249
1250
1251<h3>Lua's long strings</h3>
1252<p>
1253A long string in Lua starts with the pattern <code>[=*[</code>
1254and ends at the first occurrence of <code>]=*]</code> with
1255exactly the same number of equal signs.
1256If the opening brackets are followed by a newline,
1257this newline is discarded
1258(that is, it is not part of the string).
1259</p>
1260
1261<p>
1262To match a long string in Lua,
1263the pattern must capture the first repetition of equal signs and then,
1264whenever it finds a candidate for closing the string,
1265check whether it has the same number of equal signs.
1266</p>
1267
1268<pre class="example">
1269equals = lpeg.P"="^0
1270open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1
1271close = "]" * lpeg.C(equals) * "]"
1272closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end)
1273string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1
1274</pre>
1275
1276<p>
1277The <code>open</code> pattern matches <code>[=*[</code>,
1278capturing the repetitions of equal signs in a group named <code>init</code>;
1279it also discharges an optional newline, if present.
1280The <code>close</code> pattern matches <code>]=*]</code>,
1281also capturing the repetitions of equal signs.
1282The <code>closeeq</code> pattern first matches <code>close</code>;
1283then it uses a back capture to recover the capture made
1284by the previous <code>open</code>,
1285which is named <code>init</code>;
1286finally it uses a match-time capture to check
1287whether both captures are equal.
1288The <code>string</code> pattern starts with an <code>open</code>,
1289then it goes as far as possible until matching <code>closeeq</code>,
1290and then matches the final <code>close</code>.
1291The final numbered capture simply discards
1292the capture made by <code>close</code>.
1293</p>
1294
1295
1296<h3>Arithmetic expressions</h3>
1297<p>
1298This example is a complete parser and evaluator for simple
1299arithmetic expressions.
1300We write it in two styles.
1301The first approach first builds a syntax tree and then
1302traverses this tree to compute the expression value:
1303</p>
1304<pre class="example">
1305-- Lexical Elements
1306local Space = lpeg.S(" \n\t")^0
1307local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space
1308local TermOp = lpeg.C(lpeg.S("+-")) * Space
1309local FactorOp = lpeg.C(lpeg.S("*/")) * Space
1310local Open = "(" * Space
1311local Close = ")" * Space
1312
1313-- Grammar
1314local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor"
1315G = lpeg.P{ Exp,
1316 Exp = lpeg.Ct(Term * (TermOp * Term)^0);
1317 Term = lpeg.Ct(Factor * (FactorOp * Factor)^0);
1318 Factor = Number + Open * Exp * Close;
1319}
1320
1321G = Space * G * -1
1322
1323-- Evaluator
1324function eval (x)
1325 if type(x) == "string" then
1326 return tonumber(x)
1327 else
1328 local op1 = eval(x[1])
1329 for i = 2, #x, 2 do
1330 local op = x[i]
1331 local op2 = eval(x[i + 1])
1332 if (op == "+") then op1 = op1 + op2
1333 elseif (op == "-") then op1 = op1 - op2
1334 elseif (op == "*") then op1 = op1 * op2
1335 elseif (op == "/") then op1 = op1 / op2
1336 end
1337 end
1338 return op1
1339 end
1340end
1341
1342-- Parser/Evaluator
1343function evalExp (s)
1344 local t = lpeg.match(G, s)
1345 if not t then error("syntax error", 2) end
1346 return eval(t)
1347end
1348
1349-- small example
1350print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5
1351</pre>
1352
1353<p>
1354The second style computes the expression value on the fly,
1355without building the syntax tree.
1356The following grammar takes this approach.
1357(It assumes the same lexical elements as before.)
1358</p>
1359<pre class="example">
1360-- Auxiliary function
1361function eval (v1, op, v2)
1362 if (op == "+") then return v1 + v2
1363 elseif (op == "-") then return v1 - v2
1364 elseif (op == "*") then return v1 * v2
1365 elseif (op == "/") then return v1 / v2
1366 end
1367end
1368
1369-- Grammar
1370local V = lpeg.V
1371G = lpeg.P{ "Exp",
1372 Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval);
1373 Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval);
1374 Factor = Number / tonumber + Open * V"Exp" * Close;
1375}
1376
1377-- small example
1378print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5
1379</pre>
1380<p>
1381Note the use of the fold (accumulator) capture.
1382To compute the value of an expression,
1383the accumulator starts with the value of the first term,
1384and then applies <code>eval</code> over
1385the accumulator, the operator,
1386and the new term for each repetition.
1387</p>
1388
1389
1390
1391<h2><a name="download"></a>Download</h2>
1392
1393<p>LPeg
1394<a href="http://www.inf.puc-rio.br/~roberto/lpeg/lpeg-1.0.1.tar.gz">source code</a>.</p>
1395
1396
1397<h2><a name="license">License</a></h2>
1398
1399<p>
1400Copyright &copy; 2007-2017 Lua.org, PUC-Rio.
1401</p>
1402<p>
1403Permission is hereby granted, free of charge,
1404to any person obtaining a copy of this software and
1405associated documentation files (the "Software"),
1406to deal in the Software without restriction,
1407including without limitation the rights to use,
1408copy, modify, merge, publish, distribute, sublicense,
1409and/or sell copies of the Software,
1410and to permit persons to whom the Software is
1411furnished to do so,
1412subject to the following conditions:
1413</p>
1414
1415<p>
1416The above copyright notice and this permission notice
1417shall be included in all copies or substantial portions of the Software.
1418</p>
1419
1420<p>
1421THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1422EXPRESS OR IMPLIED,
1423INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1424FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
1425IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
1426DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
1427TORT OR OTHERWISE, ARISING FROM,
1428OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1429THE SOFTWARE.
1430</p>
1431
1432</div> <!-- id="content" -->
1433
1434</div> <!-- id="main" -->
1435
1436<div id="about">
1437<p><small>
1438$Id: lpeg.html,v 1.77 2017/01/13 13:40:05 roberto Exp $
1439</small></p>
1440</div> <!-- id="about" -->
1441
1442</div> <!-- id="container" -->
1443
1444</body>
1445</html>
diff --git a/lpprint.c b/lpprint.c
new file mode 100644
index 0000000..f7be408
--- /dev/null
+++ b/lpprint.c
@@ -0,0 +1,244 @@
1/*
2** $Id: lpprint.c,v 1.10 2016/09/13 16:06:03 roberto Exp $
3** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
4*/
5
6#include <ctype.h>
7#include <limits.h>
8#include <stdio.h>
9
10
11#include "lptypes.h"
12#include "lpprint.h"
13#include "lpcode.h"
14
15
16#if defined(LPEG_DEBUG)
17
18/*
19** {======================================================
20** Printing patterns (for debugging)
21** =======================================================
22*/
23
24
25void printcharset (const byte *st) {
26 int i;
27 printf("[");
28 for (i = 0; i <= UCHAR_MAX; i++) {
29 int first = i;
30 while (testchar(st, i) && i <= UCHAR_MAX) i++;
31 if (i - 1 == first) /* unary range? */
32 printf("(%02x)", first);
33 else if (i - 1 > first) /* non-empty range? */
34 printf("(%02x-%02x)", first, i - 1);
35 }
36 printf("]");
37}
38
39
40static const char *capkind (int kind) {
41 const char *const modes[] = {
42 "close", "position", "constant", "backref",
43 "argument", "simple", "table", "function",
44 "query", "string", "num", "substitution", "fold",
45 "runtime", "group"};
46 return modes[kind];
47}
48
49
50static void printjmp (const Instruction *op, const Instruction *p) {
51 printf("-> %d", (int)(p + (p + 1)->offset - op));
52}
53
54
55void printinst (const Instruction *op, const Instruction *p) {
56 const char *const names[] = {
57 "any", "char", "set",
58 "testany", "testchar", "testset",
59 "span", "behind",
60 "ret", "end",
61 "choice", "jmp", "call", "open_call",
62 "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
63 "fullcapture", "opencapture", "closecapture", "closeruntime"
64 };
65 printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
66 switch ((Opcode)p->i.code) {
67 case IChar: {
68 printf("'%c'", p->i.aux);
69 break;
70 }
71 case ITestChar: {
72 printf("'%c'", p->i.aux); printjmp(op, p);
73 break;
74 }
75 case IFullCapture: {
76 printf("%s (size = %d) (idx = %d)",
77 capkind(getkind(p)), getoff(p), p->i.key);
78 break;
79 }
80 case IOpenCapture: {
81 printf("%s (idx = %d)", capkind(getkind(p)), p->i.key);
82 break;
83 }
84 case ISet: {
85 printcharset((p+1)->buff);
86 break;
87 }
88 case ITestSet: {
89 printcharset((p+2)->buff); printjmp(op, p);
90 break;
91 }
92 case ISpan: {
93 printcharset((p+1)->buff);
94 break;
95 }
96 case IOpenCall: {
97 printf("-> %d", (p + 1)->offset);
98 break;
99 }
100 case IBehind: {
101 printf("%d", p->i.aux);
102 break;
103 }
104 case IJmp: case ICall: case ICommit: case IChoice:
105 case IPartialCommit: case IBackCommit: case ITestAny: {
106 printjmp(op, p);
107 break;
108 }
109 default: break;
110 }
111 printf("\n");
112}
113
114
115void printpatt (Instruction *p, int n) {
116 Instruction *op = p;
117 while (p < op + n) {
118 printinst(op, p);
119 p += sizei(p);
120 }
121}
122
123
124#if defined(LPEG_DEBUG)
125static void printcap (Capture *cap) {
126 printf("%s (idx: %d - size: %d) -> %p\n",
127 capkind(cap->kind), cap->idx, cap->siz, cap->s);
128}
129
130
131void printcaplist (Capture *cap, Capture *limit) {
132 printf(">======\n");
133 for (; cap->s && (limit == NULL || cap < limit); cap++)
134 printcap(cap);
135 printf("=======\n");
136}
137#endif
138
139/* }====================================================== */
140
141
142/*
143** {======================================================
144** Printing trees (for debugging)
145** =======================================================
146*/
147
148static const char *tagnames[] = {
149 "char", "set", "any",
150 "true", "false",
151 "rep",
152 "seq", "choice",
153 "not", "and",
154 "call", "opencall", "rule", "grammar",
155 "behind",
156 "capture", "run-time"
157};
158
159
160void printtree (TTree *tree, int ident) {
161 int i;
162 for (i = 0; i < ident; i++) printf(" ");
163 printf("%s", tagnames[tree->tag]);
164 switch (tree->tag) {
165 case TChar: {
166 int c = tree->u.n;
167 if (isprint(c))
168 printf(" '%c'\n", c);
169 else
170 printf(" (%02X)\n", c);
171 break;
172 }
173 case TSet: {
174 printcharset(treebuffer(tree));
175 printf("\n");
176 break;
177 }
178 case TOpenCall: case TCall: {
179 assert(sib2(tree)->tag == TRule);
180 printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap);
181 break;
182 }
183 case TBehind: {
184 printf(" %d\n", tree->u.n);
185 printtree(sib1(tree), ident + 2);
186 break;
187 }
188 case TCapture: {
189 printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key);
190 printtree(sib1(tree), ident + 2);
191 break;
192 }
193 case TRule: {
194 printf(" n: %d key: %d\n", tree->cap, tree->key);
195 printtree(sib1(tree), ident + 2);
196 break; /* do not print next rule as a sibling */
197 }
198 case TGrammar: {
199 TTree *rule = sib1(tree);
200 printf(" %d\n", tree->u.n); /* number of rules */
201 for (i = 0; i < tree->u.n; i++) {
202 printtree(rule, ident + 2);
203 rule = sib2(rule);
204 }
205 assert(rule->tag == TTrue); /* sentinel */
206 break;
207 }
208 default: {
209 int sibs = numsiblings[tree->tag];
210 printf("\n");
211 if (sibs >= 1) {
212 printtree(sib1(tree), ident + 2);
213 if (sibs >= 2)
214 printtree(sib2(tree), ident + 2);
215 }
216 break;
217 }
218 }
219}
220
221
222void printktable (lua_State *L, int idx) {
223 int n, i;
224 lua_getuservalue(L, idx);
225 if (lua_isnil(L, -1)) /* no ktable? */
226 return;
227 n = lua_rawlen(L, -1);
228 printf("[");
229 for (i = 1; i <= n; i++) {
230 printf("%d = ", i);
231 lua_rawgeti(L, -1, i);
232 if (lua_isstring(L, -1))
233 printf("%s ", lua_tostring(L, -1));
234 else
235 printf("%s ", lua_typename(L, lua_type(L, -1)));
236 lua_pop(L, 1);
237 }
238 printf("]\n");
239 /* leave ktable at the stack */
240}
241
242/* }====================================================== */
243
244#endif
diff --git a/lpprint.h b/lpprint.h
new file mode 100644
index 0000000..6329760
--- /dev/null
+++ b/lpprint.h
@@ -0,0 +1,36 @@
1/*
2** $Id: lpprint.h,v 1.2 2015/06/12 18:18:08 roberto Exp $
3*/
4
5
6#if !defined(lpprint_h)
7#define lpprint_h
8
9
10#include "lptree.h"
11#include "lpvm.h"
12
13
14#if defined(LPEG_DEBUG)
15
16void printpatt (Instruction *p, int n);
17void printtree (TTree *tree, int ident);
18void printktable (lua_State *L, int idx);
19void printcharset (const byte *st);
20void printcaplist (Capture *cap, Capture *limit);
21void printinst (const Instruction *op, const Instruction *p);
22
23#else
24
25#define printktable(L,idx) \
26 luaL_error(L, "function only implemented in debug mode")
27#define printtree(tree,i) \
28 luaL_error(L, "function only implemented in debug mode")
29#define printpatt(p,n) \
30 luaL_error(L, "function only implemented in debug mode")
31
32#endif
33
34
35#endif
36
diff --git a/lptree.c b/lptree.c
new file mode 100644
index 0000000..37bfaf0
--- /dev/null
+++ b/lptree.c
@@ -0,0 +1,1305 @@
1/*
2** $Id: lptree.c,v 1.23 2017/12/14 15:30:04 roberto Exp $
3** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license)
4*/
5
6#include <ctype.h>
7#include <limits.h>
8#include <string.h>
9
10
11#include "lua.h"
12#include "lauxlib.h"
13
14#include "lptypes.h"
15#include "lpcap.h"
16#include "lpcode.h"
17#include "lpprint.h"
18#include "lptree.h"
19
20
21/* number of siblings for each tree */
22const byte numsiblings[] = {
23 0, 0, 0, /* char, set, any */
24 0, 0, /* true, false */
25 1, /* rep */
26 2, 2, /* seq, choice */
27 1, 1, /* not, and */
28 0, 0, 2, 1, /* call, opencall, rule, grammar */
29 1, /* behind */
30 1, 1 /* capture, runtime capture */
31};
32
33
34static TTree *newgrammar (lua_State *L, int arg);
35
36
37/*
38** returns a reasonable name for value at index 'idx' on the stack
39*/
40static const char *val2str (lua_State *L, int idx) {
41 const char *k = lua_tostring(L, idx);
42 if (k != NULL)
43 return lua_pushfstring(L, "%s", k);
44 else
45 return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx));
46}
47
48
49/*
50** Fix a TOpenCall into a TCall node, using table 'postable' to
51** translate a key to its rule address in the tree. Raises an
52** error if key does not exist.
53*/
54static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) {
55 int n;
56 lua_rawgeti(L, -1, t->key); /* get rule's name */
57 lua_gettable(L, postable); /* query name in position table */
58 n = lua_tonumber(L, -1); /* get (absolute) position */
59 lua_pop(L, 1); /* remove position */
60 if (n == 0) { /* no position? */
61 lua_rawgeti(L, -1, t->key); /* get rule's name again */
62 luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1));
63 }
64 t->tag = TCall;
65 t->u.ps = n - (t - g); /* position relative to node */
66 assert(sib2(t)->tag == TRule);
67 sib2(t)->key = t->key; /* fix rule's key */
68}
69
70
71/*
72** Transform left associative constructions into right
73** associative ones, for sequence and choice; that is:
74** (t11 + t12) + t2 => t11 + (t12 + t2)
75** (t11 * t12) * t2 => t11 * (t12 * t2)
76** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2))
77*/
78static void correctassociativity (TTree *tree) {
79 TTree *t1 = sib1(tree);
80 assert(tree->tag == TChoice || tree->tag == TSeq);
81 while (t1->tag == tree->tag) {
82 int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */
83 int n11size = t1->u.ps - 1;
84 int n12size = n1size - n11size - 1;
85 memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */
86 tree->u.ps = n11size + 1;
87 sib2(tree)->tag = tree->tag;
88 sib2(tree)->u.ps = n12size + 1;
89 }
90}
91
92
93/*
94** Make final adjustments in a tree. Fix open calls in tree 't',
95** making them refer to their respective rules or raising appropriate
96** errors (if not inside a grammar). Correct associativity of associative
97** constructions (making them right associative). Assume that tree's
98** ktable is at the top of the stack (for error messages).
99*/
100static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) {
101 tailcall:
102 switch (t->tag) {
103 case TGrammar: /* subgrammars were already fixed */
104 return;
105 case TOpenCall: {
106 if (g != NULL) /* inside a grammar? */
107 fixonecall(L, postable, g, t);
108 else { /* open call outside grammar */
109 lua_rawgeti(L, -1, t->key);
110 luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1));
111 }
112 break;
113 }
114 case TSeq: case TChoice:
115 correctassociativity(t);
116 break;
117 }
118 switch (numsiblings[t->tag]) {
119 case 1: /* finalfix(L, postable, g, sib1(t)); */
120 t = sib1(t); goto tailcall;
121 case 2:
122 finalfix(L, postable, g, sib1(t));
123 t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */
124 default: assert(numsiblings[t->tag] == 0); break;
125 }
126}
127
128
129
130/*
131** {===================================================================
132** KTable manipulation
133**
134** - The ktable of a pattern 'p' can be shared by other patterns that
135** contain 'p' and no other constants. Because of this sharing, we
136** should not add elements to a 'ktable' unless it was freshly created
137** for the new pattern.
138**
139** - The maximum index in a ktable is USHRT_MAX, because trees and
140** patterns use unsigned shorts to store those indices.
141** ====================================================================
142*/
143
144/*
145** Create a new 'ktable' to the pattern at the top of the stack.
146*/
147static void newktable (lua_State *L, int n) {
148 lua_createtable(L, n, 0); /* create a fresh table */
149 lua_setuservalue(L, -2); /* set it as 'ktable' for pattern */
150}
151
152
153/*
154** Add element 'idx' to 'ktable' of pattern at the top of the stack;
155** Return index of new element.
156** If new element is nil, does not add it to table (as it would be
157** useless) and returns 0, as ktable[0] is always nil.
158*/
159static int addtoktable (lua_State *L, int idx) {
160 if (lua_isnil(L, idx)) /* nil value? */
161 return 0;
162 else {
163 int n;
164 lua_getuservalue(L, -1); /* get ktable from pattern */
165 n = lua_rawlen(L, -1);
166 if (n >= USHRT_MAX)
167 luaL_error(L, "too many Lua values in pattern");
168 lua_pushvalue(L, idx); /* element to be added */
169 lua_rawseti(L, -2, ++n);
170 lua_pop(L, 1); /* remove 'ktable' */
171 return n;
172 }
173}
174
175
176/*
177** Return the number of elements in the ktable at 'idx'.
178** In Lua 5.2/5.3, default "environment" for patterns is nil, not
179** a table. Treat it as an empty table. In Lua 5.1, assumes that
180** the environment has no numeric indices (len == 0)
181*/
182static int ktablelen (lua_State *L, int idx) {
183 if (!lua_istable(L, idx)) return 0;
184 else return lua_rawlen(L, idx);
185}
186
187
188/*
189** Concatentate the contents of table 'idx1' into table 'idx2'.
190** (Assume that both indices are negative.)
191** Return the original length of table 'idx2' (or 0, if no
192** element was added, as there is no need to correct any index).
193*/
194static int concattable (lua_State *L, int idx1, int idx2) {
195 int i;
196 int n1 = ktablelen(L, idx1);
197 int n2 = ktablelen(L, idx2);
198 if (n1 + n2 > USHRT_MAX)
199 luaL_error(L, "too many Lua values in pattern");
200 if (n1 == 0) return 0; /* nothing to correct */
201 for (i = 1; i <= n1; i++) {
202 lua_rawgeti(L, idx1, i);
203 lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */
204 }
205 return n2;
206}
207
208
209/*
210** When joining 'ktables', constants from one of the subpatterns must
211** be renumbered; 'correctkeys' corrects their indices (adding 'n'
212** to each of them)
213*/
214static void correctkeys (TTree *tree, int n) {
215 if (n == 0) return; /* no correction? */
216 tailcall:
217 switch (tree->tag) {
218 case TOpenCall: case TCall: case TRunTime: case TRule: {
219 if (tree->key > 0)
220 tree->key += n;
221 break;
222 }
223 case TCapture: {
224 if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum)
225 tree->key += n;
226 break;
227 }
228 default: break;
229 }
230 switch (numsiblings[tree->tag]) {
231 case 1: /* correctkeys(sib1(tree), n); */
232 tree = sib1(tree); goto tailcall;
233 case 2:
234 correctkeys(sib1(tree), n);
235 tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */
236 default: assert(numsiblings[tree->tag] == 0); break;
237 }
238}
239
240
241/*
242** Join the ktables from p1 and p2 the ktable for the new pattern at the
243** top of the stack, reusing them when possible.
244*/
245static void joinktables (lua_State *L, int p1, TTree *t2, int p2) {
246 int n1, n2;
247 lua_getuservalue(L, p1); /* get ktables */
248 lua_getuservalue(L, p2);
249 n1 = ktablelen(L, -2);
250 n2 = ktablelen(L, -1);
251 if (n1 == 0 && n2 == 0) /* are both tables empty? */
252 lua_pop(L, 2); /* nothing to be done; pop tables */
253 else if (n2 == 0 || lp_equal(L, -2, -1)) { /* 2nd table empty or equal? */
254 lua_pop(L, 1); /* pop 2nd table */
255 lua_setuservalue(L, -2); /* set 1st ktable into new pattern */
256 }
257 else if (n1 == 0) { /* first table is empty? */
258 lua_setuservalue(L, -3); /* set 2nd table into new pattern */
259 lua_pop(L, 1); /* pop 1st table */
260 }
261 else {
262 lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */
263 /* stack: new p; ktable p1; ktable p2; new ktable */
264 concattable(L, -3, -1); /* from p1 into new ktable */
265 concattable(L, -2, -1); /* from p2 into new ktable */
266 lua_setuservalue(L, -4); /* new ktable becomes 'p' environment */
267 lua_pop(L, 2); /* pop other ktables */
268 correctkeys(t2, n1); /* correction for indices from p2 */
269 }
270}
271
272
273/*
274** copy 'ktable' of element 'idx' to new tree (on top of stack)
275*/
276static void copyktable (lua_State *L, int idx) {
277 lua_getuservalue(L, idx);
278 lua_setuservalue(L, -2);
279}
280
281
282/*
283** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable'
284** from tree at the top of the stack, and correct corresponding
285** tree.
286*/
287static void mergektable (lua_State *L, int idx, TTree *stree) {
288 int n;
289 lua_getuservalue(L, -1); /* get ktables */
290 lua_getuservalue(L, idx);
291 n = concattable(L, -1, -2);
292 lua_pop(L, 2); /* remove both ktables */
293 correctkeys(stree, n);
294}
295
296
297/*
298** Create a new 'ktable' to the pattern at the top of the stack, adding
299** all elements from pattern 'p' (if not 0) plus element 'idx' to it.
300** Return index of new element.
301*/
302static int addtonewktable (lua_State *L, int p, int idx) {
303 newktable(L, 1);
304 if (p)
305 mergektable(L, p, NULL);
306 return addtoktable(L, idx);
307}
308
309/* }====================================================== */
310
311
312/*
313** {======================================================
314** Tree generation
315** =======================================================
316*/
317
318/*
319** In 5.2, could use 'luaL_testudata'...
320*/
321static int testpattern (lua_State *L, int idx) {
322 if (lua_touserdata(L, idx)) { /* value is a userdata? */
323 if (lua_getmetatable(L, idx)) { /* does it have a metatable? */
324 luaL_getmetatable(L, PATTERN_T);
325 if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */
326 lua_pop(L, 2); /* remove both metatables */
327 return 1;
328 }
329 }
330 }
331 return 0;
332}
333
334
335static Pattern *getpattern (lua_State *L, int idx) {
336 return (Pattern *)luaL_checkudata(L, idx, PATTERN_T);
337}
338
339
340static int getsize (lua_State *L, int idx) {
341 return (lua_rawlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1;
342}
343
344
345static TTree *gettree (lua_State *L, int idx, int *len) {
346 Pattern *p = getpattern(L, idx);
347 if (len)
348 *len = getsize(L, idx);
349 return p->tree;
350}
351
352
353/*
354** create a pattern. Set its uservalue (the 'ktable') equal to its
355** metatable. (It could be any empty sequence; the metatable is at
356** hand here, so we use it.)
357*/
358static TTree *newtree (lua_State *L, int len) {
359 size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern);
360 Pattern *p = (Pattern *)lua_newuserdata(L, size);
361 luaL_getmetatable(L, PATTERN_T);
362 lua_pushvalue(L, -1);
363 lua_setuservalue(L, -3);
364 lua_setmetatable(L, -2);
365 p->code = NULL; p->codesize = 0;
366 return p->tree;
367}
368
369
370static TTree *newleaf (lua_State *L, int tag) {
371 TTree *tree = newtree(L, 1);
372 tree->tag = tag;
373 return tree;
374}
375
376
377static TTree *newcharset (lua_State *L) {
378 TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1);
379 tree->tag = TSet;
380 loopset(i, treebuffer(tree)[i] = 0);
381 return tree;
382}
383
384
385/*
386** add to tree a sequence where first sibling is 'sib' (with size
387** 'sibsize'); returns position for second sibling
388*/
389static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) {
390 tree->tag = TSeq; tree->u.ps = sibsize + 1;
391 memcpy(sib1(tree), sib, sibsize * sizeof(TTree));
392 return sib2(tree);
393}
394
395
396/*
397** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got
398** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it
399** must build a sequence of sequence of sequence...)
400*/
401static void fillseq (TTree *tree, int tag, int n, const char *s) {
402 int i;
403 for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */
404 tree->tag = TSeq; tree->u.ps = 2;
405 sib1(tree)->tag = tag;
406 sib1(tree)->u.n = s ? (byte)s[i] : 0;
407 tree = sib2(tree);
408 }
409 tree->tag = tag; /* last one does not need TSeq */
410 tree->u.n = s ? (byte)s[i] : 0;
411}
412
413
414/*
415** Numbers as patterns:
416** 0 == true (always match); n == TAny repeated 'n' times;
417** -n == not (TAny repeated 'n' times)
418*/
419static TTree *numtree (lua_State *L, int n) {
420 if (n == 0)
421 return newleaf(L, TTrue);
422 else {
423 TTree *tree, *nd;
424 if (n > 0)
425 tree = nd = newtree(L, 2 * n - 1);
426 else { /* negative: code it as !(-n) */
427 n = -n;
428 tree = newtree(L, 2 * n);
429 tree->tag = TNot;
430 nd = sib1(tree);
431 }
432 fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */
433 return tree;
434 }
435}
436
437
438/*
439** Convert value at index 'idx' to a pattern
440*/
441static TTree *getpatt (lua_State *L, int idx, int *len) {
442 TTree *tree;
443 switch (lua_type(L, idx)) {
444 case LUA_TSTRING: {
445 size_t slen;
446 const char *s = lua_tolstring(L, idx, &slen); /* get string */
447 if (slen == 0) /* empty? */
448 tree = newleaf(L, TTrue); /* always match */
449 else {
450 tree = newtree(L, 2 * (slen - 1) + 1);
451 fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */
452 }
453 break;
454 }
455 case LUA_TNUMBER: {
456 int n = lua_tointeger(L, idx);
457 tree = numtree(L, n);
458 break;
459 }
460 case LUA_TBOOLEAN: {
461 tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse));
462 break;
463 }
464 case LUA_TTABLE: {
465 tree = newgrammar(L, idx);
466 break;
467 }
468 case LUA_TFUNCTION: {
469 tree = newtree(L, 2);
470 tree->tag = TRunTime;
471 tree->key = addtonewktable(L, 0, idx);
472 sib1(tree)->tag = TTrue;
473 break;
474 }
475 default: {
476 return gettree(L, idx, len);
477 }
478 }
479 lua_replace(L, idx); /* put new tree into 'idx' slot */
480 if (len)
481 *len = getsize(L, idx);
482 return tree;
483}
484
485
486/*
487** create a new tree, whith a new root and one sibling.
488** Sibling must be on the Lua stack, at index 1.
489*/
490static TTree *newroot1sib (lua_State *L, int tag) {
491 int s1;
492 TTree *tree1 = getpatt(L, 1, &s1);
493 TTree *tree = newtree(L, 1 + s1); /* create new tree */
494 tree->tag = tag;
495 memcpy(sib1(tree), tree1, s1 * sizeof(TTree));
496 copyktable(L, 1);
497 return tree;
498}
499
500
501/*
502** create a new tree, whith a new root and 2 siblings.
503** Siblings must be on the Lua stack, first one at index 1.
504*/
505static TTree *newroot2sib (lua_State *L, int tag) {
506 int s1, s2;
507 TTree *tree1 = getpatt(L, 1, &s1);
508 TTree *tree2 = getpatt(L, 2, &s2);
509 TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */
510 tree->tag = tag;
511 tree->u.ps = 1 + s1;
512 memcpy(sib1(tree), tree1, s1 * sizeof(TTree));
513 memcpy(sib2(tree), tree2, s2 * sizeof(TTree));
514 joinktables(L, 1, sib2(tree), 2);
515 return tree;
516}
517
518
519static int lp_P (lua_State *L) {
520 luaL_checkany(L, 1);
521 getpatt(L, 1, NULL);
522 lua_settop(L, 1);
523 return 1;
524}
525
526
527/*
528** sequence operator; optimizations:
529** false x => false, x true => x, true x => x
530** (cannot do x . false => false because x may have runtime captures)
531*/
532static int lp_seq (lua_State *L) {
533 TTree *tree1 = getpatt(L, 1, NULL);
534 TTree *tree2 = getpatt(L, 2, NULL);
535 if (tree1->tag == TFalse || tree2->tag == TTrue)
536 lua_pushvalue(L, 1); /* false . x == false, x . true = x */
537 else if (tree1->tag == TTrue)
538 lua_pushvalue(L, 2); /* true . x = x */
539 else
540 newroot2sib(L, TSeq);
541 return 1;
542}
543
544
545/*
546** choice operator; optimizations:
547** charset / charset => charset
548** true / x => true, x / false => x, false / x => x
549** (x / true is not equivalent to true)
550*/
551static int lp_choice (lua_State *L) {
552 Charset st1, st2;
553 TTree *t1 = getpatt(L, 1, NULL);
554 TTree *t2 = getpatt(L, 2, NULL);
555 if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
556 TTree *t = newcharset(L);
557 loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]);
558 }
559 else if (nofail(t1) || t2->tag == TFalse)
560 lua_pushvalue(L, 1); /* true / x => true, x / false => x */
561 else if (t1->tag == TFalse)
562 lua_pushvalue(L, 2); /* false / x => x */
563 else
564 newroot2sib(L, TChoice);
565 return 1;
566}
567
568
569/*
570** p^n
571*/
572static int lp_star (lua_State *L) {
573 int size1;
574 int n = (int)luaL_checkinteger(L, 2);
575 TTree *tree1 = getpatt(L, 1, &size1);
576 if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */
577 TTree *tree = newtree(L, (n + 1) * (size1 + 1));
578 if (nullable(tree1))
579 luaL_error(L, "loop body may accept empty string");
580 while (n--) /* repeat 'n' times */
581 tree = seqaux(tree, tree1, size1);
582 tree->tag = TRep;
583 memcpy(sib1(tree), tree1, size1 * sizeof(TTree));
584 }
585 else { /* choice (seq tree1 ... choice tree1 true ...) true */
586 TTree *tree;
587 n = -n;
588 /* size = (choice + seq + tree1 + true) * n, but the last has no seq */
589 tree = newtree(L, n * (size1 + 3) - 1);
590 for (; n > 1; n--) { /* repeat (n - 1) times */
591 tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2;
592 sib2(tree)->tag = TTrue;
593 tree = sib1(tree);
594 tree = seqaux(tree, tree1, size1);
595 }
596 tree->tag = TChoice; tree->u.ps = size1 + 1;
597 sib2(tree)->tag = TTrue;
598 memcpy(sib1(tree), tree1, size1 * sizeof(TTree));
599 }
600 copyktable(L, 1);
601 return 1;
602}
603
604
605/*
606** #p == &p
607*/
608static int lp_and (lua_State *L) {
609 newroot1sib(L, TAnd);
610 return 1;
611}
612
613
614/*
615** -p == !p
616*/
617static int lp_not (lua_State *L) {
618 newroot1sib(L, TNot);
619 return 1;
620}
621
622
623/*
624** [t1 - t2] == Seq (Not t2) t1
625** If t1 and t2 are charsets, make their difference.
626*/
627static int lp_sub (lua_State *L) {
628 Charset st1, st2;
629 int s1, s2;
630 TTree *t1 = getpatt(L, 1, &s1);
631 TTree *t2 = getpatt(L, 2, &s2);
632 if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
633 TTree *t = newcharset(L);
634 loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]);
635 }
636 else {
637 TTree *tree = newtree(L, 2 + s1 + s2);
638 tree->tag = TSeq; /* sequence of... */
639 tree->u.ps = 2 + s2;
640 sib1(tree)->tag = TNot; /* ...not... */
641 memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */
642 memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */
643 joinktables(L, 1, sib1(tree), 2);
644 }
645 return 1;
646}
647
648
649static int lp_set (lua_State *L) {
650 size_t l;
651 const char *s = luaL_checklstring(L, 1, &l);
652 TTree *tree = newcharset(L);
653 while (l--) {
654 setchar(treebuffer(tree), (byte)(*s));
655 s++;
656 }
657 return 1;
658}
659
660
661static int lp_range (lua_State *L) {
662 int arg;
663 int top = lua_gettop(L);
664 TTree *tree = newcharset(L);
665 for (arg = 1; arg <= top; arg++) {
666 int c;
667 size_t l;
668 const char *r = luaL_checklstring(L, arg, &l);
669 luaL_argcheck(L, l == 2, arg, "range must have two characters");
670 for (c = (byte)r[0]; c <= (byte)r[1]; c++)
671 setchar(treebuffer(tree), c);
672 }
673 return 1;
674}
675
676
677/*
678** Look-behind predicate
679*/
680static int lp_behind (lua_State *L) {
681 TTree *tree;
682 TTree *tree1 = getpatt(L, 1, NULL);
683 int n = fixedlen(tree1);
684 luaL_argcheck(L, n >= 0, 1, "pattern may not have fixed length");
685 luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures");
686 luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind");
687 tree = newroot1sib(L, TBehind);
688 tree->u.n = n;
689 return 1;
690}
691
692
693/*
694** Create a non-terminal
695*/
696static int lp_V (lua_State *L) {
697 TTree *tree = newleaf(L, TOpenCall);
698 luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected");
699 tree->key = addtonewktable(L, 0, 1);
700 return 1;
701}
702
703
704/*
705** Create a tree for a non-empty capture, with a body and
706** optionally with an associated Lua value (at index 'labelidx' in the
707** stack)
708*/
709static int capture_aux (lua_State *L, int cap, int labelidx) {
710 TTree *tree = newroot1sib(L, TCapture);
711 tree->cap = cap;
712 tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx);
713 return 1;
714}
715
716
717/*
718** Fill a tree with an empty capture, using an empty (TTrue) sibling.
719** (The 'key' field must be filled by the caller to finish the tree.)
720*/
721static TTree *auxemptycap (TTree *tree, int cap) {
722 tree->tag = TCapture;
723 tree->cap = cap;
724 sib1(tree)->tag = TTrue;
725 return tree;
726}
727
728
729/*
730** Create a tree for an empty capture.
731*/
732static TTree *newemptycap (lua_State *L, int cap, int key) {
733 TTree *tree = auxemptycap(newtree(L, 2), cap);
734 tree->key = key;
735 return tree;
736}
737
738
739/*
740** Create a tree for an empty capture with an associated Lua value.
741*/
742static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
743 TTree *tree = auxemptycap(newtree(L, 2), cap);
744 tree->key = addtonewktable(L, 0, idx);
745 return tree;
746}
747
748
749/*
750** Captures with syntax p / v
751** (function capture, query capture, string capture, or number capture)
752*/
753static int lp_divcapture (lua_State *L) {
754 switch (lua_type(L, 2)) {
755 case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2);
756 case LUA_TTABLE: return capture_aux(L, Cquery, 2);
757 case LUA_TSTRING: return capture_aux(L, Cstring, 2);
758 case LUA_TNUMBER: {
759 int n = lua_tointeger(L, 2);
760 TTree *tree = newroot1sib(L, TCapture);
761 luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number");
762 tree->cap = Cnum;
763 tree->key = n;
764 return 1;
765 }
766 default: return luaL_argerror(L, 2, "invalid replacement value");
767 }
768}
769
770
771static int lp_substcapture (lua_State *L) {
772 return capture_aux(L, Csubst, 0);
773}
774
775
776static int lp_tablecapture (lua_State *L) {
777 return capture_aux(L, Ctable, 0);
778}
779
780
781static int lp_groupcapture (lua_State *L) {
782 if (lua_isnoneornil(L, 2))
783 return capture_aux(L, Cgroup, 0);
784 else
785 return capture_aux(L, Cgroup, 2);
786}
787
788
789static int lp_foldcapture (lua_State *L) {
790 luaL_checktype(L, 2, LUA_TFUNCTION);
791 return capture_aux(L, Cfold, 2);
792}
793
794
795static int lp_simplecapture (lua_State *L) {
796 return capture_aux(L, Csimple, 0);
797}
798
799
800static int lp_poscapture (lua_State *L) {
801 newemptycap(L, Cposition, 0);
802 return 1;
803}
804
805
806static int lp_argcapture (lua_State *L) {
807 int n = (int)luaL_checkinteger(L, 1);
808 luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index");
809 newemptycap(L, Carg, n);
810 return 1;
811}
812
813
814static int lp_backref (lua_State *L) {
815 luaL_checkany(L, 1);
816 newemptycapkey(L, Cbackref, 1);
817 return 1;
818}
819
820
821/*
822** Constant capture
823*/
824static int lp_constcapture (lua_State *L) {
825 int i;
826 int n = lua_gettop(L); /* number of values */
827 if (n == 0) /* no values? */
828 newleaf(L, TTrue); /* no capture */
829 else if (n == 1)
830 newemptycapkey(L, Cconst, 1); /* single constant capture */
831 else { /* create a group capture with all values */
832 TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2);
833 newktable(L, n); /* create a 'ktable' for new tree */
834 tree->tag = TCapture;
835 tree->cap = Cgroup;
836 tree->key = 0;
837 tree = sib1(tree);
838 for (i = 1; i <= n - 1; i++) {
839 tree->tag = TSeq;
840 tree->u.ps = 3; /* skip TCapture and its sibling */
841 auxemptycap(sib1(tree), Cconst);
842 sib1(tree)->key = addtoktable(L, i);
843 tree = sib2(tree);
844 }
845 auxemptycap(tree, Cconst);
846 tree->key = addtoktable(L, i);
847 }
848 return 1;
849}
850
851
852static int lp_matchtime (lua_State *L) {
853 TTree *tree;
854 luaL_checktype(L, 2, LUA_TFUNCTION);
855 tree = newroot1sib(L, TRunTime);
856 tree->key = addtonewktable(L, 1, 2);
857 return 1;
858}
859
860/* }====================================================== */
861
862
863/*
864** {======================================================
865** Grammar - Tree generation
866** =======================================================
867*/
868
869/*
870** push on the stack the index and the pattern for the
871** initial rule of grammar at index 'arg' in the stack;
872** also add that index into position table.
873*/
874static void getfirstrule (lua_State *L, int arg, int postab) {
875 lua_rawgeti(L, arg, 1); /* access first element */
876 if (lua_isstring(L, -1)) { /* is it the name of initial rule? */
877 lua_pushvalue(L, -1); /* duplicate it to use as key */
878 lua_gettable(L, arg); /* get associated rule */
879 }
880 else {
881 lua_pushinteger(L, 1); /* key for initial rule */
882 lua_insert(L, -2); /* put it before rule */
883 }
884 if (!testpattern(L, -1)) { /* initial rule not a pattern? */
885 if (lua_isnil(L, -1))
886 luaL_error(L, "grammar has no initial rule");
887 else
888 luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2));
889 }
890 lua_pushvalue(L, -2); /* push key */
891 lua_pushinteger(L, 1); /* push rule position (after TGrammar) */
892 lua_settable(L, postab); /* insert pair at position table */
893}
894
895/*
896** traverse grammar at index 'arg', pushing all its keys and patterns
897** into the stack. Create a new table (before all pairs key-pattern) to
898** collect all keys and their associated positions in the final tree
899** (the "position table").
900** Return the number of rules and (in 'totalsize') the total size
901** for the new tree.
902*/
903static int collectrules (lua_State *L, int arg, int *totalsize) {
904 int n = 1; /* to count number of rules */
905 int postab = lua_gettop(L) + 1; /* index of position table */
906 int size; /* accumulator for total size */
907 lua_newtable(L); /* create position table */
908 getfirstrule(L, arg, postab);
909 size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */
910 lua_pushnil(L); /* prepare to traverse grammar table */
911 while (lua_next(L, arg) != 0) {
912 if (lua_tonumber(L, -2) == 1 ||
913 lp_equal(L, -2, postab + 1)) { /* initial rule? */
914 lua_pop(L, 1); /* remove value (keep key for lua_next) */
915 continue;
916 }
917 if (!testpattern(L, -1)) /* value is not a pattern? */
918 luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2));
919 luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules");
920 lua_pushvalue(L, -2); /* push key (to insert into position table) */
921 lua_pushinteger(L, size);
922 lua_settable(L, postab);
923 size += 1 + getsize(L, -1); /* update size */
924 lua_pushvalue(L, -2); /* push key (for next lua_next) */
925 n++;
926 }
927 *totalsize = size + 1; /* TTrue to finish list of rules */
928 return n;
929}
930
931
932static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
933 int i;
934 TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */
935 for (i = 0; i < n; i++) { /* add each rule into new tree */
936 int ridx = frule + 2*i + 1; /* index of i-th rule */
937 int rulesize;
938 TTree *rn = gettree(L, ridx, &rulesize);
939 nd->tag = TRule;
940 nd->key = 0; /* will be fixed when rule is used */
941 nd->cap = i; /* rule number */
942 nd->u.ps = rulesize + 1; /* point to next rule */
943 memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */
944 mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */
945 nd = sib2(nd); /* move to next rule */
946 }
947 nd->tag = TTrue; /* finish list of rules */
948}
949
950
951/*
952** Check whether a tree has potential infinite loops
953*/
954static int checkloops (TTree *tree) {
955 tailcall:
956 if (tree->tag == TRep && nullable(sib1(tree)))
957 return 1;
958 else if (tree->tag == TGrammar)
959 return 0; /* sub-grammars already checked */
960 else {
961 switch (numsiblings[tree->tag]) {
962 case 1: /* return checkloops(sib1(tree)); */
963 tree = sib1(tree); goto tailcall;
964 case 2:
965 if (checkloops(sib1(tree))) return 1;
966 /* else return checkloops(sib2(tree)); */
967 tree = sib2(tree); goto tailcall;
968 default: assert(numsiblings[tree->tag] == 0); return 0;
969 }
970 }
971}
972
973
974/*
975** Give appropriate error message for 'verifyrule'. If a rule appears
976** twice in 'passed', there is path from it back to itself without
977** advancing the subject.
978*/
979static int verifyerror (lua_State *L, int *passed, int npassed) {
980 int i, j;
981 for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */
982 for (j = i - 1; j >= 0; j--) {
983 if (passed[i] == passed[j]) {
984 lua_rawgeti(L, -1, passed[i]); /* get rule's key */
985 return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1));
986 }
987 }
988 }
989 return luaL_error(L, "too many left calls in grammar");
990}
991
992
993/*
994** Check whether a rule can be left recursive; raise an error in that
995** case; otherwise return 1 iff pattern is nullable.
996** The return value is used to check sequences, where the second pattern
997** is only relevant if the first is nullable.
998** Parameter 'nb' works as an accumulator, to allow tail calls in
999** choices. ('nb' true makes function returns true.)
1000** Parameter 'passed' is a list of already visited rules, 'npassed'
1001** counts the elements in 'passed'.
1002** Assume ktable at the top of the stack.
1003*/
1004static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
1005 int nb) {
1006 tailcall:
1007 switch (tree->tag) {
1008 case TChar: case TSet: case TAny:
1009 case TFalse:
1010 return nb; /* cannot pass from here */
1011 case TTrue:
1012 case TBehind: /* look-behind cannot have calls */
1013 return 1;
1014 case TNot: case TAnd: case TRep:
1015 /* return verifyrule(L, sib1(tree), passed, npassed, 1); */
1016 tree = sib1(tree); nb = 1; goto tailcall;
1017 case TCapture: case TRunTime:
1018 /* return verifyrule(L, sib1(tree), passed, npassed, nb); */
1019 tree = sib1(tree); goto tailcall;
1020 case TCall:
1021 /* return verifyrule(L, sib2(tree), passed, npassed, nb); */
1022 tree = sib2(tree); goto tailcall;
1023 case TSeq: /* only check 2nd child if first is nb */
1024 if (!verifyrule(L, sib1(tree), passed, npassed, 0))
1025 return nb;
1026 /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */
1027 tree = sib2(tree); goto tailcall;
1028 case TChoice: /* must check both children */
1029 nb = verifyrule(L, sib1(tree), passed, npassed, nb);
1030 /* return verifyrule(L, sib2(tree), passed, npassed, nb); */
1031 tree = sib2(tree); goto tailcall;
1032 case TRule:
1033 if (npassed >= MAXRULES)
1034 return verifyerror(L, passed, npassed);
1035 else {
1036 passed[npassed++] = tree->key;
1037 /* return verifyrule(L, sib1(tree), passed, npassed); */
1038 tree = sib1(tree); goto tailcall;
1039 }
1040 case TGrammar:
1041 return nullable(tree); /* sub-grammar cannot be left recursive */
1042 default: assert(0); return 0;
1043 }
1044}
1045
1046
1047static void verifygrammar (lua_State *L, TTree *grammar) {
1048 int passed[MAXRULES];
1049 TTree *rule;
1050 /* check left-recursive rules */
1051 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
1052 if (rule->key == 0) continue; /* unused rule */
1053 verifyrule(L, sib1(rule), passed, 0, 0);
1054 }
1055 assert(rule->tag == TTrue);
1056 /* check infinite loops inside rules */
1057 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
1058 if (rule->key == 0) continue; /* unused rule */
1059 if (checkloops(sib1(rule))) {
1060 lua_rawgeti(L, -1, rule->key); /* get rule's key */
1061 luaL_error(L, "empty loop in rule '%s'", val2str(L, -1));
1062 }
1063 }
1064 assert(rule->tag == TTrue);
1065}
1066
1067
1068/*
1069** Give a name for the initial rule if it is not referenced
1070*/
1071static void initialrulename (lua_State *L, TTree *grammar, int frule) {
1072 if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */
1073 int n = lua_rawlen(L, -1) + 1; /* index for name */
1074 lua_pushvalue(L, frule); /* rule's name */
1075 lua_rawseti(L, -2, n); /* ktable was on the top of the stack */
1076 sib1(grammar)->key = n;
1077 }
1078}
1079
1080
1081static TTree *newgrammar (lua_State *L, int arg) {
1082 int treesize;
1083 int frule = lua_gettop(L) + 2; /* position of first rule's key */
1084 int n = collectrules(L, arg, &treesize);
1085 TTree *g = newtree(L, treesize);
1086 luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules");
1087 g->tag = TGrammar; g->u.n = n;
1088 lua_newtable(L); /* create 'ktable' */
1089 lua_setuservalue(L, -2);
1090 buildgrammar(L, g, frule, n);
1091 lua_getuservalue(L, -1); /* get 'ktable' for new tree */
1092 finalfix(L, frule - 1, g, sib1(g));
1093 initialrulename(L, g, frule);
1094 verifygrammar(L, g);
1095 lua_pop(L, 1); /* remove 'ktable' */
1096 lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */
1097 lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */
1098 return g; /* new table at the top of the stack */
1099}
1100
1101/* }====================================================== */
1102
1103
1104static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) {
1105 lua_getuservalue(L, idx); /* push 'ktable' (may be used by 'finalfix') */
1106 finalfix(L, 0, NULL, p->tree);
1107 lua_pop(L, 1); /* remove 'ktable' */
1108 return compile(L, p);
1109}
1110
1111
1112static int lp_printtree (lua_State *L) {
1113 TTree *tree = getpatt(L, 1, NULL);
1114 int c = lua_toboolean(L, 2);
1115 if (c) {
1116 lua_getuservalue(L, 1); /* push 'ktable' (may be used by 'finalfix') */
1117 finalfix(L, 0, NULL, tree);
1118 lua_pop(L, 1); /* remove 'ktable' */
1119 }
1120 printktable(L, 1);
1121 printtree(tree, 0);
1122 return 0;
1123}
1124
1125
1126static int lp_printcode (lua_State *L) {
1127 Pattern *p = getpattern(L, 1);
1128 printktable(L, 1);
1129 if (p->code == NULL) /* not compiled yet? */
1130 prepcompile(L, p, 1);
1131 printpatt(p->code, p->codesize);
1132 return 0;
1133}
1134
1135
1136/*
1137** Get the initial position for the match, interpreting negative
1138** values from the end of the subject
1139*/
1140static size_t initposition (lua_State *L, size_t len) {
1141 lua_Integer ii = luaL_optinteger(L, 3, 1);
1142 if (ii > 0) { /* positive index? */
1143 if ((size_t)ii <= len) /* inside the string? */
1144 return (size_t)ii - 1; /* return it (corrected to 0-base) */
1145 else return len; /* crop at the end */
1146 }
1147 else { /* negative index */
1148 if ((size_t)(-ii) <= len) /* inside the string? */
1149 return len - ((size_t)(-ii)); /* return position from the end */
1150 else return 0; /* crop at the beginning */
1151 }
1152}
1153
1154
1155/*
1156** Main match function
1157*/
1158static int lp_match (lua_State *L) {
1159 Capture capture[INITCAPSIZE];
1160 const char *r;
1161 size_t l;
1162 Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1));
1163 Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1);
1164 const char *s = luaL_checklstring(L, SUBJIDX, &l);
1165 size_t i = initposition(L, l);
1166 int ptop = lua_gettop(L);
1167 lua_pushnil(L); /* initialize subscache */
1168 lua_pushlightuserdata(L, capture); /* initialize caplistidx */
1169 lua_getuservalue(L, 1); /* initialize penvidx */
1170 r = match(L, s, s + i, s + l, code, capture, ptop);
1171 if (r == NULL) {
1172 lua_pushnil(L);
1173 return 1;
1174 }
1175 return getcaptures(L, s, r, ptop);
1176}
1177
1178
1179
1180/*
1181** {======================================================
1182** Library creation and functions not related to matching
1183** =======================================================
1184*/
1185
1186/* maximum limit for stack size */
1187#define MAXLIM (INT_MAX / 100)
1188
1189static int lp_setmax (lua_State *L) {
1190 lua_Integer lim = luaL_checkinteger(L, 1);
1191 luaL_argcheck(L, 0 < lim && lim <= MAXLIM, 1, "out of range");
1192 lua_settop(L, 1);
1193 lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
1194 return 0;
1195}
1196
1197
1198static int lp_version (lua_State *L) {
1199 lua_pushstring(L, VERSION);
1200 return 1;
1201}
1202
1203
1204static int lp_type (lua_State *L) {
1205 if (testpattern(L, 1))
1206 lua_pushliteral(L, "pattern");
1207 else
1208 lua_pushnil(L);
1209 return 1;
1210}
1211
1212
1213int lp_gc (lua_State *L) {
1214 Pattern *p = getpattern(L, 1);
1215 realloccode(L, p, 0); /* delete code block */
1216 return 0;
1217}
1218
1219
1220static void createcat (lua_State *L, const char *catname, int (catf) (int)) {
1221 TTree *t = newcharset(L);
1222 int i;
1223 for (i = 0; i <= UCHAR_MAX; i++)
1224 if (catf(i)) setchar(treebuffer(t), i);
1225 lua_setfield(L, -2, catname);
1226}
1227
1228
1229static int lp_locale (lua_State *L) {
1230 if (lua_isnoneornil(L, 1)) {
1231 lua_settop(L, 0);
1232 lua_createtable(L, 0, 12);
1233 }
1234 else {
1235 luaL_checktype(L, 1, LUA_TTABLE);
1236 lua_settop(L, 1);
1237 }
1238 createcat(L, "alnum", isalnum);
1239 createcat(L, "alpha", isalpha);
1240 createcat(L, "cntrl", iscntrl);
1241 createcat(L, "digit", isdigit);
1242 createcat(L, "graph", isgraph);
1243 createcat(L, "lower", islower);
1244 createcat(L, "print", isprint);
1245 createcat(L, "punct", ispunct);
1246 createcat(L, "space", isspace);
1247 createcat(L, "upper", isupper);
1248 createcat(L, "xdigit", isxdigit);
1249 return 1;
1250}
1251
1252
1253static struct luaL_Reg pattreg[] = {
1254 {"ptree", lp_printtree},
1255 {"pcode", lp_printcode},
1256 {"match", lp_match},
1257 {"B", lp_behind},
1258 {"V", lp_V},
1259 {"C", lp_simplecapture},
1260 {"Cc", lp_constcapture},
1261 {"Cmt", lp_matchtime},
1262 {"Cb", lp_backref},
1263 {"Carg", lp_argcapture},
1264 {"Cp", lp_poscapture},
1265 {"Cs", lp_substcapture},
1266 {"Ct", lp_tablecapture},
1267 {"Cf", lp_foldcapture},
1268 {"Cg", lp_groupcapture},
1269 {"P", lp_P},
1270 {"S", lp_set},
1271 {"R", lp_range},
1272 {"locale", lp_locale},
1273 {"version", lp_version},
1274 {"setmaxstack", lp_setmax},
1275 {"type", lp_type},
1276 {NULL, NULL}
1277};
1278
1279
1280static struct luaL_Reg metareg[] = {
1281 {"__mul", lp_seq},
1282 {"__add", lp_choice},
1283 {"__pow", lp_star},
1284 {"__gc", lp_gc},
1285 {"__len", lp_and},
1286 {"__div", lp_divcapture},
1287 {"__unm", lp_not},
1288 {"__sub", lp_sub},
1289 {NULL, NULL}
1290};
1291
1292
1293int luaopen_lpeg (lua_State *L);
1294int luaopen_lpeg (lua_State *L) {
1295 luaL_newmetatable(L, PATTERN_T);
1296 lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */
1297 lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
1298 luaL_setfuncs(L, metareg, 0);
1299 luaL_newlib(L, pattreg);
1300 lua_pushvalue(L, -1);
1301 lua_setfield(L, -3, "__index");
1302 return 1;
1303}
1304
1305/* }====================================================== */
diff --git a/lptree.h b/lptree.h
new file mode 100644
index 0000000..34ee15c
--- /dev/null
+++ b/lptree.h
@@ -0,0 +1,82 @@
1/*
2** $Id: lptree.h,v 1.3 2016/09/13 18:07:51 roberto Exp $
3*/
4
5#if !defined(lptree_h)
6#define lptree_h
7
8
9#include "lptypes.h"
10
11
12/*
13** types of trees
14*/
15typedef enum TTag {
16 TChar = 0, /* 'n' = char */
17 TSet, /* the set is stored in next CHARSETSIZE bytes */
18 TAny,
19 TTrue,
20 TFalse,
21 TRep, /* 'sib1'* */
22 TSeq, /* 'sib1' 'sib2' */
23 TChoice, /* 'sib1' / 'sib2' */
24 TNot, /* !'sib1' */
25 TAnd, /* &'sib1' */
26 TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */
27 TOpenCall, /* ktable[key] is rule's key */
28 TRule, /* ktable[key] is rule's key (but key == 0 for unused rules);
29 'sib1' is rule's pattern;
30 'sib2' is next rule; 'cap' is rule's sequential number */
31 TGrammar, /* 'sib1' is initial (and first) rule */
32 TBehind, /* 'sib1' is pattern, 'n' is how much to go back */
33 TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind');
34 ktable[key] is Lua value associated with capture;
35 'sib1' is capture body */
36 TRunTime /* run-time capture: 'key' is Lua function;
37 'sib1' is capture body */
38} TTag;
39
40
41/*
42** Tree trees
43** The first child of a tree (if there is one) is immediately after
44** the tree. A reference to a second child (ps) is its position
45** relative to the position of the tree itself.
46*/
47typedef struct TTree {
48 byte tag;
49 byte cap; /* kind of capture (if it is a capture) */
50 unsigned short key; /* key in ktable for Lua data (0 if no key) */
51 union {
52 int ps; /* occasional second child */
53 int n; /* occasional counter */
54 } u;
55} TTree;
56
57
58/*
59** A complete pattern has its tree plus, if already compiled,
60** its corresponding code
61*/
62typedef struct Pattern {
63 union Instruction *code;
64 int codesize;
65 TTree tree[1];
66} Pattern;
67
68
69/* number of children for each tree */
70extern const byte numsiblings[];
71
72/* access to children */
73#define sib1(t) ((t) + 1)
74#define sib2(t) ((t) + (t)->u.ps)
75
76
77
78
79
80
81#endif
82
diff --git a/lptypes.h b/lptypes.h
new file mode 100644
index 0000000..5226970
--- /dev/null
+++ b/lptypes.h
@@ -0,0 +1,145 @@
1/*
2** $Id: lptypes.h,v 1.17 2017/12/14 16:56:27 roberto Exp $
3** LPeg - PEG pattern matching for Lua
4** Copyright 2007-2017, Lua.org & PUC-Rio (see 'lpeg.html' for license)
5** written by Roberto Ierusalimschy
6*/
7
8#if !defined(lptypes_h)
9#define lptypes_h
10
11
12#include <assert.h>
13#include <limits.h>
14
15#include "lua.h"
16
17
18#define VERSION "1.0.1"
19
20
21#define PATTERN_T "lpeg-pattern"
22#define MAXSTACKIDX "lpeg-maxstack"
23
24
25/*
26** compatibility with Lua 5.1
27*/
28#if (LUA_VERSION_NUM == 501)
29
30#define lp_equal lua_equal
31
32#define lua_getuservalue lua_getfenv
33#define lua_setuservalue lua_setfenv
34
35#define lua_rawlen lua_objlen
36
37#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f)
38#define luaL_newlib(L,f) luaL_register(L,"lpeg",f)
39
40#endif
41
42
43#if !defined(lp_equal)
44#define lp_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ)
45#endif
46
47
48/* default maximum size for call/backtrack stack */
49#if !defined(MAXBACK)
50#define MAXBACK 400
51#endif
52
53
54/* maximum number of rules in a grammar (limited by 'unsigned char') */
55#if !defined(MAXRULES)
56#define MAXRULES 250
57#endif
58
59
60
61/* initial size for capture's list */
62#define INITCAPSIZE 32
63
64
65/* index, on Lua stack, for subject */
66#define SUBJIDX 2
67
68/* number of fixed arguments to 'match' (before capture arguments) */
69#define FIXEDARGS 3
70
71/* index, on Lua stack, for capture list */
72#define caplistidx(ptop) ((ptop) + 2)
73
74/* index, on Lua stack, for pattern's ktable */
75#define ktableidx(ptop) ((ptop) + 3)
76
77/* index, on Lua stack, for backtracking stack */
78#define stackidx(ptop) ((ptop) + 4)
79
80
81
82typedef unsigned char byte;
83
84
85#define BITSPERCHAR 8
86
87#define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1)
88
89
90
91typedef struct Charset {
92 byte cs[CHARSETSIZE];
93} Charset;
94
95
96
97#define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} }
98
99/* access to charset */
100#define treebuffer(t) ((byte *)((t) + 1))
101
102/* number of slots needed for 'n' bytes */
103#define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1)
104
105/* set 'b' bit in charset 'cs' */
106#define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7)))
107
108
109/*
110** in capture instructions, 'kind' of capture and its offset are
111** packed in field 'aux', 4 bits for each
112*/
113#define getkind(op) ((op)->i.aux & 0xF)
114#define getoff(op) (((op)->i.aux >> 4) & 0xF)
115#define joinkindoff(k,o) ((k) | ((o) << 4))
116
117#define MAXOFF 0xF
118#define MAXAUX 0xFF
119
120
121/* maximum number of bytes to look behind */
122#define MAXBEHIND MAXAUX
123
124
125/* maximum size (in elements) for a pattern */
126#define MAXPATTSIZE (SHRT_MAX - 10)
127
128
129/* size (in elements) for an instruction plus extra l bytes */
130#define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1)
131
132
133/* size (in elements) for a ISet instruction */
134#define CHARSETINSTSIZE instsize(CHARSETSIZE)
135
136/* size (in elements) for a IFunc instruction */
137#define funcinstsize(p) ((p)->i.aux + 2)
138
139
140
141#define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7))))
142
143
144#endif
145
diff --git a/lpvm.c b/lpvm.c
new file mode 100644
index 0000000..05a5f68
--- /dev/null
+++ b/lpvm.c
@@ -0,0 +1,364 @@
1/*
2** $Id: lpvm.c,v 1.9 2016/06/03 20:11:18 roberto Exp $
3** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
4*/
5
6#include <limits.h>
7#include <string.h>
8
9
10#include "lua.h"
11#include "lauxlib.h"
12
13#include "lpcap.h"
14#include "lptypes.h"
15#include "lpvm.h"
16#include "lpprint.h"
17
18
19/* initial size for call/backtrack stack */
20#if !defined(INITBACK)
21#define INITBACK MAXBACK
22#endif
23
24
25#define getoffset(p) (((p) + 1)->offset)
26
27static const Instruction giveup = {{IGiveup, 0, 0}};
28
29
30/*
31** {======================================================
32** Virtual Machine
33** =======================================================
34*/
35
36
37typedef struct Stack {
38 const char *s; /* saved position (or NULL for calls) */
39 const Instruction *p; /* next instruction */
40 int caplevel;
41} Stack;
42
43
44#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop)))
45
46
47/*
48** Make the size of the array of captures 'cap' twice as large as needed
49** (which is 'captop'). ('n' is the number of new elements.)
50*/
51static Capture *doublecap (lua_State *L, Capture *cap, int captop,
52 int n, int ptop) {
53 Capture *newc;
54 if (captop >= INT_MAX/((int)sizeof(Capture) * 2))
55 luaL_error(L, "too many captures");
56 newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture));
57 memcpy(newc, cap, (captop - n) * sizeof(Capture));
58 lua_replace(L, caplistidx(ptop));
59 return newc;
60}
61
62
63/*
64** Double the size of the stack
65*/
66static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) {
67 Stack *stack = getstackbase(L, ptop);
68 Stack *newstack;
69 int n = *stacklimit - stack; /* current stack size */
70 int max, newn;
71 lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
72 max = lua_tointeger(L, -1); /* maximum allowed size */
73 lua_pop(L, 1);
74 if (n >= max) /* already at maximum size? */
75 luaL_error(L, "backtrack stack overflow (current limit is %d)", max);
76 newn = 2 * n; /* new size */
77 if (newn > max) newn = max;
78 newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack));
79 memcpy(newstack, stack, n * sizeof(Stack));
80 lua_replace(L, stackidx(ptop));
81 *stacklimit = newstack + newn;
82 return newstack + n; /* return next position */
83}
84
85
86/*
87** Interpret the result of a dynamic capture: false -> fail;
88** true -> keep current position; number -> next position.
89** Return new subject position. 'fr' is stack index where
90** is the result; 'curr' is current subject position; 'limit'
91** is subject's size.
92*/
93static int resdyncaptures (lua_State *L, int fr, int curr, int limit) {
94 lua_Integer res;
95 if (!lua_toboolean(L, fr)) { /* false value? */
96 lua_settop(L, fr - 1); /* remove results */
97 return -1; /* and fail */
98 }
99 else if (lua_isboolean(L, fr)) /* true? */
100 res = curr; /* keep current position */
101 else {
102 res = lua_tointeger(L, fr) - 1; /* new position */
103 if (res < curr || res > limit)
104 luaL_error(L, "invalid position returned by match-time capture");
105 }
106 lua_remove(L, fr); /* remove first result (offset) */
107 return res;
108}
109
110
111/*
112** Add capture values returned by a dynamic capture to the capture list
113** 'base', nested inside a group capture. 'fd' indexes the first capture
114** value, 'n' is the number of values (at least 1).
115*/
116static void adddyncaptures (const char *s, Capture *base, int n, int fd) {
117 int i;
118 base[0].kind = Cgroup; /* create group capture */
119 base[0].siz = 0;
120 base[0].idx = 0; /* make it an anonymous group */
121 for (i = 1; i <= n; i++) { /* add runtime captures */
122 base[i].kind = Cruntime;
123 base[i].siz = 1; /* mark it as closed */
124 base[i].idx = fd + i - 1; /* stack index of capture value */
125 base[i].s = s;
126 }
127 base[i].kind = Cclose; /* close group */
128 base[i].siz = 1;
129 base[i].s = s;
130}
131
132
133/*
134** Remove dynamic captures from the Lua stack (called in case of failure)
135*/
136static int removedyncap (lua_State *L, Capture *capture,
137 int level, int last) {
138 int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */
139 int top = lua_gettop(L);
140 if (id == 0) return 0; /* no dynamic captures? */
141 lua_settop(L, id - 1); /* remove captures */
142 return top - id + 1; /* number of values removed */
143}
144
145
146/*
147** Opcode interpreter
148*/
149const char *match (lua_State *L, const char *o, const char *s, const char *e,
150 Instruction *op, Capture *capture, int ptop) {
151 Stack stackbase[INITBACK];
152 Stack *stacklimit = stackbase + INITBACK;
153 Stack *stack = stackbase; /* point to first empty slot in stack */
154 int capsize = INITCAPSIZE;
155 int captop = 0; /* point to first empty slot in captures */
156 int ndyncap = 0; /* number of dynamic captures (in Lua stack) */
157 const Instruction *p = op; /* current instruction */
158 stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++;
159 lua_pushlightuserdata(L, stackbase);
160 for (;;) {
161#if defined(DEBUG)
162 printf("-------------------------------------\n");
163 printcaplist(capture, capture + captop);
164 printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ",
165 s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop);
166 printinst(op, p);
167#endif
168 assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop);
169 switch ((Opcode)p->i.code) {
170 case IEnd: {
171 assert(stack == getstackbase(L, ptop) + 1);
172 capture[captop].kind = Cclose;
173 capture[captop].s = NULL;
174 return s;
175 }
176 case IGiveup: {
177 assert(stack == getstackbase(L, ptop));
178 return NULL;
179 }
180 case IRet: {
181 assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL);
182 p = (--stack)->p;
183 continue;
184 }
185 case IAny: {
186 if (s < e) { p++; s++; }
187 else goto fail;
188 continue;
189 }
190 case ITestAny: {
191 if (s < e) p += 2;
192 else p += getoffset(p);
193 continue;
194 }
195 case IChar: {
196 if ((byte)*s == p->i.aux && s < e) { p++; s++; }
197 else goto fail;
198 continue;
199 }
200 case ITestChar: {
201 if ((byte)*s == p->i.aux && s < e) p += 2;
202 else p += getoffset(p);
203 continue;
204 }
205 case ISet: {
206 int c = (byte)*s;
207 if (testchar((p+1)->buff, c) && s < e)
208 { p += CHARSETINSTSIZE; s++; }
209 else goto fail;
210 continue;
211 }
212 case ITestSet: {
213 int c = (byte)*s;
214 if (testchar((p + 2)->buff, c) && s < e)
215 p += 1 + CHARSETINSTSIZE;
216 else p += getoffset(p);
217 continue;
218 }
219 case IBehind: {
220 int n = p->i.aux;
221 if (n > s - o) goto fail;
222 s -= n; p++;
223 continue;
224 }
225 case ISpan: {
226 for (; s < e; s++) {
227 int c = (byte)*s;
228 if (!testchar((p+1)->buff, c)) break;
229 }
230 p += CHARSETINSTSIZE;
231 continue;
232 }
233 case IJmp: {
234 p += getoffset(p);
235 continue;
236 }
237 case IChoice: {
238 if (stack == stacklimit)
239 stack = doublestack(L, &stacklimit, ptop);
240 stack->p = p + getoffset(p);
241 stack->s = s;
242 stack->caplevel = captop;
243 stack++;
244 p += 2;
245 continue;
246 }
247 case ICall: {
248 if (stack == stacklimit)
249 stack = doublestack(L, &stacklimit, ptop);
250 stack->s = NULL;
251 stack->p = p + 2; /* save return address */
252 stack++;
253 p += getoffset(p);
254 continue;
255 }
256 case ICommit: {
257 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
258 stack--;
259 p += getoffset(p);
260 continue;
261 }
262 case IPartialCommit: {
263 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
264 (stack - 1)->s = s;
265 (stack - 1)->caplevel = captop;
266 p += getoffset(p);
267 continue;
268 }
269 case IBackCommit: {
270 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
271 s = (--stack)->s;
272 captop = stack->caplevel;
273 p += getoffset(p);
274 continue;
275 }
276 case IFailTwice:
277 assert(stack > getstackbase(L, ptop));
278 stack--;
279 /* go through */
280 case IFail:
281 fail: { /* pattern failed: try to backtrack */
282 do { /* remove pending calls */
283 assert(stack > getstackbase(L, ptop));
284 s = (--stack)->s;
285 } while (s == NULL);
286 if (ndyncap > 0) /* is there matchtime captures? */
287 ndyncap -= removedyncap(L, capture, stack->caplevel, captop);
288 captop = stack->caplevel;
289 p = stack->p;
290#if defined(DEBUG)
291 printf("**FAIL**\n");
292#endif
293 continue;
294 }
295 case ICloseRunTime: {
296 CapState cs;
297 int rem, res, n;
298 int fr = lua_gettop(L) + 1; /* stack index of first result */
299 cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop;
300 n = runtimecap(&cs, capture + captop, s, &rem); /* call function */
301 captop -= n; /* remove nested captures */
302 ndyncap -= rem; /* update number of dynamic captures */
303 fr -= rem; /* 'rem' items were popped from Lua stack */
304 res = resdyncaptures(L, fr, s - o, e - o); /* get result */
305 if (res == -1) /* fail? */
306 goto fail;
307 s = o + res; /* else update current position */
308 n = lua_gettop(L) - fr + 1; /* number of new captures */
309 ndyncap += n; /* update number of dynamic captures */
310 if (n > 0) { /* any new capture? */
311 if (fr + n >= SHRT_MAX)
312 luaL_error(L, "too many results in match-time capture");
313 if ((captop += n + 2) >= capsize) {
314 capture = doublecap(L, capture, captop, n + 2, ptop);
315 capsize = 2 * captop;
316 }
317 /* add new captures to 'capture' list */
318 adddyncaptures(s, capture + captop - n - 2, n, fr);
319 }
320 p++;
321 continue;
322 }
323 case ICloseCapture: {
324 const char *s1 = s;
325 assert(captop > 0);
326 /* if possible, turn capture into a full capture */
327 if (capture[captop - 1].siz == 0 &&
328 s1 - capture[captop - 1].s < UCHAR_MAX) {
329 capture[captop - 1].siz = s1 - capture[captop - 1].s + 1;
330 p++;
331 continue;
332 }
333 else {
334 capture[captop].siz = 1; /* mark entry as closed */
335 capture[captop].s = s;
336 goto pushcapture;
337 }
338 }
339 case IOpenCapture:
340 capture[captop].siz = 0; /* mark entry as open */
341 capture[captop].s = s;
342 goto pushcapture;
343 case IFullCapture:
344 capture[captop].siz = getoff(p) + 1; /* save capture size */
345 capture[captop].s = s - getoff(p);
346 /* goto pushcapture; */
347 pushcapture: {
348 capture[captop].idx = p->i.key;
349 capture[captop].kind = getkind(p);
350 if (++captop >= capsize) {
351 capture = doublecap(L, capture, captop, 0, ptop);
352 capsize = 2 * captop;
353 }
354 p++;
355 continue;
356 }
357 default: assert(0); return NULL;
358 }
359 }
360}
361
362/* }====================================================== */
363
364
diff --git a/lpvm.h b/lpvm.h
new file mode 100644
index 0000000..757b9e1
--- /dev/null
+++ b/lpvm.h
@@ -0,0 +1,58 @@
1/*
2** $Id: lpvm.h,v 1.3 2014/02/21 13:06:41 roberto Exp $
3*/
4
5#if !defined(lpvm_h)
6#define lpvm_h
7
8#include "lpcap.h"
9
10
11/* Virtual Machine's instructions */
12typedef enum Opcode {
13 IAny, /* if no char, fail */
14 IChar, /* if char != aux, fail */
15 ISet, /* if char not in buff, fail */
16 ITestAny, /* in no char, jump to 'offset' */
17 ITestChar, /* if char != aux, jump to 'offset' */
18 ITestSet, /* if char not in buff, jump to 'offset' */
19 ISpan, /* read a span of chars in buff */
20 IBehind, /* walk back 'aux' characters (fail if not possible) */
21 IRet, /* return from a rule */
22 IEnd, /* end of pattern */
23 IChoice, /* stack a choice; next fail will jump to 'offset' */
24 IJmp, /* jump to 'offset' */
25 ICall, /* call rule at 'offset' */
26 IOpenCall, /* call rule number 'key' (must be closed to a ICall) */
27 ICommit, /* pop choice and jump to 'offset' */
28 IPartialCommit, /* update top choice to current position and jump */
29 IBackCommit, /* "fails" but jump to its own 'offset' */
30 IFailTwice, /* pop one choice and then fail */
31 IFail, /* go back to saved state on choice and jump to saved offset */
32 IGiveup, /* internal use */
33 IFullCapture, /* complete capture of last 'off' chars */
34 IOpenCapture, /* start a capture */
35 ICloseCapture,
36 ICloseRunTime
37} Opcode;
38
39
40
41typedef union Instruction {
42 struct Inst {
43 byte code;
44 byte aux;
45 short key;
46 } i;
47 int offset;
48 byte buff[1];
49} Instruction;
50
51
52void printpatt (Instruction *p, int n);
53const char *match (lua_State *L, const char *o, const char *s, const char *e,
54 Instruction *op, Capture *capture, int ptop);
55
56
57#endif
58
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..d803c12
--- /dev/null
+++ b/makefile
@@ -0,0 +1,55 @@
1LIBNAME = lpeg
2LUADIR = ../lua/
3
4# COPT = -O2 -DNDEBUG
5COPT = -g
6
7CWARNS = -Wall -Wextra -pedantic \
8 -Waggregate-return \
9 -Wcast-align \
10 -Wcast-qual \
11 -Wdisabled-optimization \
12 -Wpointer-arith \
13 -Wshadow \
14 -Wsign-compare \
15 -Wundef \
16 -Wwrite-strings \
17 -Wbad-function-cast \
18 -Wdeclaration-after-statement \
19 -Wmissing-prototypes \
20 -Wnested-externs \
21 -Wstrict-prototypes \
22# -Wunreachable-code \
23
24
25CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC
26CC = gcc
27
28FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o
29
30# For Linux
31linux:
32 make lpeg.so "DLLFLAGS = -shared -fPIC"
33
34# For Mac OS
35macosx:
36 make lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup"
37
38lpeg.so: $(FILES)
39 env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so
40
41$(FILES): makefile
42
43test: test.lua re.lua lpeg.so
44 ./test.lua
45
46clean:
47 rm -f $(FILES) lpeg.so
48
49
50lpcap.o: lpcap.c lpcap.h lptypes.h
51lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h
52lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h
53lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h
54lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h
55
diff --git a/pack b/pack
new file mode 100755
index 0000000..7f5556f
--- /dev/null
+++ b/pack
@@ -0,0 +1,15 @@
1#!/bin/bash
2FILES="makefile HISTORY test.lua re.lua lpeg.html re.html lpeg-128.gif \
3 lptypes.h lpcap.h lpcap.c lpcode.h lpcode.c lpprint.h lpprint.c \
4 lptree.h lptree.c lpvm.h lpvm.c"
5NAME=lpeg-$1
6DIRN=versions/$NAME
7mkdir $DIRN
8co $FILES
9mv $FILES $DIRN
10cd versions
11tar --create --gzip --file=$NAME.tar.gz $NAME
12# scp $NAME.tar.gz obaluae:public_html/lpeg/
13# ssh obaluae "rm public_html/lpeg/*.html"
14cd $NAME
15# scp *.html obaluae:public_html/lpeg/
diff --git a/re.html b/re.html
new file mode 100644
index 0000000..c7d575b
--- /dev/null
+++ b/re.html
@@ -0,0 +1,500 @@
1<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3<html>
4<head>
5 <title>LPeg.re - Regex syntax for LPEG</title>
6 <link rel="stylesheet"
7 href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css"
8 type="text/css"/>
9 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
10</head>
11<body>
12
13<!-- $Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $ -->
14
15<div id="container">
16
17<div id="product">
18 <div id="product_logo">
19 <a href="http://www.inf.puc-rio.br/~roberto/lpeg/">
20 <img alt="LPeg logo" src="lpeg-128.gif"/>
21 </a>
22 </div>
23 <div id="product_name"><big><strong>LPeg.re</strong></big></div>
24 <div id="product_description">
25 Regex syntax for LPEG
26 </div>
27</div> <!-- id="product" -->
28
29<div id="main">
30
31<div id="navigation">
32<h1>re</h1>
33
34<ul>
35 <li><a href="#basic">Basic Constructions</a></li>
36 <li><a href="#func">Functions</a></li>
37 <li><a href="#ex">Some Examples</a></li>
38 <li><a href="#license">License</a></li>
39 </ul>
40 </li>
41</ul>
42</div> <!-- id="navigation" -->
43
44<div id="content">
45
46<h2><a name="basic"></a>The <code>re</code> Module</h2>
47
48<p>
49The <code>re</code> module
50(provided by file <code>re.lua</code> in the distribution)
51supports a somewhat conventional regex syntax
52for pattern usage within <a href="lpeg.html">LPeg</a>.
53</p>
54
55<p>
56The next table summarizes <code>re</code>'s syntax.
57A <code>p</code> represents an arbitrary pattern;
58<code>num</code> represents a number (<code>[0-9]+</code>);
59<code>name</code> represents an identifier
60(<code>[a-zA-Z][a-zA-Z0-9_]*</code>).
61Constructions are listed in order of decreasing precedence.
62<table border="1">
63<tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr>
64<tr><td><code>( p )</code></td> <td>grouping</td></tr>
65<tr><td><code>'string'</code></td> <td>literal string</td></tr>
66<tr><td><code>"string"</code></td> <td>literal string</td></tr>
67<tr><td><code>[class]</code></td> <td>character class</td></tr>
68<tr><td><code>.</code></td> <td>any character</td></tr>
69<tr><td><code>%name</code></td>
70 <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr>
71<tr><td><code>name</code></td><td>non terminal</td></tr>
72<tr><td><code>&lt;name&gt;</code></td><td>non terminal</td></tr>
73<tr><td><code>{}</code></td> <td>position capture</td></tr>
74<tr><td><code>{ p }</code></td> <td>simple capture</td></tr>
75<tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr>
76<tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr>
77<tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr>
78<tr><td><code>{| p |}</code></td> <td>table capture</td></tr>
79<tr><td><code>=name</code></td> <td>back reference
80</td></tr>
81<tr><td><code>p ?</code></td> <td>optional match</td></tr>
82<tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr>
83<tr><td><code>p +</code></td> <td>one or more repetitions</td></tr>
84<tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr>
85<tr><td><code>p^+num</code></td>
86 <td>at least <code>n</code> repetitions</td></tr>
87<tr><td><code>p^-num</code></td>
88 <td>at most <code>n</code> repetitions</td></tr>
89<tr><td><code>p -&gt; 'string'</code></td> <td>string capture</td></tr>
90<tr><td><code>p -&gt; "string"</code></td> <td>string capture</td></tr>
91<tr><td><code>p -&gt; num</code></td> <td>numbered capture</td></tr>
92<tr><td><code>p -&gt; name</code></td> <td>function/query/string capture
93equivalent to <code>p / defs[name]</code></td></tr>
94<tr><td><code>p =&gt; name</code></td> <td>match-time capture
95equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr>
96<tr><td><code>p ~&gt; name</code></td> <td>fold capture
97equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr>
98<tr><td><code>& p</code></td> <td>and predicate</td></tr>
99<tr><td><code>! p</code></td> <td>not predicate</td></tr>
100<tr><td><code>p1 p2</code></td> <td>concatenation</td></tr>
101<tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr>
102<tr><td>(<code>name &lt;- p</code>)<sup>+</sup></td> <td>grammar</td></tr>
103</tbody></table>
104<p>
105Any space appearing in a syntax description can be
106replaced by zero or more space characters and Lua-style comments
107(<code>--</code> until end of line).
108</p>
109
110<p>
111Character classes define sets of characters.
112An initial <code>^</code> complements the resulting set.
113A range <em>x</em><code>-</code><em>y</em> includes in the set
114all characters with codes between the codes of <em>x</em> and <em>y</em>.
115A pre-defined class <code>%</code><em>name</em> includes all
116characters of that class.
117A simple character includes itself in the set.
118The only special characters inside a class are <code>^</code>
119(special only if it is the first character);
120<code>]</code>
121(can be included in the set as the first character,
122after the optional <code>^</code>);
123<code>%</code> (special only if followed by a letter);
124and <code>-</code>
125(can be included in the set as the first or the last character).
126</p>
127
128<p>
129Currently the pre-defined classes are similar to those from the
130Lua's string library
131(<code>%a</code> for letters,
132<code>%A</code> for non letters, etc.).
133There is also a class <code>%nl</code>
134containing only the newline character,
135which is particularly handy for grammars written inside long strings,
136as long strings do not interpret escape sequences like <code>\n</code>.
137</p>
138
139
140<h2><a name="func">Functions</a></h2>
141
142<h3><code>re.compile (string, [, defs])</code></h3>
143<p>
144Compiles the given string and
145returns an equivalent LPeg pattern.
146The given string may define either an expression or a grammar.
147The optional <code>defs</code> table provides extra Lua values
148to be used by the pattern.
149</p>
150
151<h3><code>re.find (subject, pattern [, init])</code></h3>
152<p>
153Searches the given pattern in the given subject.
154If it finds a match,
155returns the index where this occurrence starts and
156the index where it ends.
157Otherwise, returns nil.
158</p>
159
160<p>
161An optional numeric argument <code>init</code> makes the search
162starts at that position in the subject string.
163As usual in Lua libraries,
164a negative value counts from the end.
165</p>
166
167<h3><code>re.gsub (subject, pattern, replacement)</code></h3>
168<p>
169Does a <em>global substitution</em>,
170replacing all occurrences of <code>pattern</code>
171in the given <code>subject</code> by <code>replacement</code>.
172
173<h3><code>re.match (subject, pattern)</code></h3>
174<p>
175Matches the given pattern against the given subject,
176returning all captures.
177</p>
178
179<h3><code>re.updatelocale ()</code></h3>
180<p>
181Updates the pre-defined character classes to the current locale.
182</p>
183
184
185<h2><a name="ex">Some Examples</a></h2>
186
187<h3>A complete simple program</h3>
188<p>
189The next code shows a simple complete Lua program using
190the <code>re</code> module:
191</p>
192<pre class="example">
193local re = require"re"
194
195-- find the position of the first numeral in a string
196print(re.find("the number 423 is odd", "[0-9]+")) --&gt; 12 14
197
198-- returns all words in a string
199print(re.match("the number 423 is odd", "({%a+} / .)*"))
200--&gt; the number is odd
201
202-- returns the first numeral in a string
203print(re.match("the number 423 is odd", "s <- {%d+} / . s"))
204--&gt; 423
205
206print(re.gsub("hello World", "[aeiou]", "."))
207--&gt; h.ll. W.rld
208</pre>
209
210
211<h3>Balanced parentheses</h3>
212<p>
213The following call will produce the same pattern produced by the
214Lua expression in the
215<a href="lpeg.html#balanced">balanced parentheses</a> example:
216</p>
217<pre class="example">
218b = re.compile[[ balanced &lt;- "(" ([^()] / balanced)* ")" ]]
219</pre>
220
221<h3>String reversal</h3>
222<p>
223The next example reverses a string:
224</p>
225<pre class="example">
226rev = re.compile[[ R &lt;- (!.) -&gt; '' / ({.} R) -&gt; '%2%1']]
227print(rev:match"0123456789") --&gt; 9876543210
228</pre>
229
230<h3>CSV decoder</h3>
231<p>
232The next example replicates the <a href="lpeg.html#CSV">CSV decoder</a>:
233</p>
234<pre class="example">
235record = re.compile[[
236 record &lt;- {| field (',' field)* |} (%nl / !.)
237 field &lt;- escaped / nonescaped
238 nonescaped &lt;- { [^,"%nl]* }
239 escaped &lt;- '"' {~ ([^"] / '""' -&gt; '"')* ~} '"'
240]]
241</pre>
242
243<h3>Lua's long strings</h3>
244<p>
245The next example matches Lua long strings:
246</p>
247<pre class="example">
248c = re.compile([[
249 longstring &lt;- ('[' {:eq: '='* :} '[' close)
250 close &lt;- ']' =eq ']' / . close
251]])
252
253print(c:match'[==[]]===]]]]==]===[]') --&gt; 17
254</pre>
255
256<h3>Abstract Syntax Trees</h3>
257<p>
258This example shows a simple way to build an
259abstract syntax tree (AST) for a given grammar.
260To keep our example simple,
261let us consider the following grammar
262for lists of names:
263</p>
264<pre class="example">
265p = re.compile[[
266 listname &lt;- (name s)*
267 name &lt;- [a-z][a-z]*
268 s &lt;- %s*
269]]
270</pre>
271<p>
272Now, we will add captures to build a corresponding AST.
273As a first step, the pattern will build a table to
274represent each non terminal;
275terminals will be represented by their corresponding strings:
276</p>
277<pre class="example">
278c = re.compile[[
279 listname &lt;- {| (name s)* |}
280 name &lt;- {| {[a-z][a-z]*} |}
281 s &lt;- %s*
282]]
283</pre>
284<p>
285Now, a match against <code>"hi hello bye"</code>
286results in the table
287<code>{{"hi"}, {"hello"}, {"bye"}}</code>.
288</p>
289<p>
290For such a simple grammar,
291this AST is more than enough;
292actually, the tables around each single name
293are already overkilling.
294More complex grammars,
295however, may need some more structure.
296Specifically,
297it would be useful if each table had
298a <code>tag</code> field telling what non terminal
299that table represents.
300We can add such a tag using
301<a href="lpeg.html#cap-g">named group captures</a>:
302</p>
303<pre class="example">
304x = re.compile[[
305 listname <- {| {:tag: '' -> 'list':} (name s)* |}
306 name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |}
307 s <- ' '*
308]]
309</pre>
310<p>
311With these group captures,
312a match against <code>"hi hello bye"</code>
313results in the following table:
314</p>
315<pre class="example">
316{tag="list",
317 {tag="id", "hi"},
318 {tag="id", "hello"},
319 {tag="id", "bye"}
320}
321</pre>
322
323
324<h3>Indented blocks</h3>
325<p>
326This example breaks indented blocks into tables,
327respecting the indentation:
328</p>
329<pre class="example">
330p = re.compile[[
331 block &lt;- {| {:ident:' '*:} line
332 ((=ident !' ' line) / &(=ident ' ') block)* |}
333 line &lt;- {[^%nl]*} %nl
334]]
335</pre>
336<p>
337As an example,
338consider the following text:
339</p>
340<pre class="example">
341t = p:match[[
342first line
343 subline 1
344 subline 2
345second line
346third line
347 subline 3.1
348 subline 3.1.1
349 subline 3.2
350]]
351</pre>
352<p>
353The resulting table <code>t</code> will be like this:
354</p>
355<pre class="example">
356 {'first line'; {'subline 1'; 'subline 2'; ident = ' '};
357 'second line';
358 'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = ' '};
359 'subline 3.2'; ident = ' '};
360 ident = ''}
361</pre>
362
363<h3>Macro expander</h3>
364<p>
365This example implements a simple macro expander.
366Macros must be defined as part of the pattern,
367following some simple rules:
368</p>
369<pre class="example">
370p = re.compile[[
371 text &lt;- {~ item* ~}
372 item &lt;- macro / [^()] / '(' item* ')'
373 arg &lt;- ' '* {~ (!',' item)* ~}
374 args &lt;- '(' arg (',' arg)* ')'
375 -- now we define some macros
376 macro &lt;- ('apply' args) -&gt; '%1(%2)'
377 / ('add' args) -&gt; '%1 + %2'
378 / ('mul' args) -&gt; '%1 * %2'
379]]
380
381print(p:match"add(mul(a,b), apply(f,x))") --&gt; a * b + f(x)
382</pre>
383<p>
384A <code>text</code> is a sequence of items,
385wherein we apply a substitution capture to expand any macros.
386An <code>item</code> is either a macro,
387any character different from parentheses,
388or a parenthesized expression.
389A macro argument (<code>arg</code>) is a sequence
390of items different from a comma.
391(Note that a comma may appear inside an item,
392e.g., inside a parenthesized expression.)
393Again we do a substitution capture to expand any macro
394in the argument before expanding the outer macro.
395<code>args</code> is a list of arguments separated by commas.
396Finally we define the macros.
397Each macro is a string substitution;
398it replaces the macro name and its arguments by its corresponding string,
399with each <code>%</code><em>n</em> replaced by the <em>n</em>-th argument.
400</p>
401
402<h3>Patterns</h3>
403<p>
404This example shows the complete syntax
405of patterns accepted by <code>re</code>.
406</p>
407<pre class="example">
408p = [=[
409
410pattern &lt;- exp !.
411exp &lt;- S (grammar / alternative)
412
413alternative &lt;- seq ('/' S seq)*
414seq &lt;- prefix*
415prefix &lt;- '&amp;' S prefix / '!' S prefix / suffix
416suffix &lt;- primary S (([+*?]
417 / '^' [+-]? num
418 / '-&gt;' S (string / '{}' / name)
419 / '=&gt;' S name) S)*
420
421primary &lt;- '(' exp ')' / string / class / defined
422 / '{:' (name ':')? exp ':}'
423 / '=' name
424 / '{}'
425 / '{~' exp '~}'
426 / '{' exp '}'
427 / '.'
428 / name S !arrow
429 / '&lt;' name '&gt;' -- old-style non terminals
430
431grammar &lt;- definition+
432definition &lt;- name S arrow exp
433
434class &lt;- '[' '^'? item (!']' item)* ']'
435item &lt;- defined / range / .
436range &lt;- . '-' [^]]
437
438S &lt;- (%s / '--' [^%nl]*)* -- spaces and comments
439name &lt;- [A-Za-z][A-Za-z0-9_]*
440arrow &lt;- '&lt;-'
441num &lt;- [0-9]+
442string &lt;- '"' [^"]* '"' / "'" [^']* "'"
443defined &lt;- '%' name
444
445]=]
446
447print(re.match(p, p)) -- a self description must match itself
448</pre>
449
450
451
452<h2><a name="license">License</a></h2>
453
454<p>
455Copyright &copy; 2008-2015 Lua.org, PUC-Rio.
456</p>
457<p>
458Permission is hereby granted, free of charge,
459to any person obtaining a copy of this software and
460associated documentation files (the "Software"),
461to deal in the Software without restriction,
462including without limitation the rights to use,
463copy, modify, merge, publish, distribute, sublicense,
464and/or sell copies of the Software,
465and to permit persons to whom the Software is
466furnished to do so,
467subject to the following conditions:
468</p>
469
470<p>
471The above copyright notice and this permission notice
472shall be included in all copies or substantial portions of the Software.
473</p>
474
475<p>
476THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
477EXPRESS OR IMPLIED,
478INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
479FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
480IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
481DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
482TORT OR OTHERWISE, ARISING FROM,
483OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
484THE SOFTWARE.
485</p>
486
487</div> <!-- id="content" -->
488
489</div> <!-- id="main" -->
490
491<div id="about">
492<p><small>
493$Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $
494</small></p>
495</div> <!-- id="about" -->
496
497</div> <!-- id="container" -->
498
499</body>
500</html>
diff --git a/re.lua b/re.lua
new file mode 100644
index 0000000..6351929
--- /dev/null
+++ b/re.lua
@@ -0,0 +1,267 @@
1-- $Id: re.lua,v 1.46 2018/06/04 16:21:19 roberto Exp $
2
3-- imported functions and modules
4local tonumber, type, print, error = tonumber, type, print, error
5local setmetatable = setmetatable
6local m = require"lpeg"
7
8-- 'm' will be used to parse expressions, and 'mm' will be used to
9-- create expressions; that is, 're' runs on 'm', creating patterns
10-- on 'mm'
11local mm = m
12
13-- pattern's metatable
14local mt = getmetatable(mm.P(0))
15
16
17
18-- No more global accesses after this point
19local version = _VERSION
20if version == "Lua 5.2" then _ENV = nil end
21
22
23local any = m.P(1)
24
25
26-- Pre-defined names
27local Predef = { nl = m.P"\n" }
28
29
30local mem
31local fmem
32local gmem
33
34
35local function updatelocale ()
36 mm.locale(Predef)
37 Predef.a = Predef.alpha
38 Predef.c = Predef.cntrl
39 Predef.d = Predef.digit
40 Predef.g = Predef.graph
41 Predef.l = Predef.lower
42 Predef.p = Predef.punct
43 Predef.s = Predef.space
44 Predef.u = Predef.upper
45 Predef.w = Predef.alnum
46 Predef.x = Predef.xdigit
47 Predef.A = any - Predef.a
48 Predef.C = any - Predef.c
49 Predef.D = any - Predef.d
50 Predef.G = any - Predef.g
51 Predef.L = any - Predef.l
52 Predef.P = any - Predef.p
53 Predef.S = any - Predef.s
54 Predef.U = any - Predef.u
55 Predef.W = any - Predef.w
56 Predef.X = any - Predef.x
57 mem = {} -- restart memoization
58 fmem = {}
59 gmem = {}
60 local mt = {__mode = "v"}
61 setmetatable(mem, mt)
62 setmetatable(fmem, mt)
63 setmetatable(gmem, mt)
64end
65
66
67updatelocale()
68
69
70
71local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end)
72
73
74local function patt_error (s, i)
75 local msg = (#s < i + 20) and s:sub(i)
76 or s:sub(i,i+20) .. "..."
77 msg = ("pattern error near '%s'"):format(msg)
78 error(msg, 2)
79end
80
81local function mult (p, n)
82 local np = mm.P(true)
83 while n >= 1 do
84 if n%2 >= 1 then np = np * p end
85 p = p * p
86 n = n/2
87 end
88 return np
89end
90
91local function equalcap (s, i, c)
92 if type(c) ~= "string" then return nil end
93 local e = #c + i
94 if s:sub(i, e - 1) == c then return e else return nil end
95end
96
97
98local S = (Predef.space + "--" * (any - Predef.nl)^0)^0
99
100local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0
101
102local arrow = S * "<-"
103
104local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1
105
106name = m.C(name)
107
108
109-- a defined name only have meaning in a given environment
110local Def = name * m.Carg(1)
111
112
113local function getdef (id, defs)
114 local c = defs and defs[id]
115 if not c then error("undefined name: " .. id) end
116 return c
117end
118
119-- match a name and return a group of its corresponding definition
120-- and 'f' (to be folded in 'Suffix')
121local function defwithfunc (f)
122 return m.Cg(Def / getdef * m.Cc(f))
123end
124
125
126local num = m.C(m.R"09"^1) * S / tonumber
127
128local String = "'" * m.C((any - "'")^0) * "'" +
129 '"' * m.C((any - '"')^0) * '"'
130
131
132local defined = "%" * Def / function (c,Defs)
133 local cat = Defs and Defs[c] or Predef[c]
134 if not cat then error ("name '" .. c .. "' undefined") end
135 return cat
136end
137
138local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R
139
140local item = (defined + Range + m.C(any)) / m.P
141
142local Class =
143 "["
144 * (m.C(m.P"^"^-1)) -- optional complement symbol
145 * m.Cf(item * (item - "]")^0, mt.__add) /
146 function (c, p) return c == "^" and any - p or p end
147 * "]"
148
149local function adddef (t, k, exp)
150 if t[k] then
151 error("'"..k.."' already defined as a rule")
152 else
153 t[k] = exp
154 end
155 return t
156end
157
158local function firstdef (n, r) return adddef({n}, n, r) end
159
160
161local function NT (n, b)
162 if not b then
163 error("rule '"..n.."' used outside a grammar")
164 else return mm.V(n)
165 end
166end
167
168
169local exp = m.P{ "Exp",
170 Exp = S * ( m.V"Grammar"
171 + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) );
172 Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul)
173 * (#seq_follow + patt_error);
174 Prefix = "&" * S * m.V"Prefix" / mt.__len
175 + "!" * S * m.V"Prefix" / mt.__unm
176 + m.V"Suffix";
177 Suffix = m.Cf(m.V"Primary" * S *
178 ( ( m.P"+" * m.Cc(1, mt.__pow)
179 + m.P"*" * m.Cc(0, mt.__pow)
180 + m.P"?" * m.Cc(-1, mt.__pow)
181 + "^" * ( m.Cg(num * m.Cc(mult))
182 + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow))
183 )
184 + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div))
185 + m.P"{}" * m.Cc(nil, m.Ct)
186 + defwithfunc(mt.__div)
187 )
188 + "=>" * S * defwithfunc(m.Cmt)
189 + "~>" * S * defwithfunc(m.Cf)
190 ) * S
191 )^0, function (a,b,f) return f(a,b) end );
192 Primary = "(" * m.V"Exp" * ")"
193 + String / mm.P
194 + Class
195 + defined
196 + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" /
197 function (n, p) return mm.Cg(p, n) end
198 + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end
199 + m.P"{}" / mm.Cp
200 + "{~" * m.V"Exp" * "~}" / mm.Cs
201 + "{|" * m.V"Exp" * "|}" / mm.Ct
202 + "{" * m.V"Exp" * "}" / mm.C
203 + m.P"." * m.Cc(any)
204 + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT;
205 Definition = name * arrow * m.V"Exp";
206 Grammar = m.Cg(m.Cc(true), "G") *
207 m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0,
208 adddef) / mm.P
209}
210
211local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error)
212
213
214local function compile (p, defs)
215 if mm.type(p) == "pattern" then return p end -- already compiled
216 local cp = pattern:match(p, 1, defs)
217 if not cp then error("incorrect pattern", 3) end
218 return cp
219end
220
221local function match (s, p, i)
222 local cp = mem[p]
223 if not cp then
224 cp = compile(p)
225 mem[p] = cp
226 end
227 return cp:match(s, i or 1)
228end
229
230local function find (s, p, i)
231 local cp = fmem[p]
232 if not cp then
233 cp = compile(p) / 0
234 cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) }
235 fmem[p] = cp
236 end
237 local i, e = cp:match(s, i or 1)
238 if i then return i, e - 1
239 else return i
240 end
241end
242
243local function gsub (s, p, rep)
244 local g = gmem[p] or {} -- ensure gmem[p] is not collected while here
245 gmem[p] = g
246 local cp = g[rep]
247 if not cp then
248 cp = compile(p)
249 cp = mm.Cs((cp / rep + 1)^0)
250 g[rep] = cp
251 end
252 return cp:match(s)
253end
254
255
256-- exported names
257local re = {
258 compile = compile,
259 match = match,
260 find = find,
261 gsub = gsub,
262 updatelocale = updatelocale,
263}
264
265if version == "Lua 5.1" then _G.re = re end
266
267return re
diff --git a/test.lua b/test.lua
new file mode 100755
index 0000000..51c5204
--- /dev/null
+++ b/test.lua
@@ -0,0 +1,1513 @@
1#!/usr/bin/env lua
2
3-- $Id: test.lua,v 1.114 2018/06/04 16:21:19 roberto Exp $
4
5-- require"strict" -- just to be pedantic
6
7local m = require"lpeg"
8
9
10-- for general use
11local a, b, c, d, e, f, g, p, t
12
13
14-- compatibility with Lua 5.2
15local unpack = rawget(table, "unpack") or unpack
16local loadstring = rawget(_G, "loadstring") or load
17
18
19local any = m.P(1)
20local space = m.S" \t\n"^0
21
22local function checkeq (x, y, p)
23if p then print(x,y) end
24 if type(x) ~= "table" then assert(x == y)
25 else
26 for k,v in pairs(x) do checkeq(v, y[k], p) end
27 for k,v in pairs(y) do checkeq(v, x[k], p) end
28 end
29end
30
31
32local mt = getmetatable(m.P(1))
33
34
35local allchar = {}
36for i=0,255 do allchar[i + 1] = i end
37allchar = string.char(unpack(allchar))
38assert(#allchar == 256)
39
40local function cs2str (c)
41 return m.match(m.Cs((c + m.P(1)/"")^0), allchar)
42end
43
44local function eqcharset (c1, c2)
45 assert(cs2str(c1) == cs2str(c2))
46end
47
48
49print"General tests for LPeg library"
50
51assert(type(m.version()) == "string")
52print("version " .. m.version())
53assert(m.type("alo") ~= "pattern")
54assert(m.type(io.input) ~= "pattern")
55assert(m.type(m.P"alo") == "pattern")
56
57-- tests for some basic optimizations
58assert(m.match(m.P(false) + "a", "a") == 2)
59assert(m.match(m.P(true) + "a", "a") == 1)
60assert(m.match("a" + m.P(false), "b") == nil)
61assert(m.match("a" + m.P(true), "b") == 1)
62
63assert(m.match(m.P(false) * "a", "a") == nil)
64assert(m.match(m.P(true) * "a", "a") == 2)
65assert(m.match("a" * m.P(false), "a") == nil)
66assert(m.match("a" * m.P(true), "a") == 2)
67
68assert(m.match(#m.P(false) * "a", "a") == nil)
69assert(m.match(#m.P(true) * "a", "a") == 2)
70assert(m.match("a" * #m.P(false), "a") == nil)
71assert(m.match("a" * #m.P(true), "a") == 2)
72
73
74-- tests for locale
75do
76 assert(m.locale(m) == m)
77 local t = {}
78 assert(m.locale(t, m) == t)
79 local x = m.locale()
80 for n,v in pairs(x) do
81 assert(type(n) == "string")
82 eqcharset(v, m[n])
83 end
84end
85
86
87assert(m.match(3, "aaaa"))
88assert(m.match(4, "aaaa"))
89assert(not m.match(5, "aaaa"))
90assert(m.match(-3, "aa"))
91assert(not m.match(-3, "aaa"))
92assert(not m.match(-3, "aaaa"))
93assert(not m.match(-4, "aaaa"))
94assert(m.P(-5):match"aaaa")
95
96assert(m.match("a", "alo") == 2)
97assert(m.match("al", "alo") == 3)
98assert(not m.match("alu", "alo"))
99assert(m.match(true, "") == 1)
100
101local digit = m.S"0123456789"
102local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
103local lower = m.S"abcdefghijklmnopqrstuvwxyz"
104local letter = m.S"" + upper + lower
105local alpha = letter + digit + m.R()
106
107eqcharset(m.S"", m.P(false))
108eqcharset(upper, m.R("AZ"))
109eqcharset(lower, m.R("az"))
110eqcharset(upper + lower, m.R("AZ", "az"))
111eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90"))
112eqcharset(digit, m.S"01234567" + "8" + "9")
113eqcharset(upper, letter - lower)
114eqcharset(m.S(""), m.R())
115assert(cs2str(m.S("")) == "")
116
117eqcharset(m.S"\0", "\0")
118eqcharset(m.S"\1\0\2", m.R"\0\2")
119eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0")
120eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2")
121
122local word = alpha^1 * (1 - alpha)^0
123
124assert((word^0 * -1):match"alo alo")
125assert(m.match(word^1 * -1, "alo alo"))
126assert(m.match(word^2 * -1, "alo alo"))
127assert(not m.match(word^3 * -1, "alo alo"))
128
129assert(not m.match(word^-1 * -1, "alo alo"))
130assert(m.match(word^-2 * -1, "alo alo"))
131assert(m.match(word^-3 * -1, "alo alo"))
132
133local eos = m.P(-1)
134
135assert(m.match(digit^0 * letter * digit * eos, "1298a1"))
136assert(not m.match(digit^0 * letter * eos, "1257a1"))
137
138b = {
139 [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")"
140}
141
142assert(m.match(b, "(al())()"))
143assert(not m.match(b * eos, "(al())()"))
144assert(m.match(b * eos, "((al())()(é))"))
145assert(not m.match(b, "(al()()"))
146
147assert(not m.match(letter^1 - "for", "foreach"))
148assert(m.match(letter^1 - ("for" * eos), "foreach"))
149assert(not m.match(letter^1 - ("for" * eos), "for"))
150
151function basiclookfor (p)
152 return m.P {
153 [1] = p + (1 * m.V(1))
154 }
155end
156
157function caplookfor (p)
158 return basiclookfor(p:C())
159end
160
161assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou")
162a = {m.match(caplookfor(letter^1)^0, " two words, one more ")}
163checkeq(a, {"two", "words", "one", "more"})
164
165assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7)
166
167a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")}
168checkeq(a, {"123", "d"})
169
170-- bug in LPeg 0.12 (nil value does not create a 'ktable')
171assert(m.match(m.Cc(nil), "") == nil)
172
173a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")}
174checkeq(a, {"abcd", "l"})
175
176a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}
177checkeq(a, {10,20,30,2})
178a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}
179checkeq(a, {1,10,20,30,2})
180a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa')
181checkeq(a, {1,10,20,30,2})
182a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa')
183checkeq(a, {1,7,8,10,20,30,2})
184a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')}
185checkeq(a, {1,2,3,4})
186
187a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")}
188checkeq(a, {1, 5})
189
190
191t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")}
192checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""})
193
194-- bug in 0.12 ('hascapture' did not check for captures inside a rule)
195do
196 local pat = m.P{
197 'S';
198 S1 = m.C('abc') + 3,
199 S = #m.V('S1') -- rule has capture, but '#' must ignore it
200 }
201 assert(pat:match'abc' == 1)
202end
203
204
205-- bug: loop in 'hascaptures'
206do
207 local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'})
208 assert(p:match("xxx") == "")
209end
210
211
212
213-- test for small capture boundary
214for i = 250,260 do
215 assert(#m.match(m.C(i), string.rep('a', i)) == i)
216 assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i)
217end
218
219-- tests for any*n and any*-n
220for n = 1, 550, 13 do
221 local x_1 = string.rep('x', n - 1)
222 local x = x_1 .. 'a'
223 assert(not m.P(n):match(x_1))
224 assert(m.P(n):match(x) == n + 1)
225 assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4)
226 assert(m.C(n):match(x) == x)
227 assert(m.C(m.C(n)):match(x) == x)
228 assert(m.P(-n):match(x_1) == 1)
229 assert(not m.P(-n):match(x))
230 assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20)
231 local n3 = math.floor(n/3)
232 assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1)
233end
234
235-- true values
236assert(m.P(0):match("x") == 1)
237assert(m.P(0):match("") == 1)
238assert(m.C(0):match("x") == "")
239
240assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1)
241assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0)
242assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd")
243p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4
244
245
246-- test for alternation optimization
247assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2)
248assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3)
249assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1)
250assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3)
251assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3)
252assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3)
253assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3)
254assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4)
255assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3)
256assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4)
257assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4)
258assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4)
259assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3)
260assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4)
261assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4)
262assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4)
263assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3)
264assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4)
265assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4)
266assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4)
267assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3)
268assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3)
269assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4)
270assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3)
271assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1)
272assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1)
273assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3)
274assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4)
275assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3)
276assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2)
277assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1)
278assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe"))
279
280assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4)
281
282
283-- bug in 0.12 (rc1)
284assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4)
285
286assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) ==
287 4*10 + 1)
288
289-- optimizations with optional parts
290assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1)
291assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1)
292assert(m.match(("ab" * m.B"c")^-1, "ab") == 1)
293assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7)
294
295assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3)
296
297p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1
298assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21)
299
300
301-- bug in 0.12.2
302-- p = { ('ab' ('c' 'ef'?)*)? }
303p = m.C(('ab' * ('c' * m.P'ef'^-1)^0)^-1)
304s = "abcefccefc"
305assert(s == p:match(s))
306
307
308pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510"
309assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) ==
310 m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi))
311print"+"
312
313
314-- tests for capture optimizations
315assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5)
316t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")}
317checkeq(t, {3, 6})
318
319
320-- tests for numbered captures
321p = m.C(1)
322assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a")
323assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef")
324assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc")
325assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7)
326
327a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh")
328assert(a == "a" and b == "efg" and c == "h")
329
330-- test for table captures
331t = m.match(m.Ct(letter^1), "alo")
332checkeq(t, {})
333
334t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo")
335assert(n == "t" and table.concat(t) == "alo")
336
337t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo")
338assert(table.concat(t, ";") == "alo;a;l;o")
339
340t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo")
341assert(table.concat(t, ";") == "alo;a;l;o")
342
343t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo")
344assert(table.concat(t[1], ";") == "1;2;2;3;3;4")
345
346t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo")
347checkeq(t, {"alo", "a", "o"})
348
349
350-- tests for groups
351p = m.Cg(1) -- no capture
352assert(p:match('x') == 'x')
353p = m.Cg(m.P(true)/function () end * 1) -- no value
354assert(p:match('x') == 'x')
355p = m.Cg(m.Cg(m.Cg(m.C(1))))
356assert(p:match('x') == 'x')
357p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2)))
358t = {p:match'abc'}
359checkeq(t, {'a', 'b', 'c', 1, 2})
360
361p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho"))
362t = p:match''
363checkeq(t, {hi = 10, ho = 20})
364t = p:match'abc'
365checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'})
366
367-- non-string group names
368p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io))
369t = p:match('abcdefghij')
370assert(t[print] == 'a' and t[23.5] == 'b' and t[io] == 'c')
371
372
373-- test for error messages
374local function checkerr (msg, f, ...)
375 local st, err = pcall(f, ...)
376 assert(not st and m.match({ m.P(msg) + 1 * m.V(1) }, err))
377end
378
379checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a")
380checkerr("rule '1' used outside a grammar", m.match, m.V(1), "")
381checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "")
382checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "")
383checkerr("undefined in given grammar", m.match, { m.V{} }, "")
384
385checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} })
386checkerr("grammar has no initial rule", m.P, { [print] = {} })
387
388-- grammar with a long call chain before left recursion
389p = {'a',
390 a = m.V'b' * m.V'c' * m.V'd' * m.V'a',
391 b = m.V'c',
392 c = m.V'd',
393 d = m.V'e',
394 e = m.V'f',
395 f = m.V'g',
396 g = m.P''
397}
398checkerr("rule 'a' may be left recursive", m.match, p, "a")
399
400-- Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit)
401-- the next grammar has an original sequence IJmp -> ICommit -> IJmp L1
402-- that is optimized to ICommit L1
403
404p = m.P { (m.P {m.P'abc'} + 'ayz') * m.V'y'; y = m.P'x' }
405assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc')
406
407
408do
409 -- large dynamic Cc
410 local lim = 2^16 - 1
411 local c = 0
412 local function seq (n)
413 if n == 1 then c = c + 1; return m.Cc(c)
414 else
415 local m = math.floor(n / 2)
416 return seq(m) * seq(n - m)
417 end
418 end
419 p = m.Ct(seq(lim))
420 t = p:match('')
421 assert(t[lim] == lim)
422 checkerr("too many", function () p = p / print end)
423 checkerr("too many", seq, lim + 1)
424end
425
426
427-- tests for non-pattern as arguments to pattern functions
428
429p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 }
430assert(m.match(p, "aaabaac") == 7)
431
432p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans
433
434assert(p:match("abc01de") == 8)
435assert(p:match("abc01de3456") == nil)
436
437p = 'abc' * (2 * (-5 * (true * m.P'de')))
438
439assert(p:match("abc01de") == 8)
440assert(p:match("abc01de3456") == nil)
441
442p = { m.V(2), m.P"abc" } *
443 (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" })
444assert(p:match("abcaaaxx") == 7)
445assert(p:match("abcxx") == 6)
446
447
448-- a large table capture
449t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000))
450assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a')
451
452print('+')
453
454
455-- bug in 0.10 (rechecking a grammar, after tail-call optimization)
456m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } }
457
458local V = m.V
459
460local Space = m.S(" \n\t")^0
461local Number = m.C(m.R("09")^1) * Space
462local FactorOp = m.C(m.S("+-")) * Space
463local TermOp = m.C(m.S("*/")) * Space
464local Open = "(" * Space
465local Close = ")" * Space
466
467
468local function f_factor (v1, op, v2, d)
469 assert(d == nil)
470 if op == "+" then return v1 + v2
471 else return v1 - v2
472 end
473end
474
475
476local function f_term (v1, op, v2, d)
477 assert(d == nil)
478 if op == "*" then return v1 * v2
479 else return v1 / v2
480 end
481end
482
483G = m.P{ "Exp",
484 Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor);
485 Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term);
486 Term = Number / tonumber + Open * V"Exp" * Close;
487}
488
489G = Space * G * -1
490
491for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do
492 assert(m.match(G, s) == loadstring("return "..s)())
493end
494
495
496-- test for grammars (errors deep in calling non-terminals)
497g = m.P{
498 [1] = m.V(2) + "a",
499 [2] = "a" * m.V(3) * "x",
500 [3] = "b" * m.V(3) + "c"
501}
502
503assert(m.match(g, "abbbcx") == 7)
504assert(m.match(g, "abbbbx") == 2)
505
506
507-- tests for \0
508assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4)
509assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5)
510assert(m.match(m.P(1)^3, "\0\1\0a") == 5)
511assert(not m.match(-4, "\0\1\0a"))
512assert(m.match("\0\1\0a", "\0\1\0a") == 5)
513assert(m.match("\0\0\0", "\0\0\0") == 4)
514assert(not m.match("\0\0\0", "\0\0"))
515
516
517-- tests for predicates
518assert(not m.match(-m.P("a") * 2, "alo"))
519assert(m.match(- -m.P("a") * 2, "alo") == 3)
520assert(m.match(#m.P("a") * 2, "alo") == 3)
521assert(m.match(##m.P("a") * 2, "alo") == 3)
522assert(not m.match(##m.P("c") * 2, "alo"))
523assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.")
524assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.")
525assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.")
526assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.")
527
528
529-- fixed length
530do
531 -- 'and' predicate using fixed length
532 local p = m.C(#("a" * (m.P("bd") + "cd")) * 2)
533 assert(p:match("acd") == "ac")
534
535 p = #m.P{ "a" * m.V(2), m.P"b" } * 2
536 assert(p:match("abc") == 3)
537
538 p = #(m.P"abc" * m.B"c")
539 assert(p:match("abc") == 1 and not p:match("ab"))
540
541 p = m.P{ "a" * m.V(2), m.P"b"^1 }
542 checkerr("pattern may not have fixed length", m.B, p)
543
544 p = "abc" * (m.P"b"^1 + m.P"a"^0)
545 checkerr("pattern may not have fixed length", m.B, p)
546end
547
548
549p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3)
550assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1)
551
552p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20)
553assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10)
554
555
556
557-- look-behind predicate
558assert(not m.match(m.B'a', 'a'))
559assert(m.match(1 * m.B'a', 'a') == 2)
560assert(not m.match(m.B(1), 'a'))
561assert(m.match(1 * m.B(1), 'a') == 2)
562assert(m.match(-m.B(1), 'a') == 1)
563assert(m.match(m.B(250), string.rep('a', 250)) == nil)
564assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251)
565
566-- look-behind with an open call
567checkerr("pattern may not have fixed length", m.B, m.V'S1')
568checkerr("too long to look behind", m.B, 260)
569
570B = #letter * -m.B(letter) + -letter * m.B(letter)
571x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) })
572checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10})
573checkeq(m.match(x, ' ar cal '), {2,4,5,8})
574checkeq(m.match(x, ' '), {})
575checkeq(m.match(x, 'aloalo'), {1,7})
576
577assert(m.match(B, "a") == 1)
578assert(m.match(1 * B, "a") == 2)
579assert(not m.B(1 - letter):match(""))
580assert((-m.B(letter)):match("") == 1)
581
582assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5)
583assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa"))
584assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5)
585
586-- look-behind with grammars
587assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil)
588assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil)
589assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4)
590
591
592
593-- bug in 0.9
594assert(m.match(('a' * #m.P'b'), "ab") == 2)
595assert(not m.match(('a' * #m.P'b'), "a"))
596
597assert(not m.match(#m.S'567', ""))
598assert(m.match(#m.S'567' * 1, "6") == 2)
599
600
601-- tests for Tail Calls
602
603p = m.P{ 'a' * m.V(1) + '' }
604assert(p:match(string.rep('a', 1000)) == 1001)
605
606-- create a grammar for a simple DFA for even number of 0s and 1s
607--
608-- ->1 <---0---> 2
609-- ^ ^
610-- | |
611-- 1 1
612-- | |
613-- V V
614-- 3 <---0---> 4
615--
616-- this grammar should keep no backtracking information
617
618p = m.P{
619 [1] = '0' * m.V(2) + '1' * m.V(3) + -1,
620 [2] = '0' * m.V(1) + '1' * m.V(4),
621 [3] = '0' * m.V(4) + '1' * m.V(1),
622 [4] = '0' * m.V(3) + '1' * m.V(2),
623}
624
625assert(p:match(string.rep("00", 10000)))
626assert(p:match(string.rep("01", 10000)))
627assert(p:match(string.rep("011", 10000)))
628assert(not p:match(string.rep("011", 10000) .. "1"))
629assert(not p:match(string.rep("011", 10001)))
630
631
632-- this grammar does need backtracking info.
633local lim = 10000
634p = m.P{ '0' * m.V(1) + '0' }
635checkerr("stack overflow", m.match, p, string.rep("0", lim))
636m.setmaxstack(2*lim)
637checkerr("stack overflow", m.match, p, string.rep("0", lim))
638m.setmaxstack(2*lim + 4)
639assert(m.match(p, string.rep("0", lim)) == lim + 1)
640
641-- this repetition should not need stack space (only the call does)
642p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' }
643m.setmaxstack(200)
644assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362)
645
646m.setmaxstack(100) -- restore low limit
647
648-- tests for optional start position
649assert(m.match("a", "abc", 1))
650assert(m.match("b", "abc", 2))
651assert(m.match("c", "abc", 3))
652assert(not m.match(1, "abc", 4))
653assert(m.match("a", "abc", -3))
654assert(m.match("b", "abc", -2))
655assert(m.match("c", "abc", -1))
656assert(m.match("abc", "abc", -4)) -- truncate to position 1
657
658assert(m.match("", "abc", 10)) -- empty string is everywhere!
659assert(m.match("", "", 10))
660assert(not m.match(1, "", 1))
661assert(not m.match(1, "", -1))
662assert(not m.match(1, "", 0))
663
664print("+")
665
666
667-- tests for argument captures
668checkerr("invalid argument", m.Carg, 0)
669checkerr("invalid argument", m.Carg, -1)
670checkerr("invalid argument", m.Carg, 2^18)
671checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1)
672assert(m.match(m.Carg(1), 'a', 1, print) == print)
673x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)}
674checkeq(x, {10, 20})
675
676assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") *
677 m.Cmt(m.Cb("a"), function (s,i,x)
678 assert(s == "a" and i == 1);
679 return i, x+1
680 end) *
681 m.Carg(2), function (s,i,a,b,c)
682 assert(s == "a" and i == 1 and c == nil);
683 return i, 2*a + 3*b
684 end) * "a",
685 "a", 1, false, 100, 1000) == 2*1001 + 3*100)
686
687
688-- tests for Lua functions
689
690t = {}
691s = ""
692p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false
693s = "hi, this is a test"
694assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1)
695assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3)
696
697assert(not m.match(p, s))
698
699p = mt.__add(function (s, i) return i end, function (s, i) return nil end)
700assert(m.match(p, "alo"))
701
702p = mt.__mul(function (s, i) return i end, function (s, i) return nil end)
703assert(not m.match(p, "alo"))
704
705
706t = {}
707p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end
708s = "hi, this is a test"
709assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1)
710assert(#t == string.len(s) and t[1] == 2 and t[2] == 3)
711
712t = {}
713p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i;
714 return i <= s1:len() and i end) * 1
715s = "hi, this is a test"
716assert(m.match(p^0, s) == string.len(s) + 1)
717assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2)
718
719p = function (s1, i) return m.match(m.P"a"^1, s1, i) end
720assert(m.match(p, "aaaa") == 5)
721assert(m.match(p, "abaa") == 2)
722assert(not m.match(p, "baaa"))
723
724checkerr("invalid position", m.match, function () return 2^20 end, s)
725checkerr("invalid position", m.match, function () return 0 end, s)
726checkerr("invalid position", m.match, function (s, i) return i - 1 end, s)
727checkerr("invalid position", m.match,
728 m.P(1)^0 * function (_, i) return i - 1 end, s)
729assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s))
730checkerr("invalid position", m.match,
731 m.P(1)^0 * function (_, i) return i + 1 end, s)
732assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s))
733checkerr("invalid position", m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s)
734assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s))
735assert(m.match(m.P(1)^0 * function (_, i) return true end, s) ==
736 string.len(s) + 1)
737for i = 1, string.len(s) + 1 do
738 assert(m.match(function (_, _) return i end, s) == i)
739end
740
741p = (m.P(function (s, i) return i%2 == 0 and i end) * 1
742 + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0
743 * -1
744assert(p:match(string.rep('a', 14000)))
745
746-- tests for Function Replacements
747f = function (a, ...) if a ~= "x" then return {a, ...} end end
748
749t = m.match(m.C(1)^0/f, "abc")
750checkeq(t, {"a", "b", "c"})
751
752t = m.match(m.C(1)^0/f/f, "abc")
753checkeq(t, {{"a", "b", "c"}})
754
755t = m.match(m.P(1)^0/f/f, "abc") -- no capture
756checkeq(t, {{"abc"}})
757
758t = m.match((m.P(1)^0/f * m.Cp())/f, "abc")
759checkeq(t, {{"abc"}, 4})
760
761t = m.match((m.C(1)^0/f * m.Cp())/f, "abc")
762checkeq(t, {{"a", "b", "c"}, 4})
763
764t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc")
765checkeq(t, {4})
766
767t = m.match(m.C(m.C(1)^0)/f, "abc")
768checkeq(t, {"abc", "a", "b", "c"})
769
770g = function (...) return 1, ... end
771t = {m.match(m.C(1)^0/g/g, "abc")}
772checkeq(t, {1, 1, "a", "b", "c"})
773
774t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")}
775t1 = {1,1,nil,nil,4,nil,3,nil,nil}
776for i=1,10 do assert(t[i] == t1[i]) end
777
778-- bug in 0.12.2: ktable with only nil could be eliminated when joining
779-- with a pattern without ktable
780assert((m.P"aaa" * m.Cc(nil)):match"aaa" == nil)
781
782t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")}
783checkeq(t, {"a", "ax", "b", "bx", "c", "cx"})
784
785t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc")
786checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1})
787
788-- tests for Query Replacements
789
790assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10)
791assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10)
792assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40)
793t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc")
794checkeq(t, {40})
795
796assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e")
797assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde")
798assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e")
799assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.")
800assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+")
801assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde")
802assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde")
803assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde")
804assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde")
805assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx")
806assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3")
807assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3)
808assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3)
809
810assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0")
811
812assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd")
813assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d")
814assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad")
815assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a")
816assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx")
817assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") ==
818 "411 - abc ")
819
820assert(m.match(m.P(1)/"%0", "abc") == "a")
821checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc")
822checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc")
823
824p = m.C(1)
825p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1"
826assert(p:match("1234567890") == "9 - 1")
827
828assert(m.match(m.Cc(print), "") == print)
829
830-- too many captures (just ignore extra ones)
831p = m.C(1)^0 / "%2-%9-%0-%9"
832assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8")
833s = string.rep("12345678901234567890", 20)
834assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3")
835
836-- string captures with non-string subcaptures
837p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1"
838assert(p:match'x' == 'alo - x - alo')
839
840checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a")
841
842-- long strings for string capture
843l = 10000
844s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l)
845
846p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1'
847
848assert(p:match(s) == string.rep('c', l) ..
849 string.rep('b', l) ..
850 string.rep('a', l))
851
852print"+"
853
854-- accumulator capture
855function f (x) return x + 1 end
856assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7)
857
858t = {m.match(m.Cf(m.Cc(1,2,3), error), "")}
859checkeq(t, {1})
860p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0,
861 rawset)
862t = p:match("a=b;c=du;xux=yuy;")
863checkeq(t, {a="b", c="du", xux="yuy"})
864
865
866-- errors in accumulator capture
867
868-- no initial capture
869checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa')
870-- no initial capture (very long match forces fold to be a pair open-close)
871checkerr("no initial value", m.match, m.Cf(m.P(500), print),
872 string.rep('a', 600))
873
874-- nested capture produces no initial value
875checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo")
876
877
878-- tests for loop checker
879
880local function isnullable (p)
881 checkerr("may accept empty string", function (p) return p^0 end, m.P(p))
882end
883
884isnullable(m.P("x")^-4)
885assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3)
886assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3)
887isnullable("")
888isnullable(m.P("x")^0)
889isnullable(m.P("x")^-1)
890isnullable(m.P("x") + 1 + 2 + m.P("a")^-1)
891isnullable(-m.P("ab"))
892isnullable(- -m.P("ab"))
893isnullable(# #(m.P("ab") + "xy"))
894isnullable(- #m.P("ab")^0)
895isnullable(# -m.P("ab")^1)
896isnullable(#m.V(3))
897isnullable(m.V(3) + m.V(1) + m.P('a')^-1)
898isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)})
899assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc")
900 == 3)
901assert(m.match(m.P""^-3, "a") == 1)
902
903local function find (p, s)
904 return m.match(basiclookfor(p), s)
905end
906
907
908local function badgrammar (g, expected)
909 local stat, msg = pcall(m.P, g)
910 assert(not stat)
911 if expected then assert(find(expected, msg)) end
912end
913
914badgrammar({[1] = m.V(1)}, "rule '1'")
915badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal
916badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal
917badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal
918badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive
919badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive
920badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive
921badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive
922badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive
923badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop
924badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop
925badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop
926badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop
927badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop
928badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive
929badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'")
930
931assert(m.match({'a' * -m.V(1)}, "aaa") == 2)
932assert(m.match({'a' * -m.V(1)}, "aaaa") == nil)
933
934
935-- good x bad grammars
936m.P{ ('a' * m.V(1))^-1 }
937m.P{ -('a' * m.V(1)) }
938m.P{ ('abc' * m.V(1))^-1 }
939m.P{ -('abc' * m.V(1)) }
940badgrammar{ #m.P('abc') * m.V(1) }
941badgrammar{ -('a' + m.V(1)) }
942m.P{ #('a' * m.V(1)) }
943badgrammar{ #('a' + m.V(1)) }
944m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) }
945badgrammar{ m.B{ m.P'abc' } * m.V(1) }
946badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) }
947
948
949-- simple tests for maximum sizes:
950local p = m.P"a"
951for i=1,14 do p = p * p end
952
953p = {}
954for i=1,100 do p[i] = m.P"a" end
955p = m.P(p)
956
957
958-- strange values for rule labels
959
960p = m.P{ "print",
961 print = m.V(print),
962 [print] = m.V(_G),
963 [_G] = m.P"a",
964 }
965
966assert(p:match("a"))
967
968-- initial rule
969g = {}
970for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end
971g.i11 = m.P""
972for i = 1, 10 do
973 g[1] = "i"..i
974 local p = m.P(g)
975 assert(p:match("aaaaaaaaaaa") == 11 - i + 1)
976end
977
978print"+"
979
980
981-- tests for back references
982checkerr("back reference 'x' not found", m.match, m.Cb('x'), '')
983checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a')
984
985p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k"))
986t = p:match("ab")
987checkeq(t, {"a", "b"})
988
989p = m.P(true)
990for i = 1, 10 do p = p * m.Cg(1, i) end
991for i = 1, 10 do
992 local p = p * m.Cb(i)
993 assert(p:match('abcdefghij') == string.sub('abcdefghij', i, i))
994end
995
996
997t = {}
998function foo (p) t[#t + 1] = p; return p .. "x" end
999
1000p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" *
1001 m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" *
1002 m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" *
1003 m.Cg(m.Cb('x') / foo, "x") * m.Cb"x"
1004x = {p:match'ab'}
1005checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'})
1006checkeq(t, {'ab',
1007 'ab', 'abx',
1008 'ab', 'abx', 'abxx',
1009 'ab', 'abx', 'abxx', 'abxxx'})
1010
1011
1012
1013-- tests for match-time captures
1014
1015p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end)
1016 + 'acd'
1017
1018assert(p:match('abc') == 3)
1019assert(p:match('acd') == 4)
1020
1021local function id (s, i, ...)
1022 return true, ...
1023end
1024
1025assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) +
1026 m.R'09'^1 / string.char +
1027 m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y")
1028
1029p = m.P{'S',
1030 S = m.V'atom' * space
1031 + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id),
1032 atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id)
1033}
1034x = p:match"(a g () ((b) c) (d (e)))"
1035checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}});
1036
1037x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))}
1038assert(#x == 500)
1039
1040local function id(s, i, x)
1041 if x == 'a' then return i, 1, 3, 7
1042 else return nil, 2, 4, 6, 8
1043 end
1044end
1045
1046p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0
1047assert(table.concat{p:match('abababab')} == string.rep('137', 4))
1048
1049local function ref (s, i, x)
1050 return m.match(x, s, i - x:len())
1051end
1052
1053assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4)
1054assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4)
1055assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo'))
1056
1057ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end
1058
1059assert(m.Cmt(1, ref):match'2')
1060assert(not m.Cmt(1, ref):match'1')
1061assert(m.Cmt(m.P(1)^0, ref):match'03')
1062
1063function ref (s, i, a, b)
1064 if a == b then return i, a:upper() end
1065end
1066
1067p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref)
1068p = (any - p)^0 * p * any^0 * -1
1069
1070assert(p:match'abbbc-bc ddaa' == 'BC')
1071
1072do -- match-time captures cannot be optimized away
1073 local touch = 0
1074 f = m.P(function () touch = touch + 1; return true end)
1075
1076 local function check(n) n = n or 1; assert(touch == n); touch = 0 end
1077
1078 assert(m.match(f * false + 'b', 'a') == nil); check()
1079 assert(m.match(f * false + 'b', '') == nil); check()
1080 assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check()
1081 assert(m.match( (f * 'a')^0 * 'b', '') == nil); check()
1082 assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check()
1083 assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check()
1084 assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check()
1085 assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check()
1086 assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil);
1087 check()
1088 assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check()
1089 assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2)
1090 assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1)
1091 assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1)
1092 assert(m.match(-f * 'a' + 'b', '') == nil); check(1)
1093end
1094
1095c = '[' * m.Cg(m.P'='^0, "init") * '[' *
1096 { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2)
1097 return s1 == s2 end)
1098 + 1 * m.V(1) } / 0
1099
1100assert(c:match'[==[]]====]]]]==]===[]' == 18)
1101assert(c:match'[[]=]====]=]]]==]===[]' == 14)
1102assert(not c:match'[[]=]====]=]=]==]===[]')
1103
1104
1105-- old bug: optimization of concat with fail removed match-time capture
1106p = m.Cmt(0, function (s) p = s end) * m.P(false)
1107assert(not p:match('alo'))
1108assert(p == 'alo')
1109
1110
1111-- ensure that failed match-time captures are not kept on Lua stack
1112do
1113 local t = {__mode = "kv"}; setmetatable(t,t)
1114 local c = 0
1115
1116 local function foo (s,i)
1117 collectgarbage();
1118 assert(next(t) == "__mode" and next(t, "__mode") == nil)
1119 local x = {}
1120 t[x] = true
1121 c = c + 1
1122 return i, x
1123 end
1124
1125 local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" }
1126 p:match(string.rep('1', 10))
1127 assert(c == 11)
1128end
1129
1130
1131-- Return a match-time capture that returns 'n' captures
1132local function manyCmt (n)
1133 return m.Cmt("a", function ()
1134 local a = {}; for i = 1, n do a[i] = n - i end
1135 return true, unpack(a)
1136 end)
1137end
1138
1139-- bug in 1.0: failed match-time that used previous match-time results
1140do
1141 local x
1142 local function aux (...) x = #{...}; return false end
1143 local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")}
1144 assert(#res == 10 and res[1] == 9 and res[10] == 0)
1145end
1146
1147
1148-- bug in 1.0: problems with math-times returning too many captures
1149do
1150 local lim = 2^11 - 10
1151 local res = {m.match(manyCmt(lim), "a")}
1152 assert(#res == lim and res[1] == lim - 1 and res[lim] == 0)
1153 checkerr("too many", m.match, manyCmt(2^15), "a")
1154end
1155
1156p = (m.P(function () return true, "a" end) * 'a'
1157 + m.P(function (s, i) return i, "aa", 20 end) * 'b'
1158 + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0
1159
1160t = {p:match('abacc')}
1161checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'})
1162
1163
1164-------------------------------------------------------------------
1165-- Tests for 're' module
1166-------------------------------------------------------------------
1167
1168local re = require "re"
1169
1170local match, compile = re.match, re.compile
1171
1172
1173
1174assert(match("a", ".") == 2)
1175assert(match("a", "''") == 1)
1176assert(match("", " ! . ") == 1)
1177assert(not match("a", " ! . "))
1178assert(match("abcde", " ( . . ) * ") == 5)
1179assert(match("abbcde", " [a-c] +") == 5)
1180assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7)
1181assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8)
1182assert(match("abbc--", " [a-c] + +") == 5)
1183assert(match("abbc--", " [ac-] +") == 2)
1184assert(match("abbc--", " [-acb] + ") == 7)
1185assert(not match("abbcde", " [b-z] + "))
1186assert(match("abb\"de", '"abb"["]"de"') == 7)
1187assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee")
1188assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8)
1189
1190assert(re.match("aaand", "[a]^2") == 3)
1191
1192local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")}
1193checkeq(t, {4, 5, 7})
1194local t = {match("abceefe", "((&&'e' {})? .)*")}
1195checkeq(t, {4, 5, 7})
1196local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")}
1197checkeq(t, {4, 5, 7})
1198local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")}
1199checkeq(t, {4, 5, 7})
1200
1201assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5)
1202assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4)
1203assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8)
1204
1205assert(match("abc", "a <- (. a)?") == 4)
1206b = "balanced <- '(' ([^()] / balanced)* ')'"
1207assert(match("(abc)", b))
1208assert(match("(a(b)((c) (d)))", b))
1209assert(not match("(a(b ((c) (d)))", b))
1210
1211b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]]
1212assert(b == m.P(b))
1213assert(b:match"((((a))(b)))")
1214
1215local g = [[
1216 S <- "0" B / "1" A / "" -- balanced strings
1217 A <- "0" S / "1" A A -- one more 0
1218 B <- "1" S / "0" B B -- one more 1
1219]]
1220assert(match("00011011", g) == 9)
1221
1222local g = [[
1223 S <- ("0" B / "1" A)*
1224 A <- "0" / "1" A A
1225 B <- "1" / "0" B B
1226]]
1227assert(match("00011011", g) == 9)
1228assert(match("000110110", g) == 9)
1229assert(match("011110110", g) == 3)
1230assert(match("000110010", g) == 1)
1231
1232s = "aaaaaaaaaaaaaaaaaaaaaaaa"
1233assert(match(s, "'a'^3") == 4)
1234assert(match(s, "'a'^0") == 1)
1235assert(match(s, "'a'^+3") == s:len() + 1)
1236assert(not match(s, "'a'^+30"))
1237assert(match(s, "'a'^-30") == s:len() + 1)
1238assert(match(s, "'a'^-5") == 6)
1239for i = 1, s:len() do
1240 assert(match(s, string.format("'a'^+%d", i)) >= i + 1)
1241 assert(match(s, string.format("'a'^-%d", i)) <= i + 1)
1242 assert(match(s, string.format("'a'^%d", i)) == i + 1)
1243end
1244assert(match("01234567890123456789", "[0-9]^3+") == 19)
1245
1246
1247assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123")
1248t = match("0123456789", "{| {.}* |}")
1249checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"})
1250assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101")
1251
1252assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c")
1253assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d")
1254assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6)
1255
1256assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x"))
1257assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x"))
1258
1259eqcharset(compile"[]]", "]")
1260eqcharset(compile"[][]", m.S"[]")
1261eqcharset(compile"[]-]", m.S"-]")
1262eqcharset(compile"[-]", m.S"-")
1263eqcharset(compile"[az-]", m.S"a-z")
1264eqcharset(compile"[-az]", m.S"a-z")
1265eqcharset(compile"[a-z]", m.R"az")
1266eqcharset(compile"[]['\"]", m.S[[]['"]])
1267
1268eqcharset(compile"[^]]", any - "]")
1269eqcharset(compile"[^][]", any - m.S"[]")
1270eqcharset(compile"[^]-]", any - m.S"-]")
1271eqcharset(compile"[^]-]", any - m.S"-]")
1272eqcharset(compile"[^-]", any - m.S"-")
1273eqcharset(compile"[^az-]", any - m.S"a-z")
1274eqcharset(compile"[^-az]", any - m.S"a-z")
1275eqcharset(compile"[^a-z]", any - m.R"az")
1276eqcharset(compile"[^]['\"]", any - m.S[[]['"]])
1277
1278-- tests for comments in 're'
1279e = compile[[
1280A <- _B -- \t \n %nl .<> <- -> --
1281_B <- 'x' --]]
1282assert(e:match'xy' == 2)
1283
1284-- tests for 're' with pre-definitions
1285defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"}
1286e = compile("%letters (%letters / %digits)*", defs)
1287assert(e:match"x123" == 5)
1288e = compile("%_", defs)
1289assert(e:match"__" == 3)
1290
1291e = compile([[
1292 S <- A+
1293 A <- %letters+ B
1294 B <- %digits+
1295]], defs)
1296
1297e = compile("{[0-9]+'.'?[0-9]*} -> sin", math)
1298assert(e:match("2.34") == math.sin(2.34))
1299
1300
1301function eq (_, _, a, b) return a == b end
1302
1303c = re.compile([[
1304 longstring <- '[' {:init: '='* :} '[' close
1305 close <- ']' =init ']' / . close
1306]])
1307
1308assert(c:match'[==[]]===]]]]==]===[]' == 17)
1309assert(c:match'[[]=]====]=]]]==]===[]' == 14)
1310assert(not c:match'[[]=]====]=]=]==]===[]')
1311
1312c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. "
1313
1314assert(c:match'[==[]]===]]]]==]')
1315assert(c:match'[[]=]====]=][]==]===[]]')
1316assert(not c:match'[[]=]====]=]=]==]===[]')
1317
1318assert(re.find("hi alalo", "{:x:..:} =x") == 4)
1319assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4)
1320assert(not re.find("hi alalo", "{:x:..:} =x", 5))
1321assert(re.find("hi alalo", "{'al'}", 5) == 6)
1322assert(re.find("hi aloalolo", "{:x:..:} =x") == 8)
1323assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11)
1324
1325-- re.find discards any captures
1326local a,b,c = re.find("alo", "{.}{'o'}")
1327assert(a == 2 and b == 3 and c == nil)
1328
1329local function match (s,p)
1330 local i,e = re.find(s,p)
1331 if i then return s:sub(i, e) end
1332end
1333assert(match("alo alo", '[a-z]+') == "alo")
1334assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil)
1335assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo")
1336
1337assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo")
1338assert(re.gsub("alo alo", "%w+", ".") == ". .")
1339assert(re.gsub("hi, how are you", "[aeiou]", string.upper) ==
1340 "hI, hOw ArE yOU")
1341
1342s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]'
1343c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' "
1344assert(re.gsub(s, c, "%2") == 'hi and =]')
1345assert(re.gsub(s, c, "%0") == s)
1346assert(re.gsub('[=[hi]=]', c, "%2") == '=')
1347
1348assert(re.find("", "!.") == 1)
1349assert(re.find("alo", "!.") == 4)
1350
1351function addtag (s, i, t, tag) t.tag = tag; return i, t end
1352
1353c = re.compile([[
1354 doc <- block !.
1355 block <- (start {| (block / { [^<]+ })* |} end?) => addtag
1356 start <- '<' {:tag: [a-z]+ :} '>'
1357 end <- '</' { =tag } '>'
1358]], {addtag = addtag})
1359
1360x = c:match[[
1361<x>hi<b>hello</b>but<b>totheend</x>]]
1362checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but',
1363 {'totheend'}})
1364
1365
1366-- test for folding captures
1367c = re.compile([[
1368 S <- (number (%s+ number)*) ~> add
1369 number <- %d+ -> tonumber
1370]], {tonumber = tonumber, add = function (a,b) return a + b end})
1371assert(c:match("3 401 50") == 3 + 401 + 50)
1372
1373-- tests for look-ahead captures
1374x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")}
1375checkeq(x, {"", "alo", ""})
1376
1377assert(re.match("aloalo",
1378 "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}")
1379 == "AallooAalloo")
1380
1381-- bug in 0.9 (and older versions), due to captures in look-aheads
1382x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]]
1383assert(x:match"alo alo" == "+ +")
1384
1385-- valid capture in look-ahead (used inside the look-ahead itself)
1386x = re.compile[[
1387 S <- &({:two: .. :} . =two) {[a-z]+} / . S
1388]]
1389assert(x:match("hello aloaLo aloalo xuxu") == "aloalo")
1390
1391
1392p = re.compile[[
1393 block <- {| {:ident:space*:} line
1394 ((=ident !space line) / &(=ident space) block)* |}
1395 line <- {[^%nl]*} %nl
1396 space <- '_' -- should be ' ', but '_' is simpler for editors
1397]]
1398
1399t= p:match[[
14001
1401__1.1
1402__1.2
1403____1.2.1
1404____
14052
1406__2.1
1407]]
1408checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"},
1409 "2", {"2.1", ident = "__"}, ident = ""})
1410
1411
1412-- nested grammars
1413p = re.compile[[
1414 s <- a b !.
1415 b <- ( x <- ('b' x)? )
1416 a <- ( x <- 'a' x? )
1417]]
1418
1419assert(p:match'aaabbb')
1420assert(p:match'aaa')
1421assert(not p:match'bbb')
1422assert(not p:match'aaabbba')
1423
1424-- testing groups
1425t = {re.match("abc", "{:S <- {:.:} {S} / '':}")}
1426checkeq(t, {"a", "bc", "b", "c", "c", ""})
1427
1428t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}")
1429checkeq(t, {a="1", b="2", c="4"})
1430t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}")
1431checkeq(t, {a="1", b="2", c="4"})
1432t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}")
1433checkeq(t, {"1", b="2", "4", "5"})
1434t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}")
1435checkeq(t, {"1", "23", "4", "5"})
1436t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}")
1437checkeq(t, {"1", "23", "4", "5"})
1438
1439
1440-- testing pre-defined names
1441assert(os.setlocale("C") == "C")
1442
1443function eqlpeggsub (p1, p2)
1444 local s1 = cs2str(re.compile(p1))
1445 local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "")
1446 -- if s1 ~= s2 then print(#s1,#s2) end
1447 assert(s1 == s2)
1448end
1449
1450
1451eqlpeggsub("%w", "%w")
1452eqlpeggsub("%a", "%a")
1453eqlpeggsub("%l", "%l")
1454eqlpeggsub("%u", "%u")
1455eqlpeggsub("%p", "%p")
1456eqlpeggsub("%d", "%d")
1457eqlpeggsub("%x", "%x")
1458eqlpeggsub("%s", "%s")
1459eqlpeggsub("%c", "%c")
1460
1461eqlpeggsub("%W", "%W")
1462eqlpeggsub("%A", "%A")
1463eqlpeggsub("%L", "%L")
1464eqlpeggsub("%U", "%U")
1465eqlpeggsub("%P", "%P")
1466eqlpeggsub("%D", "%D")
1467eqlpeggsub("%X", "%X")
1468eqlpeggsub("%S", "%S")
1469eqlpeggsub("%C", "%C")
1470
1471eqlpeggsub("[%w]", "%w")
1472eqlpeggsub("[_%w]", "_%w")
1473eqlpeggsub("[^%w]", "%W")
1474eqlpeggsub("[%W%S]", "%W%S")
1475
1476re.updatelocale()
1477
1478
1479-- testing nested substitutions x string captures
1480
1481p = re.compile[[
1482 text <- {~ item* ~}
1483 item <- macro / [^()] / '(' item* ')'
1484 arg <- ' '* {~ (!',' item)* ~}
1485 args <- '(' arg (',' arg)* ')'
1486 macro <- ('apply' args) -> '%1(%2)'
1487 / ('add' args) -> '%1 + %2'
1488 / ('mul' args) -> '%1 * %2'
1489]]
1490
1491assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)")
1492
1493rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']]
1494
1495assert(rev:match"0123456789" == "9876543210")
1496
1497
1498-- testing error messages in re
1499
1500local function errmsg (p, err)
1501 checkerr(err, re.compile, p)
1502end
1503
1504errmsg('aaaa', "rule 'aaaa'")
1505errmsg('a', 'outside')
1506errmsg('b <- a', 'undefined')
1507errmsg("x <- 'a' x <- 'b'", 'already defined')
1508errmsg("'a' -", "near '-'")
1509
1510
1511print"OK"
1512
1513