diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
commit | e08e5df853560de6482d84066a7accc6a18de545 (patch) | |
tree | ee19686bb35da90709a32ed24bf7855de1a3946a | |
download | lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.gz lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.bz2 lpeg-e08e5df853560de6482d84066a7accc6a18de545.zip |
Fist version of LPeg on GIT
LPeg repository is being moved to git. Past versions won't be moved;
they are still available in RCS.
-rw-r--r-- | doc.css | 223 | ||||
-rw-r--r-- | lpcap.c | 537 | ||||
-rw-r--r-- | lpcap.h | 56 | ||||
-rw-r--r-- | lpcode.c | 1014 | ||||
-rw-r--r-- | lpcode.h | 40 | ||||
-rw-r--r-- | lpeg.html | 1445 | ||||
-rw-r--r-- | lpprint.c | 244 | ||||
-rw-r--r-- | lpprint.h | 36 | ||||
-rw-r--r-- | lptree.c | 1305 | ||||
-rw-r--r-- | lptree.h | 82 | ||||
-rw-r--r-- | lptypes.h | 145 | ||||
-rw-r--r-- | lpvm.c | 364 | ||||
-rw-r--r-- | lpvm.h | 58 | ||||
-rw-r--r-- | makefile | 55 | ||||
-rwxr-xr-x | pack | 15 | ||||
-rw-r--r-- | re.html | 500 | ||||
-rw-r--r-- | re.lua | 267 | ||||
-rwxr-xr-x | test.lua | 1513 |
18 files changed, 7899 insertions, 0 deletions
@@ -0,0 +1,223 @@ | |||
1 | body { | ||
2 | margin-left: 1em; | ||
3 | margin-right: 1em; | ||
4 | font-family: arial, helvetica, geneva, sans-serif; | ||
5 | background-color:#ffffff; margin:0px; | ||
6 | } | ||
7 | |||
8 | code { | ||
9 | font-family: "Andale Mono", monospace; | ||
10 | } | ||
11 | |||
12 | tt { | ||
13 | font-family: "Andale Mono", monospace; | ||
14 | } | ||
15 | |||
16 | body, td, th { font-size: 11pt; } | ||
17 | |||
18 | h1, h2, h3, h4 { margin-left: 0em; } | ||
19 | |||
20 | textarea, pre, tt { font-size:10pt; } | ||
21 | body, td, th { color:#000000; } | ||
22 | small { font-size:0.85em; } | ||
23 | h1 { font-size:1.5em; } | ||
24 | h2 { font-size:1.25em; } | ||
25 | h3 { font-size:1.15em; } | ||
26 | h4 { font-size:1.06em; } | ||
27 | |||
28 | a:link { font-weight:bold; color: #004080; text-decoration: none; } | ||
29 | a:visited { font-weight:bold; color: #006699; text-decoration: none; } | ||
30 | a:link:hover { text-decoration:underline; } | ||
31 | hr { color:#cccccc } | ||
32 | img { border-width: 0px; } | ||
33 | |||
34 | |||
35 | h3 { padding-top: 1em; } | ||
36 | |||
37 | p { margin-left: 1em; } | ||
38 | |||
39 | p.name { | ||
40 | font-family: "Andale Mono", monospace; | ||
41 | padding-top: 1em; | ||
42 | margin-left: 0em; | ||
43 | } | ||
44 | |||
45 | blockquote { margin-left: 3em; } | ||
46 | |||
47 | .example { | ||
48 | background-color: rgb(245, 245, 245); | ||
49 | border-top-width: 1px; | ||
50 | border-right-width: 1px; | ||
51 | border-bottom-width: 1px; | ||
52 | border-left-width: 1px; | ||
53 | border-top-style: solid; | ||
54 | border-right-style: solid; | ||
55 | border-bottom-style: solid; | ||
56 | border-left-style: solid; | ||
57 | border-top-color: silver; | ||
58 | border-right-color: silver; | ||
59 | border-bottom-color: silver; | ||
60 | border-left-color: silver; | ||
61 | padding: 1em; | ||
62 | margin-left: 1em; | ||
63 | margin-right: 1em; | ||
64 | font-family: "Andale Mono", monospace; | ||
65 | font-size: smaller; | ||
66 | } | ||
67 | |||
68 | |||
69 | hr { | ||
70 | margin-left: 0em; | ||
71 | background: #00007f; | ||
72 | border: 0px; | ||
73 | height: 1px; | ||
74 | } | ||
75 | |||
76 | ul { list-style-type: disc; } | ||
77 | |||
78 | table.index { border: 1px #00007f; } | ||
79 | table.index td { text-align: left; vertical-align: top; } | ||
80 | table.index ul { padding-top: 0em; margin-top: 0em; } | ||
81 | |||
82 | table { | ||
83 | border: 1px solid black; | ||
84 | border-collapse: collapse; | ||
85 | margin-left: auto; | ||
86 | margin-right: auto; | ||
87 | } | ||
88 | th { | ||
89 | border: 1px solid black; | ||
90 | padding: 0.5em; | ||
91 | } | ||
92 | td { | ||
93 | border: 1px solid black; | ||
94 | padding: 0.5em; | ||
95 | } | ||
96 | div.header, div.footer { margin-left: 0em; } | ||
97 | |||
98 | #container | ||
99 | { | ||
100 | margin-left: 1em; | ||
101 | margin-right: 1em; | ||
102 | background-color: #f0f0f0; | ||
103 | } | ||
104 | |||
105 | #product | ||
106 | { | ||
107 | text-align: center; | ||
108 | border-bottom: 1px solid #cccccc; | ||
109 | background-color: #ffffff; | ||
110 | } | ||
111 | |||
112 | #product big { | ||
113 | font-size: 2em; | ||
114 | } | ||
115 | |||
116 | #product_logo | ||
117 | { | ||
118 | } | ||
119 | |||
120 | #product_name | ||
121 | { | ||
122 | } | ||
123 | |||
124 | #product_description | ||
125 | { | ||
126 | } | ||
127 | |||
128 | #main | ||
129 | { | ||
130 | background-color: #f0f0f0; | ||
131 | border-left: 2px solid #cccccc; | ||
132 | } | ||
133 | |||
134 | #navigation | ||
135 | { | ||
136 | float: left; | ||
137 | width: 12em; | ||
138 | margin: 0; | ||
139 | vertical-align: top; | ||
140 | background-color: #f0f0f0; | ||
141 | overflow:visible; | ||
142 | } | ||
143 | |||
144 | #navigation h1 { | ||
145 | background-color:#e7e7e7; | ||
146 | font-size:1.1em; | ||
147 | color:#000000; | ||
148 | text-align:left; | ||
149 | margin:0px; | ||
150 | padding:0.2em; | ||
151 | border-top:1px solid #dddddd; | ||
152 | border-bottom:1px solid #dddddd; | ||
153 | } | ||
154 | |||
155 | #navigation ul | ||
156 | { | ||
157 | font-size:1em; | ||
158 | list-style-type: none; | ||
159 | padding: 0; | ||
160 | margin: 1px; | ||
161 | } | ||
162 | |||
163 | #navigation li | ||
164 | { | ||
165 | text-indent: -1em; | ||
166 | margin: 0em 0em 0em 0.5em; | ||
167 | display: block; | ||
168 | padding: 3px 0px 0px 12px; | ||
169 | } | ||
170 | |||
171 | #navigation li li a | ||
172 | { | ||
173 | padding: 0px 3px 0px -1em; | ||
174 | } | ||
175 | |||
176 | #content | ||
177 | { | ||
178 | margin-left: 12em; | ||
179 | padding: 1em; | ||
180 | border-left: 2px solid #cccccc; | ||
181 | border-right: 2px solid #cccccc; | ||
182 | background-color: #ffffff; | ||
183 | } | ||
184 | |||
185 | #about | ||
186 | { | ||
187 | clear: both; | ||
188 | margin: 0; | ||
189 | padding: 5px; | ||
190 | border-top: 2px solid #cccccc; | ||
191 | background-color: #ffffff; | ||
192 | } | ||
193 | |||
194 | @media print { | ||
195 | body { | ||
196 | font: 10pt "Times New Roman", "TimeNR", Times, serif; | ||
197 | } | ||
198 | a { font-weight:bold; color: #004080; text-decoration: underline; } | ||
199 | |||
200 | #main { background-color: #ffffff; border-left: 0px; } | ||
201 | #container { margin-left: 2%; margin-right: 2%; background-color: #ffffff; } | ||
202 | |||
203 | #content { margin-left: 0px; padding: 1em; border-left: 0px; border-right: 0px; background-color: #ffffff; } | ||
204 | |||
205 | #navigation { display: none; | ||
206 | } | ||
207 | |||
208 | #product_logo | ||
209 | { | ||
210 | display: none; | ||
211 | } | ||
212 | |||
213 | #about img | ||
214 | { | ||
215 | display: none; | ||
216 | } | ||
217 | |||
218 | .example { | ||
219 | font-family: "Andale Mono", monospace; | ||
220 | font-size: 8pt; | ||
221 | page-break-inside: avoid; | ||
222 | } | ||
223 | } | ||
@@ -0,0 +1,537 @@ | |||
1 | /* | ||
2 | ** $Id: lpcap.c,v 1.6 2015/06/15 16:09:57 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include "lua.h" | ||
7 | #include "lauxlib.h" | ||
8 | |||
9 | #include "lpcap.h" | ||
10 | #include "lptypes.h" | ||
11 | |||
12 | |||
13 | #define captype(cap) ((cap)->kind) | ||
14 | |||
15 | #define isclosecap(cap) (captype(cap) == Cclose) | ||
16 | |||
17 | #define closeaddr(c) ((c)->s + (c)->siz - 1) | ||
18 | |||
19 | #define isfullcap(cap) ((cap)->siz != 0) | ||
20 | |||
21 | #define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v) | ||
22 | |||
23 | #define pushluaval(cs) getfromktable(cs, (cs)->cap->idx) | ||
24 | |||
25 | |||
26 | |||
27 | /* | ||
28 | ** Put at the cache for Lua values the value indexed by 'v' in ktable | ||
29 | ** of the running pattern (if it is not there yet); returns its index. | ||
30 | */ | ||
31 | static int updatecache (CapState *cs, int v) { | ||
32 | int idx = cs->ptop + 1; /* stack index of cache for Lua values */ | ||
33 | if (v != cs->valuecached) { /* not there? */ | ||
34 | getfromktable(cs, v); /* get value from 'ktable' */ | ||
35 | lua_replace(cs->L, idx); /* put it at reserved stack position */ | ||
36 | cs->valuecached = v; /* keep track of what is there */ | ||
37 | } | ||
38 | return idx; | ||
39 | } | ||
40 | |||
41 | |||
42 | static int pushcapture (CapState *cs); | ||
43 | |||
44 | |||
45 | /* | ||
46 | ** Goes back in a list of captures looking for an open capture | ||
47 | ** corresponding to a close | ||
48 | */ | ||
49 | static Capture *findopen (Capture *cap) { | ||
50 | int n = 0; /* number of closes waiting an open */ | ||
51 | for (;;) { | ||
52 | cap--; | ||
53 | if (isclosecap(cap)) n++; /* one more open to skip */ | ||
54 | else if (!isfullcap(cap)) | ||
55 | if (n-- == 0) return cap; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | |||
60 | /* | ||
61 | ** Go to the next capture | ||
62 | */ | ||
63 | static void nextcap (CapState *cs) { | ||
64 | Capture *cap = cs->cap; | ||
65 | if (!isfullcap(cap)) { /* not a single capture? */ | ||
66 | int n = 0; /* number of opens waiting a close */ | ||
67 | for (;;) { /* look for corresponding close */ | ||
68 | cap++; | ||
69 | if (isclosecap(cap)) { | ||
70 | if (n-- == 0) break; | ||
71 | } | ||
72 | else if (!isfullcap(cap)) n++; | ||
73 | } | ||
74 | } | ||
75 | cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */ | ||
76 | } | ||
77 | |||
78 | |||
79 | /* | ||
80 | ** Push on the Lua stack all values generated by nested captures inside | ||
81 | ** the current capture. Returns number of values pushed. 'addextra' | ||
82 | ** makes it push the entire match after all captured values. The | ||
83 | ** entire match is pushed also if there are no other nested values, | ||
84 | ** so the function never returns zero. | ||
85 | */ | ||
86 | static int pushnestedvalues (CapState *cs, int addextra) { | ||
87 | Capture *co = cs->cap; | ||
88 | if (isfullcap(cs->cap++)) { /* no nested captures? */ | ||
89 | lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */ | ||
90 | return 1; /* that is it */ | ||
91 | } | ||
92 | else { | ||
93 | int n = 0; | ||
94 | while (!isclosecap(cs->cap)) /* repeat for all nested patterns */ | ||
95 | n += pushcapture(cs); | ||
96 | if (addextra || n == 0) { /* need extra? */ | ||
97 | lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */ | ||
98 | n++; | ||
99 | } | ||
100 | cs->cap++; /* skip close entry */ | ||
101 | return n; | ||
102 | } | ||
103 | } | ||
104 | |||
105 | |||
106 | /* | ||
107 | ** Push only the first value generated by nested captures | ||
108 | */ | ||
109 | static void pushonenestedvalue (CapState *cs) { | ||
110 | int n = pushnestedvalues(cs, 0); | ||
111 | if (n > 1) | ||
112 | lua_pop(cs->L, n - 1); /* pop extra values */ | ||
113 | } | ||
114 | |||
115 | |||
116 | /* | ||
117 | ** Try to find a named group capture with the name given at the top of | ||
118 | ** the stack; goes backward from 'cap'. | ||
119 | */ | ||
120 | static Capture *findback (CapState *cs, Capture *cap) { | ||
121 | lua_State *L = cs->L; | ||
122 | while (cap-- > cs->ocap) { /* repeat until end of list */ | ||
123 | if (isclosecap(cap)) | ||
124 | cap = findopen(cap); /* skip nested captures */ | ||
125 | else if (!isfullcap(cap)) | ||
126 | continue; /* opening an enclosing capture: skip and get previous */ | ||
127 | if (captype(cap) == Cgroup) { | ||
128 | getfromktable(cs, cap->idx); /* get group name */ | ||
129 | if (lp_equal(L, -2, -1)) { /* right group? */ | ||
130 | lua_pop(L, 2); /* remove reference name and group name */ | ||
131 | return cap; | ||
132 | } | ||
133 | else lua_pop(L, 1); /* remove group name */ | ||
134 | } | ||
135 | } | ||
136 | luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1)); | ||
137 | return NULL; /* to avoid warnings */ | ||
138 | } | ||
139 | |||
140 | |||
141 | /* | ||
142 | ** Back-reference capture. Return number of values pushed. | ||
143 | */ | ||
144 | static int backrefcap (CapState *cs) { | ||
145 | int n; | ||
146 | Capture *curr = cs->cap; | ||
147 | pushluaval(cs); /* reference name */ | ||
148 | cs->cap = findback(cs, curr); /* find corresponding group */ | ||
149 | n = pushnestedvalues(cs, 0); /* push group's values */ | ||
150 | cs->cap = curr + 1; | ||
151 | return n; | ||
152 | } | ||
153 | |||
154 | |||
155 | /* | ||
156 | ** Table capture: creates a new table and populates it with nested | ||
157 | ** captures. | ||
158 | */ | ||
159 | static int tablecap (CapState *cs) { | ||
160 | lua_State *L = cs->L; | ||
161 | int n = 0; | ||
162 | lua_newtable(L); | ||
163 | if (isfullcap(cs->cap++)) | ||
164 | return 1; /* table is empty */ | ||
165 | while (!isclosecap(cs->cap)) { | ||
166 | if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */ | ||
167 | pushluaval(cs); /* push group name */ | ||
168 | pushonenestedvalue(cs); | ||
169 | lua_settable(L, -3); | ||
170 | } | ||
171 | else { /* not a named group */ | ||
172 | int i; | ||
173 | int k = pushcapture(cs); | ||
174 | for (i = k; i > 0; i--) /* store all values into table */ | ||
175 | lua_rawseti(L, -(i + 1), n + i); | ||
176 | n += k; | ||
177 | } | ||
178 | } | ||
179 | cs->cap++; /* skip close entry */ | ||
180 | return 1; /* number of values pushed (only the table) */ | ||
181 | } | ||
182 | |||
183 | |||
184 | /* | ||
185 | ** Table-query capture | ||
186 | */ | ||
187 | static int querycap (CapState *cs) { | ||
188 | int idx = cs->cap->idx; | ||
189 | pushonenestedvalue(cs); /* get nested capture */ | ||
190 | lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */ | ||
191 | if (!lua_isnil(cs->L, -1)) | ||
192 | return 1; | ||
193 | else { /* no value */ | ||
194 | lua_pop(cs->L, 1); /* remove nil */ | ||
195 | return 0; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | |||
200 | /* | ||
201 | ** Fold capture | ||
202 | */ | ||
203 | static int foldcap (CapState *cs) { | ||
204 | int n; | ||
205 | lua_State *L = cs->L; | ||
206 | int idx = cs->cap->idx; | ||
207 | if (isfullcap(cs->cap++) || /* no nested captures? */ | ||
208 | isclosecap(cs->cap) || /* no nested captures (large subject)? */ | ||
209 | (n = pushcapture(cs)) == 0) /* nested captures with no values? */ | ||
210 | return luaL_error(L, "no initial value for fold capture"); | ||
211 | if (n > 1) | ||
212 | lua_pop(L, n - 1); /* leave only one result for accumulator */ | ||
213 | while (!isclosecap(cs->cap)) { | ||
214 | lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */ | ||
215 | lua_insert(L, -2); /* put it before accumulator */ | ||
216 | n = pushcapture(cs); /* get next capture's values */ | ||
217 | lua_call(L, n + 1, 1); /* call folding function */ | ||
218 | } | ||
219 | cs->cap++; /* skip close entry */ | ||
220 | return 1; /* only accumulator left on the stack */ | ||
221 | } | ||
222 | |||
223 | |||
224 | /* | ||
225 | ** Function capture | ||
226 | */ | ||
227 | static int functioncap (CapState *cs) { | ||
228 | int n; | ||
229 | int top = lua_gettop(cs->L); | ||
230 | pushluaval(cs); /* push function */ | ||
231 | n = pushnestedvalues(cs, 0); /* push nested captures */ | ||
232 | lua_call(cs->L, n, LUA_MULTRET); /* call function */ | ||
233 | return lua_gettop(cs->L) - top; /* return function's results */ | ||
234 | } | ||
235 | |||
236 | |||
237 | /* | ||
238 | ** Select capture | ||
239 | */ | ||
240 | static int numcap (CapState *cs) { | ||
241 | int idx = cs->cap->idx; /* value to select */ | ||
242 | if (idx == 0) { /* no values? */ | ||
243 | nextcap(cs); /* skip entire capture */ | ||
244 | return 0; /* no value produced */ | ||
245 | } | ||
246 | else { | ||
247 | int n = pushnestedvalues(cs, 0); | ||
248 | if (n < idx) /* invalid index? */ | ||
249 | return luaL_error(cs->L, "no capture '%d'", idx); | ||
250 | else { | ||
251 | lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */ | ||
252 | lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */ | ||
253 | lua_pop(cs->L, n - 1); /* remove other captures */ | ||
254 | return 1; | ||
255 | } | ||
256 | } | ||
257 | } | ||
258 | |||
259 | |||
260 | /* | ||
261 | ** Return the stack index of the first runtime capture in the given | ||
262 | ** list of captures (or zero if no runtime captures) | ||
263 | */ | ||
264 | int finddyncap (Capture *cap, Capture *last) { | ||
265 | for (; cap < last; cap++) { | ||
266 | if (cap->kind == Cruntime) | ||
267 | return cap->idx; /* stack position of first capture */ | ||
268 | } | ||
269 | return 0; /* no dynamic captures in this segment */ | ||
270 | } | ||
271 | |||
272 | |||
273 | /* | ||
274 | ** Calls a runtime capture. Returns number of captures removed by | ||
275 | ** the call, including the initial Cgroup. (Captures to be added are | ||
276 | ** on the Lua stack.) | ||
277 | */ | ||
278 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { | ||
279 | int n, id; | ||
280 | lua_State *L = cs->L; | ||
281 | int otop = lua_gettop(L); | ||
282 | Capture *open = findopen(close); | ||
283 | assert(captype(open) == Cgroup); | ||
284 | id = finddyncap(open, close); /* get first dynamic capture argument */ | ||
285 | close->kind = Cclose; /* closes the group */ | ||
286 | close->s = s; | ||
287 | cs->cap = open; cs->valuecached = 0; /* prepare capture state */ | ||
288 | luaL_checkstack(L, 4, "too many runtime captures"); | ||
289 | pushluaval(cs); /* push function to be called */ | ||
290 | lua_pushvalue(L, SUBJIDX); /* push original subject */ | ||
291 | lua_pushinteger(L, s - cs->s + 1); /* push current position */ | ||
292 | n = pushnestedvalues(cs, 0); /* push nested captures */ | ||
293 | lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */ | ||
294 | if (id > 0) { /* are there old dynamic captures to be removed? */ | ||
295 | int i; | ||
296 | for (i = id; i <= otop; i++) | ||
297 | lua_remove(L, id); /* remove old dynamic captures */ | ||
298 | *rem = otop - id + 1; /* total number of dynamic captures removed */ | ||
299 | } | ||
300 | else | ||
301 | *rem = 0; /* no dynamic captures removed */ | ||
302 | return close - open; /* number of captures of all kinds removed */ | ||
303 | } | ||
304 | |||
305 | |||
306 | /* | ||
307 | ** Auxiliary structure for substitution and string captures: keep | ||
308 | ** information about nested captures for future use, avoiding to push | ||
309 | ** string results into Lua | ||
310 | */ | ||
311 | typedef struct StrAux { | ||
312 | int isstring; /* whether capture is a string */ | ||
313 | union { | ||
314 | Capture *cp; /* if not a string, respective capture */ | ||
315 | struct { /* if it is a string... */ | ||
316 | const char *s; /* ... starts here */ | ||
317 | const char *e; /* ... ends here */ | ||
318 | } s; | ||
319 | } u; | ||
320 | } StrAux; | ||
321 | |||
322 | #define MAXSTRCAPS 10 | ||
323 | |||
324 | /* | ||
325 | ** Collect values from current capture into array 'cps'. Current | ||
326 | ** capture must be Cstring (first call) or Csimple (recursive calls). | ||
327 | ** (In first call, fills %0 with whole match for Cstring.) | ||
328 | ** Returns number of elements in the array that were filled. | ||
329 | */ | ||
330 | static int getstrcaps (CapState *cs, StrAux *cps, int n) { | ||
331 | int k = n++; | ||
332 | cps[k].isstring = 1; /* get string value */ | ||
333 | cps[k].u.s.s = cs->cap->s; /* starts here */ | ||
334 | if (!isfullcap(cs->cap++)) { /* nested captures? */ | ||
335 | while (!isclosecap(cs->cap)) { /* traverse them */ | ||
336 | if (n >= MAXSTRCAPS) /* too many captures? */ | ||
337 | nextcap(cs); /* skip extra captures (will not need them) */ | ||
338 | else if (captype(cs->cap) == Csimple) /* string? */ | ||
339 | n = getstrcaps(cs, cps, n); /* put info. into array */ | ||
340 | else { | ||
341 | cps[n].isstring = 0; /* not a string */ | ||
342 | cps[n].u.cp = cs->cap; /* keep original capture */ | ||
343 | nextcap(cs); | ||
344 | n++; | ||
345 | } | ||
346 | } | ||
347 | cs->cap++; /* skip close */ | ||
348 | } | ||
349 | cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */ | ||
350 | return n; | ||
351 | } | ||
352 | |||
353 | |||
354 | /* | ||
355 | ** add next capture value (which should be a string) to buffer 'b' | ||
356 | */ | ||
357 | static int addonestring (luaL_Buffer *b, CapState *cs, const char *what); | ||
358 | |||
359 | |||
360 | /* | ||
361 | ** String capture: add result to buffer 'b' (instead of pushing | ||
362 | ** it into the stack) | ||
363 | */ | ||
364 | static void stringcap (luaL_Buffer *b, CapState *cs) { | ||
365 | StrAux cps[MAXSTRCAPS]; | ||
366 | int n; | ||
367 | size_t len, i; | ||
368 | const char *fmt; /* format string */ | ||
369 | fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len); | ||
370 | n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */ | ||
371 | for (i = 0; i < len; i++) { /* traverse them */ | ||
372 | if (fmt[i] != '%') /* not an escape? */ | ||
373 | luaL_addchar(b, fmt[i]); /* add it to buffer */ | ||
374 | else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */ | ||
375 | luaL_addchar(b, fmt[i]); /* add to buffer */ | ||
376 | else { | ||
377 | int l = fmt[i] - '0'; /* capture index */ | ||
378 | if (l > n) | ||
379 | luaL_error(cs->L, "invalid capture index (%d)", l); | ||
380 | else if (cps[l].isstring) | ||
381 | luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s); | ||
382 | else { | ||
383 | Capture *curr = cs->cap; | ||
384 | cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */ | ||
385 | if (!addonestring(b, cs, "capture")) | ||
386 | luaL_error(cs->L, "no values in capture index %d", l); | ||
387 | cs->cap = curr; /* continue from where it stopped */ | ||
388 | } | ||
389 | } | ||
390 | } | ||
391 | } | ||
392 | |||
393 | |||
394 | /* | ||
395 | ** Substitution capture: add result to buffer 'b' | ||
396 | */ | ||
397 | static void substcap (luaL_Buffer *b, CapState *cs) { | ||
398 | const char *curr = cs->cap->s; | ||
399 | if (isfullcap(cs->cap)) /* no nested captures? */ | ||
400 | luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */ | ||
401 | else { | ||
402 | cs->cap++; /* skip open entry */ | ||
403 | while (!isclosecap(cs->cap)) { /* traverse nested captures */ | ||
404 | const char *next = cs->cap->s; | ||
405 | luaL_addlstring(b, curr, next - curr); /* add text up to capture */ | ||
406 | if (addonestring(b, cs, "replacement")) | ||
407 | curr = closeaddr(cs->cap - 1); /* continue after match */ | ||
408 | else /* no capture value */ | ||
409 | curr = next; /* keep original text in final result */ | ||
410 | } | ||
411 | luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */ | ||
412 | } | ||
413 | cs->cap++; /* go to next capture */ | ||
414 | } | ||
415 | |||
416 | |||
417 | /* | ||
418 | ** Evaluates a capture and adds its first value to buffer 'b'; returns | ||
419 | ** whether there was a value | ||
420 | */ | ||
421 | static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { | ||
422 | switch (captype(cs->cap)) { | ||
423 | case Cstring: | ||
424 | stringcap(b, cs); /* add capture directly to buffer */ | ||
425 | return 1; | ||
426 | case Csubst: | ||
427 | substcap(b, cs); /* add capture directly to buffer */ | ||
428 | return 1; | ||
429 | default: { | ||
430 | lua_State *L = cs->L; | ||
431 | int n = pushcapture(cs); | ||
432 | if (n > 0) { | ||
433 | if (n > 1) lua_pop(L, n - 1); /* only one result */ | ||
434 | if (!lua_isstring(L, -1)) | ||
435 | luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); | ||
436 | luaL_addvalue(b); | ||
437 | } | ||
438 | return n; | ||
439 | } | ||
440 | } | ||
441 | } | ||
442 | |||
443 | |||
444 | /* | ||
445 | ** Push all values of the current capture into the stack; returns | ||
446 | ** number of values pushed | ||
447 | */ | ||
448 | static int pushcapture (CapState *cs) { | ||
449 | lua_State *L = cs->L; | ||
450 | luaL_checkstack(L, 4, "too many captures"); | ||
451 | switch (captype(cs->cap)) { | ||
452 | case Cposition: { | ||
453 | lua_pushinteger(L, cs->cap->s - cs->s + 1); | ||
454 | cs->cap++; | ||
455 | return 1; | ||
456 | } | ||
457 | case Cconst: { | ||
458 | pushluaval(cs); | ||
459 | cs->cap++; | ||
460 | return 1; | ||
461 | } | ||
462 | case Carg: { | ||
463 | int arg = (cs->cap++)->idx; | ||
464 | if (arg + FIXEDARGS > cs->ptop) | ||
465 | return luaL_error(L, "reference to absent extra argument #%d", arg); | ||
466 | lua_pushvalue(L, arg + FIXEDARGS); | ||
467 | return 1; | ||
468 | } | ||
469 | case Csimple: { | ||
470 | int k = pushnestedvalues(cs, 1); | ||
471 | lua_insert(L, -k); /* make whole match be first result */ | ||
472 | return k; | ||
473 | } | ||
474 | case Cruntime: { | ||
475 | lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ | ||
476 | return 1; | ||
477 | } | ||
478 | case Cstring: { | ||
479 | luaL_Buffer b; | ||
480 | luaL_buffinit(L, &b); | ||
481 | stringcap(&b, cs); | ||
482 | luaL_pushresult(&b); | ||
483 | return 1; | ||
484 | } | ||
485 | case Csubst: { | ||
486 | luaL_Buffer b; | ||
487 | luaL_buffinit(L, &b); | ||
488 | substcap(&b, cs); | ||
489 | luaL_pushresult(&b); | ||
490 | return 1; | ||
491 | } | ||
492 | case Cgroup: { | ||
493 | if (cs->cap->idx == 0) /* anonymous group? */ | ||
494 | return pushnestedvalues(cs, 0); /* add all nested values */ | ||
495 | else { /* named group: add no values */ | ||
496 | nextcap(cs); /* skip capture */ | ||
497 | return 0; | ||
498 | } | ||
499 | } | ||
500 | case Cbackref: return backrefcap(cs); | ||
501 | case Ctable: return tablecap(cs); | ||
502 | case Cfunction: return functioncap(cs); | ||
503 | case Cnum: return numcap(cs); | ||
504 | case Cquery: return querycap(cs); | ||
505 | case Cfold: return foldcap(cs); | ||
506 | default: assert(0); return 0; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | |||
511 | /* | ||
512 | ** Prepare a CapState structure and traverse the entire list of | ||
513 | ** captures in the stack pushing its results. 's' is the subject | ||
514 | ** string, 'r' is the final position of the match, and 'ptop' | ||
515 | ** the index in the stack where some useful values were pushed. | ||
516 | ** Returns the number of results pushed. (If the list produces no | ||
517 | ** results, push the final position of the match.) | ||
518 | */ | ||
519 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { | ||
520 | Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); | ||
521 | int n = 0; | ||
522 | if (!isclosecap(capture)) { /* is there any capture? */ | ||
523 | CapState cs; | ||
524 | cs.ocap = cs.cap = capture; cs.L = L; | ||
525 | cs.s = s; cs.valuecached = 0; cs.ptop = ptop; | ||
526 | do { /* collect their values */ | ||
527 | n += pushcapture(&cs); | ||
528 | } while (!isclosecap(cs.cap)); | ||
529 | } | ||
530 | if (n == 0) { /* no capture values? */ | ||
531 | lua_pushinteger(L, r - s + 1); /* return only end position */ | ||
532 | n = 1; | ||
533 | } | ||
534 | return n; | ||
535 | } | ||
536 | |||
537 | |||
@@ -0,0 +1,56 @@ | |||
1 | /* | ||
2 | ** $Id: lpcap.h,v 1.3 2016/09/13 17:45:58 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpcap_h) | ||
6 | #define lpcap_h | ||
7 | |||
8 | |||
9 | #include "lptypes.h" | ||
10 | |||
11 | |||
12 | /* kinds of captures */ | ||
13 | typedef enum CapKind { | ||
14 | Cclose, /* not used in trees */ | ||
15 | Cposition, | ||
16 | Cconst, /* ktable[key] is Lua constant */ | ||
17 | Cbackref, /* ktable[key] is "name" of group to get capture */ | ||
18 | Carg, /* 'key' is arg's number */ | ||
19 | Csimple, /* next node is pattern */ | ||
20 | Ctable, /* next node is pattern */ | ||
21 | Cfunction, /* ktable[key] is function; next node is pattern */ | ||
22 | Cquery, /* ktable[key] is table; next node is pattern */ | ||
23 | Cstring, /* ktable[key] is string; next node is pattern */ | ||
24 | Cnum, /* numbered capture; 'key' is number of value to return */ | ||
25 | Csubst, /* substitution capture; next node is pattern */ | ||
26 | Cfold, /* ktable[key] is function; next node is pattern */ | ||
27 | Cruntime, /* not used in trees (is uses another type for tree) */ | ||
28 | Cgroup /* ktable[key] is group's "name" */ | ||
29 | } CapKind; | ||
30 | |||
31 | |||
32 | typedef struct Capture { | ||
33 | const char *s; /* subject position */ | ||
34 | unsigned short idx; /* extra info (group name, arg index, etc.) */ | ||
35 | byte kind; /* kind of capture */ | ||
36 | byte siz; /* size of full capture + 1 (0 = not a full capture) */ | ||
37 | } Capture; | ||
38 | |||
39 | |||
40 | typedef struct CapState { | ||
41 | Capture *cap; /* current capture */ | ||
42 | Capture *ocap; /* (original) capture list */ | ||
43 | lua_State *L; | ||
44 | int ptop; /* index of last argument to 'match' */ | ||
45 | const char *s; /* original string */ | ||
46 | int valuecached; /* value stored in cache slot */ | ||
47 | } CapState; | ||
48 | |||
49 | |||
50 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); | ||
51 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop); | ||
52 | int finddyncap (Capture *cap, Capture *last); | ||
53 | |||
54 | #endif | ||
55 | |||
56 | |||
diff --git a/lpcode.c b/lpcode.c new file mode 100644 index 0000000..2722d71 --- /dev/null +++ b/lpcode.c | |||
@@ -0,0 +1,1014 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | |||
8 | |||
9 | #include "lua.h" | ||
10 | #include "lauxlib.h" | ||
11 | |||
12 | #include "lptypes.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | /* signals a "no-instruction */ | ||
17 | #define NOINST -1 | ||
18 | |||
19 | |||
20 | |||
21 | static const Charset fullset_ = | ||
22 | {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
23 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
24 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
25 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; | ||
26 | |||
27 | static const Charset *fullset = &fullset_; | ||
28 | |||
29 | /* | ||
30 | ** {====================================================== | ||
31 | ** Analysis and some optimizations | ||
32 | ** ======================================================= | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
37 | ** full (IAny), or none of those (ISet). When singleton, '*c' returns | ||
38 | ** which character it is. (When generic set, the set was the input, | ||
39 | ** so there is no need to return it.) | ||
40 | */ | ||
41 | static Opcode charsettype (const byte *cs, int *c) { | ||
42 | int count = 0; /* number of characters in the set */ | ||
43 | int i; | ||
44 | int candidate = -1; /* candidate position for the singleton char */ | ||
45 | for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ | ||
46 | int b = cs[i]; | ||
47 | if (b == 0) { /* is byte empty? */ | ||
48 | if (count > 1) /* was set neither empty nor singleton? */ | ||
49 | return ISet; /* neither full nor empty nor singleton */ | ||
50 | /* else set is still empty or singleton */ | ||
51 | } | ||
52 | else if (b == 0xFF) { /* is byte full? */ | ||
53 | if (count < (i * BITSPERCHAR)) /* was set not full? */ | ||
54 | return ISet; /* neither full nor empty nor singleton */ | ||
55 | else count += BITSPERCHAR; /* set is still full */ | ||
56 | } | ||
57 | else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ | ||
58 | if (count > 0) /* was set not empty? */ | ||
59 | return ISet; /* neither full nor empty nor singleton */ | ||
60 | else { /* set has only one char till now; track it */ | ||
61 | count++; | ||
62 | candidate = i; | ||
63 | } | ||
64 | } | ||
65 | else return ISet; /* byte is neither empty, full, nor singleton */ | ||
66 | } | ||
67 | switch (count) { | ||
68 | case 0: return IFail; /* empty set */ | ||
69 | case 1: { /* singleton; find character bit inside byte */ | ||
70 | int b = cs[candidate]; | ||
71 | *c = candidate * BITSPERCHAR; | ||
72 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } | ||
73 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } | ||
74 | if ((b & 0x02) != 0) { *c += 1; } | ||
75 | return IChar; | ||
76 | } | ||
77 | default: { | ||
78 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ | ||
79 | return IAny; | ||
80 | } | ||
81 | } | ||
82 | } | ||
83 | |||
84 | |||
85 | /* | ||
86 | ** A few basic operations on Charsets | ||
87 | */ | ||
88 | static void cs_complement (Charset *cs) { | ||
89 | loopset(i, cs->cs[i] = ~cs->cs[i]); | ||
90 | } | ||
91 | |||
92 | static int cs_equal (const byte *cs1, const byte *cs2) { | ||
93 | loopset(i, if (cs1[i] != cs2[i]) return 0); | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | ||
98 | loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a | ||
105 | ** charset and return 1; else return 0. | ||
106 | */ | ||
107 | int tocharset (TTree *tree, Charset *cs) { | ||
108 | switch (tree->tag) { | ||
109 | case TSet: { /* copy set */ | ||
110 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
111 | return 1; | ||
112 | } | ||
113 | case TChar: { /* only one char */ | ||
114 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
115 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
116 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
117 | return 1; | ||
118 | } | ||
119 | case TAny: { | ||
120 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
121 | return 1; | ||
122 | } | ||
123 | default: return 0; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | |||
128 | /* | ||
129 | ** Visit a TCall node taking care to stop recursion. If node not yet | ||
130 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default | ||
131 | ** value) | ||
132 | */ | ||
133 | static int callrecursive (TTree *tree, int f (TTree *t), int def) { | ||
134 | int key = tree->key; | ||
135 | assert(tree->tag == TCall); | ||
136 | assert(sib2(tree)->tag == TRule); | ||
137 | if (key == 0) /* node already visited? */ | ||
138 | return def; /* return default value */ | ||
139 | else { /* first visit */ | ||
140 | int result; | ||
141 | tree->key = 0; /* mark call as already visited */ | ||
142 | result = f(sib2(tree)); /* go to called rule */ | ||
143 | tree->key = key; /* restore tree */ | ||
144 | return result; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | |||
149 | /* | ||
150 | ** Check whether a pattern tree has captures | ||
151 | */ | ||
152 | int hascaptures (TTree *tree) { | ||
153 | tailcall: | ||
154 | switch (tree->tag) { | ||
155 | case TCapture: case TRunTime: | ||
156 | return 1; | ||
157 | case TCall: | ||
158 | return callrecursive(tree, hascaptures, 0); | ||
159 | case TRule: /* do not follow siblings */ | ||
160 | tree = sib1(tree); goto tailcall; | ||
161 | case TOpenCall: assert(0); | ||
162 | default: { | ||
163 | switch (numsiblings[tree->tag]) { | ||
164 | case 1: /* return hascaptures(sib1(tree)); */ | ||
165 | tree = sib1(tree); goto tailcall; | ||
166 | case 2: | ||
167 | if (hascaptures(sib1(tree))) | ||
168 | return 1; | ||
169 | /* else return hascaptures(sib2(tree)); */ | ||
170 | tree = sib2(tree); goto tailcall; | ||
171 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
172 | } | ||
173 | } | ||
174 | } | ||
175 | } | ||
176 | |||
177 | |||
178 | /* | ||
179 | ** Checks how a pattern behaves regarding the empty string, | ||
180 | ** in one of two different ways: | ||
181 | ** A pattern is *nullable* if it can match without consuming any character; | ||
182 | ** A pattern is *nofail* if it never fails for any string | ||
183 | ** (including the empty string). | ||
184 | ** The difference is only for predicates and run-time captures; | ||
185 | ** for other patterns, the two properties are equivalent. | ||
186 | ** (With predicates, &'a' is nullable but not nofail. Of course, | ||
187 | ** nofail => nullable.) | ||
188 | ** These functions are all convervative in the following way: | ||
189 | ** p is nullable => nullable(p) | ||
190 | ** nofail(p) => p cannot fail | ||
191 | ** The function assumes that TOpenCall is not nullable; | ||
192 | ** this will be checked again when the grammar is fixed. | ||
193 | ** Run-time captures can do whatever they want, so the result | ||
194 | ** is conservative. | ||
195 | */ | ||
196 | int checkaux (TTree *tree, int pred) { | ||
197 | tailcall: | ||
198 | switch (tree->tag) { | ||
199 | case TChar: case TSet: case TAny: | ||
200 | case TFalse: case TOpenCall: | ||
201 | return 0; /* not nullable */ | ||
202 | case TRep: case TTrue: | ||
203 | return 1; /* no fail */ | ||
204 | case TNot: case TBehind: /* can match empty, but can fail */ | ||
205 | if (pred == PEnofail) return 0; | ||
206 | else return 1; /* PEnullable */ | ||
207 | case TAnd: /* can match empty; fail iff body does */ | ||
208 | if (pred == PEnullable) return 1; | ||
209 | /* else return checkaux(sib1(tree), pred); */ | ||
210 | tree = sib1(tree); goto tailcall; | ||
211 | case TRunTime: /* can fail; match empty iff body does */ | ||
212 | if (pred == PEnofail) return 0; | ||
213 | /* else return checkaux(sib1(tree), pred); */ | ||
214 | tree = sib1(tree); goto tailcall; | ||
215 | case TSeq: | ||
216 | if (!checkaux(sib1(tree), pred)) return 0; | ||
217 | /* else return checkaux(sib2(tree), pred); */ | ||
218 | tree = sib2(tree); goto tailcall; | ||
219 | case TChoice: | ||
220 | if (checkaux(sib2(tree), pred)) return 1; | ||
221 | /* else return checkaux(sib1(tree), pred); */ | ||
222 | tree = sib1(tree); goto tailcall; | ||
223 | case TCapture: case TGrammar: case TRule: | ||
224 | /* return checkaux(sib1(tree), pred); */ | ||
225 | tree = sib1(tree); goto tailcall; | ||
226 | case TCall: /* return checkaux(sib2(tree), pred); */ | ||
227 | tree = sib2(tree); goto tailcall; | ||
228 | default: assert(0); return 0; | ||
229 | } | ||
230 | } | ||
231 | |||
232 | |||
233 | /* | ||
234 | ** number of characters to match a pattern (or -1 if variable) | ||
235 | */ | ||
236 | int fixedlen (TTree *tree) { | ||
237 | int len = 0; /* to accumulate in tail calls */ | ||
238 | tailcall: | ||
239 | switch (tree->tag) { | ||
240 | case TChar: case TSet: case TAny: | ||
241 | return len + 1; | ||
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | ||
243 | return len; | ||
244 | case TRep: case TRunTime: case TOpenCall: | ||
245 | return -1; | ||
246 | case TCapture: case TRule: case TGrammar: | ||
247 | /* return fixedlen(sib1(tree)); */ | ||
248 | tree = sib1(tree); goto tailcall; | ||
249 | case TCall: { | ||
250 | int n1 = callrecursive(tree, fixedlen, -1); | ||
251 | if (n1 < 0) | ||
252 | return -1; | ||
253 | else | ||
254 | return len + n1; | ||
255 | } | ||
256 | case TSeq: { | ||
257 | int n1 = fixedlen(sib1(tree)); | ||
258 | if (n1 < 0) | ||
259 | return -1; | ||
260 | /* else return fixedlen(sib2(tree)) + len; */ | ||
261 | len += n1; tree = sib2(tree); goto tailcall; | ||
262 | } | ||
263 | case TChoice: { | ||
264 | int n1 = fixedlen(sib1(tree)); | ||
265 | int n2 = fixedlen(sib2(tree)); | ||
266 | if (n1 != n2 || n1 < 0) | ||
267 | return -1; | ||
268 | else | ||
269 | return len + n1; | ||
270 | } | ||
271 | default: assert(0); return 0; | ||
272 | }; | ||
273 | } | ||
274 | |||
275 | |||
276 | /* | ||
277 | ** Computes the 'first set' of a pattern. | ||
278 | ** The result is a conservative aproximation: | ||
279 | ** match p ax -> x (for some x) ==> a belongs to first(p) | ||
280 | ** or | ||
281 | ** a not in first(p) ==> match p ax -> fail (for all x) | ||
282 | ** | ||
283 | ** The set 'follow' is the first set of what follows the | ||
284 | ** pattern (full set if nothing follows it). | ||
285 | ** | ||
286 | ** The function returns 0 when this resulting set can be used for | ||
287 | ** test instructions that avoid the pattern altogether. | ||
288 | ** A non-zero return can happen for two reasons: | ||
289 | ** 1) match p '' -> '' ==> return has bit 1 set | ||
290 | ** (tests cannot be used because they would always fail for an empty input); | ||
291 | ** 2) there is a match-time capture ==> return has bit 2 set | ||
292 | ** (optimizations should not bypass match-time captures). | ||
293 | */ | ||
294 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | ||
295 | tailcall: | ||
296 | switch (tree->tag) { | ||
297 | case TChar: case TSet: case TAny: { | ||
298 | tocharset(tree, firstset); | ||
299 | return 0; | ||
300 | } | ||
301 | case TTrue: { | ||
302 | loopset(i, firstset->cs[i] = follow->cs[i]); | ||
303 | return 1; /* accepts the empty string */ | ||
304 | } | ||
305 | case TFalse: { | ||
306 | loopset(i, firstset->cs[i] = 0); | ||
307 | return 0; | ||
308 | } | ||
309 | case TChoice: { | ||
310 | Charset csaux; | ||
311 | int e1 = getfirst(sib1(tree), follow, firstset); | ||
312 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
313 | loopset(i, firstset->cs[i] |= csaux.cs[i]); | ||
314 | return e1 | e2; | ||
315 | } | ||
316 | case TSeq: { | ||
317 | if (!nullable(sib1(tree))) { | ||
318 | /* when p1 is not nullable, p2 has nothing to contribute; | ||
319 | return getfirst(sib1(tree), fullset, firstset); */ | ||
320 | tree = sib1(tree); follow = fullset; goto tailcall; | ||
321 | } | ||
322 | else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ | ||
323 | Charset csaux; | ||
324 | int e2 = getfirst(sib2(tree), follow, &csaux); | ||
325 | int e1 = getfirst(sib1(tree), &csaux, firstset); | ||
326 | if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ | ||
327 | else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ | ||
328 | return 2; /* pattern has a matchtime capture */ | ||
329 | else return e2; /* else depends on 'e2' */ | ||
330 | } | ||
331 | } | ||
332 | case TRep: { | ||
333 | getfirst(sib1(tree), follow, firstset); | ||
334 | loopset(i, firstset->cs[i] |= follow->cs[i]); | ||
335 | return 1; /* accept the empty string */ | ||
336 | } | ||
337 | case TCapture: case TGrammar: case TRule: { | ||
338 | /* return getfirst(sib1(tree), follow, firstset); */ | ||
339 | tree = sib1(tree); goto tailcall; | ||
340 | } | ||
341 | case TRunTime: { /* function invalidates any follow info. */ | ||
342 | int e = getfirst(sib1(tree), fullset, firstset); | ||
343 | if (e) return 2; /* function is not "protected"? */ | ||
344 | else return 0; /* pattern inside capture ensures first can be used */ | ||
345 | } | ||
346 | case TCall: { | ||
347 | /* return getfirst(sib2(tree), follow, firstset); */ | ||
348 | tree = sib2(tree); goto tailcall; | ||
349 | } | ||
350 | case TAnd: { | ||
351 | int e = getfirst(sib1(tree), follow, firstset); | ||
352 | loopset(i, firstset->cs[i] &= follow->cs[i]); | ||
353 | return e; | ||
354 | } | ||
355 | case TNot: { | ||
356 | if (tocharset(sib1(tree), firstset)) { | ||
357 | cs_complement(firstset); | ||
358 | return 1; | ||
359 | } | ||
360 | /* else go through */ | ||
361 | } | ||
362 | case TBehind: { /* instruction gives no new information */ | ||
363 | /* call 'getfirst' only to check for math-time captures */ | ||
364 | int e = getfirst(sib1(tree), follow, firstset); | ||
365 | loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ | ||
366 | return e | 1; /* always can accept the empty string */ | ||
367 | } | ||
368 | default: assert(0); return 0; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | |||
373 | /* | ||
374 | ** If 'headfail(tree)' true, then 'tree' can fail only depending on the | ||
375 | ** next character of the subject. | ||
376 | */ | ||
377 | static int headfail (TTree *tree) { | ||
378 | tailcall: | ||
379 | switch (tree->tag) { | ||
380 | case TChar: case TSet: case TAny: case TFalse: | ||
381 | return 1; | ||
382 | case TTrue: case TRep: case TRunTime: case TNot: | ||
383 | case TBehind: | ||
384 | return 0; | ||
385 | case TCapture: case TGrammar: case TRule: case TAnd: | ||
386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | ||
387 | case TCall: | ||
388 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ | ||
389 | case TSeq: | ||
390 | if (!nofail(sib2(tree))) return 0; | ||
391 | /* else return headfail(sib1(tree)); */ | ||
392 | tree = sib1(tree); goto tailcall; | ||
393 | case TChoice: | ||
394 | if (!headfail(sib1(tree))) return 0; | ||
395 | /* else return headfail(sib2(tree)); */ | ||
396 | tree = sib2(tree); goto tailcall; | ||
397 | default: assert(0); return 0; | ||
398 | } | ||
399 | } | ||
400 | |||
401 | |||
402 | /* | ||
403 | ** Check whether the code generation for the given tree can benefit | ||
404 | ** from a follow set (to avoid computing the follow set when it is | ||
405 | ** not needed) | ||
406 | */ | ||
407 | static int needfollow (TTree *tree) { | ||
408 | tailcall: | ||
409 | switch (tree->tag) { | ||
410 | case TChar: case TSet: case TAny: | ||
411 | case TFalse: case TTrue: case TAnd: case TNot: | ||
412 | case TRunTime: case TGrammar: case TCall: case TBehind: | ||
413 | return 0; | ||
414 | case TChoice: case TRep: | ||
415 | return 1; | ||
416 | case TCapture: | ||
417 | tree = sib1(tree); goto tailcall; | ||
418 | case TSeq: | ||
419 | tree = sib2(tree); goto tailcall; | ||
420 | default: assert(0); return 0; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | /* }====================================================== */ | ||
425 | |||
426 | |||
427 | |||
428 | /* | ||
429 | ** {====================================================== | ||
430 | ** Code generation | ||
431 | ** ======================================================= | ||
432 | */ | ||
433 | |||
434 | |||
435 | /* | ||
436 | ** size of an instruction | ||
437 | */ | ||
438 | int sizei (const Instruction *i) { | ||
439 | switch((Opcode)i->i.code) { | ||
440 | case ISet: case ISpan: return CHARSETINSTSIZE; | ||
441 | case ITestSet: return CHARSETINSTSIZE + 1; | ||
442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | ||
443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | ||
444 | return 2; | ||
445 | default: return 1; | ||
446 | } | ||
447 | } | ||
448 | |||
449 | |||
450 | /* | ||
451 | ** state for the compiler | ||
452 | */ | ||
453 | typedef struct CompileState { | ||
454 | Pattern *p; /* pattern being compiled */ | ||
455 | int ncode; /* next position in p->code to be filled */ | ||
456 | lua_State *L; | ||
457 | } CompileState; | ||
458 | |||
459 | |||
460 | /* | ||
461 | ** code generation is recursive; 'opt' indicates that the code is being | ||
462 | ** generated as the last thing inside an optional pattern (so, if that | ||
463 | ** code is optional too, it can reuse the 'IChoice' already in place for | ||
464 | ** the outer pattern). 'tt' points to a previous test protecting this | ||
465 | ** code (or NOINST). 'fl' is the follow set of the pattern. | ||
466 | */ | ||
467 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
468 | const Charset *fl); | ||
469 | |||
470 | |||
471 | void realloccode (lua_State *L, Pattern *p, int nsize) { | ||
472 | void *ud; | ||
473 | lua_Alloc f = lua_getallocf(L, &ud); | ||
474 | void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), | ||
475 | nsize * sizeof(Instruction)); | ||
476 | if (newblock == NULL && nsize > 0) | ||
477 | luaL_error(L, "not enough memory"); | ||
478 | p->code = (Instruction *)newblock; | ||
479 | p->codesize = nsize; | ||
480 | } | ||
481 | |||
482 | |||
483 | static int nextinstruction (CompileState *compst) { | ||
484 | int size = compst->p->codesize; | ||
485 | if (compst->ncode >= size) | ||
486 | realloccode(compst->L, compst->p, size * 2); | ||
487 | return compst->ncode++; | ||
488 | } | ||
489 | |||
490 | |||
491 | #define getinstr(cs,i) ((cs)->p->code[i]) | ||
492 | |||
493 | |||
494 | static int addinstruction (CompileState *compst, Opcode op, int aux) { | ||
495 | int i = nextinstruction(compst); | ||
496 | getinstr(compst, i).i.code = op; | ||
497 | getinstr(compst, i).i.aux = aux; | ||
498 | return i; | ||
499 | } | ||
500 | |||
501 | |||
502 | /* | ||
503 | ** Add an instruction followed by space for an offset (to be set later) | ||
504 | */ | ||
505 | static int addoffsetinst (CompileState *compst, Opcode op) { | ||
506 | int i = addinstruction(compst, op, 0); /* instruction */ | ||
507 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | ||
508 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); | ||
509 | return i; | ||
510 | } | ||
511 | |||
512 | |||
513 | /* | ||
514 | ** Set the offset of an instruction | ||
515 | */ | ||
516 | static void setoffset (CompileState *compst, int instruction, int offset) { | ||
517 | getinstr(compst, instruction + 1).offset = offset; | ||
518 | } | ||
519 | |||
520 | |||
521 | /* | ||
522 | ** Add a capture instruction: | ||
523 | ** 'op' is the capture instruction; 'cap' the capture kind; | ||
524 | ** 'key' the key into ktable; 'aux' is the optional capture offset | ||
525 | ** | ||
526 | */ | ||
527 | static int addinstcap (CompileState *compst, Opcode op, int cap, int key, | ||
528 | int aux) { | ||
529 | int i = addinstruction(compst, op, joinkindoff(cap, aux)); | ||
530 | getinstr(compst, i).i.key = key; | ||
531 | return i; | ||
532 | } | ||
533 | |||
534 | |||
535 | #define gethere(compst) ((compst)->ncode) | ||
536 | |||
537 | #define target(code,i) ((i) + code[i + 1].offset) | ||
538 | |||
539 | |||
540 | /* | ||
541 | ** Patch 'instruction' to jump to 'target' | ||
542 | */ | ||
543 | static void jumptothere (CompileState *compst, int instruction, int target) { | ||
544 | if (instruction >= 0) | ||
545 | setoffset(compst, instruction, target - instruction); | ||
546 | } | ||
547 | |||
548 | |||
549 | /* | ||
550 | ** Patch 'instruction' to jump to current position | ||
551 | */ | ||
552 | static void jumptohere (CompileState *compst, int instruction) { | ||
553 | jumptothere(compst, instruction, gethere(compst)); | ||
554 | } | ||
555 | |||
556 | |||
557 | /* | ||
558 | ** Code an IChar instruction, or IAny if there is an equivalent | ||
559 | ** test dominating it | ||
560 | */ | ||
561 | static void codechar (CompileState *compst, int c, int tt) { | ||
562 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && | ||
563 | getinstr(compst, tt).i.aux == c) | ||
564 | addinstruction(compst, IAny, 0); | ||
565 | else | ||
566 | addinstruction(compst, IChar, c); | ||
567 | } | ||
568 | |||
569 | |||
570 | /* | ||
571 | ** Add a charset posfix to an instruction | ||
572 | */ | ||
573 | static void addcharset (CompileState *compst, const byte *cs) { | ||
574 | int p = gethere(compst); | ||
575 | int i; | ||
576 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | ||
577 | nextinstruction(compst); /* space for buffer */ | ||
578 | /* fill buffer with charset */ | ||
579 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | ||
580 | } | ||
581 | |||
582 | |||
583 | /* | ||
584 | ** code a char set, optimizing unit sets for IChar, "complete" | ||
585 | ** sets for IAny, and empty sets for IFail; also use an IAny | ||
586 | ** when instruction is dominated by an equivalent test. | ||
587 | */ | ||
588 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | ||
589 | int c = 0; /* (=) to avoid warnings */ | ||
590 | Opcode op = charsettype(cs, &c); | ||
591 | switch (op) { | ||
592 | case IChar: codechar(compst, c, tt); break; | ||
593 | case ISet: { /* non-trivial set? */ | ||
594 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && | ||
595 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | ||
596 | addinstruction(compst, IAny, 0); | ||
597 | else { | ||
598 | addinstruction(compst, ISet, 0); | ||
599 | addcharset(compst, cs); | ||
600 | } | ||
601 | break; | ||
602 | } | ||
603 | default: addinstruction(compst, op, c); break; | ||
604 | } | ||
605 | } | ||
606 | |||
607 | |||
608 | /* | ||
609 | ** code a test set, optimizing unit sets for ITestChar, "complete" | ||
610 | ** sets for ITestAny, and empty sets for IJmp (always fails). | ||
611 | ** 'e' is true iff test should accept the empty string. (Test | ||
612 | ** instructions in the current VM never accept the empty string.) | ||
613 | */ | ||
614 | static int codetestset (CompileState *compst, Charset *cs, int e) { | ||
615 | if (e) return NOINST; /* no test */ | ||
616 | else { | ||
617 | int c = 0; | ||
618 | Opcode op = charsettype(cs->cs, &c); | ||
619 | switch (op) { | ||
620 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ | ||
621 | case IAny: return addoffsetinst(compst, ITestAny); | ||
622 | case IChar: { | ||
623 | int i = addoffsetinst(compst, ITestChar); | ||
624 | getinstr(compst, i).i.aux = c; | ||
625 | return i; | ||
626 | } | ||
627 | case ISet: { | ||
628 | int i = addoffsetinst(compst, ITestSet); | ||
629 | addcharset(compst, cs->cs); | ||
630 | return i; | ||
631 | } | ||
632 | default: assert(0); return 0; | ||
633 | } | ||
634 | } | ||
635 | } | ||
636 | |||
637 | |||
638 | /* | ||
639 | ** Find the final destination of a sequence of jumps | ||
640 | */ | ||
641 | static int finaltarget (Instruction *code, int i) { | ||
642 | while (code[i].i.code == IJmp) | ||
643 | i = target(code, i); | ||
644 | return i; | ||
645 | } | ||
646 | |||
647 | |||
648 | /* | ||
649 | ** final label (after traversing any jumps) | ||
650 | */ | ||
651 | static int finallabel (Instruction *code, int i) { | ||
652 | return finaltarget(code, target(code, i)); | ||
653 | } | ||
654 | |||
655 | |||
656 | /* | ||
657 | ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) | ||
658 | */ | ||
659 | static void codebehind (CompileState *compst, TTree *tree) { | ||
660 | if (tree->u.n > 0) | ||
661 | addinstruction(compst, IBehind, tree->u.n); | ||
662 | codegen(compst, sib1(tree), 0, NOINST, fullset); | ||
663 | } | ||
664 | |||
665 | |||
666 | /* | ||
667 | ** Choice; optimizations: | ||
668 | ** - when p1 is headfail or | ||
669 | ** when first(p1) and first(p2) are disjoint, than | ||
670 | ** a character not in first(p1) cannot go to p1, and a character | ||
671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | ||
672 | ** (The optimization is not valid if p1 accepts the empty string, | ||
673 | ** as then there is no character at all...) | ||
674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | ||
675 | ** the Choice already active in the stack. | ||
676 | */ | ||
677 | static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | ||
678 | const Charset *fl) { | ||
679 | int emptyp2 = (p2->tag == TTrue); | ||
680 | Charset cs1, cs2; | ||
681 | int e1 = getfirst(p1, fullset, &cs1); | ||
682 | if (headfail(p1) || | ||
683 | (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { | ||
684 | /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ | ||
685 | int test = codetestset(compst, &cs1, 0); | ||
686 | int jmp = NOINST; | ||
687 | codegen(compst, p1, 0, test, fl); | ||
688 | if (!emptyp2) | ||
689 | jmp = addoffsetinst(compst, IJmp); | ||
690 | jumptohere(compst, test); | ||
691 | codegen(compst, p2, opt, NOINST, fl); | ||
692 | jumptohere(compst, jmp); | ||
693 | } | ||
694 | else if (opt && emptyp2) { | ||
695 | /* p1? == IPartialCommit; p1 */ | ||
696 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
697 | codegen(compst, p1, 1, NOINST, fullset); | ||
698 | } | ||
699 | else { | ||
700 | /* <p1 / p2> == | ||
701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | ||
702 | int pcommit; | ||
703 | int test = codetestset(compst, &cs1, e1); | ||
704 | int pchoice = addoffsetinst(compst, IChoice); | ||
705 | codegen(compst, p1, emptyp2, test, fullset); | ||
706 | pcommit = addoffsetinst(compst, ICommit); | ||
707 | jumptohere(compst, pchoice); | ||
708 | jumptohere(compst, test); | ||
709 | codegen(compst, p2, opt, NOINST, fl); | ||
710 | jumptohere(compst, pcommit); | ||
711 | } | ||
712 | } | ||
713 | |||
714 | |||
715 | /* | ||
716 | ** And predicate | ||
717 | ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n | ||
718 | ** (valid only when 'p' has no captures) | ||
719 | */ | ||
720 | static void codeand (CompileState *compst, TTree *tree, int tt) { | ||
721 | int n = fixedlen(tree); | ||
722 | if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { | ||
723 | codegen(compst, tree, 0, tt, fullset); | ||
724 | if (n > 0) | ||
725 | addinstruction(compst, IBehind, n); | ||
726 | } | ||
727 | else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ | ||
728 | int pcommit; | ||
729 | int pchoice = addoffsetinst(compst, IChoice); | ||
730 | codegen(compst, tree, 0, tt, fullset); | ||
731 | pcommit = addoffsetinst(compst, IBackCommit); | ||
732 | jumptohere(compst, pchoice); | ||
733 | addinstruction(compst, IFail, 0); | ||
734 | jumptohere(compst, pcommit); | ||
735 | } | ||
736 | } | ||
737 | |||
738 | |||
739 | /* | ||
740 | ** Captures: if pattern has fixed (and not too big) length, and it | ||
741 | ** has no nested captures, use a single IFullCapture instruction | ||
742 | ** after the match; otherwise, enclose the pattern with OpenCapture - | ||
743 | ** CloseCapture. | ||
744 | */ | ||
745 | static void codecapture (CompileState *compst, TTree *tree, int tt, | ||
746 | const Charset *fl) { | ||
747 | int len = fixedlen(sib1(tree)); | ||
748 | if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { | ||
749 | codegen(compst, sib1(tree), 0, tt, fl); | ||
750 | addinstcap(compst, IFullCapture, tree->cap, tree->key, len); | ||
751 | } | ||
752 | else { | ||
753 | addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); | ||
754 | codegen(compst, sib1(tree), 0, tt, fl); | ||
755 | addinstcap(compst, ICloseCapture, Cclose, 0, 0); | ||
756 | } | ||
757 | } | ||
758 | |||
759 | |||
760 | static void coderuntime (CompileState *compst, TTree *tree, int tt) { | ||
761 | addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); | ||
762 | codegen(compst, sib1(tree), 0, tt, fullset); | ||
763 | addinstcap(compst, ICloseRunTime, Cclose, 0, 0); | ||
764 | } | ||
765 | |||
766 | |||
767 | /* | ||
768 | ** Repetion; optimizations: | ||
769 | ** When pattern is a charset, can use special instruction ISpan. | ||
770 | ** When pattern is head fail, or if it starts with characters that | ||
771 | ** are disjoint from what follows the repetions, a simple test | ||
772 | ** is enough (a fail inside the repetition would backtrack to fail | ||
773 | ** again in the following pattern, so there is no need for a choice). | ||
774 | ** When 'opt' is true, the repetion can reuse the Choice already | ||
775 | ** active in the stack. | ||
776 | */ | ||
777 | static void coderep (CompileState *compst, TTree *tree, int opt, | ||
778 | const Charset *fl) { | ||
779 | Charset st; | ||
780 | if (tocharset(tree, &st)) { | ||
781 | addinstruction(compst, ISpan, 0); | ||
782 | addcharset(compst, st.cs); | ||
783 | } | ||
784 | else { | ||
785 | int e1 = getfirst(tree, fullset, &st); | ||
786 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | ||
787 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | ||
788 | int jmp; | ||
789 | int test = codetestset(compst, &st, 0); | ||
790 | codegen(compst, tree, 0, test, fullset); | ||
791 | jmp = addoffsetinst(compst, IJmp); | ||
792 | jumptohere(compst, test); | ||
793 | jumptothere(compst, jmp, test); | ||
794 | } | ||
795 | else { | ||
796 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ | ||
797 | /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ | ||
798 | int commit, l2; | ||
799 | int test = codetestset(compst, &st, e1); | ||
800 | int pchoice = NOINST; | ||
801 | if (opt) | ||
802 | jumptohere(compst, addoffsetinst(compst, IPartialCommit)); | ||
803 | else | ||
804 | pchoice = addoffsetinst(compst, IChoice); | ||
805 | l2 = gethere(compst); | ||
806 | codegen(compst, tree, 0, NOINST, fullset); | ||
807 | commit = addoffsetinst(compst, IPartialCommit); | ||
808 | jumptothere(compst, commit, l2); | ||
809 | jumptohere(compst, pchoice); | ||
810 | jumptohere(compst, test); | ||
811 | } | ||
812 | } | ||
813 | } | ||
814 | |||
815 | |||
816 | /* | ||
817 | ** Not predicate; optimizations: | ||
818 | ** In any case, if first test fails, 'not' succeeds, so it can jump to | ||
819 | ** the end. If pattern is headfail, that is all (it cannot fail | ||
820 | ** in other parts); this case includes 'not' of simple sets. Otherwise, | ||
821 | ** use the default code (a choice plus a failtwice). | ||
822 | */ | ||
823 | static void codenot (CompileState *compst, TTree *tree) { | ||
824 | Charset st; | ||
825 | int e = getfirst(tree, fullset, &st); | ||
826 | int test = codetestset(compst, &st, e); | ||
827 | if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ | ||
828 | addinstruction(compst, IFail, 0); | ||
829 | else { | ||
830 | /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ | ||
831 | int pchoice = addoffsetinst(compst, IChoice); | ||
832 | codegen(compst, tree, 0, NOINST, fullset); | ||
833 | addinstruction(compst, IFailTwice, 0); | ||
834 | jumptohere(compst, pchoice); | ||
835 | } | ||
836 | jumptohere(compst, test); | ||
837 | } | ||
838 | |||
839 | |||
840 | /* | ||
841 | ** change open calls to calls, using list 'positions' to find | ||
842 | ** correct offsets; also optimize tail calls | ||
843 | */ | ||
844 | static void correctcalls (CompileState *compst, int *positions, | ||
845 | int from, int to) { | ||
846 | int i; | ||
847 | Instruction *code = compst->p->code; | ||
848 | for (i = from; i < to; i += sizei(&code[i])) { | ||
849 | if (code[i].i.code == IOpenCall) { | ||
850 | int n = code[i].i.key; /* rule number */ | ||
851 | int rule = positions[n]; /* rule position */ | ||
852 | assert(rule == from || code[rule - 1].i.code == IRet); | ||
853 | if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ | ||
854 | code[i].i.code = IJmp; /* tail call */ | ||
855 | else | ||
856 | code[i].i.code = ICall; | ||
857 | jumptothere(compst, i, rule); /* call jumps to respective rule */ | ||
858 | } | ||
859 | } | ||
860 | assert(i == to); | ||
861 | } | ||
862 | |||
863 | |||
864 | /* | ||
865 | ** Code for a grammar: | ||
866 | ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: | ||
867 | */ | ||
868 | static void codegrammar (CompileState *compst, TTree *grammar) { | ||
869 | int positions[MAXRULES]; | ||
870 | int rulenumber = 0; | ||
871 | TTree *rule; | ||
872 | int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ | ||
873 | int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ | ||
874 | int start = gethere(compst); /* here starts the initial rule */ | ||
875 | jumptohere(compst, firstcall); | ||
876 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
877 | positions[rulenumber++] = gethere(compst); /* save rule position */ | ||
878 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ | ||
879 | addinstruction(compst, IRet, 0); | ||
880 | } | ||
881 | assert(rule->tag == TTrue); | ||
882 | jumptohere(compst, jumptoend); | ||
883 | correctcalls(compst, positions, start, gethere(compst)); | ||
884 | } | ||
885 | |||
886 | |||
887 | static void codecall (CompileState *compst, TTree *call) { | ||
888 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ | ||
889 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ | ||
890 | assert(sib2(call)->tag == TRule); | ||
891 | } | ||
892 | |||
893 | |||
894 | /* | ||
895 | ** Code first child of a sequence | ||
896 | ** (second child is called in-place to allow tail call) | ||
897 | ** Return 'tt' for second child | ||
898 | */ | ||
899 | static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, | ||
900 | int tt, const Charset *fl) { | ||
901 | if (needfollow(p1)) { | ||
902 | Charset fl1; | ||
903 | getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ | ||
904 | codegen(compst, p1, 0, tt, &fl1); | ||
905 | } | ||
906 | else /* use 'fullset' as follow */ | ||
907 | codegen(compst, p1, 0, tt, fullset); | ||
908 | if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ | ||
909 | return NOINST; /* invalidate test */ | ||
910 | else return tt; /* else 'tt' still protects sib2 */ | ||
911 | } | ||
912 | |||
913 | |||
914 | /* | ||
915 | ** Main code-generation function: dispatch to auxiliar functions | ||
916 | ** according to kind of tree. ('needfollow' should return true | ||
917 | ** only for consructions that use 'fl'.) | ||
918 | */ | ||
919 | static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | ||
920 | const Charset *fl) { | ||
921 | tailcall: | ||
922 | switch (tree->tag) { | ||
923 | case TChar: codechar(compst, tree->u.n, tt); break; | ||
924 | case TAny: addinstruction(compst, IAny, 0); break; | ||
925 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | ||
926 | case TTrue: break; | ||
927 | case TFalse: addinstruction(compst, IFail, 0); break; | ||
928 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | ||
929 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | ||
930 | case TBehind: codebehind(compst, tree); break; | ||
931 | case TNot: codenot(compst, sib1(tree)); break; | ||
932 | case TAnd: codeand(compst, sib1(tree), tt); break; | ||
933 | case TCapture: codecapture(compst, tree, tt, fl); break; | ||
934 | case TRunTime: coderuntime(compst, tree, tt); break; | ||
935 | case TGrammar: codegrammar(compst, tree); break; | ||
936 | case TCall: codecall(compst, tree); break; | ||
937 | case TSeq: { | ||
938 | tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ | ||
939 | /* codegen(compst, p2, opt, tt, fl); */ | ||
940 | tree = sib2(tree); goto tailcall; | ||
941 | } | ||
942 | default: assert(0); | ||
943 | } | ||
944 | } | ||
945 | |||
946 | |||
947 | /* | ||
948 | ** Optimize jumps and other jump-like instructions. | ||
949 | ** * Update labels of instructions with labels to their final | ||
950 | ** destinations (e.g., choice L1; ... L1: jmp L2: becomes | ||
951 | ** choice L2) | ||
952 | ** * Jumps to other instructions that do jumps become those | ||
953 | ** instructions (e.g., jump to return becomes a return; jump | ||
954 | ** to commit becomes a commit) | ||
955 | */ | ||
956 | static void peephole (CompileState *compst) { | ||
957 | Instruction *code = compst->p->code; | ||
958 | int i; | ||
959 | for (i = 0; i < compst->ncode; i += sizei(&code[i])) { | ||
960 | redo: | ||
961 | switch (code[i].i.code) { | ||
962 | case IChoice: case ICall: case ICommit: case IPartialCommit: | ||
963 | case IBackCommit: case ITestChar: case ITestSet: | ||
964 | case ITestAny: { /* instructions with labels */ | ||
965 | jumptothere(compst, i, finallabel(code, i)); /* optimize label */ | ||
966 | break; | ||
967 | } | ||
968 | case IJmp: { | ||
969 | int ft = finaltarget(code, i); | ||
970 | switch (code[ft].i.code) { /* jumping to what? */ | ||
971 | case IRet: case IFail: case IFailTwice: | ||
972 | case IEnd: { /* instructions with unconditional implicit jumps */ | ||
973 | code[i] = code[ft]; /* jump becomes that instruction */ | ||
974 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ | ||
975 | break; | ||
976 | } | ||
977 | case ICommit: case IPartialCommit: | ||
978 | case IBackCommit: { /* inst. with unconditional explicit jumps */ | ||
979 | int fft = finallabel(code, ft); | ||
980 | code[i] = code[ft]; /* jump becomes that instruction... */ | ||
981 | jumptothere(compst, i, fft); /* but must correct its offset */ | ||
982 | goto redo; /* reoptimize its label */ | ||
983 | } | ||
984 | default: { | ||
985 | jumptothere(compst, i, ft); /* optimize label */ | ||
986 | break; | ||
987 | } | ||
988 | } | ||
989 | break; | ||
990 | } | ||
991 | default: break; | ||
992 | } | ||
993 | } | ||
994 | assert(code[i - 1].i.code == IEnd); | ||
995 | } | ||
996 | |||
997 | |||
998 | /* | ||
999 | ** Compile a pattern | ||
1000 | */ | ||
1001 | Instruction *compile (lua_State *L, Pattern *p) { | ||
1002 | CompileState compst; | ||
1003 | compst.p = p; compst.ncode = 0; compst.L = L; | ||
1004 | realloccode(L, p, 2); /* minimum initial size */ | ||
1005 | codegen(&compst, p->tree, 0, NOINST, fullset); | ||
1006 | addinstruction(&compst, IEnd, 0); | ||
1007 | realloccode(L, p, compst.ncode); /* set final size */ | ||
1008 | peephole(&compst); | ||
1009 | return p->code; | ||
1010 | } | ||
1011 | |||
1012 | |||
1013 | /* }====================================================== */ | ||
1014 | |||
diff --git a/lpcode.h b/lpcode.h new file mode 100644 index 0000000..2a5861e --- /dev/null +++ b/lpcode.h | |||
@@ -0,0 +1,40 @@ | |||
1 | /* | ||
2 | ** $Id: lpcode.h,v 1.8 2016/09/15 17:46:13 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpcode_h) | ||
6 | #define lpcode_h | ||
7 | |||
8 | #include "lua.h" | ||
9 | |||
10 | #include "lptypes.h" | ||
11 | #include "lptree.h" | ||
12 | #include "lpvm.h" | ||
13 | |||
14 | int tocharset (TTree *tree, Charset *cs); | ||
15 | int checkaux (TTree *tree, int pred); | ||
16 | int fixedlen (TTree *tree); | ||
17 | int hascaptures (TTree *tree); | ||
18 | int lp_gc (lua_State *L); | ||
19 | Instruction *compile (lua_State *L, Pattern *p); | ||
20 | void realloccode (lua_State *L, Pattern *p, int nsize); | ||
21 | int sizei (const Instruction *i); | ||
22 | |||
23 | |||
24 | #define PEnullable 0 | ||
25 | #define PEnofail 1 | ||
26 | |||
27 | /* | ||
28 | ** nofail(t) implies that 't' cannot fail with any input | ||
29 | */ | ||
30 | #define nofail(t) checkaux(t, PEnofail) | ||
31 | |||
32 | /* | ||
33 | ** (not nullable(t)) implies 't' cannot match without consuming | ||
34 | ** something | ||
35 | */ | ||
36 | #define nullable(t) checkaux(t, PEnullable) | ||
37 | |||
38 | |||
39 | |||
40 | #endif | ||
diff --git a/lpeg.html b/lpeg.html new file mode 100644 index 0000000..5c9535f --- /dev/null +++ b/lpeg.html | |||
@@ -0,0 +1,1445 @@ | |||
1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | ||
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>LPeg - Parsing Expression Grammars For Lua</title> | ||
6 | <link rel="stylesheet" | ||
7 | href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css" | ||
8 | type="text/css"/> | ||
9 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> | ||
10 | </head> | ||
11 | <body> | ||
12 | |||
13 | <!-- $Id: lpeg.html,v 1.77 2017/01/13 13:40:05 roberto Exp $ --> | ||
14 | |||
15 | <div id="container"> | ||
16 | |||
17 | <div id="product"> | ||
18 | <div id="product_logo"> | ||
19 | <a href="http://www.inf.puc-rio.br/~roberto/lpeg/"> | ||
20 | <img alt="LPeg logo" src="lpeg-128.gif"/></a> | ||
21 | |||
22 | </div> | ||
23 | <div id="product_name"><big><strong>LPeg</strong></big></div> | ||
24 | <div id="product_description"> | ||
25 | Parsing Expression Grammars For Lua, version 1.0 | ||
26 | </div> | ||
27 | </div> <!-- id="product" --> | ||
28 | |||
29 | <div id="main"> | ||
30 | |||
31 | <div id="navigation"> | ||
32 | <h1>LPeg</h1> | ||
33 | |||
34 | <ul> | ||
35 | <li><strong>Home</strong> | ||
36 | <ul> | ||
37 | <li><a href="#intro">Introduction</a></li> | ||
38 | <li><a href="#func">Functions</a></li> | ||
39 | <li><a href="#basic">Basic Constructions</a></li> | ||
40 | <li><a href="#grammar">Grammars</a></li> | ||
41 | <li><a href="#captures">Captures</a></li> | ||
42 | <li><a href="#ex">Some Examples</a></li> | ||
43 | <li><a href="re.html">The <code>re</code> Module</a></li> | ||
44 | <li><a href="#download">Download</a></li> | ||
45 | <li><a href="#license">License</a></li> | ||
46 | </ul> | ||
47 | </li> | ||
48 | </ul> | ||
49 | </div> <!-- id="navigation" --> | ||
50 | |||
51 | <div id="content"> | ||
52 | |||
53 | |||
54 | <h2><a name="intro">Introduction</a></h2> | ||
55 | |||
56 | <p> | ||
57 | <em>LPeg</em> is a new pattern-matching library for Lua, | ||
58 | based on | ||
59 | <a href="http://pdos.csail.mit.edu/%7Ebaford/packrat/"> | ||
60 | Parsing Expression Grammars</a> (PEGs). | ||
61 | This text is a reference manual for the library. | ||
62 | For a more formal treatment of LPeg, | ||
63 | as well as some discussion about its implementation, | ||
64 | see | ||
65 | <a href="http://www.inf.puc-rio.br/~roberto/docs/peg.pdf"> | ||
66 | A Text Pattern-Matching Tool based on Parsing Expression Grammars</a>. | ||
67 | (You may also be interested in my | ||
68 | <a href="http://vimeo.com/1485123">talk about LPeg</a> | ||
69 | given at the III Lua Workshop.) | ||
70 | </p> | ||
71 | |||
72 | <p> | ||
73 | Following the Snobol tradition, | ||
74 | LPeg defines patterns as first-class objects. | ||
75 | That is, patterns are regular Lua values | ||
76 | (represented by userdata). | ||
77 | The library offers several functions to create | ||
78 | and compose patterns. | ||
79 | With the use of metamethods, | ||
80 | several of these functions are provided as infix or prefix | ||
81 | operators. | ||
82 | On the one hand, | ||
83 | the result is usually much more verbose than the typical | ||
84 | encoding of patterns using the so called | ||
85 | <em>regular expressions</em> | ||
86 | (which typically are not regular expressions in the formal sense). | ||
87 | On the other hand, | ||
88 | first-class patterns allow much better documentation | ||
89 | (as it is easy to comment the code, | ||
90 | to break complex definitions in smaller parts, etc.) | ||
91 | and are extensible, | ||
92 | as we can define new functions to create and compose patterns. | ||
93 | </p> | ||
94 | |||
95 | <p> | ||
96 | For a quick glance of the library, | ||
97 | the following table summarizes its basic operations | ||
98 | for creating patterns: | ||
99 | </p> | ||
100 | <table border="1"> | ||
101 | <tbody><tr><td><b>Operator</b></td><td><b>Description</b></td></tr> | ||
102 | <tr><td><a href="#op-p"><code>lpeg.P(string)</code></a></td> | ||
103 | <td>Matches <code>string</code> literally</td></tr> | ||
104 | <tr><td><a href="#op-p"><code>lpeg.P(n)</code></a></td> | ||
105 | <td>Matches exactly <code>n</code> characters</td></tr> | ||
106 | <tr><td><a href="#op-s"><code>lpeg.S(string)</code></a></td> | ||
107 | <td>Matches any character in <code>string</code> (Set)</td></tr> | ||
108 | <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> | ||
109 | <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> | ||
110 | <tr><td><a href="#op-pow"><code>patt^n</code></a></td> | ||
111 | <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> | ||
112 | <tr><td><a href="#op-pow"><code>patt^-n</code></a></td> | ||
113 | <td>Matches at most <code>n</code> repetitions of <code>patt</code></td></tr> | ||
114 | <tr><td><a href="#op-mul"><code>patt1 * patt2</code></a></td> | ||
115 | <td>Matches <code>patt1</code> followed by <code>patt2</code></td></tr> | ||
116 | <tr><td><a href="#op-add"><code>patt1 + patt2</code></a></td> | ||
117 | <td>Matches <code>patt1</code> or <code>patt2</code> | ||
118 | (ordered choice)</td></tr> | ||
119 | <tr><td><a href="#op-sub"><code>patt1 - patt2</code></a></td> | ||
120 | <td>Matches <code>patt1</code> if <code>patt2</code> does not match</td></tr> | ||
121 | <tr><td><a href="#op-unm"><code>-patt</code></a></td> | ||
122 | <td>Equivalent to <code>("" - patt)</code></td></tr> | ||
123 | <tr><td><a href="#op-len"><code>#patt</code></a></td> | ||
124 | <td>Matches <code>patt</code> but consumes no input</td></tr> | ||
125 | <tr><td><a href="#op-behind"><code>lpeg.B(patt)</code></a></td> | ||
126 | <td>Matches <code>patt</code> behind the current position, | ||
127 | consuming no input</td></tr> | ||
128 | </tbody></table> | ||
129 | |||
130 | <p>As a very simple example, | ||
131 | <code>lpeg.R("09")^1</code> creates a pattern that | ||
132 | matches a non-empty sequence of digits. | ||
133 | As a not so simple example, | ||
134 | <code>-lpeg.P(1)</code> | ||
135 | (which can be written as <code>lpeg.P(-1)</code>, | ||
136 | or simply <code>-1</code> for operations expecting a pattern) | ||
137 | matches an empty string only if it cannot match a single character; | ||
138 | so, it succeeds only at the end of the subject. | ||
139 | </p> | ||
140 | |||
141 | <p> | ||
142 | LPeg also offers the <a href="re.html"><code>re</code> module</a>, | ||
143 | which implements patterns following a regular-expression style | ||
144 | (e.g., <code>[09]+</code>). | ||
145 | (This module is 260 lines of Lua code, | ||
146 | and of course it uses LPeg to parse regular expressions and | ||
147 | translate them to regular LPeg patterns.) | ||
148 | </p> | ||
149 | |||
150 | |||
151 | <h2><a name="func">Functions</a></h2> | ||
152 | |||
153 | |||
154 | <h3><a name="f-match"></a><code>lpeg.match (pattern, subject [, init])</code></h3> | ||
155 | <p> | ||
156 | The matching function. | ||
157 | It attempts to match the given pattern against the subject string. | ||
158 | If the match succeeds, | ||
159 | returns the index in the subject of the first character after the match, | ||
160 | or the <a href="#captures">captured values</a> | ||
161 | (if the pattern captured any value). | ||
162 | </p> | ||
163 | |||
164 | <p> | ||
165 | An optional numeric argument <code>init</code> makes the match | ||
166 | start at that position in the subject string. | ||
167 | As usual in Lua libraries, | ||
168 | a negative value counts from the end. | ||
169 | </p> | ||
170 | |||
171 | <p> | ||
172 | Unlike typical pattern-matching functions, | ||
173 | <code>match</code> works only in <em>anchored</em> mode; | ||
174 | that is, it tries to match the pattern with a prefix of | ||
175 | the given subject string (at position <code>init</code>), | ||
176 | not with an arbitrary substring of the subject. | ||
177 | So, if we want to find a pattern anywhere in a string, | ||
178 | we must either write a loop in Lua or write a pattern that | ||
179 | matches anywhere. | ||
180 | This second approach is easy and quite efficient; | ||
181 | see <a href="#ex">examples</a>. | ||
182 | </p> | ||
183 | |||
184 | <h3><a name="f-type"></a><code>lpeg.type (value)</code></h3> | ||
185 | <p> | ||
186 | If the given value is a pattern, | ||
187 | returns the string <code>"pattern"</code>. | ||
188 | Otherwise returns nil. | ||
189 | </p> | ||
190 | |||
191 | <h3><a name="f-version"></a><code>lpeg.version ()</code></h3> | ||
192 | <p> | ||
193 | Returns a string with the running version of LPeg. | ||
194 | </p> | ||
195 | |||
196 | <h3><a name="f-setstack"></a><code>lpeg.setmaxstack (max)</code></h3> | ||
197 | <p> | ||
198 | Sets a limit for the size of the backtrack stack used by LPeg to | ||
199 | track calls and choices. | ||
200 | (The default limit is 400.) | ||
201 | Most well-written patterns need little backtrack levels and | ||
202 | therefore you seldom need to change this limit; | ||
203 | before changing it you should try to rewrite your | ||
204 | pattern to avoid the need for extra space. | ||
205 | Nevertheless, a few useful patterns may overflow. | ||
206 | Also, with recursive grammars, | ||
207 | subjects with deep recursion may also need larger limits. | ||
208 | </p> | ||
209 | |||
210 | |||
211 | <h2><a name="basic">Basic Constructions</a></h2> | ||
212 | |||
213 | <p> | ||
214 | The following operations build patterns. | ||
215 | All operations that expect a pattern as an argument | ||
216 | may receive also strings, tables, numbers, booleans, or functions, | ||
217 | which are translated to patterns according to | ||
218 | the rules of function <a href="#op-p"><code>lpeg.P</code></a>. | ||
219 | </p> | ||
220 | |||
221 | |||
222 | |||
223 | <h3><a name="op-p"></a><code>lpeg.P (value)</code></h3> | ||
224 | <p> | ||
225 | Converts the given value into a proper pattern, | ||
226 | according to the following rules: | ||
227 | </p> | ||
228 | <ul> | ||
229 | |||
230 | <li><p> | ||
231 | If the argument is a pattern, | ||
232 | it is returned unmodified. | ||
233 | </p></li> | ||
234 | |||
235 | <li><p> | ||
236 | If the argument is a string, | ||
237 | it is translated to a pattern that matches the string literally. | ||
238 | </p></li> | ||
239 | |||
240 | <li><p> | ||
241 | If the argument is a non-negative number <em>n</em>, | ||
242 | the result is a pattern that matches exactly <em>n</em> characters. | ||
243 | </p></li> | ||
244 | |||
245 | <li><p> | ||
246 | If the argument is a negative number <em>-n</em>, | ||
247 | the result is a pattern that | ||
248 | succeeds only if the input string has less than <em>n</em> characters left: | ||
249 | <code>lpeg.P(-n)</code> | ||
250 | is equivalent to <code>-lpeg.P(n)</code> | ||
251 | (see the <a href="#op-unm">unary minus operation</a>). | ||
252 | </p></li> | ||
253 | |||
254 | <li><p> | ||
255 | If the argument is a boolean, | ||
256 | the result is a pattern that always succeeds or always fails | ||
257 | (according to the boolean value), | ||
258 | without consuming any input. | ||
259 | </p></li> | ||
260 | |||
261 | <li><p> | ||
262 | If the argument is a table, | ||
263 | it is interpreted as a grammar | ||
264 | (see <a href="#grammar">Grammars</a>). | ||
265 | </p></li> | ||
266 | |||
267 | <li><p> | ||
268 | If the argument is a function, | ||
269 | returns a pattern equivalent to a | ||
270 | <a href="#matchtime">match-time capture</a> over the empty string. | ||
271 | </p></li> | ||
272 | |||
273 | </ul> | ||
274 | |||
275 | |||
276 | <h3><a name="op-behind"></a><code>lpeg.B(patt)</code></h3> | ||
277 | <p> | ||
278 | Returns a pattern that | ||
279 | matches only if the input string at the current position | ||
280 | is preceded by <code>patt</code>. | ||
281 | Pattern <code>patt</code> must match only strings | ||
282 | with some fixed length, | ||
283 | and it cannot contain captures. | ||
284 | </p> | ||
285 | |||
286 | <p> | ||
287 | Like the <a href="#op-len">and predicate</a>, | ||
288 | this pattern never consumes any input, | ||
289 | independently of success or failure. | ||
290 | </p> | ||
291 | |||
292 | |||
293 | <h3><a name="op-r"></a><code>lpeg.R ({range})</code></h3> | ||
294 | <p> | ||
295 | Returns a pattern that matches any single character | ||
296 | belonging to one of the given <em>ranges</em>. | ||
297 | Each <code>range</code> is a string <em>xy</em> of length 2, | ||
298 | representing all characters with code | ||
299 | between the codes of <em>x</em> and <em>y</em> | ||
300 | (both inclusive). | ||
301 | </p> | ||
302 | |||
303 | <p> | ||
304 | As an example, the pattern | ||
305 | <code>lpeg.R("09")</code> matches any digit, | ||
306 | and <code>lpeg.R("az", "AZ")</code> matches any ASCII letter. | ||
307 | </p> | ||
308 | |||
309 | |||
310 | <h3><a name="op-s"></a><code>lpeg.S (string)</code></h3> | ||
311 | <p> | ||
312 | Returns a pattern that matches any single character that | ||
313 | appears in the given string. | ||
314 | (The <code>S</code> stands for <em>Set</em>.) | ||
315 | </p> | ||
316 | |||
317 | <p> | ||
318 | As an example, the pattern | ||
319 | <code>lpeg.S("+-*/")</code> matches any arithmetic operator. | ||
320 | </p> | ||
321 | |||
322 | <p> | ||
323 | Note that, if <code>s</code> is a character | ||
324 | (that is, a string of length 1), | ||
325 | then <code>lpeg.P(s)</code> is equivalent to <code>lpeg.S(s)</code> | ||
326 | which is equivalent to <code>lpeg.R(s..s)</code>. | ||
327 | Note also that both <code>lpeg.S("")</code> and <code>lpeg.R()</code> | ||
328 | are patterns that always fail. | ||
329 | </p> | ||
330 | |||
331 | |||
332 | <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> | ||
333 | <p> | ||
334 | This operation creates a non-terminal (a <em>variable</em>) | ||
335 | for a grammar. | ||
336 | The created non-terminal refers to the rule indexed by <code>v</code> | ||
337 | in the enclosing grammar. | ||
338 | (See <a href="#grammar">Grammars</a> for details.) | ||
339 | </p> | ||
340 | |||
341 | |||
342 | <h3><a name="op-locale"></a><code>lpeg.locale ([table])</code></h3> | ||
343 | <p> | ||
344 | Returns a table with patterns for matching some character classes | ||
345 | according to the current locale. | ||
346 | The table has fields named | ||
347 | <code>alnum</code>, | ||
348 | <code>alpha</code>, | ||
349 | <code>cntrl</code>, | ||
350 | <code>digit</code>, | ||
351 | <code>graph</code>, | ||
352 | <code>lower</code>, | ||
353 | <code>print</code>, | ||
354 | <code>punct</code>, | ||
355 | <code>space</code>, | ||
356 | <code>upper</code>, and | ||
357 | <code>xdigit</code>, | ||
358 | each one containing a correspondent pattern. | ||
359 | Each pattern matches any single character that belongs to its class. | ||
360 | </p> | ||
361 | |||
362 | <p> | ||
363 | If called with an argument <code>table</code>, | ||
364 | then it creates those fields inside the given table and | ||
365 | returns that table. | ||
366 | </p> | ||
367 | |||
368 | |||
369 | <h3><a name="op-len"></a><code>#patt</code></h3> | ||
370 | <p> | ||
371 | Returns a pattern that | ||
372 | matches only if the input string matches <code>patt</code>, | ||
373 | but without consuming any input, | ||
374 | independently of success or failure. | ||
375 | (This pattern is called an <em>and predicate</em> | ||
376 | and it is equivalent to | ||
377 | <em>&patt</em> in the original PEG notation.) | ||
378 | </p> | ||
379 | |||
380 | |||
381 | <p> | ||
382 | This pattern never produces any capture. | ||
383 | </p> | ||
384 | |||
385 | |||
386 | <h3><a name="op-unm"></a><code>-patt</code></h3> | ||
387 | <p> | ||
388 | Returns a pattern that | ||
389 | matches only if the input string does not match <code>patt</code>. | ||
390 | It does not consume any input, | ||
391 | independently of success or failure. | ||
392 | (This pattern is equivalent to | ||
393 | <em>!patt</em> in the original PEG notation.) | ||
394 | </p> | ||
395 | |||
396 | <p> | ||
397 | As an example, the pattern | ||
398 | <code>-lpeg.P(1)</code> matches only the end of string. | ||
399 | </p> | ||
400 | |||
401 | <p> | ||
402 | This pattern never produces any captures, | ||
403 | because either <code>patt</code> fails | ||
404 | or <code>-patt</code> fails. | ||
405 | (A failing pattern never produces captures.) | ||
406 | </p> | ||
407 | |||
408 | |||
409 | <h3><a name="op-add"></a><code>patt1 + patt2</code></h3> | ||
410 | <p> | ||
411 | Returns a pattern equivalent to an <em>ordered choice</em> | ||
412 | of <code>patt1</code> and <code>patt2</code>. | ||
413 | (This is denoted by <em>patt1 / patt2</em> in the original PEG notation, | ||
414 | not to be confused with the <code>/</code> operation in LPeg.) | ||
415 | It matches either <code>patt1</code> or <code>patt2</code>, | ||
416 | with no backtracking once one of them succeeds. | ||
417 | The identity element for this operation is the pattern | ||
418 | <code>lpeg.P(false)</code>, | ||
419 | which always fails. | ||
420 | </p> | ||
421 | |||
422 | <p> | ||
423 | If both <code>patt1</code> and <code>patt2</code> are | ||
424 | character sets, | ||
425 | this operation is equivalent to set union. | ||
426 | </p> | ||
427 | <pre class="example"> | ||
428 | lower = lpeg.R("az") | ||
429 | upper = lpeg.R("AZ") | ||
430 | letter = lower + upper | ||
431 | </pre> | ||
432 | |||
433 | |||
434 | <h3><a name="op-sub"></a><code>patt1 - patt2</code></h3> | ||
435 | <p> | ||
436 | Returns a pattern equivalent to <em>!patt2 patt1</em>. | ||
437 | This pattern asserts that the input does not match | ||
438 | <code>patt2</code> and then matches <code>patt1</code>. | ||
439 | </p> | ||
440 | |||
441 | <p> | ||
442 | When successful, | ||
443 | this pattern produces all captures from <code>patt1</code>. | ||
444 | It never produces any capture from <code>patt2</code> | ||
445 | (as either <code>patt2</code> fails or | ||
446 | <code>patt1 - patt2</code> fails). | ||
447 | </p> | ||
448 | |||
449 | <p> | ||
450 | If both <code>patt1</code> and <code>patt2</code> are | ||
451 | character sets, | ||
452 | this operation is equivalent to set difference. | ||
453 | Note that <code>-patt</code> is equivalent to <code>"" - patt</code> | ||
454 | (or <code>0 - patt</code>). | ||
455 | If <code>patt</code> is a character set, | ||
456 | <code>1 - patt</code> is its complement. | ||
457 | </p> | ||
458 | |||
459 | |||
460 | <h3><a name="op-mul"></a><code>patt1 * patt2</code></h3> | ||
461 | <p> | ||
462 | Returns a pattern that matches <code>patt1</code> | ||
463 | and then matches <code>patt2</code>, | ||
464 | starting where <code>patt1</code> finished. | ||
465 | The identity element for this operation is the | ||
466 | pattern <code>lpeg.P(true)</code>, | ||
467 | which always succeeds. | ||
468 | </p> | ||
469 | |||
470 | <p> | ||
471 | (LPeg uses the <code>*</code> operator | ||
472 | [instead of the more obvious <code>..</code>] | ||
473 | both because it has | ||
474 | the right priority and because in formal languages it is | ||
475 | common to use a dot for denoting concatenation.) | ||
476 | </p> | ||
477 | |||
478 | |||
479 | <h3><a name="op-pow"></a><code>patt^n</code></h3> | ||
480 | <p> | ||
481 | If <code>n</code> is nonnegative, | ||
482 | this pattern is | ||
483 | equivalent to <em>patt<sup>n</sup> patt*</em>: | ||
484 | It matches <code>n</code> or more occurrences of <code>patt</code>. | ||
485 | </p> | ||
486 | |||
487 | <p> | ||
488 | Otherwise, when <code>n</code> is negative, | ||
489 | this pattern is equivalent to <em>(patt?)<sup>-n</sup></em>: | ||
490 | It matches at most <code>|n|</code> | ||
491 | occurrences of <code>patt</code>. | ||
492 | </p> | ||
493 | |||
494 | <p> | ||
495 | In particular, <code>patt^0</code> is equivalent to <em>patt*</em>, | ||
496 | <code>patt^1</code> is equivalent to <em>patt+</em>, | ||
497 | and <code>patt^-1</code> is equivalent to <em>patt?</em> | ||
498 | in the original PEG notation. | ||
499 | </p> | ||
500 | |||
501 | <p> | ||
502 | In all cases, | ||
503 | the resulting pattern is greedy with no backtracking | ||
504 | (also called a <em>possessive</em> repetition). | ||
505 | That is, it matches only the longest possible sequence | ||
506 | of matches for <code>patt</code>. | ||
507 | </p> | ||
508 | |||
509 | |||
510 | |||
511 | <h2><a name="grammar">Grammars</a></h2> | ||
512 | |||
513 | <p> | ||
514 | With the use of Lua variables, | ||
515 | it is possible to define patterns incrementally, | ||
516 | with each new pattern using previously defined ones. | ||
517 | However, this technique does not allow the definition of | ||
518 | recursive patterns. | ||
519 | For recursive patterns, | ||
520 | we need real grammars. | ||
521 | </p> | ||
522 | |||
523 | <p> | ||
524 | LPeg represents grammars with tables, | ||
525 | where each entry is a rule. | ||
526 | </p> | ||
527 | |||
528 | <p> | ||
529 | The call <code>lpeg.V(v)</code> | ||
530 | creates a pattern that represents the nonterminal | ||
531 | (or <em>variable</em>) with index <code>v</code> in a grammar. | ||
532 | Because the grammar still does not exist when | ||
533 | this function is evaluated, | ||
534 | the result is an <em>open reference</em> to the respective rule. | ||
535 | </p> | ||
536 | |||
537 | <p> | ||
538 | A table is <em>fixed</em> when it is converted to a pattern | ||
539 | (either by calling <code>lpeg.P</code> or by using it wherein a | ||
540 | pattern is expected). | ||
541 | Then every open reference created by <code>lpeg.V(v)</code> | ||
542 | is corrected to refer to the rule indexed by <code>v</code> in the table. | ||
543 | </p> | ||
544 | |||
545 | <p> | ||
546 | When a table is fixed, | ||
547 | the result is a pattern that matches its <em>initial rule</em>. | ||
548 | The entry with index 1 in the table defines its initial rule. | ||
549 | If that entry is a string, | ||
550 | it is assumed to be the name of the initial rule. | ||
551 | Otherwise, LPeg assumes that the entry 1 itself is the initial rule. | ||
552 | </p> | ||
553 | |||
554 | <p> | ||
555 | As an example, | ||
556 | the following grammar matches strings of a's and b's that | ||
557 | have the same number of a's and b's: | ||
558 | </p> | ||
559 | <pre class="example"> | ||
560 | equalcount = lpeg.P{ | ||
561 | "S"; -- initial rule name | ||
562 | S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "", | ||
563 | A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A", | ||
564 | B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B", | ||
565 | } * -1 | ||
566 | </pre> | ||
567 | <p> | ||
568 | It is equivalent to the following grammar in standard PEG notation: | ||
569 | </p> | ||
570 | <pre class="example"> | ||
571 | S <- 'a' B / 'b' A / '' | ||
572 | A <- 'a' S / 'b' A A | ||
573 | B <- 'b' S / 'a' B B | ||
574 | </pre> | ||
575 | |||
576 | |||
577 | <h2><a name="captures">Captures</a></h2> | ||
578 | |||
579 | <p> | ||
580 | A <em>capture</em> is a pattern that produces values | ||
581 | (the so called <em>semantic information</em>) | ||
582 | according to what it matches. | ||
583 | LPeg offers several kinds of captures, | ||
584 | which produces values based on matches and combine these values to | ||
585 | produce new values. | ||
586 | Each capture may produce zero or more values. | ||
587 | </p> | ||
588 | |||
589 | <p> | ||
590 | The following table summarizes the basic captures: | ||
591 | </p> | ||
592 | <table border="1"> | ||
593 | <tbody><tr><td><b>Operation</b></td><td><b>What it Produces</b></td></tr> | ||
594 | <tr><td><a href="#cap-c"><code>lpeg.C(patt)</code></a></td> | ||
595 | <td>the match for <code>patt</code> plus all captures | ||
596 | made by <code>patt</code></td></tr> | ||
597 | <tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td> | ||
598 | <td>the value of the n<sup>th</sup> extra argument to | ||
599 | <code>lpeg.match</code> (matches the empty string)</td></tr> | ||
600 | <tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td> | ||
601 | <td>the values produced by the previous | ||
602 | group capture named <code>name</code> | ||
603 | (matches the empty string)</td></tr> | ||
604 | <tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td> | ||
605 | <td>the given values (matches the empty string)</td></tr> | ||
606 | <tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td> | ||
607 | <td>a <em>folding</em> of the captures from <code>patt</code></td></tr> | ||
608 | <tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td> | ||
609 | <td>the values produced by <code>patt</code>, | ||
610 | optionally tagged with <code>name</code></td></tr> | ||
611 | <tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td> | ||
612 | <td>the current position (matches the empty string)</td></tr> | ||
613 | <tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td> | ||
614 | <td>the match for <code>patt</code> | ||
615 | with the values from nested captures replacing their matches</td></tr> | ||
616 | <tr><td><a href="#cap-t"><code>lpeg.Ct(patt)</code></a></td> | ||
617 | <td>a table with all captures from <code>patt</code></td></tr> | ||
618 | <tr><td><a href="#cap-string"><code>patt / string</code></a></td> | ||
619 | <td><code>string</code>, with some marks replaced by captures | ||
620 | of <code>patt</code></td></tr> | ||
621 | <tr><td><a href="#cap-num"><code>patt / number</code></a></td> | ||
622 | <td>the n-th value captured by <code>patt</code>, | ||
623 | or no value when <code>number</code> is zero.</td></tr> | ||
624 | <tr><td><a href="#cap-query"><code>patt / table</code></a></td> | ||
625 | <td><code>table[c]</code>, where <code>c</code> is the (first) | ||
626 | capture of <code>patt</code></td></tr> | ||
627 | <tr><td><a href="#cap-func"><code>patt / function</code></a></td> | ||
628 | <td>the returns of <code>function</code> applied to the captures | ||
629 | of <code>patt</code></td></tr> | ||
630 | <tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td> | ||
631 | <td>the returns of <code>function</code> applied to the captures | ||
632 | of <code>patt</code>; the application is done at match time</td></tr> | ||
633 | </tbody></table> | ||
634 | |||
635 | <p> | ||
636 | A capture pattern produces its values only when it succeeds. | ||
637 | For instance, | ||
638 | the pattern <code>lpeg.C(lpeg.P"a"^-1)</code> | ||
639 | produces the empty string when there is no <code>"a"</code> | ||
640 | (because the pattern <code>"a"?</code> succeeds), | ||
641 | while the pattern <code>lpeg.C("a")^-1</code> | ||
642 | does not produce any value when there is no <code>"a"</code> | ||
643 | (because the pattern <code>"a"</code> fails). | ||
644 | A pattern inside a loop or inside a recursive structure | ||
645 | produces values for each match. | ||
646 | </p> | ||
647 | |||
648 | <p> | ||
649 | Usually, | ||
650 | LPeg does not specify when (and if) it evaluates its captures. | ||
651 | (As an example, | ||
652 | consider the pattern <code>lpeg.P"a" / func / 0</code>. | ||
653 | Because the "division" by 0 instructs LPeg to throw away the | ||
654 | results from the pattern, | ||
655 | LPeg may or may not call <code>func</code>.) | ||
656 | Therefore, captures should avoid side effects. | ||
657 | Moreover, | ||
658 | most captures cannot affect the way a pattern matches a subject. | ||
659 | The only exception to this rule is the | ||
660 | so-called <a href="#matchtime"><em>match-time capture</em></a>. | ||
661 | When a match-time capture matches, | ||
662 | it forces the immediate evaluation of all its nested captures | ||
663 | and then calls its corresponding function, | ||
664 | which defines whether the match succeeds and also | ||
665 | what values are produced. | ||
666 | </p> | ||
667 | |||
668 | <h3><a name="cap-c"></a><code>lpeg.C (patt)</code></h3> | ||
669 | <p> | ||
670 | Creates a <em>simple capture</em>, | ||
671 | which captures the substring of the subject that matches <code>patt</code>. | ||
672 | The captured value is a string. | ||
673 | If <code>patt</code> has other captures, | ||
674 | their values are returned after this one. | ||
675 | </p> | ||
676 | |||
677 | |||
678 | <h3><a name="cap-arg"></a><code>lpeg.Carg (n)</code></h3> | ||
679 | <p> | ||
680 | Creates an <em>argument capture</em>. | ||
681 | This pattern matches the empty string and | ||
682 | produces the value given as the n<sup>th</sup> extra | ||
683 | argument given in the call to <code>lpeg.match</code>. | ||
684 | </p> | ||
685 | |||
686 | |||
687 | <h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3> | ||
688 | <p> | ||
689 | Creates a <em>back capture</em>. | ||
690 | This pattern matches the empty string and | ||
691 | produces the values produced by the <em>most recent</em> | ||
692 | <a href="#cap-g">group capture</a> named <code>name</code> | ||
693 | (where <code>name</code> can be any Lua value). | ||
694 | </p> | ||
695 | |||
696 | <p> | ||
697 | <em>Most recent</em> means the last | ||
698 | <em>complete</em> | ||
699 | <em>outermost</em> | ||
700 | group capture with the given name. | ||
701 | A <em>Complete</em> capture means that the entire pattern | ||
702 | corresponding to the capture has matched. | ||
703 | An <em>Outermost</em> capture means that the capture is not inside | ||
704 | another complete capture. | ||
705 | </p> | ||
706 | |||
707 | <p> | ||
708 | In the same way that LPeg does not specify when it evaluates captures, | ||
709 | it does not specify whether it reuses | ||
710 | values previously produced by the group | ||
711 | or re-evaluates them. | ||
712 | </p> | ||
713 | |||
714 | <h3><a name="cap-cc"></a><code>lpeg.Cc ([value, ...])</code></h3> | ||
715 | <p> | ||
716 | Creates a <em>constant capture</em>. | ||
717 | This pattern matches the empty string and | ||
718 | produces all given values as its captured values. | ||
719 | </p> | ||
720 | |||
721 | |||
722 | <h3><a name="cap-f"></a><code>lpeg.Cf (patt, func)</code></h3> | ||
723 | <p> | ||
724 | Creates a <em>fold capture</em>. | ||
725 | If <code>patt</code> produces a list of captures | ||
726 | <em>C<sub>1</sub> C<sub>2</sub> ... C<sub>n</sub></em>, | ||
727 | this capture will produce the value | ||
728 | <em>func(...func(func(C<sub>1</sub>, C<sub>2</sub>), C<sub>3</sub>)..., | ||
729 | C<sub>n</sub>)</em>, | ||
730 | that is, it will <em>fold</em> | ||
731 | (or <em>accumulate</em>, or <em>reduce</em>) | ||
732 | the captures from <code>patt</code> using function <code>func</code>. | ||
733 | </p> | ||
734 | |||
735 | <p> | ||
736 | This capture assumes that <code>patt</code> should produce | ||
737 | at least one capture with at least one value (of any type), | ||
738 | which becomes the initial value of an <em>accumulator</em>. | ||
739 | (If you need a specific initial value, | ||
740 | you may prefix a <a href="#cap-cc">constant capture</a> to <code>patt</code>.) | ||
741 | For each subsequent capture, | ||
742 | LPeg calls <code>func</code> | ||
743 | with this accumulator as the first argument and all values produced | ||
744 | by the capture as extra arguments; | ||
745 | the first result from this call | ||
746 | becomes the new value for the accumulator. | ||
747 | The final value of the accumulator becomes the captured value. | ||
748 | </p> | ||
749 | |||
750 | <p> | ||
751 | As an example, | ||
752 | the following pattern matches a list of numbers separated | ||
753 | by commas and returns their addition: | ||
754 | </p> | ||
755 | <pre class="example"> | ||
756 | -- matches a numeral and captures its numerical value | ||
757 | number = lpeg.R"09"^1 / tonumber | ||
758 | |||
759 | -- matches a list of numbers, capturing their values | ||
760 | list = number * ("," * number)^0 | ||
761 | |||
762 | -- auxiliary function to add two numbers | ||
763 | function add (acc, newvalue) return acc + newvalue end | ||
764 | |||
765 | -- folds the list of numbers adding them | ||
766 | sum = lpeg.Cf(list, add) | ||
767 | |||
768 | -- example of use | ||
769 | print(sum:match("10,30,43")) --> 83 | ||
770 | </pre> | ||
771 | |||
772 | |||
773 | <h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3> | ||
774 | <p> | ||
775 | Creates a <em>group capture</em>. | ||
776 | It groups all values returned by <code>patt</code> | ||
777 | into a single capture. | ||
778 | The group may be anonymous (if no name is given) | ||
779 | or named with the given name | ||
780 | (which can be any non-nil Lua value). | ||
781 | </p> | ||
782 | |||
783 | <p> | ||
784 | An anonymous group serves to join values from several captures into | ||
785 | a single capture. | ||
786 | A named group has a different behavior. | ||
787 | In most situations, a named group returns no values at all. | ||
788 | Its values are only relevant for a following | ||
789 | <a href="#cap-b">back capture</a> or when used | ||
790 | inside a <a href="#cap-t">table capture</a>. | ||
791 | </p> | ||
792 | |||
793 | |||
794 | <h3><a name="cap-p"></a><code>lpeg.Cp ()</code></h3> | ||
795 | <p> | ||
796 | Creates a <em>position capture</em>. | ||
797 | It matches the empty string and | ||
798 | captures the position in the subject where the match occurs. | ||
799 | The captured value is a number. | ||
800 | </p> | ||
801 | |||
802 | |||
803 | <h3><a name="cap-s"></a><code>lpeg.Cs (patt)</code></h3> | ||
804 | <p> | ||
805 | Creates a <em>substitution capture</em>, | ||
806 | which captures the substring of the subject that matches <code>patt</code>, | ||
807 | with <em>substitutions</em>. | ||
808 | For any capture inside <code>patt</code> with a value, | ||
809 | the substring that matched the capture is replaced by the capture value | ||
810 | (which should be a string). | ||
811 | The final captured value is the string resulting from | ||
812 | all replacements. | ||
813 | </p> | ||
814 | |||
815 | |||
816 | <h3><a name="cap-t"></a><code>lpeg.Ct (patt)</code></h3> | ||
817 | <p> | ||
818 | Creates a <em>table capture</em>. | ||
819 | This capture returns a table with all values from all anonymous captures | ||
820 | made by <code>patt</code> inside this table in successive integer keys, | ||
821 | starting at 1. | ||
822 | Moreover, | ||
823 | for each named capture group created by <code>patt</code>, | ||
824 | the first value of the group is put into the table | ||
825 | with the group name as its key. | ||
826 | The captured value is only the table. | ||
827 | </p> | ||
828 | |||
829 | |||
830 | <h3><a name="cap-string"></a><code>patt / string</code></h3> | ||
831 | <p> | ||
832 | Creates a <em>string capture</em>. | ||
833 | It creates a capture string based on <code>string</code>. | ||
834 | The captured value is a copy of <code>string</code>, | ||
835 | except that the character <code>%</code> works as an escape character: | ||
836 | any sequence in <code>string</code> of the form <code>%<em>n</em></code>, | ||
837 | with <em>n</em> between 1 and 9, | ||
838 | stands for the match of the <em>n</em>-th capture in <code>patt</code>. | ||
839 | The sequence <code>%0</code> stands for the whole match. | ||
840 | The sequence <code>%%</code> stands for a single <code>%</code>. | ||
841 | </p> | ||
842 | |||
843 | |||
844 | <h3><a name="cap-num"></a><code>patt / number</code></h3> | ||
845 | <p> | ||
846 | Creates a <em>numbered capture</em>. | ||
847 | For a non-zero number, | ||
848 | the captured value is the n-th value | ||
849 | captured by <code>patt</code>. | ||
850 | When <code>number</code> is zero, | ||
851 | there are no captured values. | ||
852 | </p> | ||
853 | |||
854 | |||
855 | <h3><a name="cap-query"></a><code>patt / table</code></h3> | ||
856 | <p> | ||
857 | Creates a <em>query capture</em>. | ||
858 | It indexes the given table using as key the first value captured by | ||
859 | <code>patt</code>, | ||
860 | or the whole match if <code>patt</code> produced no value. | ||
861 | The value at that index is the final value of the capture. | ||
862 | If the table does not have that key, | ||
863 | there is no captured value. | ||
864 | </p> | ||
865 | |||
866 | |||
867 | <h3><a name="cap-func"></a><code>patt / function</code></h3> | ||
868 | <p> | ||
869 | Creates a <em>function capture</em>. | ||
870 | It calls the given function passing all captures made by | ||
871 | <code>patt</code> as arguments, | ||
872 | or the whole match if <code>patt</code> made no capture. | ||
873 | The values returned by the function | ||
874 | are the final values of the capture. | ||
875 | In particular, | ||
876 | if <code>function</code> returns no value, | ||
877 | there is no captured value. | ||
878 | </p> | ||
879 | |||
880 | |||
881 | <h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3> | ||
882 | <p> | ||
883 | Creates a <em>match-time capture</em>. | ||
884 | Unlike all other captures, | ||
885 | this one is evaluated immediately when a match occurs | ||
886 | (even if it is part of a larger pattern that fails later). | ||
887 | It forces the immediate evaluation of all its nested captures | ||
888 | and then calls <code>function</code>. | ||
889 | </p> | ||
890 | |||
891 | <p> | ||
892 | The given function gets as arguments the entire subject, | ||
893 | the current position (after the match of <code>patt</code>), | ||
894 | plus any capture values produced by <code>patt</code>. | ||
895 | </p> | ||
896 | |||
897 | <p> | ||
898 | The first value returned by <code>function</code> | ||
899 | defines how the match happens. | ||
900 | If the call returns a number, | ||
901 | the match succeeds | ||
902 | and the returned number becomes the new current position. | ||
903 | (Assuming a subject <em>s</em> and current position <em>i</em>, | ||
904 | the returned number must be in the range <em>[i, len(s) + 1]</em>.) | ||
905 | If the call returns <b>true</b>, | ||
906 | the match succeeds without consuming any input. | ||
907 | (So, to return <b>true</b> is equivalent to return <em>i</em>.) | ||
908 | If the call returns <b>false</b>, <b>nil</b>, or no value, | ||
909 | the match fails. | ||
910 | </p> | ||
911 | |||
912 | <p> | ||
913 | Any extra values returned by the function become the | ||
914 | values produced by the capture. | ||
915 | </p> | ||
916 | |||
917 | |||
918 | |||
919 | |||
920 | <h2><a name="ex">Some Examples</a></h2> | ||
921 | |||
922 | <h3>Using a Pattern</h3> | ||
923 | <p> | ||
924 | This example shows a very simple but complete program | ||
925 | that builds and uses a pattern: | ||
926 | </p> | ||
927 | <pre class="example"> | ||
928 | local lpeg = require "lpeg" | ||
929 | |||
930 | -- matches a word followed by end-of-string | ||
931 | p = lpeg.R"az"^1 * -1 | ||
932 | |||
933 | print(p:match("hello")) --> 6 | ||
934 | print(lpeg.match(p, "hello")) --> 6 | ||
935 | print(p:match("1 hello")) --> nil | ||
936 | </pre> | ||
937 | <p> | ||
938 | The pattern is simply a sequence of one or more lower-case letters | ||
939 | followed by the end of string (-1). | ||
940 | The program calls <code>match</code> both as a method | ||
941 | and as a function. | ||
942 | In both sucessful cases, | ||
943 | the match returns | ||
944 | the index of the first character after the match, | ||
945 | which is the string length plus one. | ||
946 | </p> | ||
947 | |||
948 | |||
949 | <h3>Name-value lists</h3> | ||
950 | <p> | ||
951 | This example parses a list of name-value pairs and returns a table | ||
952 | with those pairs: | ||
953 | </p> | ||
954 | <pre class="example"> | ||
955 | lpeg.locale(lpeg) -- adds locale entries into 'lpeg' table | ||
956 | |||
957 | local space = lpeg.space^0 | ||
958 | local name = lpeg.C(lpeg.alpha^1) * space | ||
959 | local sep = lpeg.S(",;") * space | ||
960 | local pair = lpeg.Cg(name * "=" * space * name) * sep^-1 | ||
961 | local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset) | ||
962 | t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } | ||
963 | </pre> | ||
964 | <p> | ||
965 | Each pair has the format <code>name = name</code> followed by | ||
966 | an optional separator (a comma or a semicolon). | ||
967 | The <code>pair</code> pattern encloses the pair in a group pattern, | ||
968 | so that the names become the values of a single capture. | ||
969 | The <code>list</code> pattern then folds these captures. | ||
970 | It starts with an empty table, | ||
971 | created by a table capture matching an empty string; | ||
972 | then for each capture (a pair of names) it applies <code>rawset</code> | ||
973 | over the accumulator (the table) and the capture values (the pair of names). | ||
974 | <code>rawset</code> returns the table itself, | ||
975 | so the accumulator is always the table. | ||
976 | </p> | ||
977 | |||
978 | <h3>Splitting a string</h3> | ||
979 | <p> | ||
980 | The following code builds a pattern that | ||
981 | splits a string using a given pattern | ||
982 | <code>sep</code> as a separator: | ||
983 | </p> | ||
984 | <pre class="example"> | ||
985 | function split (s, sep) | ||
986 | sep = lpeg.P(sep) | ||
987 | local elem = lpeg.C((1 - sep)^0) | ||
988 | local p = elem * (sep * elem)^0 | ||
989 | return lpeg.match(p, s) | ||
990 | end | ||
991 | </pre> | ||
992 | <p> | ||
993 | First the function ensures that <code>sep</code> is a proper pattern. | ||
994 | The pattern <code>elem</code> is a repetition of zero of more | ||
995 | arbitrary characters as long as there is not a match against | ||
996 | the separator. | ||
997 | It also captures its match. | ||
998 | The pattern <code>p</code> matches a list of elements separated | ||
999 | by <code>sep</code>. | ||
1000 | </p> | ||
1001 | |||
1002 | <p> | ||
1003 | If the split results in too many values, | ||
1004 | it may overflow the maximum number of values | ||
1005 | that can be returned by a Lua function. | ||
1006 | In this case, | ||
1007 | we can collect these values in a table: | ||
1008 | </p> | ||
1009 | <pre class="example"> | ||
1010 | function split (s, sep) | ||
1011 | sep = lpeg.P(sep) | ||
1012 | local elem = lpeg.C((1 - sep)^0) | ||
1013 | local p = lpeg.Ct(elem * (sep * elem)^0) -- make a table capture | ||
1014 | return lpeg.match(p, s) | ||
1015 | end | ||
1016 | </pre> | ||
1017 | |||
1018 | |||
1019 | <h3>Searching for a pattern</h3> | ||
1020 | <p> | ||
1021 | The primitive <code>match</code> works only in anchored mode. | ||
1022 | If we want to find a pattern anywhere in a string, | ||
1023 | we must write a pattern that matches anywhere. | ||
1024 | </p> | ||
1025 | |||
1026 | <p> | ||
1027 | Because patterns are composable, | ||
1028 | we can write a function that, | ||
1029 | given any arbitrary pattern <code>p</code>, | ||
1030 | returns a new pattern that searches for <code>p</code> | ||
1031 | anywhere in a string. | ||
1032 | There are several ways to do the search. | ||
1033 | One way is like this: | ||
1034 | </p> | ||
1035 | <pre class="example"> | ||
1036 | function anywhere (p) | ||
1037 | return lpeg.P{ p + 1 * lpeg.V(1) } | ||
1038 | end | ||
1039 | </pre> | ||
1040 | <p> | ||
1041 | This grammar has a straight reading: | ||
1042 | it matches <code>p</code> or skips one character and tries again. | ||
1043 | </p> | ||
1044 | |||
1045 | <p> | ||
1046 | If we want to know where the pattern is in the string | ||
1047 | (instead of knowing only that it is there somewhere), | ||
1048 | we can add position captures to the pattern: | ||
1049 | </p> | ||
1050 | <pre class="example"> | ||
1051 | local I = lpeg.Cp() | ||
1052 | function anywhere (p) | ||
1053 | return lpeg.P{ I * p * I + 1 * lpeg.V(1) } | ||
1054 | end | ||
1055 | |||
1056 | print(anywhere("world"):match("hello world!")) -> 7 12 | ||
1057 | </pre> | ||
1058 | |||
1059 | <p> | ||
1060 | Another option for the search is like this: | ||
1061 | </p> | ||
1062 | <pre class="example"> | ||
1063 | local I = lpeg.Cp() | ||
1064 | function anywhere (p) | ||
1065 | return (1 - lpeg.P(p))^0 * I * p * I | ||
1066 | end | ||
1067 | </pre> | ||
1068 | <p> | ||
1069 | Again the pattern has a straight reading: | ||
1070 | it skips as many characters as possible while not matching <code>p</code>, | ||
1071 | and then matches <code>p</code> (plus appropriate captures). | ||
1072 | </p> | ||
1073 | |||
1074 | <p> | ||
1075 | If we want to look for a pattern only at word boundaries, | ||
1076 | we can use the following transformer: | ||
1077 | </p> | ||
1078 | |||
1079 | <pre class="example"> | ||
1080 | local t = lpeg.locale() | ||
1081 | |||
1082 | function atwordboundary (p) | ||
1083 | return lpeg.P{ | ||
1084 | [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1) | ||
1085 | } | ||
1086 | end | ||
1087 | </pre> | ||
1088 | |||
1089 | |||
1090 | <h3><a name="balanced"></a>Balanced parentheses</h3> | ||
1091 | <p> | ||
1092 | The following pattern matches only strings with balanced parentheses: | ||
1093 | </p> | ||
1094 | <pre class="example"> | ||
1095 | b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" } | ||
1096 | </pre> | ||
1097 | <p> | ||
1098 | Reading the first (and only) rule of the given grammar, | ||
1099 | we have that a balanced string is | ||
1100 | an open parenthesis, | ||
1101 | followed by zero or more repetitions of either | ||
1102 | a non-parenthesis character or | ||
1103 | a balanced string (<code>lpeg.V(1)</code>), | ||
1104 | followed by a closing parenthesis. | ||
1105 | </p> | ||
1106 | |||
1107 | |||
1108 | <h3>Global substitution</h3> | ||
1109 | <p> | ||
1110 | The next example does a job somewhat similar to <code>string.gsub</code>. | ||
1111 | It receives a pattern and a replacement value, | ||
1112 | and substitutes the replacement value for all occurrences of the pattern | ||
1113 | in a given string: | ||
1114 | </p> | ||
1115 | <pre class="example"> | ||
1116 | function gsub (s, patt, repl) | ||
1117 | patt = lpeg.P(patt) | ||
1118 | patt = lpeg.Cs((patt / repl + 1)^0) | ||
1119 | return lpeg.match(patt, s) | ||
1120 | end | ||
1121 | </pre> | ||
1122 | <p> | ||
1123 | As in <code>string.gsub</code>, | ||
1124 | the replacement value can be a string, | ||
1125 | a function, or a table. | ||
1126 | </p> | ||
1127 | |||
1128 | |||
1129 | <h3><a name="CSV"></a>Comma-Separated Values (CSV)</h3> | ||
1130 | <p> | ||
1131 | This example breaks a string into comma-separated values, | ||
1132 | returning all fields: | ||
1133 | </p> | ||
1134 | <pre class="example"> | ||
1135 | local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' + | ||
1136 | lpeg.C((1 - lpeg.S',\n"')^0) | ||
1137 | |||
1138 | local record = field * (',' * field)^0 * (lpeg.P'\n' + -1) | ||
1139 | |||
1140 | function csv (s) | ||
1141 | return lpeg.match(record, s) | ||
1142 | end | ||
1143 | </pre> | ||
1144 | <p> | ||
1145 | A field is either a quoted field | ||
1146 | (which may contain any character except an individual quote, | ||
1147 | which may be written as two quotes that are replaced by one) | ||
1148 | or an unquoted field | ||
1149 | (which cannot contain commas, newlines, or quotes). | ||
1150 | A record is a list of fields separated by commas, | ||
1151 | ending with a newline or the string end (-1). | ||
1152 | </p> | ||
1153 | |||
1154 | <p> | ||
1155 | As it is, | ||
1156 | the previous pattern returns each field as a separated result. | ||
1157 | If we add a table capture in the definition of <code>record</code>, | ||
1158 | the pattern will return instead a single table | ||
1159 | containing all fields: | ||
1160 | </p> | ||
1161 | <pre> | ||
1162 | local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1) | ||
1163 | </pre> | ||
1164 | |||
1165 | |||
1166 | <h3>UTF-8 and Latin 1</h3> | ||
1167 | <p> | ||
1168 | It is not difficult to use LPeg to convert a string from | ||
1169 | UTF-8 encoding to Latin 1 (ISO 8859-1): | ||
1170 | </p> | ||
1171 | |||
1172 | <pre class="example"> | ||
1173 | -- convert a two-byte UTF-8 sequence to a Latin 1 character | ||
1174 | local function f2 (s) | ||
1175 | local c1, c2 = string.byte(s, 1, 2) | ||
1176 | return string.char(c1 * 64 + c2 - 12416) | ||
1177 | end | ||
1178 | |||
1179 | local utf8 = lpeg.R("\0\127") | ||
1180 | + lpeg.R("\194\195") * lpeg.R("\128\191") / f2 | ||
1181 | |||
1182 | local decode_pattern = lpeg.Cs(utf8^0) * -1 | ||
1183 | </pre> | ||
1184 | <p> | ||
1185 | In this code, | ||
1186 | the definition of UTF-8 is already restricted to the | ||
1187 | Latin 1 range (from 0 to 255). | ||
1188 | Any encoding outside this range (as well as any invalid encoding) | ||
1189 | will not match that pattern. | ||
1190 | </p> | ||
1191 | |||
1192 | <p> | ||
1193 | As the definition of <code>decode_pattern</code> demands that | ||
1194 | the pattern matches the whole input (because of the -1 at its end), | ||
1195 | any invalid string will simply fail to match, | ||
1196 | without any useful information about the problem. | ||
1197 | We can improve this situation redefining <code>decode_pattern</code> | ||
1198 | as follows: | ||
1199 | </p> | ||
1200 | <pre class="example"> | ||
1201 | local function er (_, i) error("invalid encoding at position " .. i) end | ||
1202 | |||
1203 | local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er)) | ||
1204 | </pre> | ||
1205 | <p> | ||
1206 | Now, if the pattern <code>utf8^0</code> stops | ||
1207 | before the end of the string, | ||
1208 | an appropriate error function is called. | ||
1209 | </p> | ||
1210 | |||
1211 | |||
1212 | <h3>UTF-8 and Unicode</h3> | ||
1213 | <p> | ||
1214 | We can extend the previous patterns to handle all Unicode code points. | ||
1215 | Of course, | ||
1216 | we cannot translate them to Latin 1 or any other one-byte encoding. | ||
1217 | Instead, our translation results in a array with the code points | ||
1218 | represented as numbers. | ||
1219 | The full code is here: | ||
1220 | </p> | ||
1221 | <pre class="example"> | ||
1222 | -- decode a two-byte UTF-8 sequence | ||
1223 | local function f2 (s) | ||
1224 | local c1, c2 = string.byte(s, 1, 2) | ||
1225 | return c1 * 64 + c2 - 12416 | ||
1226 | end | ||
1227 | |||
1228 | -- decode a three-byte UTF-8 sequence | ||
1229 | local function f3 (s) | ||
1230 | local c1, c2, c3 = string.byte(s, 1, 3) | ||
1231 | return (c1 * 64 + c2) * 64 + c3 - 925824 | ||
1232 | end | ||
1233 | |||
1234 | -- decode a four-byte UTF-8 sequence | ||
1235 | local function f4 (s) | ||
1236 | local c1, c2, c3, c4 = string.byte(s, 1, 4) | ||
1237 | return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 | ||
1238 | end | ||
1239 | |||
1240 | local cont = lpeg.R("\128\191") -- continuation byte | ||
1241 | |||
1242 | local utf8 = lpeg.R("\0\127") / string.byte | ||
1243 | + lpeg.R("\194\223") * cont / f2 | ||
1244 | + lpeg.R("\224\239") * cont * cont / f3 | ||
1245 | + lpeg.R("\240\244") * cont * cont * cont / f4 | ||
1246 | |||
1247 | local decode_pattern = lpeg.Ct(utf8^0) * -1 | ||
1248 | </pre> | ||
1249 | |||
1250 | |||
1251 | <h3>Lua's long strings</h3> | ||
1252 | <p> | ||
1253 | A long string in Lua starts with the pattern <code>[=*[</code> | ||
1254 | and ends at the first occurrence of <code>]=*]</code> with | ||
1255 | exactly the same number of equal signs. | ||
1256 | If the opening brackets are followed by a newline, | ||
1257 | this newline is discarded | ||
1258 | (that is, it is not part of the string). | ||
1259 | </p> | ||
1260 | |||
1261 | <p> | ||
1262 | To match a long string in Lua, | ||
1263 | the pattern must capture the first repetition of equal signs and then, | ||
1264 | whenever it finds a candidate for closing the string, | ||
1265 | check whether it has the same number of equal signs. | ||
1266 | </p> | ||
1267 | |||
1268 | <pre class="example"> | ||
1269 | equals = lpeg.P"="^0 | ||
1270 | open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1 | ||
1271 | close = "]" * lpeg.C(equals) * "]" | ||
1272 | closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end) | ||
1273 | string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1 | ||
1274 | </pre> | ||
1275 | |||
1276 | <p> | ||
1277 | The <code>open</code> pattern matches <code>[=*[</code>, | ||
1278 | capturing the repetitions of equal signs in a group named <code>init</code>; | ||
1279 | it also discharges an optional newline, if present. | ||
1280 | The <code>close</code> pattern matches <code>]=*]</code>, | ||
1281 | also capturing the repetitions of equal signs. | ||
1282 | The <code>closeeq</code> pattern first matches <code>close</code>; | ||
1283 | then it uses a back capture to recover the capture made | ||
1284 | by the previous <code>open</code>, | ||
1285 | which is named <code>init</code>; | ||
1286 | finally it uses a match-time capture to check | ||
1287 | whether both captures are equal. | ||
1288 | The <code>string</code> pattern starts with an <code>open</code>, | ||
1289 | then it goes as far as possible until matching <code>closeeq</code>, | ||
1290 | and then matches the final <code>close</code>. | ||
1291 | The final numbered capture simply discards | ||
1292 | the capture made by <code>close</code>. | ||
1293 | </p> | ||
1294 | |||
1295 | |||
1296 | <h3>Arithmetic expressions</h3> | ||
1297 | <p> | ||
1298 | This example is a complete parser and evaluator for simple | ||
1299 | arithmetic expressions. | ||
1300 | We write it in two styles. | ||
1301 | The first approach first builds a syntax tree and then | ||
1302 | traverses this tree to compute the expression value: | ||
1303 | </p> | ||
1304 | <pre class="example"> | ||
1305 | -- Lexical Elements | ||
1306 | local Space = lpeg.S(" \n\t")^0 | ||
1307 | local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space | ||
1308 | local TermOp = lpeg.C(lpeg.S("+-")) * Space | ||
1309 | local FactorOp = lpeg.C(lpeg.S("*/")) * Space | ||
1310 | local Open = "(" * Space | ||
1311 | local Close = ")" * Space | ||
1312 | |||
1313 | -- Grammar | ||
1314 | local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor" | ||
1315 | G = lpeg.P{ Exp, | ||
1316 | Exp = lpeg.Ct(Term * (TermOp * Term)^0); | ||
1317 | Term = lpeg.Ct(Factor * (FactorOp * Factor)^0); | ||
1318 | Factor = Number + Open * Exp * Close; | ||
1319 | } | ||
1320 | |||
1321 | G = Space * G * -1 | ||
1322 | |||
1323 | -- Evaluator | ||
1324 | function eval (x) | ||
1325 | if type(x) == "string" then | ||
1326 | return tonumber(x) | ||
1327 | else | ||
1328 | local op1 = eval(x[1]) | ||
1329 | for i = 2, #x, 2 do | ||
1330 | local op = x[i] | ||
1331 | local op2 = eval(x[i + 1]) | ||
1332 | if (op == "+") then op1 = op1 + op2 | ||
1333 | elseif (op == "-") then op1 = op1 - op2 | ||
1334 | elseif (op == "*") then op1 = op1 * op2 | ||
1335 | elseif (op == "/") then op1 = op1 / op2 | ||
1336 | end | ||
1337 | end | ||
1338 | return op1 | ||
1339 | end | ||
1340 | end | ||
1341 | |||
1342 | -- Parser/Evaluator | ||
1343 | function evalExp (s) | ||
1344 | local t = lpeg.match(G, s) | ||
1345 | if not t then error("syntax error", 2) end | ||
1346 | return eval(t) | ||
1347 | end | ||
1348 | |||
1349 | -- small example | ||
1350 | print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5 | ||
1351 | </pre> | ||
1352 | |||
1353 | <p> | ||
1354 | The second style computes the expression value on the fly, | ||
1355 | without building the syntax tree. | ||
1356 | The following grammar takes this approach. | ||
1357 | (It assumes the same lexical elements as before.) | ||
1358 | </p> | ||
1359 | <pre class="example"> | ||
1360 | -- Auxiliary function | ||
1361 | function eval (v1, op, v2) | ||
1362 | if (op == "+") then return v1 + v2 | ||
1363 | elseif (op == "-") then return v1 - v2 | ||
1364 | elseif (op == "*") then return v1 * v2 | ||
1365 | elseif (op == "/") then return v1 / v2 | ||
1366 | end | ||
1367 | end | ||
1368 | |||
1369 | -- Grammar | ||
1370 | local V = lpeg.V | ||
1371 | G = lpeg.P{ "Exp", | ||
1372 | Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval); | ||
1373 | Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval); | ||
1374 | Factor = Number / tonumber + Open * V"Exp" * Close; | ||
1375 | } | ||
1376 | |||
1377 | -- small example | ||
1378 | print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 | ||
1379 | </pre> | ||
1380 | <p> | ||
1381 | Note the use of the fold (accumulator) capture. | ||
1382 | To compute the value of an expression, | ||
1383 | the accumulator starts with the value of the first term, | ||
1384 | and then applies <code>eval</code> over | ||
1385 | the accumulator, the operator, | ||
1386 | and the new term for each repetition. | ||
1387 | </p> | ||
1388 | |||
1389 | |||
1390 | |||
1391 | <h2><a name="download"></a>Download</h2> | ||
1392 | |||
1393 | <p>LPeg | ||
1394 | <a href="http://www.inf.puc-rio.br/~roberto/lpeg/lpeg-1.0.1.tar.gz">source code</a>.</p> | ||
1395 | |||
1396 | |||
1397 | <h2><a name="license">License</a></h2> | ||
1398 | |||
1399 | <p> | ||
1400 | Copyright © 2007-2017 Lua.org, PUC-Rio. | ||
1401 | </p> | ||
1402 | <p> | ||
1403 | Permission is hereby granted, free of charge, | ||
1404 | to any person obtaining a copy of this software and | ||
1405 | associated documentation files (the "Software"), | ||
1406 | to deal in the Software without restriction, | ||
1407 | including without limitation the rights to use, | ||
1408 | copy, modify, merge, publish, distribute, sublicense, | ||
1409 | and/or sell copies of the Software, | ||
1410 | and to permit persons to whom the Software is | ||
1411 | furnished to do so, | ||
1412 | subject to the following conditions: | ||
1413 | </p> | ||
1414 | |||
1415 | <p> | ||
1416 | The above copyright notice and this permission notice | ||
1417 | shall be included in all copies or substantial portions of the Software. | ||
1418 | </p> | ||
1419 | |||
1420 | <p> | ||
1421 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
1422 | EXPRESS OR IMPLIED, | ||
1423 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
1424 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
1425 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
1426 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
1427 | TORT OR OTHERWISE, ARISING FROM, | ||
1428 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
1429 | THE SOFTWARE. | ||
1430 | </p> | ||
1431 | |||
1432 | </div> <!-- id="content" --> | ||
1433 | |||
1434 | </div> <!-- id="main" --> | ||
1435 | |||
1436 | <div id="about"> | ||
1437 | <p><small> | ||
1438 | $Id: lpeg.html,v 1.77 2017/01/13 13:40:05 roberto Exp $ | ||
1439 | </small></p> | ||
1440 | </div> <!-- id="about" --> | ||
1441 | |||
1442 | </div> <!-- id="container" --> | ||
1443 | |||
1444 | </body> | ||
1445 | </html> | ||
diff --git a/lpprint.c b/lpprint.c new file mode 100644 index 0000000..f7be408 --- /dev/null +++ b/lpprint.c | |||
@@ -0,0 +1,244 @@ | |||
1 | /* | ||
2 | ** $Id: lpprint.c,v 1.10 2016/09/13 16:06:03 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <ctype.h> | ||
7 | #include <limits.h> | ||
8 | #include <stdio.h> | ||
9 | |||
10 | |||
11 | #include "lptypes.h" | ||
12 | #include "lpprint.h" | ||
13 | #include "lpcode.h" | ||
14 | |||
15 | |||
16 | #if defined(LPEG_DEBUG) | ||
17 | |||
18 | /* | ||
19 | ** {====================================================== | ||
20 | ** Printing patterns (for debugging) | ||
21 | ** ======================================================= | ||
22 | */ | ||
23 | |||
24 | |||
25 | void printcharset (const byte *st) { | ||
26 | int i; | ||
27 | printf("["); | ||
28 | for (i = 0; i <= UCHAR_MAX; i++) { | ||
29 | int first = i; | ||
30 | while (testchar(st, i) && i <= UCHAR_MAX) i++; | ||
31 | if (i - 1 == first) /* unary range? */ | ||
32 | printf("(%02x)", first); | ||
33 | else if (i - 1 > first) /* non-empty range? */ | ||
34 | printf("(%02x-%02x)", first, i - 1); | ||
35 | } | ||
36 | printf("]"); | ||
37 | } | ||
38 | |||
39 | |||
40 | static const char *capkind (int kind) { | ||
41 | const char *const modes[] = { | ||
42 | "close", "position", "constant", "backref", | ||
43 | "argument", "simple", "table", "function", | ||
44 | "query", "string", "num", "substitution", "fold", | ||
45 | "runtime", "group"}; | ||
46 | return modes[kind]; | ||
47 | } | ||
48 | |||
49 | |||
50 | static void printjmp (const Instruction *op, const Instruction *p) { | ||
51 | printf("-> %d", (int)(p + (p + 1)->offset - op)); | ||
52 | } | ||
53 | |||
54 | |||
55 | void printinst (const Instruction *op, const Instruction *p) { | ||
56 | const char *const names[] = { | ||
57 | "any", "char", "set", | ||
58 | "testany", "testchar", "testset", | ||
59 | "span", "behind", | ||
60 | "ret", "end", | ||
61 | "choice", "jmp", "call", "open_call", | ||
62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", | ||
63 | "fullcapture", "opencapture", "closecapture", "closeruntime" | ||
64 | }; | ||
65 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); | ||
66 | switch ((Opcode)p->i.code) { | ||
67 | case IChar: { | ||
68 | printf("'%c'", p->i.aux); | ||
69 | break; | ||
70 | } | ||
71 | case ITestChar: { | ||
72 | printf("'%c'", p->i.aux); printjmp(op, p); | ||
73 | break; | ||
74 | } | ||
75 | case IFullCapture: { | ||
76 | printf("%s (size = %d) (idx = %d)", | ||
77 | capkind(getkind(p)), getoff(p), p->i.key); | ||
78 | break; | ||
79 | } | ||
80 | case IOpenCapture: { | ||
81 | printf("%s (idx = %d)", capkind(getkind(p)), p->i.key); | ||
82 | break; | ||
83 | } | ||
84 | case ISet: { | ||
85 | printcharset((p+1)->buff); | ||
86 | break; | ||
87 | } | ||
88 | case ITestSet: { | ||
89 | printcharset((p+2)->buff); printjmp(op, p); | ||
90 | break; | ||
91 | } | ||
92 | case ISpan: { | ||
93 | printcharset((p+1)->buff); | ||
94 | break; | ||
95 | } | ||
96 | case IOpenCall: { | ||
97 | printf("-> %d", (p + 1)->offset); | ||
98 | break; | ||
99 | } | ||
100 | case IBehind: { | ||
101 | printf("%d", p->i.aux); | ||
102 | break; | ||
103 | } | ||
104 | case IJmp: case ICall: case ICommit: case IChoice: | ||
105 | case IPartialCommit: case IBackCommit: case ITestAny: { | ||
106 | printjmp(op, p); | ||
107 | break; | ||
108 | } | ||
109 | default: break; | ||
110 | } | ||
111 | printf("\n"); | ||
112 | } | ||
113 | |||
114 | |||
115 | void printpatt (Instruction *p, int n) { | ||
116 | Instruction *op = p; | ||
117 | while (p < op + n) { | ||
118 | printinst(op, p); | ||
119 | p += sizei(p); | ||
120 | } | ||
121 | } | ||
122 | |||
123 | |||
124 | #if defined(LPEG_DEBUG) | ||
125 | static void printcap (Capture *cap) { | ||
126 | printf("%s (idx: %d - size: %d) -> %p\n", | ||
127 | capkind(cap->kind), cap->idx, cap->siz, cap->s); | ||
128 | } | ||
129 | |||
130 | |||
131 | void printcaplist (Capture *cap, Capture *limit) { | ||
132 | printf(">======\n"); | ||
133 | for (; cap->s && (limit == NULL || cap < limit); cap++) | ||
134 | printcap(cap); | ||
135 | printf("=======\n"); | ||
136 | } | ||
137 | #endif | ||
138 | |||
139 | /* }====================================================== */ | ||
140 | |||
141 | |||
142 | /* | ||
143 | ** {====================================================== | ||
144 | ** Printing trees (for debugging) | ||
145 | ** ======================================================= | ||
146 | */ | ||
147 | |||
148 | static const char *tagnames[] = { | ||
149 | "char", "set", "any", | ||
150 | "true", "false", | ||
151 | "rep", | ||
152 | "seq", "choice", | ||
153 | "not", "and", | ||
154 | "call", "opencall", "rule", "grammar", | ||
155 | "behind", | ||
156 | "capture", "run-time" | ||
157 | }; | ||
158 | |||
159 | |||
160 | void printtree (TTree *tree, int ident) { | ||
161 | int i; | ||
162 | for (i = 0; i < ident; i++) printf(" "); | ||
163 | printf("%s", tagnames[tree->tag]); | ||
164 | switch (tree->tag) { | ||
165 | case TChar: { | ||
166 | int c = tree->u.n; | ||
167 | if (isprint(c)) | ||
168 | printf(" '%c'\n", c); | ||
169 | else | ||
170 | printf(" (%02X)\n", c); | ||
171 | break; | ||
172 | } | ||
173 | case TSet: { | ||
174 | printcharset(treebuffer(tree)); | ||
175 | printf("\n"); | ||
176 | break; | ||
177 | } | ||
178 | case TOpenCall: case TCall: { | ||
179 | assert(sib2(tree)->tag == TRule); | ||
180 | printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); | ||
181 | break; | ||
182 | } | ||
183 | case TBehind: { | ||
184 | printf(" %d\n", tree->u.n); | ||
185 | printtree(sib1(tree), ident + 2); | ||
186 | break; | ||
187 | } | ||
188 | case TCapture: { | ||
189 | printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); | ||
190 | printtree(sib1(tree), ident + 2); | ||
191 | break; | ||
192 | } | ||
193 | case TRule: { | ||
194 | printf(" n: %d key: %d\n", tree->cap, tree->key); | ||
195 | printtree(sib1(tree), ident + 2); | ||
196 | break; /* do not print next rule as a sibling */ | ||
197 | } | ||
198 | case TGrammar: { | ||
199 | TTree *rule = sib1(tree); | ||
200 | printf(" %d\n", tree->u.n); /* number of rules */ | ||
201 | for (i = 0; i < tree->u.n; i++) { | ||
202 | printtree(rule, ident + 2); | ||
203 | rule = sib2(rule); | ||
204 | } | ||
205 | assert(rule->tag == TTrue); /* sentinel */ | ||
206 | break; | ||
207 | } | ||
208 | default: { | ||
209 | int sibs = numsiblings[tree->tag]; | ||
210 | printf("\n"); | ||
211 | if (sibs >= 1) { | ||
212 | printtree(sib1(tree), ident + 2); | ||
213 | if (sibs >= 2) | ||
214 | printtree(sib2(tree), ident + 2); | ||
215 | } | ||
216 | break; | ||
217 | } | ||
218 | } | ||
219 | } | ||
220 | |||
221 | |||
222 | void printktable (lua_State *L, int idx) { | ||
223 | int n, i; | ||
224 | lua_getuservalue(L, idx); | ||
225 | if (lua_isnil(L, -1)) /* no ktable? */ | ||
226 | return; | ||
227 | n = lua_rawlen(L, -1); | ||
228 | printf("["); | ||
229 | for (i = 1; i <= n; i++) { | ||
230 | printf("%d = ", i); | ||
231 | lua_rawgeti(L, -1, i); | ||
232 | if (lua_isstring(L, -1)) | ||
233 | printf("%s ", lua_tostring(L, -1)); | ||
234 | else | ||
235 | printf("%s ", lua_typename(L, lua_type(L, -1))); | ||
236 | lua_pop(L, 1); | ||
237 | } | ||
238 | printf("]\n"); | ||
239 | /* leave ktable at the stack */ | ||
240 | } | ||
241 | |||
242 | /* }====================================================== */ | ||
243 | |||
244 | #endif | ||
diff --git a/lpprint.h b/lpprint.h new file mode 100644 index 0000000..6329760 --- /dev/null +++ b/lpprint.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | ** $Id: lpprint.h,v 1.2 2015/06/12 18:18:08 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | |||
6 | #if !defined(lpprint_h) | ||
7 | #define lpprint_h | ||
8 | |||
9 | |||
10 | #include "lptree.h" | ||
11 | #include "lpvm.h" | ||
12 | |||
13 | |||
14 | #if defined(LPEG_DEBUG) | ||
15 | |||
16 | void printpatt (Instruction *p, int n); | ||
17 | void printtree (TTree *tree, int ident); | ||
18 | void printktable (lua_State *L, int idx); | ||
19 | void printcharset (const byte *st); | ||
20 | void printcaplist (Capture *cap, Capture *limit); | ||
21 | void printinst (const Instruction *op, const Instruction *p); | ||
22 | |||
23 | #else | ||
24 | |||
25 | #define printktable(L,idx) \ | ||
26 | luaL_error(L, "function only implemented in debug mode") | ||
27 | #define printtree(tree,i) \ | ||
28 | luaL_error(L, "function only implemented in debug mode") | ||
29 | #define printpatt(p,n) \ | ||
30 | luaL_error(L, "function only implemented in debug mode") | ||
31 | |||
32 | #endif | ||
33 | |||
34 | |||
35 | #endif | ||
36 | |||
diff --git a/lptree.c b/lptree.c new file mode 100644 index 0000000..37bfaf0 --- /dev/null +++ b/lptree.c | |||
@@ -0,0 +1,1305 @@ | |||
1 | /* | ||
2 | ** $Id: lptree.c,v 1.23 2017/12/14 15:30:04 roberto Exp $ | ||
3 | ** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <ctype.h> | ||
7 | #include <limits.h> | ||
8 | #include <string.h> | ||
9 | |||
10 | |||
11 | #include "lua.h" | ||
12 | #include "lauxlib.h" | ||
13 | |||
14 | #include "lptypes.h" | ||
15 | #include "lpcap.h" | ||
16 | #include "lpcode.h" | ||
17 | #include "lpprint.h" | ||
18 | #include "lptree.h" | ||
19 | |||
20 | |||
21 | /* number of siblings for each tree */ | ||
22 | const byte numsiblings[] = { | ||
23 | 0, 0, 0, /* char, set, any */ | ||
24 | 0, 0, /* true, false */ | ||
25 | 1, /* rep */ | ||
26 | 2, 2, /* seq, choice */ | ||
27 | 1, 1, /* not, and */ | ||
28 | 0, 0, 2, 1, /* call, opencall, rule, grammar */ | ||
29 | 1, /* behind */ | ||
30 | 1, 1 /* capture, runtime capture */ | ||
31 | }; | ||
32 | |||
33 | |||
34 | static TTree *newgrammar (lua_State *L, int arg); | ||
35 | |||
36 | |||
37 | /* | ||
38 | ** returns a reasonable name for value at index 'idx' on the stack | ||
39 | */ | ||
40 | static const char *val2str (lua_State *L, int idx) { | ||
41 | const char *k = lua_tostring(L, idx); | ||
42 | if (k != NULL) | ||
43 | return lua_pushfstring(L, "%s", k); | ||
44 | else | ||
45 | return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx)); | ||
46 | } | ||
47 | |||
48 | |||
49 | /* | ||
50 | ** Fix a TOpenCall into a TCall node, using table 'postable' to | ||
51 | ** translate a key to its rule address in the tree. Raises an | ||
52 | ** error if key does not exist. | ||
53 | */ | ||
54 | static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { | ||
55 | int n; | ||
56 | lua_rawgeti(L, -1, t->key); /* get rule's name */ | ||
57 | lua_gettable(L, postable); /* query name in position table */ | ||
58 | n = lua_tonumber(L, -1); /* get (absolute) position */ | ||
59 | lua_pop(L, 1); /* remove position */ | ||
60 | if (n == 0) { /* no position? */ | ||
61 | lua_rawgeti(L, -1, t->key); /* get rule's name again */ | ||
62 | luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); | ||
63 | } | ||
64 | t->tag = TCall; | ||
65 | t->u.ps = n - (t - g); /* position relative to node */ | ||
66 | assert(sib2(t)->tag == TRule); | ||
67 | sib2(t)->key = t->key; /* fix rule's key */ | ||
68 | } | ||
69 | |||
70 | |||
71 | /* | ||
72 | ** Transform left associative constructions into right | ||
73 | ** associative ones, for sequence and choice; that is: | ||
74 | ** (t11 + t12) + t2 => t11 + (t12 + t2) | ||
75 | ** (t11 * t12) * t2 => t11 * (t12 * t2) | ||
76 | ** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) | ||
77 | */ | ||
78 | static void correctassociativity (TTree *tree) { | ||
79 | TTree *t1 = sib1(tree); | ||
80 | assert(tree->tag == TChoice || tree->tag == TSeq); | ||
81 | while (t1->tag == tree->tag) { | ||
82 | int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */ | ||
83 | int n11size = t1->u.ps - 1; | ||
84 | int n12size = n1size - n11size - 1; | ||
85 | memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */ | ||
86 | tree->u.ps = n11size + 1; | ||
87 | sib2(tree)->tag = tree->tag; | ||
88 | sib2(tree)->u.ps = n12size + 1; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | |||
93 | /* | ||
94 | ** Make final adjustments in a tree. Fix open calls in tree 't', | ||
95 | ** making them refer to their respective rules or raising appropriate | ||
96 | ** errors (if not inside a grammar). Correct associativity of associative | ||
97 | ** constructions (making them right associative). Assume that tree's | ||
98 | ** ktable is at the top of the stack (for error messages). | ||
99 | */ | ||
100 | static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { | ||
101 | tailcall: | ||
102 | switch (t->tag) { | ||
103 | case TGrammar: /* subgrammars were already fixed */ | ||
104 | return; | ||
105 | case TOpenCall: { | ||
106 | if (g != NULL) /* inside a grammar? */ | ||
107 | fixonecall(L, postable, g, t); | ||
108 | else { /* open call outside grammar */ | ||
109 | lua_rawgeti(L, -1, t->key); | ||
110 | luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); | ||
111 | } | ||
112 | break; | ||
113 | } | ||
114 | case TSeq: case TChoice: | ||
115 | correctassociativity(t); | ||
116 | break; | ||
117 | } | ||
118 | switch (numsiblings[t->tag]) { | ||
119 | case 1: /* finalfix(L, postable, g, sib1(t)); */ | ||
120 | t = sib1(t); goto tailcall; | ||
121 | case 2: | ||
122 | finalfix(L, postable, g, sib1(t)); | ||
123 | t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */ | ||
124 | default: assert(numsiblings[t->tag] == 0); break; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | |||
129 | |||
130 | /* | ||
131 | ** {=================================================================== | ||
132 | ** KTable manipulation | ||
133 | ** | ||
134 | ** - The ktable of a pattern 'p' can be shared by other patterns that | ||
135 | ** contain 'p' and no other constants. Because of this sharing, we | ||
136 | ** should not add elements to a 'ktable' unless it was freshly created | ||
137 | ** for the new pattern. | ||
138 | ** | ||
139 | ** - The maximum index in a ktable is USHRT_MAX, because trees and | ||
140 | ** patterns use unsigned shorts to store those indices. | ||
141 | ** ==================================================================== | ||
142 | */ | ||
143 | |||
144 | /* | ||
145 | ** Create a new 'ktable' to the pattern at the top of the stack. | ||
146 | */ | ||
147 | static void newktable (lua_State *L, int n) { | ||
148 | lua_createtable(L, n, 0); /* create a fresh table */ | ||
149 | lua_setuservalue(L, -2); /* set it as 'ktable' for pattern */ | ||
150 | } | ||
151 | |||
152 | |||
153 | /* | ||
154 | ** Add element 'idx' to 'ktable' of pattern at the top of the stack; | ||
155 | ** Return index of new element. | ||
156 | ** If new element is nil, does not add it to table (as it would be | ||
157 | ** useless) and returns 0, as ktable[0] is always nil. | ||
158 | */ | ||
159 | static int addtoktable (lua_State *L, int idx) { | ||
160 | if (lua_isnil(L, idx)) /* nil value? */ | ||
161 | return 0; | ||
162 | else { | ||
163 | int n; | ||
164 | lua_getuservalue(L, -1); /* get ktable from pattern */ | ||
165 | n = lua_rawlen(L, -1); | ||
166 | if (n >= USHRT_MAX) | ||
167 | luaL_error(L, "too many Lua values in pattern"); | ||
168 | lua_pushvalue(L, idx); /* element to be added */ | ||
169 | lua_rawseti(L, -2, ++n); | ||
170 | lua_pop(L, 1); /* remove 'ktable' */ | ||
171 | return n; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | |||
176 | /* | ||
177 | ** Return the number of elements in the ktable at 'idx'. | ||
178 | ** In Lua 5.2/5.3, default "environment" for patterns is nil, not | ||
179 | ** a table. Treat it as an empty table. In Lua 5.1, assumes that | ||
180 | ** the environment has no numeric indices (len == 0) | ||
181 | */ | ||
182 | static int ktablelen (lua_State *L, int idx) { | ||
183 | if (!lua_istable(L, idx)) return 0; | ||
184 | else return lua_rawlen(L, idx); | ||
185 | } | ||
186 | |||
187 | |||
188 | /* | ||
189 | ** Concatentate the contents of table 'idx1' into table 'idx2'. | ||
190 | ** (Assume that both indices are negative.) | ||
191 | ** Return the original length of table 'idx2' (or 0, if no | ||
192 | ** element was added, as there is no need to correct any index). | ||
193 | */ | ||
194 | static int concattable (lua_State *L, int idx1, int idx2) { | ||
195 | int i; | ||
196 | int n1 = ktablelen(L, idx1); | ||
197 | int n2 = ktablelen(L, idx2); | ||
198 | if (n1 + n2 > USHRT_MAX) | ||
199 | luaL_error(L, "too many Lua values in pattern"); | ||
200 | if (n1 == 0) return 0; /* nothing to correct */ | ||
201 | for (i = 1; i <= n1; i++) { | ||
202 | lua_rawgeti(L, idx1, i); | ||
203 | lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */ | ||
204 | } | ||
205 | return n2; | ||
206 | } | ||
207 | |||
208 | |||
209 | /* | ||
210 | ** When joining 'ktables', constants from one of the subpatterns must | ||
211 | ** be renumbered; 'correctkeys' corrects their indices (adding 'n' | ||
212 | ** to each of them) | ||
213 | */ | ||
214 | static void correctkeys (TTree *tree, int n) { | ||
215 | if (n == 0) return; /* no correction? */ | ||
216 | tailcall: | ||
217 | switch (tree->tag) { | ||
218 | case TOpenCall: case TCall: case TRunTime: case TRule: { | ||
219 | if (tree->key > 0) | ||
220 | tree->key += n; | ||
221 | break; | ||
222 | } | ||
223 | case TCapture: { | ||
224 | if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum) | ||
225 | tree->key += n; | ||
226 | break; | ||
227 | } | ||
228 | default: break; | ||
229 | } | ||
230 | switch (numsiblings[tree->tag]) { | ||
231 | case 1: /* correctkeys(sib1(tree), n); */ | ||
232 | tree = sib1(tree); goto tailcall; | ||
233 | case 2: | ||
234 | correctkeys(sib1(tree), n); | ||
235 | tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */ | ||
236 | default: assert(numsiblings[tree->tag] == 0); break; | ||
237 | } | ||
238 | } | ||
239 | |||
240 | |||
241 | /* | ||
242 | ** Join the ktables from p1 and p2 the ktable for the new pattern at the | ||
243 | ** top of the stack, reusing them when possible. | ||
244 | */ | ||
245 | static void joinktables (lua_State *L, int p1, TTree *t2, int p2) { | ||
246 | int n1, n2; | ||
247 | lua_getuservalue(L, p1); /* get ktables */ | ||
248 | lua_getuservalue(L, p2); | ||
249 | n1 = ktablelen(L, -2); | ||
250 | n2 = ktablelen(L, -1); | ||
251 | if (n1 == 0 && n2 == 0) /* are both tables empty? */ | ||
252 | lua_pop(L, 2); /* nothing to be done; pop tables */ | ||
253 | else if (n2 == 0 || lp_equal(L, -2, -1)) { /* 2nd table empty or equal? */ | ||
254 | lua_pop(L, 1); /* pop 2nd table */ | ||
255 | lua_setuservalue(L, -2); /* set 1st ktable into new pattern */ | ||
256 | } | ||
257 | else if (n1 == 0) { /* first table is empty? */ | ||
258 | lua_setuservalue(L, -3); /* set 2nd table into new pattern */ | ||
259 | lua_pop(L, 1); /* pop 1st table */ | ||
260 | } | ||
261 | else { | ||
262 | lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */ | ||
263 | /* stack: new p; ktable p1; ktable p2; new ktable */ | ||
264 | concattable(L, -3, -1); /* from p1 into new ktable */ | ||
265 | concattable(L, -2, -1); /* from p2 into new ktable */ | ||
266 | lua_setuservalue(L, -4); /* new ktable becomes 'p' environment */ | ||
267 | lua_pop(L, 2); /* pop other ktables */ | ||
268 | correctkeys(t2, n1); /* correction for indices from p2 */ | ||
269 | } | ||
270 | } | ||
271 | |||
272 | |||
273 | /* | ||
274 | ** copy 'ktable' of element 'idx' to new tree (on top of stack) | ||
275 | */ | ||
276 | static void copyktable (lua_State *L, int idx) { | ||
277 | lua_getuservalue(L, idx); | ||
278 | lua_setuservalue(L, -2); | ||
279 | } | ||
280 | |||
281 | |||
282 | /* | ||
283 | ** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable' | ||
284 | ** from tree at the top of the stack, and correct corresponding | ||
285 | ** tree. | ||
286 | */ | ||
287 | static void mergektable (lua_State *L, int idx, TTree *stree) { | ||
288 | int n; | ||
289 | lua_getuservalue(L, -1); /* get ktables */ | ||
290 | lua_getuservalue(L, idx); | ||
291 | n = concattable(L, -1, -2); | ||
292 | lua_pop(L, 2); /* remove both ktables */ | ||
293 | correctkeys(stree, n); | ||
294 | } | ||
295 | |||
296 | |||
297 | /* | ||
298 | ** Create a new 'ktable' to the pattern at the top of the stack, adding | ||
299 | ** all elements from pattern 'p' (if not 0) plus element 'idx' to it. | ||
300 | ** Return index of new element. | ||
301 | */ | ||
302 | static int addtonewktable (lua_State *L, int p, int idx) { | ||
303 | newktable(L, 1); | ||
304 | if (p) | ||
305 | mergektable(L, p, NULL); | ||
306 | return addtoktable(L, idx); | ||
307 | } | ||
308 | |||
309 | /* }====================================================== */ | ||
310 | |||
311 | |||
312 | /* | ||
313 | ** {====================================================== | ||
314 | ** Tree generation | ||
315 | ** ======================================================= | ||
316 | */ | ||
317 | |||
318 | /* | ||
319 | ** In 5.2, could use 'luaL_testudata'... | ||
320 | */ | ||
321 | static int testpattern (lua_State *L, int idx) { | ||
322 | if (lua_touserdata(L, idx)) { /* value is a userdata? */ | ||
323 | if (lua_getmetatable(L, idx)) { /* does it have a metatable? */ | ||
324 | luaL_getmetatable(L, PATTERN_T); | ||
325 | if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */ | ||
326 | lua_pop(L, 2); /* remove both metatables */ | ||
327 | return 1; | ||
328 | } | ||
329 | } | ||
330 | } | ||
331 | return 0; | ||
332 | } | ||
333 | |||
334 | |||
335 | static Pattern *getpattern (lua_State *L, int idx) { | ||
336 | return (Pattern *)luaL_checkudata(L, idx, PATTERN_T); | ||
337 | } | ||
338 | |||
339 | |||
340 | static int getsize (lua_State *L, int idx) { | ||
341 | return (lua_rawlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1; | ||
342 | } | ||
343 | |||
344 | |||
345 | static TTree *gettree (lua_State *L, int idx, int *len) { | ||
346 | Pattern *p = getpattern(L, idx); | ||
347 | if (len) | ||
348 | *len = getsize(L, idx); | ||
349 | return p->tree; | ||
350 | } | ||
351 | |||
352 | |||
353 | /* | ||
354 | ** create a pattern. Set its uservalue (the 'ktable') equal to its | ||
355 | ** metatable. (It could be any empty sequence; the metatable is at | ||
356 | ** hand here, so we use it.) | ||
357 | */ | ||
358 | static TTree *newtree (lua_State *L, int len) { | ||
359 | size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern); | ||
360 | Pattern *p = (Pattern *)lua_newuserdata(L, size); | ||
361 | luaL_getmetatable(L, PATTERN_T); | ||
362 | lua_pushvalue(L, -1); | ||
363 | lua_setuservalue(L, -3); | ||
364 | lua_setmetatable(L, -2); | ||
365 | p->code = NULL; p->codesize = 0; | ||
366 | return p->tree; | ||
367 | } | ||
368 | |||
369 | |||
370 | static TTree *newleaf (lua_State *L, int tag) { | ||
371 | TTree *tree = newtree(L, 1); | ||
372 | tree->tag = tag; | ||
373 | return tree; | ||
374 | } | ||
375 | |||
376 | |||
377 | static TTree *newcharset (lua_State *L) { | ||
378 | TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); | ||
379 | tree->tag = TSet; | ||
380 | loopset(i, treebuffer(tree)[i] = 0); | ||
381 | return tree; | ||
382 | } | ||
383 | |||
384 | |||
385 | /* | ||
386 | ** add to tree a sequence where first sibling is 'sib' (with size | ||
387 | ** 'sibsize'); returns position for second sibling | ||
388 | */ | ||
389 | static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) { | ||
390 | tree->tag = TSeq; tree->u.ps = sibsize + 1; | ||
391 | memcpy(sib1(tree), sib, sibsize * sizeof(TTree)); | ||
392 | return sib2(tree); | ||
393 | } | ||
394 | |||
395 | |||
396 | /* | ||
397 | ** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got | ||
398 | ** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it | ||
399 | ** must build a sequence of sequence of sequence...) | ||
400 | */ | ||
401 | static void fillseq (TTree *tree, int tag, int n, const char *s) { | ||
402 | int i; | ||
403 | for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */ | ||
404 | tree->tag = TSeq; tree->u.ps = 2; | ||
405 | sib1(tree)->tag = tag; | ||
406 | sib1(tree)->u.n = s ? (byte)s[i] : 0; | ||
407 | tree = sib2(tree); | ||
408 | } | ||
409 | tree->tag = tag; /* last one does not need TSeq */ | ||
410 | tree->u.n = s ? (byte)s[i] : 0; | ||
411 | } | ||
412 | |||
413 | |||
414 | /* | ||
415 | ** Numbers as patterns: | ||
416 | ** 0 == true (always match); n == TAny repeated 'n' times; | ||
417 | ** -n == not (TAny repeated 'n' times) | ||
418 | */ | ||
419 | static TTree *numtree (lua_State *L, int n) { | ||
420 | if (n == 0) | ||
421 | return newleaf(L, TTrue); | ||
422 | else { | ||
423 | TTree *tree, *nd; | ||
424 | if (n > 0) | ||
425 | tree = nd = newtree(L, 2 * n - 1); | ||
426 | else { /* negative: code it as !(-n) */ | ||
427 | n = -n; | ||
428 | tree = newtree(L, 2 * n); | ||
429 | tree->tag = TNot; | ||
430 | nd = sib1(tree); | ||
431 | } | ||
432 | fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */ | ||
433 | return tree; | ||
434 | } | ||
435 | } | ||
436 | |||
437 | |||
438 | /* | ||
439 | ** Convert value at index 'idx' to a pattern | ||
440 | */ | ||
441 | static TTree *getpatt (lua_State *L, int idx, int *len) { | ||
442 | TTree *tree; | ||
443 | switch (lua_type(L, idx)) { | ||
444 | case LUA_TSTRING: { | ||
445 | size_t slen; | ||
446 | const char *s = lua_tolstring(L, idx, &slen); /* get string */ | ||
447 | if (slen == 0) /* empty? */ | ||
448 | tree = newleaf(L, TTrue); /* always match */ | ||
449 | else { | ||
450 | tree = newtree(L, 2 * (slen - 1) + 1); | ||
451 | fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */ | ||
452 | } | ||
453 | break; | ||
454 | } | ||
455 | case LUA_TNUMBER: { | ||
456 | int n = lua_tointeger(L, idx); | ||
457 | tree = numtree(L, n); | ||
458 | break; | ||
459 | } | ||
460 | case LUA_TBOOLEAN: { | ||
461 | tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse)); | ||
462 | break; | ||
463 | } | ||
464 | case LUA_TTABLE: { | ||
465 | tree = newgrammar(L, idx); | ||
466 | break; | ||
467 | } | ||
468 | case LUA_TFUNCTION: { | ||
469 | tree = newtree(L, 2); | ||
470 | tree->tag = TRunTime; | ||
471 | tree->key = addtonewktable(L, 0, idx); | ||
472 | sib1(tree)->tag = TTrue; | ||
473 | break; | ||
474 | } | ||
475 | default: { | ||
476 | return gettree(L, idx, len); | ||
477 | } | ||
478 | } | ||
479 | lua_replace(L, idx); /* put new tree into 'idx' slot */ | ||
480 | if (len) | ||
481 | *len = getsize(L, idx); | ||
482 | return tree; | ||
483 | } | ||
484 | |||
485 | |||
486 | /* | ||
487 | ** create a new tree, whith a new root and one sibling. | ||
488 | ** Sibling must be on the Lua stack, at index 1. | ||
489 | */ | ||
490 | static TTree *newroot1sib (lua_State *L, int tag) { | ||
491 | int s1; | ||
492 | TTree *tree1 = getpatt(L, 1, &s1); | ||
493 | TTree *tree = newtree(L, 1 + s1); /* create new tree */ | ||
494 | tree->tag = tag; | ||
495 | memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); | ||
496 | copyktable(L, 1); | ||
497 | return tree; | ||
498 | } | ||
499 | |||
500 | |||
501 | /* | ||
502 | ** create a new tree, whith a new root and 2 siblings. | ||
503 | ** Siblings must be on the Lua stack, first one at index 1. | ||
504 | */ | ||
505 | static TTree *newroot2sib (lua_State *L, int tag) { | ||
506 | int s1, s2; | ||
507 | TTree *tree1 = getpatt(L, 1, &s1); | ||
508 | TTree *tree2 = getpatt(L, 2, &s2); | ||
509 | TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */ | ||
510 | tree->tag = tag; | ||
511 | tree->u.ps = 1 + s1; | ||
512 | memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); | ||
513 | memcpy(sib2(tree), tree2, s2 * sizeof(TTree)); | ||
514 | joinktables(L, 1, sib2(tree), 2); | ||
515 | return tree; | ||
516 | } | ||
517 | |||
518 | |||
519 | static int lp_P (lua_State *L) { | ||
520 | luaL_checkany(L, 1); | ||
521 | getpatt(L, 1, NULL); | ||
522 | lua_settop(L, 1); | ||
523 | return 1; | ||
524 | } | ||
525 | |||
526 | |||
527 | /* | ||
528 | ** sequence operator; optimizations: | ||
529 | ** false x => false, x true => x, true x => x | ||
530 | ** (cannot do x . false => false because x may have runtime captures) | ||
531 | */ | ||
532 | static int lp_seq (lua_State *L) { | ||
533 | TTree *tree1 = getpatt(L, 1, NULL); | ||
534 | TTree *tree2 = getpatt(L, 2, NULL); | ||
535 | if (tree1->tag == TFalse || tree2->tag == TTrue) | ||
536 | lua_pushvalue(L, 1); /* false . x == false, x . true = x */ | ||
537 | else if (tree1->tag == TTrue) | ||
538 | lua_pushvalue(L, 2); /* true . x = x */ | ||
539 | else | ||
540 | newroot2sib(L, TSeq); | ||
541 | return 1; | ||
542 | } | ||
543 | |||
544 | |||
545 | /* | ||
546 | ** choice operator; optimizations: | ||
547 | ** charset / charset => charset | ||
548 | ** true / x => true, x / false => x, false / x => x | ||
549 | ** (x / true is not equivalent to true) | ||
550 | */ | ||
551 | static int lp_choice (lua_State *L) { | ||
552 | Charset st1, st2; | ||
553 | TTree *t1 = getpatt(L, 1, NULL); | ||
554 | TTree *t2 = getpatt(L, 2, NULL); | ||
555 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | ||
556 | TTree *t = newcharset(L); | ||
557 | loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); | ||
558 | } | ||
559 | else if (nofail(t1) || t2->tag == TFalse) | ||
560 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ | ||
561 | else if (t1->tag == TFalse) | ||
562 | lua_pushvalue(L, 2); /* false / x => x */ | ||
563 | else | ||
564 | newroot2sib(L, TChoice); | ||
565 | return 1; | ||
566 | } | ||
567 | |||
568 | |||
569 | /* | ||
570 | ** p^n | ||
571 | */ | ||
572 | static int lp_star (lua_State *L) { | ||
573 | int size1; | ||
574 | int n = (int)luaL_checkinteger(L, 2); | ||
575 | TTree *tree1 = getpatt(L, 1, &size1); | ||
576 | if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */ | ||
577 | TTree *tree = newtree(L, (n + 1) * (size1 + 1)); | ||
578 | if (nullable(tree1)) | ||
579 | luaL_error(L, "loop body may accept empty string"); | ||
580 | while (n--) /* repeat 'n' times */ | ||
581 | tree = seqaux(tree, tree1, size1); | ||
582 | tree->tag = TRep; | ||
583 | memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); | ||
584 | } | ||
585 | else { /* choice (seq tree1 ... choice tree1 true ...) true */ | ||
586 | TTree *tree; | ||
587 | n = -n; | ||
588 | /* size = (choice + seq + tree1 + true) * n, but the last has no seq */ | ||
589 | tree = newtree(L, n * (size1 + 3) - 1); | ||
590 | for (; n > 1; n--) { /* repeat (n - 1) times */ | ||
591 | tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2; | ||
592 | sib2(tree)->tag = TTrue; | ||
593 | tree = sib1(tree); | ||
594 | tree = seqaux(tree, tree1, size1); | ||
595 | } | ||
596 | tree->tag = TChoice; tree->u.ps = size1 + 1; | ||
597 | sib2(tree)->tag = TTrue; | ||
598 | memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); | ||
599 | } | ||
600 | copyktable(L, 1); | ||
601 | return 1; | ||
602 | } | ||
603 | |||
604 | |||
605 | /* | ||
606 | ** #p == &p | ||
607 | */ | ||
608 | static int lp_and (lua_State *L) { | ||
609 | newroot1sib(L, TAnd); | ||
610 | return 1; | ||
611 | } | ||
612 | |||
613 | |||
614 | /* | ||
615 | ** -p == !p | ||
616 | */ | ||
617 | static int lp_not (lua_State *L) { | ||
618 | newroot1sib(L, TNot); | ||
619 | return 1; | ||
620 | } | ||
621 | |||
622 | |||
623 | /* | ||
624 | ** [t1 - t2] == Seq (Not t2) t1 | ||
625 | ** If t1 and t2 are charsets, make their difference. | ||
626 | */ | ||
627 | static int lp_sub (lua_State *L) { | ||
628 | Charset st1, st2; | ||
629 | int s1, s2; | ||
630 | TTree *t1 = getpatt(L, 1, &s1); | ||
631 | TTree *t2 = getpatt(L, 2, &s2); | ||
632 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | ||
633 | TTree *t = newcharset(L); | ||
634 | loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); | ||
635 | } | ||
636 | else { | ||
637 | TTree *tree = newtree(L, 2 + s1 + s2); | ||
638 | tree->tag = TSeq; /* sequence of... */ | ||
639 | tree->u.ps = 2 + s2; | ||
640 | sib1(tree)->tag = TNot; /* ...not... */ | ||
641 | memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */ | ||
642 | memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */ | ||
643 | joinktables(L, 1, sib1(tree), 2); | ||
644 | } | ||
645 | return 1; | ||
646 | } | ||
647 | |||
648 | |||
649 | static int lp_set (lua_State *L) { | ||
650 | size_t l; | ||
651 | const char *s = luaL_checklstring(L, 1, &l); | ||
652 | TTree *tree = newcharset(L); | ||
653 | while (l--) { | ||
654 | setchar(treebuffer(tree), (byte)(*s)); | ||
655 | s++; | ||
656 | } | ||
657 | return 1; | ||
658 | } | ||
659 | |||
660 | |||
661 | static int lp_range (lua_State *L) { | ||
662 | int arg; | ||
663 | int top = lua_gettop(L); | ||
664 | TTree *tree = newcharset(L); | ||
665 | for (arg = 1; arg <= top; arg++) { | ||
666 | int c; | ||
667 | size_t l; | ||
668 | const char *r = luaL_checklstring(L, arg, &l); | ||
669 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); | ||
670 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) | ||
671 | setchar(treebuffer(tree), c); | ||
672 | } | ||
673 | return 1; | ||
674 | } | ||
675 | |||
676 | |||
677 | /* | ||
678 | ** Look-behind predicate | ||
679 | */ | ||
680 | static int lp_behind (lua_State *L) { | ||
681 | TTree *tree; | ||
682 | TTree *tree1 = getpatt(L, 1, NULL); | ||
683 | int n = fixedlen(tree1); | ||
684 | luaL_argcheck(L, n >= 0, 1, "pattern may not have fixed length"); | ||
685 | luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures"); | ||
686 | luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind"); | ||
687 | tree = newroot1sib(L, TBehind); | ||
688 | tree->u.n = n; | ||
689 | return 1; | ||
690 | } | ||
691 | |||
692 | |||
693 | /* | ||
694 | ** Create a non-terminal | ||
695 | */ | ||
696 | static int lp_V (lua_State *L) { | ||
697 | TTree *tree = newleaf(L, TOpenCall); | ||
698 | luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected"); | ||
699 | tree->key = addtonewktable(L, 0, 1); | ||
700 | return 1; | ||
701 | } | ||
702 | |||
703 | |||
704 | /* | ||
705 | ** Create a tree for a non-empty capture, with a body and | ||
706 | ** optionally with an associated Lua value (at index 'labelidx' in the | ||
707 | ** stack) | ||
708 | */ | ||
709 | static int capture_aux (lua_State *L, int cap, int labelidx) { | ||
710 | TTree *tree = newroot1sib(L, TCapture); | ||
711 | tree->cap = cap; | ||
712 | tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx); | ||
713 | return 1; | ||
714 | } | ||
715 | |||
716 | |||
717 | /* | ||
718 | ** Fill a tree with an empty capture, using an empty (TTrue) sibling. | ||
719 | ** (The 'key' field must be filled by the caller to finish the tree.) | ||
720 | */ | ||
721 | static TTree *auxemptycap (TTree *tree, int cap) { | ||
722 | tree->tag = TCapture; | ||
723 | tree->cap = cap; | ||
724 | sib1(tree)->tag = TTrue; | ||
725 | return tree; | ||
726 | } | ||
727 | |||
728 | |||
729 | /* | ||
730 | ** Create a tree for an empty capture. | ||
731 | */ | ||
732 | static TTree *newemptycap (lua_State *L, int cap, int key) { | ||
733 | TTree *tree = auxemptycap(newtree(L, 2), cap); | ||
734 | tree->key = key; | ||
735 | return tree; | ||
736 | } | ||
737 | |||
738 | |||
739 | /* | ||
740 | ** Create a tree for an empty capture with an associated Lua value. | ||
741 | */ | ||
742 | static TTree *newemptycapkey (lua_State *L, int cap, int idx) { | ||
743 | TTree *tree = auxemptycap(newtree(L, 2), cap); | ||
744 | tree->key = addtonewktable(L, 0, idx); | ||
745 | return tree; | ||
746 | } | ||
747 | |||
748 | |||
749 | /* | ||
750 | ** Captures with syntax p / v | ||
751 | ** (function capture, query capture, string capture, or number capture) | ||
752 | */ | ||
753 | static int lp_divcapture (lua_State *L) { | ||
754 | switch (lua_type(L, 2)) { | ||
755 | case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2); | ||
756 | case LUA_TTABLE: return capture_aux(L, Cquery, 2); | ||
757 | case LUA_TSTRING: return capture_aux(L, Cstring, 2); | ||
758 | case LUA_TNUMBER: { | ||
759 | int n = lua_tointeger(L, 2); | ||
760 | TTree *tree = newroot1sib(L, TCapture); | ||
761 | luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number"); | ||
762 | tree->cap = Cnum; | ||
763 | tree->key = n; | ||
764 | return 1; | ||
765 | } | ||
766 | default: return luaL_argerror(L, 2, "invalid replacement value"); | ||
767 | } | ||
768 | } | ||
769 | |||
770 | |||
771 | static int lp_substcapture (lua_State *L) { | ||
772 | return capture_aux(L, Csubst, 0); | ||
773 | } | ||
774 | |||
775 | |||
776 | static int lp_tablecapture (lua_State *L) { | ||
777 | return capture_aux(L, Ctable, 0); | ||
778 | } | ||
779 | |||
780 | |||
781 | static int lp_groupcapture (lua_State *L) { | ||
782 | if (lua_isnoneornil(L, 2)) | ||
783 | return capture_aux(L, Cgroup, 0); | ||
784 | else | ||
785 | return capture_aux(L, Cgroup, 2); | ||
786 | } | ||
787 | |||
788 | |||
789 | static int lp_foldcapture (lua_State *L) { | ||
790 | luaL_checktype(L, 2, LUA_TFUNCTION); | ||
791 | return capture_aux(L, Cfold, 2); | ||
792 | } | ||
793 | |||
794 | |||
795 | static int lp_simplecapture (lua_State *L) { | ||
796 | return capture_aux(L, Csimple, 0); | ||
797 | } | ||
798 | |||
799 | |||
800 | static int lp_poscapture (lua_State *L) { | ||
801 | newemptycap(L, Cposition, 0); | ||
802 | return 1; | ||
803 | } | ||
804 | |||
805 | |||
806 | static int lp_argcapture (lua_State *L) { | ||
807 | int n = (int)luaL_checkinteger(L, 1); | ||
808 | luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); | ||
809 | newemptycap(L, Carg, n); | ||
810 | return 1; | ||
811 | } | ||
812 | |||
813 | |||
814 | static int lp_backref (lua_State *L) { | ||
815 | luaL_checkany(L, 1); | ||
816 | newemptycapkey(L, Cbackref, 1); | ||
817 | return 1; | ||
818 | } | ||
819 | |||
820 | |||
821 | /* | ||
822 | ** Constant capture | ||
823 | */ | ||
824 | static int lp_constcapture (lua_State *L) { | ||
825 | int i; | ||
826 | int n = lua_gettop(L); /* number of values */ | ||
827 | if (n == 0) /* no values? */ | ||
828 | newleaf(L, TTrue); /* no capture */ | ||
829 | else if (n == 1) | ||
830 | newemptycapkey(L, Cconst, 1); /* single constant capture */ | ||
831 | else { /* create a group capture with all values */ | ||
832 | TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2); | ||
833 | newktable(L, n); /* create a 'ktable' for new tree */ | ||
834 | tree->tag = TCapture; | ||
835 | tree->cap = Cgroup; | ||
836 | tree->key = 0; | ||
837 | tree = sib1(tree); | ||
838 | for (i = 1; i <= n - 1; i++) { | ||
839 | tree->tag = TSeq; | ||
840 | tree->u.ps = 3; /* skip TCapture and its sibling */ | ||
841 | auxemptycap(sib1(tree), Cconst); | ||
842 | sib1(tree)->key = addtoktable(L, i); | ||
843 | tree = sib2(tree); | ||
844 | } | ||
845 | auxemptycap(tree, Cconst); | ||
846 | tree->key = addtoktable(L, i); | ||
847 | } | ||
848 | return 1; | ||
849 | } | ||
850 | |||
851 | |||
852 | static int lp_matchtime (lua_State *L) { | ||
853 | TTree *tree; | ||
854 | luaL_checktype(L, 2, LUA_TFUNCTION); | ||
855 | tree = newroot1sib(L, TRunTime); | ||
856 | tree->key = addtonewktable(L, 1, 2); | ||
857 | return 1; | ||
858 | } | ||
859 | |||
860 | /* }====================================================== */ | ||
861 | |||
862 | |||
863 | /* | ||
864 | ** {====================================================== | ||
865 | ** Grammar - Tree generation | ||
866 | ** ======================================================= | ||
867 | */ | ||
868 | |||
869 | /* | ||
870 | ** push on the stack the index and the pattern for the | ||
871 | ** initial rule of grammar at index 'arg' in the stack; | ||
872 | ** also add that index into position table. | ||
873 | */ | ||
874 | static void getfirstrule (lua_State *L, int arg, int postab) { | ||
875 | lua_rawgeti(L, arg, 1); /* access first element */ | ||
876 | if (lua_isstring(L, -1)) { /* is it the name of initial rule? */ | ||
877 | lua_pushvalue(L, -1); /* duplicate it to use as key */ | ||
878 | lua_gettable(L, arg); /* get associated rule */ | ||
879 | } | ||
880 | else { | ||
881 | lua_pushinteger(L, 1); /* key for initial rule */ | ||
882 | lua_insert(L, -2); /* put it before rule */ | ||
883 | } | ||
884 | if (!testpattern(L, -1)) { /* initial rule not a pattern? */ | ||
885 | if (lua_isnil(L, -1)) | ||
886 | luaL_error(L, "grammar has no initial rule"); | ||
887 | else | ||
888 | luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2)); | ||
889 | } | ||
890 | lua_pushvalue(L, -2); /* push key */ | ||
891 | lua_pushinteger(L, 1); /* push rule position (after TGrammar) */ | ||
892 | lua_settable(L, postab); /* insert pair at position table */ | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | ** traverse grammar at index 'arg', pushing all its keys and patterns | ||
897 | ** into the stack. Create a new table (before all pairs key-pattern) to | ||
898 | ** collect all keys and their associated positions in the final tree | ||
899 | ** (the "position table"). | ||
900 | ** Return the number of rules and (in 'totalsize') the total size | ||
901 | ** for the new tree. | ||
902 | */ | ||
903 | static int collectrules (lua_State *L, int arg, int *totalsize) { | ||
904 | int n = 1; /* to count number of rules */ | ||
905 | int postab = lua_gettop(L) + 1; /* index of position table */ | ||
906 | int size; /* accumulator for total size */ | ||
907 | lua_newtable(L); /* create position table */ | ||
908 | getfirstrule(L, arg, postab); | ||
909 | size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ | ||
910 | lua_pushnil(L); /* prepare to traverse grammar table */ | ||
911 | while (lua_next(L, arg) != 0) { | ||
912 | if (lua_tonumber(L, -2) == 1 || | ||
913 | lp_equal(L, -2, postab + 1)) { /* initial rule? */ | ||
914 | lua_pop(L, 1); /* remove value (keep key for lua_next) */ | ||
915 | continue; | ||
916 | } | ||
917 | if (!testpattern(L, -1)) /* value is not a pattern? */ | ||
918 | luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2)); | ||
919 | luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules"); | ||
920 | lua_pushvalue(L, -2); /* push key (to insert into position table) */ | ||
921 | lua_pushinteger(L, size); | ||
922 | lua_settable(L, postab); | ||
923 | size += 1 + getsize(L, -1); /* update size */ | ||
924 | lua_pushvalue(L, -2); /* push key (for next lua_next) */ | ||
925 | n++; | ||
926 | } | ||
927 | *totalsize = size + 1; /* TTrue to finish list of rules */ | ||
928 | return n; | ||
929 | } | ||
930 | |||
931 | |||
932 | static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { | ||
933 | int i; | ||
934 | TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */ | ||
935 | for (i = 0; i < n; i++) { /* add each rule into new tree */ | ||
936 | int ridx = frule + 2*i + 1; /* index of i-th rule */ | ||
937 | int rulesize; | ||
938 | TTree *rn = gettree(L, ridx, &rulesize); | ||
939 | nd->tag = TRule; | ||
940 | nd->key = 0; /* will be fixed when rule is used */ | ||
941 | nd->cap = i; /* rule number */ | ||
942 | nd->u.ps = rulesize + 1; /* point to next rule */ | ||
943 | memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ | ||
944 | mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ | ||
945 | nd = sib2(nd); /* move to next rule */ | ||
946 | } | ||
947 | nd->tag = TTrue; /* finish list of rules */ | ||
948 | } | ||
949 | |||
950 | |||
951 | /* | ||
952 | ** Check whether a tree has potential infinite loops | ||
953 | */ | ||
954 | static int checkloops (TTree *tree) { | ||
955 | tailcall: | ||
956 | if (tree->tag == TRep && nullable(sib1(tree))) | ||
957 | return 1; | ||
958 | else if (tree->tag == TGrammar) | ||
959 | return 0; /* sub-grammars already checked */ | ||
960 | else { | ||
961 | switch (numsiblings[tree->tag]) { | ||
962 | case 1: /* return checkloops(sib1(tree)); */ | ||
963 | tree = sib1(tree); goto tailcall; | ||
964 | case 2: | ||
965 | if (checkloops(sib1(tree))) return 1; | ||
966 | /* else return checkloops(sib2(tree)); */ | ||
967 | tree = sib2(tree); goto tailcall; | ||
968 | default: assert(numsiblings[tree->tag] == 0); return 0; | ||
969 | } | ||
970 | } | ||
971 | } | ||
972 | |||
973 | |||
974 | /* | ||
975 | ** Give appropriate error message for 'verifyrule'. If a rule appears | ||
976 | ** twice in 'passed', there is path from it back to itself without | ||
977 | ** advancing the subject. | ||
978 | */ | ||
979 | static int verifyerror (lua_State *L, int *passed, int npassed) { | ||
980 | int i, j; | ||
981 | for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ | ||
982 | for (j = i - 1; j >= 0; j--) { | ||
983 | if (passed[i] == passed[j]) { | ||
984 | lua_rawgeti(L, -1, passed[i]); /* get rule's key */ | ||
985 | return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1)); | ||
986 | } | ||
987 | } | ||
988 | } | ||
989 | return luaL_error(L, "too many left calls in grammar"); | ||
990 | } | ||
991 | |||
992 | |||
993 | /* | ||
994 | ** Check whether a rule can be left recursive; raise an error in that | ||
995 | ** case; otherwise return 1 iff pattern is nullable. | ||
996 | ** The return value is used to check sequences, where the second pattern | ||
997 | ** is only relevant if the first is nullable. | ||
998 | ** Parameter 'nb' works as an accumulator, to allow tail calls in | ||
999 | ** choices. ('nb' true makes function returns true.) | ||
1000 | ** Parameter 'passed' is a list of already visited rules, 'npassed' | ||
1001 | ** counts the elements in 'passed'. | ||
1002 | ** Assume ktable at the top of the stack. | ||
1003 | */ | ||
1004 | static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | ||
1005 | int nb) { | ||
1006 | tailcall: | ||
1007 | switch (tree->tag) { | ||
1008 | case TChar: case TSet: case TAny: | ||
1009 | case TFalse: | ||
1010 | return nb; /* cannot pass from here */ | ||
1011 | case TTrue: | ||
1012 | case TBehind: /* look-behind cannot have calls */ | ||
1013 | return 1; | ||
1014 | case TNot: case TAnd: case TRep: | ||
1015 | /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ | ||
1016 | tree = sib1(tree); nb = 1; goto tailcall; | ||
1017 | case TCapture: case TRunTime: | ||
1018 | /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ | ||
1019 | tree = sib1(tree); goto tailcall; | ||
1020 | case TCall: | ||
1021 | /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ | ||
1022 | tree = sib2(tree); goto tailcall; | ||
1023 | case TSeq: /* only check 2nd child if first is nb */ | ||
1024 | if (!verifyrule(L, sib1(tree), passed, npassed, 0)) | ||
1025 | return nb; | ||
1026 | /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ | ||
1027 | tree = sib2(tree); goto tailcall; | ||
1028 | case TChoice: /* must check both children */ | ||
1029 | nb = verifyrule(L, sib1(tree), passed, npassed, nb); | ||
1030 | /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ | ||
1031 | tree = sib2(tree); goto tailcall; | ||
1032 | case TRule: | ||
1033 | if (npassed >= MAXRULES) | ||
1034 | return verifyerror(L, passed, npassed); | ||
1035 | else { | ||
1036 | passed[npassed++] = tree->key; | ||
1037 | /* return verifyrule(L, sib1(tree), passed, npassed); */ | ||
1038 | tree = sib1(tree); goto tailcall; | ||
1039 | } | ||
1040 | case TGrammar: | ||
1041 | return nullable(tree); /* sub-grammar cannot be left recursive */ | ||
1042 | default: assert(0); return 0; | ||
1043 | } | ||
1044 | } | ||
1045 | |||
1046 | |||
1047 | static void verifygrammar (lua_State *L, TTree *grammar) { | ||
1048 | int passed[MAXRULES]; | ||
1049 | TTree *rule; | ||
1050 | /* check left-recursive rules */ | ||
1051 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
1052 | if (rule->key == 0) continue; /* unused rule */ | ||
1053 | verifyrule(L, sib1(rule), passed, 0, 0); | ||
1054 | } | ||
1055 | assert(rule->tag == TTrue); | ||
1056 | /* check infinite loops inside rules */ | ||
1057 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | ||
1058 | if (rule->key == 0) continue; /* unused rule */ | ||
1059 | if (checkloops(sib1(rule))) { | ||
1060 | lua_rawgeti(L, -1, rule->key); /* get rule's key */ | ||
1061 | luaL_error(L, "empty loop in rule '%s'", val2str(L, -1)); | ||
1062 | } | ||
1063 | } | ||
1064 | assert(rule->tag == TTrue); | ||
1065 | } | ||
1066 | |||
1067 | |||
1068 | /* | ||
1069 | ** Give a name for the initial rule if it is not referenced | ||
1070 | */ | ||
1071 | static void initialrulename (lua_State *L, TTree *grammar, int frule) { | ||
1072 | if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */ | ||
1073 | int n = lua_rawlen(L, -1) + 1; /* index for name */ | ||
1074 | lua_pushvalue(L, frule); /* rule's name */ | ||
1075 | lua_rawseti(L, -2, n); /* ktable was on the top of the stack */ | ||
1076 | sib1(grammar)->key = n; | ||
1077 | } | ||
1078 | } | ||
1079 | |||
1080 | |||
1081 | static TTree *newgrammar (lua_State *L, int arg) { | ||
1082 | int treesize; | ||
1083 | int frule = lua_gettop(L) + 2; /* position of first rule's key */ | ||
1084 | int n = collectrules(L, arg, &treesize); | ||
1085 | TTree *g = newtree(L, treesize); | ||
1086 | luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules"); | ||
1087 | g->tag = TGrammar; g->u.n = n; | ||
1088 | lua_newtable(L); /* create 'ktable' */ | ||
1089 | lua_setuservalue(L, -2); | ||
1090 | buildgrammar(L, g, frule, n); | ||
1091 | lua_getuservalue(L, -1); /* get 'ktable' for new tree */ | ||
1092 | finalfix(L, frule - 1, g, sib1(g)); | ||
1093 | initialrulename(L, g, frule); | ||
1094 | verifygrammar(L, g); | ||
1095 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1096 | lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */ | ||
1097 | lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */ | ||
1098 | return g; /* new table at the top of the stack */ | ||
1099 | } | ||
1100 | |||
1101 | /* }====================================================== */ | ||
1102 | |||
1103 | |||
1104 | static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) { | ||
1105 | lua_getuservalue(L, idx); /* push 'ktable' (may be used by 'finalfix') */ | ||
1106 | finalfix(L, 0, NULL, p->tree); | ||
1107 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1108 | return compile(L, p); | ||
1109 | } | ||
1110 | |||
1111 | |||
1112 | static int lp_printtree (lua_State *L) { | ||
1113 | TTree *tree = getpatt(L, 1, NULL); | ||
1114 | int c = lua_toboolean(L, 2); | ||
1115 | if (c) { | ||
1116 | lua_getuservalue(L, 1); /* push 'ktable' (may be used by 'finalfix') */ | ||
1117 | finalfix(L, 0, NULL, tree); | ||
1118 | lua_pop(L, 1); /* remove 'ktable' */ | ||
1119 | } | ||
1120 | printktable(L, 1); | ||
1121 | printtree(tree, 0); | ||
1122 | return 0; | ||
1123 | } | ||
1124 | |||
1125 | |||
1126 | static int lp_printcode (lua_State *L) { | ||
1127 | Pattern *p = getpattern(L, 1); | ||
1128 | printktable(L, 1); | ||
1129 | if (p->code == NULL) /* not compiled yet? */ | ||
1130 | prepcompile(L, p, 1); | ||
1131 | printpatt(p->code, p->codesize); | ||
1132 | return 0; | ||
1133 | } | ||
1134 | |||
1135 | |||
1136 | /* | ||
1137 | ** Get the initial position for the match, interpreting negative | ||
1138 | ** values from the end of the subject | ||
1139 | */ | ||
1140 | static size_t initposition (lua_State *L, size_t len) { | ||
1141 | lua_Integer ii = luaL_optinteger(L, 3, 1); | ||
1142 | if (ii > 0) { /* positive index? */ | ||
1143 | if ((size_t)ii <= len) /* inside the string? */ | ||
1144 | return (size_t)ii - 1; /* return it (corrected to 0-base) */ | ||
1145 | else return len; /* crop at the end */ | ||
1146 | } | ||
1147 | else { /* negative index */ | ||
1148 | if ((size_t)(-ii) <= len) /* inside the string? */ | ||
1149 | return len - ((size_t)(-ii)); /* return position from the end */ | ||
1150 | else return 0; /* crop at the beginning */ | ||
1151 | } | ||
1152 | } | ||
1153 | |||
1154 | |||
1155 | /* | ||
1156 | ** Main match function | ||
1157 | */ | ||
1158 | static int lp_match (lua_State *L) { | ||
1159 | Capture capture[INITCAPSIZE]; | ||
1160 | const char *r; | ||
1161 | size_t l; | ||
1162 | Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1)); | ||
1163 | Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1); | ||
1164 | const char *s = luaL_checklstring(L, SUBJIDX, &l); | ||
1165 | size_t i = initposition(L, l); | ||
1166 | int ptop = lua_gettop(L); | ||
1167 | lua_pushnil(L); /* initialize subscache */ | ||
1168 | lua_pushlightuserdata(L, capture); /* initialize caplistidx */ | ||
1169 | lua_getuservalue(L, 1); /* initialize penvidx */ | ||
1170 | r = match(L, s, s + i, s + l, code, capture, ptop); | ||
1171 | if (r == NULL) { | ||
1172 | lua_pushnil(L); | ||
1173 | return 1; | ||
1174 | } | ||
1175 | return getcaptures(L, s, r, ptop); | ||
1176 | } | ||
1177 | |||
1178 | |||
1179 | |||
1180 | /* | ||
1181 | ** {====================================================== | ||
1182 | ** Library creation and functions not related to matching | ||
1183 | ** ======================================================= | ||
1184 | */ | ||
1185 | |||
1186 | /* maximum limit for stack size */ | ||
1187 | #define MAXLIM (INT_MAX / 100) | ||
1188 | |||
1189 | static int lp_setmax (lua_State *L) { | ||
1190 | lua_Integer lim = luaL_checkinteger(L, 1); | ||
1191 | luaL_argcheck(L, 0 < lim && lim <= MAXLIM, 1, "out of range"); | ||
1192 | lua_settop(L, 1); | ||
1193 | lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
1194 | return 0; | ||
1195 | } | ||
1196 | |||
1197 | |||
1198 | static int lp_version (lua_State *L) { | ||
1199 | lua_pushstring(L, VERSION); | ||
1200 | return 1; | ||
1201 | } | ||
1202 | |||
1203 | |||
1204 | static int lp_type (lua_State *L) { | ||
1205 | if (testpattern(L, 1)) | ||
1206 | lua_pushliteral(L, "pattern"); | ||
1207 | else | ||
1208 | lua_pushnil(L); | ||
1209 | return 1; | ||
1210 | } | ||
1211 | |||
1212 | |||
1213 | int lp_gc (lua_State *L) { | ||
1214 | Pattern *p = getpattern(L, 1); | ||
1215 | realloccode(L, p, 0); /* delete code block */ | ||
1216 | return 0; | ||
1217 | } | ||
1218 | |||
1219 | |||
1220 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { | ||
1221 | TTree *t = newcharset(L); | ||
1222 | int i; | ||
1223 | for (i = 0; i <= UCHAR_MAX; i++) | ||
1224 | if (catf(i)) setchar(treebuffer(t), i); | ||
1225 | lua_setfield(L, -2, catname); | ||
1226 | } | ||
1227 | |||
1228 | |||
1229 | static int lp_locale (lua_State *L) { | ||
1230 | if (lua_isnoneornil(L, 1)) { | ||
1231 | lua_settop(L, 0); | ||
1232 | lua_createtable(L, 0, 12); | ||
1233 | } | ||
1234 | else { | ||
1235 | luaL_checktype(L, 1, LUA_TTABLE); | ||
1236 | lua_settop(L, 1); | ||
1237 | } | ||
1238 | createcat(L, "alnum", isalnum); | ||
1239 | createcat(L, "alpha", isalpha); | ||
1240 | createcat(L, "cntrl", iscntrl); | ||
1241 | createcat(L, "digit", isdigit); | ||
1242 | createcat(L, "graph", isgraph); | ||
1243 | createcat(L, "lower", islower); | ||
1244 | createcat(L, "print", isprint); | ||
1245 | createcat(L, "punct", ispunct); | ||
1246 | createcat(L, "space", isspace); | ||
1247 | createcat(L, "upper", isupper); | ||
1248 | createcat(L, "xdigit", isxdigit); | ||
1249 | return 1; | ||
1250 | } | ||
1251 | |||
1252 | |||
1253 | static struct luaL_Reg pattreg[] = { | ||
1254 | {"ptree", lp_printtree}, | ||
1255 | {"pcode", lp_printcode}, | ||
1256 | {"match", lp_match}, | ||
1257 | {"B", lp_behind}, | ||
1258 | {"V", lp_V}, | ||
1259 | {"C", lp_simplecapture}, | ||
1260 | {"Cc", lp_constcapture}, | ||
1261 | {"Cmt", lp_matchtime}, | ||
1262 | {"Cb", lp_backref}, | ||
1263 | {"Carg", lp_argcapture}, | ||
1264 | {"Cp", lp_poscapture}, | ||
1265 | {"Cs", lp_substcapture}, | ||
1266 | {"Ct", lp_tablecapture}, | ||
1267 | {"Cf", lp_foldcapture}, | ||
1268 | {"Cg", lp_groupcapture}, | ||
1269 | {"P", lp_P}, | ||
1270 | {"S", lp_set}, | ||
1271 | {"R", lp_range}, | ||
1272 | {"locale", lp_locale}, | ||
1273 | {"version", lp_version}, | ||
1274 | {"setmaxstack", lp_setmax}, | ||
1275 | {"type", lp_type}, | ||
1276 | {NULL, NULL} | ||
1277 | }; | ||
1278 | |||
1279 | |||
1280 | static struct luaL_Reg metareg[] = { | ||
1281 | {"__mul", lp_seq}, | ||
1282 | {"__add", lp_choice}, | ||
1283 | {"__pow", lp_star}, | ||
1284 | {"__gc", lp_gc}, | ||
1285 | {"__len", lp_and}, | ||
1286 | {"__div", lp_divcapture}, | ||
1287 | {"__unm", lp_not}, | ||
1288 | {"__sub", lp_sub}, | ||
1289 | {NULL, NULL} | ||
1290 | }; | ||
1291 | |||
1292 | |||
1293 | int luaopen_lpeg (lua_State *L); | ||
1294 | int luaopen_lpeg (lua_State *L) { | ||
1295 | luaL_newmetatable(L, PATTERN_T); | ||
1296 | lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */ | ||
1297 | lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
1298 | luaL_setfuncs(L, metareg, 0); | ||
1299 | luaL_newlib(L, pattreg); | ||
1300 | lua_pushvalue(L, -1); | ||
1301 | lua_setfield(L, -3, "__index"); | ||
1302 | return 1; | ||
1303 | } | ||
1304 | |||
1305 | /* }====================================================== */ | ||
diff --git a/lptree.h b/lptree.h new file mode 100644 index 0000000..34ee15c --- /dev/null +++ b/lptree.h | |||
@@ -0,0 +1,82 @@ | |||
1 | /* | ||
2 | ** $Id: lptree.h,v 1.3 2016/09/13 18:07:51 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lptree_h) | ||
6 | #define lptree_h | ||
7 | |||
8 | |||
9 | #include "lptypes.h" | ||
10 | |||
11 | |||
12 | /* | ||
13 | ** types of trees | ||
14 | */ | ||
15 | typedef enum TTag { | ||
16 | TChar = 0, /* 'n' = char */ | ||
17 | TSet, /* the set is stored in next CHARSETSIZE bytes */ | ||
18 | TAny, | ||
19 | TTrue, | ||
20 | TFalse, | ||
21 | TRep, /* 'sib1'* */ | ||
22 | TSeq, /* 'sib1' 'sib2' */ | ||
23 | TChoice, /* 'sib1' / 'sib2' */ | ||
24 | TNot, /* !'sib1' */ | ||
25 | TAnd, /* &'sib1' */ | ||
26 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ | ||
27 | TOpenCall, /* ktable[key] is rule's key */ | ||
28 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); | ||
29 | 'sib1' is rule's pattern; | ||
30 | 'sib2' is next rule; 'cap' is rule's sequential number */ | ||
31 | TGrammar, /* 'sib1' is initial (and first) rule */ | ||
32 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ | ||
33 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); | ||
34 | ktable[key] is Lua value associated with capture; | ||
35 | 'sib1' is capture body */ | ||
36 | TRunTime /* run-time capture: 'key' is Lua function; | ||
37 | 'sib1' is capture body */ | ||
38 | } TTag; | ||
39 | |||
40 | |||
41 | /* | ||
42 | ** Tree trees | ||
43 | ** The first child of a tree (if there is one) is immediately after | ||
44 | ** the tree. A reference to a second child (ps) is its position | ||
45 | ** relative to the position of the tree itself. | ||
46 | */ | ||
47 | typedef struct TTree { | ||
48 | byte tag; | ||
49 | byte cap; /* kind of capture (if it is a capture) */ | ||
50 | unsigned short key; /* key in ktable for Lua data (0 if no key) */ | ||
51 | union { | ||
52 | int ps; /* occasional second child */ | ||
53 | int n; /* occasional counter */ | ||
54 | } u; | ||
55 | } TTree; | ||
56 | |||
57 | |||
58 | /* | ||
59 | ** A complete pattern has its tree plus, if already compiled, | ||
60 | ** its corresponding code | ||
61 | */ | ||
62 | typedef struct Pattern { | ||
63 | union Instruction *code; | ||
64 | int codesize; | ||
65 | TTree tree[1]; | ||
66 | } Pattern; | ||
67 | |||
68 | |||
69 | /* number of children for each tree */ | ||
70 | extern const byte numsiblings[]; | ||
71 | |||
72 | /* access to children */ | ||
73 | #define sib1(t) ((t) + 1) | ||
74 | #define sib2(t) ((t) + (t)->u.ps) | ||
75 | |||
76 | |||
77 | |||
78 | |||
79 | |||
80 | |||
81 | #endif | ||
82 | |||
diff --git a/lptypes.h b/lptypes.h new file mode 100644 index 0000000..5226970 --- /dev/null +++ b/lptypes.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /* | ||
2 | ** $Id: lptypes.h,v 1.17 2017/12/14 16:56:27 roberto Exp $ | ||
3 | ** LPeg - PEG pattern matching for Lua | ||
4 | ** Copyright 2007-2017, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
5 | ** written by Roberto Ierusalimschy | ||
6 | */ | ||
7 | |||
8 | #if !defined(lptypes_h) | ||
9 | #define lptypes_h | ||
10 | |||
11 | |||
12 | #include <assert.h> | ||
13 | #include <limits.h> | ||
14 | |||
15 | #include "lua.h" | ||
16 | |||
17 | |||
18 | #define VERSION "1.0.1" | ||
19 | |||
20 | |||
21 | #define PATTERN_T "lpeg-pattern" | ||
22 | #define MAXSTACKIDX "lpeg-maxstack" | ||
23 | |||
24 | |||
25 | /* | ||
26 | ** compatibility with Lua 5.1 | ||
27 | */ | ||
28 | #if (LUA_VERSION_NUM == 501) | ||
29 | |||
30 | #define lp_equal lua_equal | ||
31 | |||
32 | #define lua_getuservalue lua_getfenv | ||
33 | #define lua_setuservalue lua_setfenv | ||
34 | |||
35 | #define lua_rawlen lua_objlen | ||
36 | |||
37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) | ||
38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) | ||
39 | |||
40 | #endif | ||
41 | |||
42 | |||
43 | #if !defined(lp_equal) | ||
44 | #define lp_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) | ||
45 | #endif | ||
46 | |||
47 | |||
48 | /* default maximum size for call/backtrack stack */ | ||
49 | #if !defined(MAXBACK) | ||
50 | #define MAXBACK 400 | ||
51 | #endif | ||
52 | |||
53 | |||
54 | /* maximum number of rules in a grammar (limited by 'unsigned char') */ | ||
55 | #if !defined(MAXRULES) | ||
56 | #define MAXRULES 250 | ||
57 | #endif | ||
58 | |||
59 | |||
60 | |||
61 | /* initial size for capture's list */ | ||
62 | #define INITCAPSIZE 32 | ||
63 | |||
64 | |||
65 | /* index, on Lua stack, for subject */ | ||
66 | #define SUBJIDX 2 | ||
67 | |||
68 | /* number of fixed arguments to 'match' (before capture arguments) */ | ||
69 | #define FIXEDARGS 3 | ||
70 | |||
71 | /* index, on Lua stack, for capture list */ | ||
72 | #define caplistidx(ptop) ((ptop) + 2) | ||
73 | |||
74 | /* index, on Lua stack, for pattern's ktable */ | ||
75 | #define ktableidx(ptop) ((ptop) + 3) | ||
76 | |||
77 | /* index, on Lua stack, for backtracking stack */ | ||
78 | #define stackidx(ptop) ((ptop) + 4) | ||
79 | |||
80 | |||
81 | |||
82 | typedef unsigned char byte; | ||
83 | |||
84 | |||
85 | #define BITSPERCHAR 8 | ||
86 | |||
87 | #define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1) | ||
88 | |||
89 | |||
90 | |||
91 | typedef struct Charset { | ||
92 | byte cs[CHARSETSIZE]; | ||
93 | } Charset; | ||
94 | |||
95 | |||
96 | |||
97 | #define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} } | ||
98 | |||
99 | /* access to charset */ | ||
100 | #define treebuffer(t) ((byte *)((t) + 1)) | ||
101 | |||
102 | /* number of slots needed for 'n' bytes */ | ||
103 | #define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1) | ||
104 | |||
105 | /* set 'b' bit in charset 'cs' */ | ||
106 | #define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7))) | ||
107 | |||
108 | |||
109 | /* | ||
110 | ** in capture instructions, 'kind' of capture and its offset are | ||
111 | ** packed in field 'aux', 4 bits for each | ||
112 | */ | ||
113 | #define getkind(op) ((op)->i.aux & 0xF) | ||
114 | #define getoff(op) (((op)->i.aux >> 4) & 0xF) | ||
115 | #define joinkindoff(k,o) ((k) | ((o) << 4)) | ||
116 | |||
117 | #define MAXOFF 0xF | ||
118 | #define MAXAUX 0xFF | ||
119 | |||
120 | |||
121 | /* maximum number of bytes to look behind */ | ||
122 | #define MAXBEHIND MAXAUX | ||
123 | |||
124 | |||
125 | /* maximum size (in elements) for a pattern */ | ||
126 | #define MAXPATTSIZE (SHRT_MAX - 10) | ||
127 | |||
128 | |||
129 | /* size (in elements) for an instruction plus extra l bytes */ | ||
130 | #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) | ||
131 | |||
132 | |||
133 | /* size (in elements) for a ISet instruction */ | ||
134 | #define CHARSETINSTSIZE instsize(CHARSETSIZE) | ||
135 | |||
136 | /* size (in elements) for a IFunc instruction */ | ||
137 | #define funcinstsize(p) ((p)->i.aux + 2) | ||
138 | |||
139 | |||
140 | |||
141 | #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) | ||
142 | |||
143 | |||
144 | #endif | ||
145 | |||
@@ -0,0 +1,364 @@ | |||
1 | /* | ||
2 | ** $Id: lpvm.c,v 1.9 2016/06/03 20:11:18 roberto Exp $ | ||
3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
4 | */ | ||
5 | |||
6 | #include <limits.h> | ||
7 | #include <string.h> | ||
8 | |||
9 | |||
10 | #include "lua.h" | ||
11 | #include "lauxlib.h" | ||
12 | |||
13 | #include "lpcap.h" | ||
14 | #include "lptypes.h" | ||
15 | #include "lpvm.h" | ||
16 | #include "lpprint.h" | ||
17 | |||
18 | |||
19 | /* initial size for call/backtrack stack */ | ||
20 | #if !defined(INITBACK) | ||
21 | #define INITBACK MAXBACK | ||
22 | #endif | ||
23 | |||
24 | |||
25 | #define getoffset(p) (((p) + 1)->offset) | ||
26 | |||
27 | static const Instruction giveup = {{IGiveup, 0, 0}}; | ||
28 | |||
29 | |||
30 | /* | ||
31 | ** {====================================================== | ||
32 | ** Virtual Machine | ||
33 | ** ======================================================= | ||
34 | */ | ||
35 | |||
36 | |||
37 | typedef struct Stack { | ||
38 | const char *s; /* saved position (or NULL for calls) */ | ||
39 | const Instruction *p; /* next instruction */ | ||
40 | int caplevel; | ||
41 | } Stack; | ||
42 | |||
43 | |||
44 | #define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) | ||
45 | |||
46 | |||
47 | /* | ||
48 | ** Make the size of the array of captures 'cap' twice as large as needed | ||
49 | ** (which is 'captop'). ('n' is the number of new elements.) | ||
50 | */ | ||
51 | static Capture *doublecap (lua_State *L, Capture *cap, int captop, | ||
52 | int n, int ptop) { | ||
53 | Capture *newc; | ||
54 | if (captop >= INT_MAX/((int)sizeof(Capture) * 2)) | ||
55 | luaL_error(L, "too many captures"); | ||
56 | newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture)); | ||
57 | memcpy(newc, cap, (captop - n) * sizeof(Capture)); | ||
58 | lua_replace(L, caplistidx(ptop)); | ||
59 | return newc; | ||
60 | } | ||
61 | |||
62 | |||
63 | /* | ||
64 | ** Double the size of the stack | ||
65 | */ | ||
66 | static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) { | ||
67 | Stack *stack = getstackbase(L, ptop); | ||
68 | Stack *newstack; | ||
69 | int n = *stacklimit - stack; /* current stack size */ | ||
70 | int max, newn; | ||
71 | lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); | ||
72 | max = lua_tointeger(L, -1); /* maximum allowed size */ | ||
73 | lua_pop(L, 1); | ||
74 | if (n >= max) /* already at maximum size? */ | ||
75 | luaL_error(L, "backtrack stack overflow (current limit is %d)", max); | ||
76 | newn = 2 * n; /* new size */ | ||
77 | if (newn > max) newn = max; | ||
78 | newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack)); | ||
79 | memcpy(newstack, stack, n * sizeof(Stack)); | ||
80 | lua_replace(L, stackidx(ptop)); | ||
81 | *stacklimit = newstack + newn; | ||
82 | return newstack + n; /* return next position */ | ||
83 | } | ||
84 | |||
85 | |||
86 | /* | ||
87 | ** Interpret the result of a dynamic capture: false -> fail; | ||
88 | ** true -> keep current position; number -> next position. | ||
89 | ** Return new subject position. 'fr' is stack index where | ||
90 | ** is the result; 'curr' is current subject position; 'limit' | ||
91 | ** is subject's size. | ||
92 | */ | ||
93 | static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { | ||
94 | lua_Integer res; | ||
95 | if (!lua_toboolean(L, fr)) { /* false value? */ | ||
96 | lua_settop(L, fr - 1); /* remove results */ | ||
97 | return -1; /* and fail */ | ||
98 | } | ||
99 | else if (lua_isboolean(L, fr)) /* true? */ | ||
100 | res = curr; /* keep current position */ | ||
101 | else { | ||
102 | res = lua_tointeger(L, fr) - 1; /* new position */ | ||
103 | if (res < curr || res > limit) | ||
104 | luaL_error(L, "invalid position returned by match-time capture"); | ||
105 | } | ||
106 | lua_remove(L, fr); /* remove first result (offset) */ | ||
107 | return res; | ||
108 | } | ||
109 | |||
110 | |||
111 | /* | ||
112 | ** Add capture values returned by a dynamic capture to the capture list | ||
113 | ** 'base', nested inside a group capture. 'fd' indexes the first capture | ||
114 | ** value, 'n' is the number of values (at least 1). | ||
115 | */ | ||
116 | static void adddyncaptures (const char *s, Capture *base, int n, int fd) { | ||
117 | int i; | ||
118 | base[0].kind = Cgroup; /* create group capture */ | ||
119 | base[0].siz = 0; | ||
120 | base[0].idx = 0; /* make it an anonymous group */ | ||
121 | for (i = 1; i <= n; i++) { /* add runtime captures */ | ||
122 | base[i].kind = Cruntime; | ||
123 | base[i].siz = 1; /* mark it as closed */ | ||
124 | base[i].idx = fd + i - 1; /* stack index of capture value */ | ||
125 | base[i].s = s; | ||
126 | } | ||
127 | base[i].kind = Cclose; /* close group */ | ||
128 | base[i].siz = 1; | ||
129 | base[i].s = s; | ||
130 | } | ||
131 | |||
132 | |||
133 | /* | ||
134 | ** Remove dynamic captures from the Lua stack (called in case of failure) | ||
135 | */ | ||
136 | static int removedyncap (lua_State *L, Capture *capture, | ||
137 | int level, int last) { | ||
138 | int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */ | ||
139 | int top = lua_gettop(L); | ||
140 | if (id == 0) return 0; /* no dynamic captures? */ | ||
141 | lua_settop(L, id - 1); /* remove captures */ | ||
142 | return top - id + 1; /* number of values removed */ | ||
143 | } | ||
144 | |||
145 | |||
146 | /* | ||
147 | ** Opcode interpreter | ||
148 | */ | ||
149 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | ||
150 | Instruction *op, Capture *capture, int ptop) { | ||
151 | Stack stackbase[INITBACK]; | ||
152 | Stack *stacklimit = stackbase + INITBACK; | ||
153 | Stack *stack = stackbase; /* point to first empty slot in stack */ | ||
154 | int capsize = INITCAPSIZE; | ||
155 | int captop = 0; /* point to first empty slot in captures */ | ||
156 | int ndyncap = 0; /* number of dynamic captures (in Lua stack) */ | ||
157 | const Instruction *p = op; /* current instruction */ | ||
158 | stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++; | ||
159 | lua_pushlightuserdata(L, stackbase); | ||
160 | for (;;) { | ||
161 | #if defined(DEBUG) | ||
162 | printf("-------------------------------------\n"); | ||
163 | printcaplist(capture, capture + captop); | ||
164 | printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", | ||
165 | s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop); | ||
166 | printinst(op, p); | ||
167 | #endif | ||
168 | assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); | ||
169 | switch ((Opcode)p->i.code) { | ||
170 | case IEnd: { | ||
171 | assert(stack == getstackbase(L, ptop) + 1); | ||
172 | capture[captop].kind = Cclose; | ||
173 | capture[captop].s = NULL; | ||
174 | return s; | ||
175 | } | ||
176 | case IGiveup: { | ||
177 | assert(stack == getstackbase(L, ptop)); | ||
178 | return NULL; | ||
179 | } | ||
180 | case IRet: { | ||
181 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL); | ||
182 | p = (--stack)->p; | ||
183 | continue; | ||
184 | } | ||
185 | case IAny: { | ||
186 | if (s < e) { p++; s++; } | ||
187 | else goto fail; | ||
188 | continue; | ||
189 | } | ||
190 | case ITestAny: { | ||
191 | if (s < e) p += 2; | ||
192 | else p += getoffset(p); | ||
193 | continue; | ||
194 | } | ||
195 | case IChar: { | ||
196 | if ((byte)*s == p->i.aux && s < e) { p++; s++; } | ||
197 | else goto fail; | ||
198 | continue; | ||
199 | } | ||
200 | case ITestChar: { | ||
201 | if ((byte)*s == p->i.aux && s < e) p += 2; | ||
202 | else p += getoffset(p); | ||
203 | continue; | ||
204 | } | ||
205 | case ISet: { | ||
206 | int c = (byte)*s; | ||
207 | if (testchar((p+1)->buff, c) && s < e) | ||
208 | { p += CHARSETINSTSIZE; s++; } | ||
209 | else goto fail; | ||
210 | continue; | ||
211 | } | ||
212 | case ITestSet: { | ||
213 | int c = (byte)*s; | ||
214 | if (testchar((p + 2)->buff, c) && s < e) | ||
215 | p += 1 + CHARSETINSTSIZE; | ||
216 | else p += getoffset(p); | ||
217 | continue; | ||
218 | } | ||
219 | case IBehind: { | ||
220 | int n = p->i.aux; | ||
221 | if (n > s - o) goto fail; | ||
222 | s -= n; p++; | ||
223 | continue; | ||
224 | } | ||
225 | case ISpan: { | ||
226 | for (; s < e; s++) { | ||
227 | int c = (byte)*s; | ||
228 | if (!testchar((p+1)->buff, c)) break; | ||
229 | } | ||
230 | p += CHARSETINSTSIZE; | ||
231 | continue; | ||
232 | } | ||
233 | case IJmp: { | ||
234 | p += getoffset(p); | ||
235 | continue; | ||
236 | } | ||
237 | case IChoice: { | ||
238 | if (stack == stacklimit) | ||
239 | stack = doublestack(L, &stacklimit, ptop); | ||
240 | stack->p = p + getoffset(p); | ||
241 | stack->s = s; | ||
242 | stack->caplevel = captop; | ||
243 | stack++; | ||
244 | p += 2; | ||
245 | continue; | ||
246 | } | ||
247 | case ICall: { | ||
248 | if (stack == stacklimit) | ||
249 | stack = doublestack(L, &stacklimit, ptop); | ||
250 | stack->s = NULL; | ||
251 | stack->p = p + 2; /* save return address */ | ||
252 | stack++; | ||
253 | p += getoffset(p); | ||
254 | continue; | ||
255 | } | ||
256 | case ICommit: { | ||
257 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
258 | stack--; | ||
259 | p += getoffset(p); | ||
260 | continue; | ||
261 | } | ||
262 | case IPartialCommit: { | ||
263 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
264 | (stack - 1)->s = s; | ||
265 | (stack - 1)->caplevel = captop; | ||
266 | p += getoffset(p); | ||
267 | continue; | ||
268 | } | ||
269 | case IBackCommit: { | ||
270 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | ||
271 | s = (--stack)->s; | ||
272 | captop = stack->caplevel; | ||
273 | p += getoffset(p); | ||
274 | continue; | ||
275 | } | ||
276 | case IFailTwice: | ||
277 | assert(stack > getstackbase(L, ptop)); | ||
278 | stack--; | ||
279 | /* go through */ | ||
280 | case IFail: | ||
281 | fail: { /* pattern failed: try to backtrack */ | ||
282 | do { /* remove pending calls */ | ||
283 | assert(stack > getstackbase(L, ptop)); | ||
284 | s = (--stack)->s; | ||
285 | } while (s == NULL); | ||
286 | if (ndyncap > 0) /* is there matchtime captures? */ | ||
287 | ndyncap -= removedyncap(L, capture, stack->caplevel, captop); | ||
288 | captop = stack->caplevel; | ||
289 | p = stack->p; | ||
290 | #if defined(DEBUG) | ||
291 | printf("**FAIL**\n"); | ||
292 | #endif | ||
293 | continue; | ||
294 | } | ||
295 | case ICloseRunTime: { | ||
296 | CapState cs; | ||
297 | int rem, res, n; | ||
298 | int fr = lua_gettop(L) + 1; /* stack index of first result */ | ||
299 | cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop; | ||
300 | n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ | ||
301 | captop -= n; /* remove nested captures */ | ||
302 | ndyncap -= rem; /* update number of dynamic captures */ | ||
303 | fr -= rem; /* 'rem' items were popped from Lua stack */ | ||
304 | res = resdyncaptures(L, fr, s - o, e - o); /* get result */ | ||
305 | if (res == -1) /* fail? */ | ||
306 | goto fail; | ||
307 | s = o + res; /* else update current position */ | ||
308 | n = lua_gettop(L) - fr + 1; /* number of new captures */ | ||
309 | ndyncap += n; /* update number of dynamic captures */ | ||
310 | if (n > 0) { /* any new capture? */ | ||
311 | if (fr + n >= SHRT_MAX) | ||
312 | luaL_error(L, "too many results in match-time capture"); | ||
313 | if ((captop += n + 2) >= capsize) { | ||
314 | capture = doublecap(L, capture, captop, n + 2, ptop); | ||
315 | capsize = 2 * captop; | ||
316 | } | ||
317 | /* add new captures to 'capture' list */ | ||
318 | adddyncaptures(s, capture + captop - n - 2, n, fr); | ||
319 | } | ||
320 | p++; | ||
321 | continue; | ||
322 | } | ||
323 | case ICloseCapture: { | ||
324 | const char *s1 = s; | ||
325 | assert(captop > 0); | ||
326 | /* if possible, turn capture into a full capture */ | ||
327 | if (capture[captop - 1].siz == 0 && | ||
328 | s1 - capture[captop - 1].s < UCHAR_MAX) { | ||
329 | capture[captop - 1].siz = s1 - capture[captop - 1].s + 1; | ||
330 | p++; | ||
331 | continue; | ||
332 | } | ||
333 | else { | ||
334 | capture[captop].siz = 1; /* mark entry as closed */ | ||
335 | capture[captop].s = s; | ||
336 | goto pushcapture; | ||
337 | } | ||
338 | } | ||
339 | case IOpenCapture: | ||
340 | capture[captop].siz = 0; /* mark entry as open */ | ||
341 | capture[captop].s = s; | ||
342 | goto pushcapture; | ||
343 | case IFullCapture: | ||
344 | capture[captop].siz = getoff(p) + 1; /* save capture size */ | ||
345 | capture[captop].s = s - getoff(p); | ||
346 | /* goto pushcapture; */ | ||
347 | pushcapture: { | ||
348 | capture[captop].idx = p->i.key; | ||
349 | capture[captop].kind = getkind(p); | ||
350 | if (++captop >= capsize) { | ||
351 | capture = doublecap(L, capture, captop, 0, ptop); | ||
352 | capsize = 2 * captop; | ||
353 | } | ||
354 | p++; | ||
355 | continue; | ||
356 | } | ||
357 | default: assert(0); return NULL; | ||
358 | } | ||
359 | } | ||
360 | } | ||
361 | |||
362 | /* }====================================================== */ | ||
363 | |||
364 | |||
@@ -0,0 +1,58 @@ | |||
1 | /* | ||
2 | ** $Id: lpvm.h,v 1.3 2014/02/21 13:06:41 roberto Exp $ | ||
3 | */ | ||
4 | |||
5 | #if !defined(lpvm_h) | ||
6 | #define lpvm_h | ||
7 | |||
8 | #include "lpcap.h" | ||
9 | |||
10 | |||
11 | /* Virtual Machine's instructions */ | ||
12 | typedef enum Opcode { | ||
13 | IAny, /* if no char, fail */ | ||
14 | IChar, /* if char != aux, fail */ | ||
15 | ISet, /* if char not in buff, fail */ | ||
16 | ITestAny, /* in no char, jump to 'offset' */ | ||
17 | ITestChar, /* if char != aux, jump to 'offset' */ | ||
18 | ITestSet, /* if char not in buff, jump to 'offset' */ | ||
19 | ISpan, /* read a span of chars in buff */ | ||
20 | IBehind, /* walk back 'aux' characters (fail if not possible) */ | ||
21 | IRet, /* return from a rule */ | ||
22 | IEnd, /* end of pattern */ | ||
23 | IChoice, /* stack a choice; next fail will jump to 'offset' */ | ||
24 | IJmp, /* jump to 'offset' */ | ||
25 | ICall, /* call rule at 'offset' */ | ||
26 | IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ | ||
27 | ICommit, /* pop choice and jump to 'offset' */ | ||
28 | IPartialCommit, /* update top choice to current position and jump */ | ||
29 | IBackCommit, /* "fails" but jump to its own 'offset' */ | ||
30 | IFailTwice, /* pop one choice and then fail */ | ||
31 | IFail, /* go back to saved state on choice and jump to saved offset */ | ||
32 | IGiveup, /* internal use */ | ||
33 | IFullCapture, /* complete capture of last 'off' chars */ | ||
34 | IOpenCapture, /* start a capture */ | ||
35 | ICloseCapture, | ||
36 | ICloseRunTime | ||
37 | } Opcode; | ||
38 | |||
39 | |||
40 | |||
41 | typedef union Instruction { | ||
42 | struct Inst { | ||
43 | byte code; | ||
44 | byte aux; | ||
45 | short key; | ||
46 | } i; | ||
47 | int offset; | ||
48 | byte buff[1]; | ||
49 | } Instruction; | ||
50 | |||
51 | |||
52 | void printpatt (Instruction *p, int n); | ||
53 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | ||
54 | Instruction *op, Capture *capture, int ptop); | ||
55 | |||
56 | |||
57 | #endif | ||
58 | |||
diff --git a/makefile b/makefile new file mode 100644 index 0000000..d803c12 --- /dev/null +++ b/makefile | |||
@@ -0,0 +1,55 @@ | |||
1 | LIBNAME = lpeg | ||
2 | LUADIR = ../lua/ | ||
3 | |||
4 | # COPT = -O2 -DNDEBUG | ||
5 | COPT = -g | ||
6 | |||
7 | CWARNS = -Wall -Wextra -pedantic \ | ||
8 | -Waggregate-return \ | ||
9 | -Wcast-align \ | ||
10 | -Wcast-qual \ | ||
11 | -Wdisabled-optimization \ | ||
12 | -Wpointer-arith \ | ||
13 | -Wshadow \ | ||
14 | -Wsign-compare \ | ||
15 | -Wundef \ | ||
16 | -Wwrite-strings \ | ||
17 | -Wbad-function-cast \ | ||
18 | -Wdeclaration-after-statement \ | ||
19 | -Wmissing-prototypes \ | ||
20 | -Wnested-externs \ | ||
21 | -Wstrict-prototypes \ | ||
22 | # -Wunreachable-code \ | ||
23 | |||
24 | |||
25 | CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC | ||
26 | CC = gcc | ||
27 | |||
28 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o | ||
29 | |||
30 | # For Linux | ||
31 | linux: | ||
32 | make lpeg.so "DLLFLAGS = -shared -fPIC" | ||
33 | |||
34 | # For Mac OS | ||
35 | macosx: | ||
36 | make lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" | ||
37 | |||
38 | lpeg.so: $(FILES) | ||
39 | env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so | ||
40 | |||
41 | $(FILES): makefile | ||
42 | |||
43 | test: test.lua re.lua lpeg.so | ||
44 | ./test.lua | ||
45 | |||
46 | clean: | ||
47 | rm -f $(FILES) lpeg.so | ||
48 | |||
49 | |||
50 | lpcap.o: lpcap.c lpcap.h lptypes.h | ||
51 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | ||
52 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | ||
53 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | ||
54 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | ||
55 | |||
@@ -0,0 +1,15 @@ | |||
1 | #!/bin/bash | ||
2 | FILES="makefile HISTORY test.lua re.lua lpeg.html re.html lpeg-128.gif \ | ||
3 | lptypes.h lpcap.h lpcap.c lpcode.h lpcode.c lpprint.h lpprint.c \ | ||
4 | lptree.h lptree.c lpvm.h lpvm.c" | ||
5 | NAME=lpeg-$1 | ||
6 | DIRN=versions/$NAME | ||
7 | mkdir $DIRN | ||
8 | co $FILES | ||
9 | mv $FILES $DIRN | ||
10 | cd versions | ||
11 | tar --create --gzip --file=$NAME.tar.gz $NAME | ||
12 | # scp $NAME.tar.gz obaluae:public_html/lpeg/ | ||
13 | # ssh obaluae "rm public_html/lpeg/*.html" | ||
14 | cd $NAME | ||
15 | # scp *.html obaluae:public_html/lpeg/ | ||
@@ -0,0 +1,500 @@ | |||
1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | ||
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html> | ||
4 | <head> | ||
5 | <title>LPeg.re - Regex syntax for LPEG</title> | ||
6 | <link rel="stylesheet" | ||
7 | href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css" | ||
8 | type="text/css"/> | ||
9 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> | ||
10 | </head> | ||
11 | <body> | ||
12 | |||
13 | <!-- $Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $ --> | ||
14 | |||
15 | <div id="container"> | ||
16 | |||
17 | <div id="product"> | ||
18 | <div id="product_logo"> | ||
19 | <a href="http://www.inf.puc-rio.br/~roberto/lpeg/"> | ||
20 | <img alt="LPeg logo" src="lpeg-128.gif"/> | ||
21 | </a> | ||
22 | </div> | ||
23 | <div id="product_name"><big><strong>LPeg.re</strong></big></div> | ||
24 | <div id="product_description"> | ||
25 | Regex syntax for LPEG | ||
26 | </div> | ||
27 | </div> <!-- id="product" --> | ||
28 | |||
29 | <div id="main"> | ||
30 | |||
31 | <div id="navigation"> | ||
32 | <h1>re</h1> | ||
33 | |||
34 | <ul> | ||
35 | <li><a href="#basic">Basic Constructions</a></li> | ||
36 | <li><a href="#func">Functions</a></li> | ||
37 | <li><a href="#ex">Some Examples</a></li> | ||
38 | <li><a href="#license">License</a></li> | ||
39 | </ul> | ||
40 | </li> | ||
41 | </ul> | ||
42 | </div> <!-- id="navigation" --> | ||
43 | |||
44 | <div id="content"> | ||
45 | |||
46 | <h2><a name="basic"></a>The <code>re</code> Module</h2> | ||
47 | |||
48 | <p> | ||
49 | The <code>re</code> module | ||
50 | (provided by file <code>re.lua</code> in the distribution) | ||
51 | supports a somewhat conventional regex syntax | ||
52 | for pattern usage within <a href="lpeg.html">LPeg</a>. | ||
53 | </p> | ||
54 | |||
55 | <p> | ||
56 | The next table summarizes <code>re</code>'s syntax. | ||
57 | A <code>p</code> represents an arbitrary pattern; | ||
58 | <code>num</code> represents a number (<code>[0-9]+</code>); | ||
59 | <code>name</code> represents an identifier | ||
60 | (<code>[a-zA-Z][a-zA-Z0-9_]*</code>). | ||
61 | Constructions are listed in order of decreasing precedence. | ||
62 | <table border="1"> | ||
63 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> | ||
64 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> | ||
65 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> | ||
66 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> | ||
67 | <tr><td><code>[class]</code></td> <td>character class</td></tr> | ||
68 | <tr><td><code>.</code></td> <td>any character</td></tr> | ||
69 | <tr><td><code>%name</code></td> | ||
70 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> | ||
71 | <tr><td><code>name</code></td><td>non terminal</td></tr> | ||
72 | <tr><td><code><name></code></td><td>non terminal</td></tr> | ||
73 | <tr><td><code>{}</code></td> <td>position capture</td></tr> | ||
74 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> | ||
75 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> | ||
76 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> | ||
77 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> | ||
78 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> | ||
79 | <tr><td><code>=name</code></td> <td>back reference | ||
80 | </td></tr> | ||
81 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
82 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
83 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
84 | <tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr> | ||
85 | <tr><td><code>p^+num</code></td> | ||
86 | <td>at least <code>n</code> repetitions</td></tr> | ||
87 | <tr><td><code>p^-num</code></td> | ||
88 | <td>at most <code>n</code> repetitions</td></tr> | ||
89 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> | ||
90 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> | ||
91 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> | ||
92 | <tr><td><code>p -> name</code></td> <td>function/query/string capture | ||
93 | equivalent to <code>p / defs[name]</code></td></tr> | ||
94 | <tr><td><code>p => name</code></td> <td>match-time capture | ||
95 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> | ||
96 | <tr><td><code>p ~> name</code></td> <td>fold capture | ||
97 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | ||
98 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | ||
99 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | ||
100 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
101 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
102 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
103 | </tbody></table> | ||
104 | <p> | ||
105 | Any space appearing in a syntax description can be | ||
106 | replaced by zero or more space characters and Lua-style comments | ||
107 | (<code>--</code> until end of line). | ||
108 | </p> | ||
109 | |||
110 | <p> | ||
111 | Character classes define sets of characters. | ||
112 | An initial <code>^</code> complements the resulting set. | ||
113 | A range <em>x</em><code>-</code><em>y</em> includes in the set | ||
114 | all characters with codes between the codes of <em>x</em> and <em>y</em>. | ||
115 | A pre-defined class <code>%</code><em>name</em> includes all | ||
116 | characters of that class. | ||
117 | A simple character includes itself in the set. | ||
118 | The only special characters inside a class are <code>^</code> | ||
119 | (special only if it is the first character); | ||
120 | <code>]</code> | ||
121 | (can be included in the set as the first character, | ||
122 | after the optional <code>^</code>); | ||
123 | <code>%</code> (special only if followed by a letter); | ||
124 | and <code>-</code> | ||
125 | (can be included in the set as the first or the last character). | ||
126 | </p> | ||
127 | |||
128 | <p> | ||
129 | Currently the pre-defined classes are similar to those from the | ||
130 | Lua's string library | ||
131 | (<code>%a</code> for letters, | ||
132 | <code>%A</code> for non letters, etc.). | ||
133 | There is also a class <code>%nl</code> | ||
134 | containing only the newline character, | ||
135 | which is particularly handy for grammars written inside long strings, | ||
136 | as long strings do not interpret escape sequences like <code>\n</code>. | ||
137 | </p> | ||
138 | |||
139 | |||
140 | <h2><a name="func">Functions</a></h2> | ||
141 | |||
142 | <h3><code>re.compile (string, [, defs])</code></h3> | ||
143 | <p> | ||
144 | Compiles the given string and | ||
145 | returns an equivalent LPeg pattern. | ||
146 | The given string may define either an expression or a grammar. | ||
147 | The optional <code>defs</code> table provides extra Lua values | ||
148 | to be used by the pattern. | ||
149 | </p> | ||
150 | |||
151 | <h3><code>re.find (subject, pattern [, init])</code></h3> | ||
152 | <p> | ||
153 | Searches the given pattern in the given subject. | ||
154 | If it finds a match, | ||
155 | returns the index where this occurrence starts and | ||
156 | the index where it ends. | ||
157 | Otherwise, returns nil. | ||
158 | </p> | ||
159 | |||
160 | <p> | ||
161 | An optional numeric argument <code>init</code> makes the search | ||
162 | starts at that position in the subject string. | ||
163 | As usual in Lua libraries, | ||
164 | a negative value counts from the end. | ||
165 | </p> | ||
166 | |||
167 | <h3><code>re.gsub (subject, pattern, replacement)</code></h3> | ||
168 | <p> | ||
169 | Does a <em>global substitution</em>, | ||
170 | replacing all occurrences of <code>pattern</code> | ||
171 | in the given <code>subject</code> by <code>replacement</code>. | ||
172 | |||
173 | <h3><code>re.match (subject, pattern)</code></h3> | ||
174 | <p> | ||
175 | Matches the given pattern against the given subject, | ||
176 | returning all captures. | ||
177 | </p> | ||
178 | |||
179 | <h3><code>re.updatelocale ()</code></h3> | ||
180 | <p> | ||
181 | Updates the pre-defined character classes to the current locale. | ||
182 | </p> | ||
183 | |||
184 | |||
185 | <h2><a name="ex">Some Examples</a></h2> | ||
186 | |||
187 | <h3>A complete simple program</h3> | ||
188 | <p> | ||
189 | The next code shows a simple complete Lua program using | ||
190 | the <code>re</code> module: | ||
191 | </p> | ||
192 | <pre class="example"> | ||
193 | local re = require"re" | ||
194 | |||
195 | -- find the position of the first numeral in a string | ||
196 | print(re.find("the number 423 is odd", "[0-9]+")) --> 12 14 | ||
197 | |||
198 | -- returns all words in a string | ||
199 | print(re.match("the number 423 is odd", "({%a+} / .)*")) | ||
200 | --> the number is odd | ||
201 | |||
202 | -- returns the first numeral in a string | ||
203 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) | ||
204 | --> 423 | ||
205 | |||
206 | print(re.gsub("hello World", "[aeiou]", ".")) | ||
207 | --> h.ll. W.rld | ||
208 | </pre> | ||
209 | |||
210 | |||
211 | <h3>Balanced parentheses</h3> | ||
212 | <p> | ||
213 | The following call will produce the same pattern produced by the | ||
214 | Lua expression in the | ||
215 | <a href="lpeg.html#balanced">balanced parentheses</a> example: | ||
216 | </p> | ||
217 | <pre class="example"> | ||
218 | b = re.compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] | ||
219 | </pre> | ||
220 | |||
221 | <h3>String reversal</h3> | ||
222 | <p> | ||
223 | The next example reverses a string: | ||
224 | </p> | ||
225 | <pre class="example"> | ||
226 | rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] | ||
227 | print(rev:match"0123456789") --> 9876543210 | ||
228 | </pre> | ||
229 | |||
230 | <h3>CSV decoder</h3> | ||
231 | <p> | ||
232 | The next example replicates the <a href="lpeg.html#CSV">CSV decoder</a>: | ||
233 | </p> | ||
234 | <pre class="example"> | ||
235 | record = re.compile[[ | ||
236 | record <- {| field (',' field)* |} (%nl / !.) | ||
237 | field <- escaped / nonescaped | ||
238 | nonescaped <- { [^,"%nl]* } | ||
239 | escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"' | ||
240 | ]] | ||
241 | </pre> | ||
242 | |||
243 | <h3>Lua's long strings</h3> | ||
244 | <p> | ||
245 | The next example matches Lua long strings: | ||
246 | </p> | ||
247 | <pre class="example"> | ||
248 | c = re.compile([[ | ||
249 | longstring <- ('[' {:eq: '='* :} '[' close) | ||
250 | close <- ']' =eq ']' / . close | ||
251 | ]]) | ||
252 | |||
253 | print(c:match'[==[]]===]]]]==]===[]') --> 17 | ||
254 | </pre> | ||
255 | |||
256 | <h3>Abstract Syntax Trees</h3> | ||
257 | <p> | ||
258 | This example shows a simple way to build an | ||
259 | abstract syntax tree (AST) for a given grammar. | ||
260 | To keep our example simple, | ||
261 | let us consider the following grammar | ||
262 | for lists of names: | ||
263 | </p> | ||
264 | <pre class="example"> | ||
265 | p = re.compile[[ | ||
266 | listname <- (name s)* | ||
267 | name <- [a-z][a-z]* | ||
268 | s <- %s* | ||
269 | ]] | ||
270 | </pre> | ||
271 | <p> | ||
272 | Now, we will add captures to build a corresponding AST. | ||
273 | As a first step, the pattern will build a table to | ||
274 | represent each non terminal; | ||
275 | terminals will be represented by their corresponding strings: | ||
276 | </p> | ||
277 | <pre class="example"> | ||
278 | c = re.compile[[ | ||
279 | listname <- {| (name s)* |} | ||
280 | name <- {| {[a-z][a-z]*} |} | ||
281 | s <- %s* | ||
282 | ]] | ||
283 | </pre> | ||
284 | <p> | ||
285 | Now, a match against <code>"hi hello bye"</code> | ||
286 | results in the table | ||
287 | <code>{{"hi"}, {"hello"}, {"bye"}}</code>. | ||
288 | </p> | ||
289 | <p> | ||
290 | For such a simple grammar, | ||
291 | this AST is more than enough; | ||
292 | actually, the tables around each single name | ||
293 | are already overkilling. | ||
294 | More complex grammars, | ||
295 | however, may need some more structure. | ||
296 | Specifically, | ||
297 | it would be useful if each table had | ||
298 | a <code>tag</code> field telling what non terminal | ||
299 | that table represents. | ||
300 | We can add such a tag using | ||
301 | <a href="lpeg.html#cap-g">named group captures</a>: | ||
302 | </p> | ||
303 | <pre class="example"> | ||
304 | x = re.compile[[ | ||
305 | listname <- {| {:tag: '' -> 'list':} (name s)* |} | ||
306 | name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |} | ||
307 | s <- ' '* | ||
308 | ]] | ||
309 | </pre> | ||
310 | <p> | ||
311 | With these group captures, | ||
312 | a match against <code>"hi hello bye"</code> | ||
313 | results in the following table: | ||
314 | </p> | ||
315 | <pre class="example"> | ||
316 | {tag="list", | ||
317 | {tag="id", "hi"}, | ||
318 | {tag="id", "hello"}, | ||
319 | {tag="id", "bye"} | ||
320 | } | ||
321 | </pre> | ||
322 | |||
323 | |||
324 | <h3>Indented blocks</h3> | ||
325 | <p> | ||
326 | This example breaks indented blocks into tables, | ||
327 | respecting the indentation: | ||
328 | </p> | ||
329 | <pre class="example"> | ||
330 | p = re.compile[[ | ||
331 | block <- {| {:ident:' '*:} line | ||
332 | ((=ident !' ' line) / &(=ident ' ') block)* |} | ||
333 | line <- {[^%nl]*} %nl | ||
334 | ]] | ||
335 | </pre> | ||
336 | <p> | ||
337 | As an example, | ||
338 | consider the following text: | ||
339 | </p> | ||
340 | <pre class="example"> | ||
341 | t = p:match[[ | ||
342 | first line | ||
343 | subline 1 | ||
344 | subline 2 | ||
345 | second line | ||
346 | third line | ||
347 | subline 3.1 | ||
348 | subline 3.1.1 | ||
349 | subline 3.2 | ||
350 | ]] | ||
351 | </pre> | ||
352 | <p> | ||
353 | The resulting table <code>t</code> will be like this: | ||
354 | </p> | ||
355 | <pre class="example"> | ||
356 | {'first line'; {'subline 1'; 'subline 2'; ident = ' '}; | ||
357 | 'second line'; | ||
358 | 'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = ' '}; | ||
359 | 'subline 3.2'; ident = ' '}; | ||
360 | ident = ''} | ||
361 | </pre> | ||
362 | |||
363 | <h3>Macro expander</h3> | ||
364 | <p> | ||
365 | This example implements a simple macro expander. | ||
366 | Macros must be defined as part of the pattern, | ||
367 | following some simple rules: | ||
368 | </p> | ||
369 | <pre class="example"> | ||
370 | p = re.compile[[ | ||
371 | text <- {~ item* ~} | ||
372 | item <- macro / [^()] / '(' item* ')' | ||
373 | arg <- ' '* {~ (!',' item)* ~} | ||
374 | args <- '(' arg (',' arg)* ')' | ||
375 | -- now we define some macros | ||
376 | macro <- ('apply' args) -> '%1(%2)' | ||
377 | / ('add' args) -> '%1 + %2' | ||
378 | / ('mul' args) -> '%1 * %2' | ||
379 | ]] | ||
380 | |||
381 | print(p:match"add(mul(a,b), apply(f,x))") --> a * b + f(x) | ||
382 | </pre> | ||
383 | <p> | ||
384 | A <code>text</code> is a sequence of items, | ||
385 | wherein we apply a substitution capture to expand any macros. | ||
386 | An <code>item</code> is either a macro, | ||
387 | any character different from parentheses, | ||
388 | or a parenthesized expression. | ||
389 | A macro argument (<code>arg</code>) is a sequence | ||
390 | of items different from a comma. | ||
391 | (Note that a comma may appear inside an item, | ||
392 | e.g., inside a parenthesized expression.) | ||
393 | Again we do a substitution capture to expand any macro | ||
394 | in the argument before expanding the outer macro. | ||
395 | <code>args</code> is a list of arguments separated by commas. | ||
396 | Finally we define the macros. | ||
397 | Each macro is a string substitution; | ||
398 | it replaces the macro name and its arguments by its corresponding string, | ||
399 | with each <code>%</code><em>n</em> replaced by the <em>n</em>-th argument. | ||
400 | </p> | ||
401 | |||
402 | <h3>Patterns</h3> | ||
403 | <p> | ||
404 | This example shows the complete syntax | ||
405 | of patterns accepted by <code>re</code>. | ||
406 | </p> | ||
407 | <pre class="example"> | ||
408 | p = [=[ | ||
409 | |||
410 | pattern <- exp !. | ||
411 | exp <- S (grammar / alternative) | ||
412 | |||
413 | alternative <- seq ('/' S seq)* | ||
414 | seq <- prefix* | ||
415 | prefix <- '&' S prefix / '!' S prefix / suffix | ||
416 | suffix <- primary S (([+*?] | ||
417 | / '^' [+-]? num | ||
418 | / '->' S (string / '{}' / name) | ||
419 | / '=>' S name) S)* | ||
420 | |||
421 | primary <- '(' exp ')' / string / class / defined | ||
422 | / '{:' (name ':')? exp ':}' | ||
423 | / '=' name | ||
424 | / '{}' | ||
425 | / '{~' exp '~}' | ||
426 | / '{' exp '}' | ||
427 | / '.' | ||
428 | / name S !arrow | ||
429 | / '<' name '>' -- old-style non terminals | ||
430 | |||
431 | grammar <- definition+ | ||
432 | definition <- name S arrow exp | ||
433 | |||
434 | class <- '[' '^'? item (!']' item)* ']' | ||
435 | item <- defined / range / . | ||
436 | range <- . '-' [^]] | ||
437 | |||
438 | S <- (%s / '--' [^%nl]*)* -- spaces and comments | ||
439 | name <- [A-Za-z][A-Za-z0-9_]* | ||
440 | arrow <- '<-' | ||
441 | num <- [0-9]+ | ||
442 | string <- '"' [^"]* '"' / "'" [^']* "'" | ||
443 | defined <- '%' name | ||
444 | |||
445 | ]=] | ||
446 | |||
447 | print(re.match(p, p)) -- a self description must match itself | ||
448 | </pre> | ||
449 | |||
450 | |||
451 | |||
452 | <h2><a name="license">License</a></h2> | ||
453 | |||
454 | <p> | ||
455 | Copyright © 2008-2015 Lua.org, PUC-Rio. | ||
456 | </p> | ||
457 | <p> | ||
458 | Permission is hereby granted, free of charge, | ||
459 | to any person obtaining a copy of this software and | ||
460 | associated documentation files (the "Software"), | ||
461 | to deal in the Software without restriction, | ||
462 | including without limitation the rights to use, | ||
463 | copy, modify, merge, publish, distribute, sublicense, | ||
464 | and/or sell copies of the Software, | ||
465 | and to permit persons to whom the Software is | ||
466 | furnished to do so, | ||
467 | subject to the following conditions: | ||
468 | </p> | ||
469 | |||
470 | <p> | ||
471 | The above copyright notice and this permission notice | ||
472 | shall be included in all copies or substantial portions of the Software. | ||
473 | </p> | ||
474 | |||
475 | <p> | ||
476 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
477 | EXPRESS OR IMPLIED, | ||
478 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
479 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
480 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
481 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
482 | TORT OR OTHERWISE, ARISING FROM, | ||
483 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
484 | THE SOFTWARE. | ||
485 | </p> | ||
486 | |||
487 | </div> <!-- id="content" --> | ||
488 | |||
489 | </div> <!-- id="main" --> | ||
490 | |||
491 | <div id="about"> | ||
492 | <p><small> | ||
493 | $Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $ | ||
494 | </small></p> | ||
495 | </div> <!-- id="about" --> | ||
496 | |||
497 | </div> <!-- id="container" --> | ||
498 | |||
499 | </body> | ||
500 | </html> | ||
@@ -0,0 +1,267 @@ | |||
1 | -- $Id: re.lua,v 1.46 2018/06/04 16:21:19 roberto Exp $ | ||
2 | |||
3 | -- imported functions and modules | ||
4 | local tonumber, type, print, error = tonumber, type, print, error | ||
5 | local setmetatable = setmetatable | ||
6 | local m = require"lpeg" | ||
7 | |||
8 | -- 'm' will be used to parse expressions, and 'mm' will be used to | ||
9 | -- create expressions; that is, 're' runs on 'm', creating patterns | ||
10 | -- on 'mm' | ||
11 | local mm = m | ||
12 | |||
13 | -- pattern's metatable | ||
14 | local mt = getmetatable(mm.P(0)) | ||
15 | |||
16 | |||
17 | |||
18 | -- No more global accesses after this point | ||
19 | local version = _VERSION | ||
20 | if version == "Lua 5.2" then _ENV = nil end | ||
21 | |||
22 | |||
23 | local any = m.P(1) | ||
24 | |||
25 | |||
26 | -- Pre-defined names | ||
27 | local Predef = { nl = m.P"\n" } | ||
28 | |||
29 | |||
30 | local mem | ||
31 | local fmem | ||
32 | local gmem | ||
33 | |||
34 | |||
35 | local function updatelocale () | ||
36 | mm.locale(Predef) | ||
37 | Predef.a = Predef.alpha | ||
38 | Predef.c = Predef.cntrl | ||
39 | Predef.d = Predef.digit | ||
40 | Predef.g = Predef.graph | ||
41 | Predef.l = Predef.lower | ||
42 | Predef.p = Predef.punct | ||
43 | Predef.s = Predef.space | ||
44 | Predef.u = Predef.upper | ||
45 | Predef.w = Predef.alnum | ||
46 | Predef.x = Predef.xdigit | ||
47 | Predef.A = any - Predef.a | ||
48 | Predef.C = any - Predef.c | ||
49 | Predef.D = any - Predef.d | ||
50 | Predef.G = any - Predef.g | ||
51 | Predef.L = any - Predef.l | ||
52 | Predef.P = any - Predef.p | ||
53 | Predef.S = any - Predef.s | ||
54 | Predef.U = any - Predef.u | ||
55 | Predef.W = any - Predef.w | ||
56 | Predef.X = any - Predef.x | ||
57 | mem = {} -- restart memoization | ||
58 | fmem = {} | ||
59 | gmem = {} | ||
60 | local mt = {__mode = "v"} | ||
61 | setmetatable(mem, mt) | ||
62 | setmetatable(fmem, mt) | ||
63 | setmetatable(gmem, mt) | ||
64 | end | ||
65 | |||
66 | |||
67 | updatelocale() | ||
68 | |||
69 | |||
70 | |||
71 | local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) | ||
72 | |||
73 | |||
74 | local function patt_error (s, i) | ||
75 | local msg = (#s < i + 20) and s:sub(i) | ||
76 | or s:sub(i,i+20) .. "..." | ||
77 | msg = ("pattern error near '%s'"):format(msg) | ||
78 | error(msg, 2) | ||
79 | end | ||
80 | |||
81 | local function mult (p, n) | ||
82 | local np = mm.P(true) | ||
83 | while n >= 1 do | ||
84 | if n%2 >= 1 then np = np * p end | ||
85 | p = p * p | ||
86 | n = n/2 | ||
87 | end | ||
88 | return np | ||
89 | end | ||
90 | |||
91 | local function equalcap (s, i, c) | ||
92 | if type(c) ~= "string" then return nil end | ||
93 | local e = #c + i | ||
94 | if s:sub(i, e - 1) == c then return e else return nil end | ||
95 | end | ||
96 | |||
97 | |||
98 | local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 | ||
99 | |||
100 | local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 | ||
101 | |||
102 | local arrow = S * "<-" | ||
103 | |||
104 | local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 | ||
105 | |||
106 | name = m.C(name) | ||
107 | |||
108 | |||
109 | -- a defined name only have meaning in a given environment | ||
110 | local Def = name * m.Carg(1) | ||
111 | |||
112 | |||
113 | local function getdef (id, defs) | ||
114 | local c = defs and defs[id] | ||
115 | if not c then error("undefined name: " .. id) end | ||
116 | return c | ||
117 | end | ||
118 | |||
119 | -- match a name and return a group of its corresponding definition | ||
120 | -- and 'f' (to be folded in 'Suffix') | ||
121 | local function defwithfunc (f) | ||
122 | return m.Cg(Def / getdef * m.Cc(f)) | ||
123 | end | ||
124 | |||
125 | |||
126 | local num = m.C(m.R"09"^1) * S / tonumber | ||
127 | |||
128 | local String = "'" * m.C((any - "'")^0) * "'" + | ||
129 | '"' * m.C((any - '"')^0) * '"' | ||
130 | |||
131 | |||
132 | local defined = "%" * Def / function (c,Defs) | ||
133 | local cat = Defs and Defs[c] or Predef[c] | ||
134 | if not cat then error ("name '" .. c .. "' undefined") end | ||
135 | return cat | ||
136 | end | ||
137 | |||
138 | local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R | ||
139 | |||
140 | local item = (defined + Range + m.C(any)) / m.P | ||
141 | |||
142 | local Class = | ||
143 | "[" | ||
144 | * (m.C(m.P"^"^-1)) -- optional complement symbol | ||
145 | * m.Cf(item * (item - "]")^0, mt.__add) / | ||
146 | function (c, p) return c == "^" and any - p or p end | ||
147 | * "]" | ||
148 | |||
149 | local function adddef (t, k, exp) | ||
150 | if t[k] then | ||
151 | error("'"..k.."' already defined as a rule") | ||
152 | else | ||
153 | t[k] = exp | ||
154 | end | ||
155 | return t | ||
156 | end | ||
157 | |||
158 | local function firstdef (n, r) return adddef({n}, n, r) end | ||
159 | |||
160 | |||
161 | local function NT (n, b) | ||
162 | if not b then | ||
163 | error("rule '"..n.."' used outside a grammar") | ||
164 | else return mm.V(n) | ||
165 | end | ||
166 | end | ||
167 | |||
168 | |||
169 | local exp = m.P{ "Exp", | ||
170 | Exp = S * ( m.V"Grammar" | ||
171 | + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) ); | ||
172 | Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul) | ||
173 | * (#seq_follow + patt_error); | ||
174 | Prefix = "&" * S * m.V"Prefix" / mt.__len | ||
175 | + "!" * S * m.V"Prefix" / mt.__unm | ||
176 | + m.V"Suffix"; | ||
177 | Suffix = m.Cf(m.V"Primary" * S * | ||
178 | ( ( m.P"+" * m.Cc(1, mt.__pow) | ||
179 | + m.P"*" * m.Cc(0, mt.__pow) | ||
180 | + m.P"?" * m.Cc(-1, mt.__pow) | ||
181 | + "^" * ( m.Cg(num * m.Cc(mult)) | ||
182 | + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) | ||
183 | ) | ||
184 | + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) | ||
185 | + m.P"{}" * m.Cc(nil, m.Ct) | ||
186 | + defwithfunc(mt.__div) | ||
187 | ) | ||
188 | + "=>" * S * defwithfunc(m.Cmt) | ||
189 | + "~>" * S * defwithfunc(m.Cf) | ||
190 | ) * S | ||
191 | )^0, function (a,b,f) return f(a,b) end ); | ||
192 | Primary = "(" * m.V"Exp" * ")" | ||
193 | + String / mm.P | ||
194 | + Class | ||
195 | + defined | ||
196 | + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / | ||
197 | function (n, p) return mm.Cg(p, n) end | ||
198 | + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end | ||
199 | + m.P"{}" / mm.Cp | ||
200 | + "{~" * m.V"Exp" * "~}" / mm.Cs | ||
201 | + "{|" * m.V"Exp" * "|}" / mm.Ct | ||
202 | + "{" * m.V"Exp" * "}" / mm.C | ||
203 | + m.P"." * m.Cc(any) | ||
204 | + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT; | ||
205 | Definition = name * arrow * m.V"Exp"; | ||
206 | Grammar = m.Cg(m.Cc(true), "G") * | ||
207 | m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0, | ||
208 | adddef) / mm.P | ||
209 | } | ||
210 | |||
211 | local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) | ||
212 | |||
213 | |||
214 | local function compile (p, defs) | ||
215 | if mm.type(p) == "pattern" then return p end -- already compiled | ||
216 | local cp = pattern:match(p, 1, defs) | ||
217 | if not cp then error("incorrect pattern", 3) end | ||
218 | return cp | ||
219 | end | ||
220 | |||
221 | local function match (s, p, i) | ||
222 | local cp = mem[p] | ||
223 | if not cp then | ||
224 | cp = compile(p) | ||
225 | mem[p] = cp | ||
226 | end | ||
227 | return cp:match(s, i or 1) | ||
228 | end | ||
229 | |||
230 | local function find (s, p, i) | ||
231 | local cp = fmem[p] | ||
232 | if not cp then | ||
233 | cp = compile(p) / 0 | ||
234 | cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } | ||
235 | fmem[p] = cp | ||
236 | end | ||
237 | local i, e = cp:match(s, i or 1) | ||
238 | if i then return i, e - 1 | ||
239 | else return i | ||
240 | end | ||
241 | end | ||
242 | |||
243 | local function gsub (s, p, rep) | ||
244 | local g = gmem[p] or {} -- ensure gmem[p] is not collected while here | ||
245 | gmem[p] = g | ||
246 | local cp = g[rep] | ||
247 | if not cp then | ||
248 | cp = compile(p) | ||
249 | cp = mm.Cs((cp / rep + 1)^0) | ||
250 | g[rep] = cp | ||
251 | end | ||
252 | return cp:match(s) | ||
253 | end | ||
254 | |||
255 | |||
256 | -- exported names | ||
257 | local re = { | ||
258 | compile = compile, | ||
259 | match = match, | ||
260 | find = find, | ||
261 | gsub = gsub, | ||
262 | updatelocale = updatelocale, | ||
263 | } | ||
264 | |||
265 | if version == "Lua 5.1" then _G.re = re end | ||
266 | |||
267 | return re | ||
diff --git a/test.lua b/test.lua new file mode 100755 index 0000000..51c5204 --- /dev/null +++ b/test.lua | |||
@@ -0,0 +1,1513 @@ | |||
1 | #!/usr/bin/env lua | ||
2 | |||
3 | -- $Id: test.lua,v 1.114 2018/06/04 16:21:19 roberto Exp $ | ||
4 | |||
5 | -- require"strict" -- just to be pedantic | ||
6 | |||
7 | local m = require"lpeg" | ||
8 | |||
9 | |||
10 | -- for general use | ||
11 | local a, b, c, d, e, f, g, p, t | ||
12 | |||
13 | |||
14 | -- compatibility with Lua 5.2 | ||
15 | local unpack = rawget(table, "unpack") or unpack | ||
16 | local loadstring = rawget(_G, "loadstring") or load | ||
17 | |||
18 | |||
19 | local any = m.P(1) | ||
20 | local space = m.S" \t\n"^0 | ||
21 | |||
22 | local function checkeq (x, y, p) | ||
23 | if p then print(x,y) end | ||
24 | if type(x) ~= "table" then assert(x == y) | ||
25 | else | ||
26 | for k,v in pairs(x) do checkeq(v, y[k], p) end | ||
27 | for k,v in pairs(y) do checkeq(v, x[k], p) end | ||
28 | end | ||
29 | end | ||
30 | |||
31 | |||
32 | local mt = getmetatable(m.P(1)) | ||
33 | |||
34 | |||
35 | local allchar = {} | ||
36 | for i=0,255 do allchar[i + 1] = i end | ||
37 | allchar = string.char(unpack(allchar)) | ||
38 | assert(#allchar == 256) | ||
39 | |||
40 | local function cs2str (c) | ||
41 | return m.match(m.Cs((c + m.P(1)/"")^0), allchar) | ||
42 | end | ||
43 | |||
44 | local function eqcharset (c1, c2) | ||
45 | assert(cs2str(c1) == cs2str(c2)) | ||
46 | end | ||
47 | |||
48 | |||
49 | print"General tests for LPeg library" | ||
50 | |||
51 | assert(type(m.version()) == "string") | ||
52 | print("version " .. m.version()) | ||
53 | assert(m.type("alo") ~= "pattern") | ||
54 | assert(m.type(io.input) ~= "pattern") | ||
55 | assert(m.type(m.P"alo") == "pattern") | ||
56 | |||
57 | -- tests for some basic optimizations | ||
58 | assert(m.match(m.P(false) + "a", "a") == 2) | ||
59 | assert(m.match(m.P(true) + "a", "a") == 1) | ||
60 | assert(m.match("a" + m.P(false), "b") == nil) | ||
61 | assert(m.match("a" + m.P(true), "b") == 1) | ||
62 | |||
63 | assert(m.match(m.P(false) * "a", "a") == nil) | ||
64 | assert(m.match(m.P(true) * "a", "a") == 2) | ||
65 | assert(m.match("a" * m.P(false), "a") == nil) | ||
66 | assert(m.match("a" * m.P(true), "a") == 2) | ||
67 | |||
68 | assert(m.match(#m.P(false) * "a", "a") == nil) | ||
69 | assert(m.match(#m.P(true) * "a", "a") == 2) | ||
70 | assert(m.match("a" * #m.P(false), "a") == nil) | ||
71 | assert(m.match("a" * #m.P(true), "a") == 2) | ||
72 | |||
73 | |||
74 | -- tests for locale | ||
75 | do | ||
76 | assert(m.locale(m) == m) | ||
77 | local t = {} | ||
78 | assert(m.locale(t, m) == t) | ||
79 | local x = m.locale() | ||
80 | for n,v in pairs(x) do | ||
81 | assert(type(n) == "string") | ||
82 | eqcharset(v, m[n]) | ||
83 | end | ||
84 | end | ||
85 | |||
86 | |||
87 | assert(m.match(3, "aaaa")) | ||
88 | assert(m.match(4, "aaaa")) | ||
89 | assert(not m.match(5, "aaaa")) | ||
90 | assert(m.match(-3, "aa")) | ||
91 | assert(not m.match(-3, "aaa")) | ||
92 | assert(not m.match(-3, "aaaa")) | ||
93 | assert(not m.match(-4, "aaaa")) | ||
94 | assert(m.P(-5):match"aaaa") | ||
95 | |||
96 | assert(m.match("a", "alo") == 2) | ||
97 | assert(m.match("al", "alo") == 3) | ||
98 | assert(not m.match("alu", "alo")) | ||
99 | assert(m.match(true, "") == 1) | ||
100 | |||
101 | local digit = m.S"0123456789" | ||
102 | local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
103 | local lower = m.S"abcdefghijklmnopqrstuvwxyz" | ||
104 | local letter = m.S"" + upper + lower | ||
105 | local alpha = letter + digit + m.R() | ||
106 | |||
107 | eqcharset(m.S"", m.P(false)) | ||
108 | eqcharset(upper, m.R("AZ")) | ||
109 | eqcharset(lower, m.R("az")) | ||
110 | eqcharset(upper + lower, m.R("AZ", "az")) | ||
111 | eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")) | ||
112 | eqcharset(digit, m.S"01234567" + "8" + "9") | ||
113 | eqcharset(upper, letter - lower) | ||
114 | eqcharset(m.S(""), m.R()) | ||
115 | assert(cs2str(m.S("")) == "") | ||
116 | |||
117 | eqcharset(m.S"\0", "\0") | ||
118 | eqcharset(m.S"\1\0\2", m.R"\0\2") | ||
119 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") | ||
120 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") | ||
121 | |||
122 | local word = alpha^1 * (1 - alpha)^0 | ||
123 | |||
124 | assert((word^0 * -1):match"alo alo") | ||
125 | assert(m.match(word^1 * -1, "alo alo")) | ||
126 | assert(m.match(word^2 * -1, "alo alo")) | ||
127 | assert(not m.match(word^3 * -1, "alo alo")) | ||
128 | |||
129 | assert(not m.match(word^-1 * -1, "alo alo")) | ||
130 | assert(m.match(word^-2 * -1, "alo alo")) | ||
131 | assert(m.match(word^-3 * -1, "alo alo")) | ||
132 | |||
133 | local eos = m.P(-1) | ||
134 | |||
135 | assert(m.match(digit^0 * letter * digit * eos, "1298a1")) | ||
136 | assert(not m.match(digit^0 * letter * eos, "1257a1")) | ||
137 | |||
138 | b = { | ||
139 | [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")" | ||
140 | } | ||
141 | |||
142 | assert(m.match(b, "(al())()")) | ||
143 | assert(not m.match(b * eos, "(al())()")) | ||
144 | assert(m.match(b * eos, "((al())()(é))")) | ||
145 | assert(not m.match(b, "(al()()")) | ||
146 | |||
147 | assert(not m.match(letter^1 - "for", "foreach")) | ||
148 | assert(m.match(letter^1 - ("for" * eos), "foreach")) | ||
149 | assert(not m.match(letter^1 - ("for" * eos), "for")) | ||
150 | |||
151 | function basiclookfor (p) | ||
152 | return m.P { | ||
153 | [1] = p + (1 * m.V(1)) | ||
154 | } | ||
155 | end | ||
156 | |||
157 | function caplookfor (p) | ||
158 | return basiclookfor(p:C()) | ||
159 | end | ||
160 | |||
161 | assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou") | ||
162 | a = {m.match(caplookfor(letter^1)^0, " two words, one more ")} | ||
163 | checkeq(a, {"two", "words", "one", "more"}) | ||
164 | |||
165 | assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7) | ||
166 | |||
167 | a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")} | ||
168 | checkeq(a, {"123", "d"}) | ||
169 | |||
170 | -- bug in LPeg 0.12 (nil value does not create a 'ktable') | ||
171 | assert(m.match(m.Cc(nil), "") == nil) | ||
172 | |||
173 | a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")} | ||
174 | checkeq(a, {"abcd", "l"}) | ||
175 | |||
176 | a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} | ||
177 | checkeq(a, {10,20,30,2}) | ||
178 | a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} | ||
179 | checkeq(a, {1,10,20,30,2}) | ||
180 | a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') | ||
181 | checkeq(a, {1,10,20,30,2}) | ||
182 | a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') | ||
183 | checkeq(a, {1,7,8,10,20,30,2}) | ||
184 | a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')} | ||
185 | checkeq(a, {1,2,3,4}) | ||
186 | |||
187 | a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")} | ||
188 | checkeq(a, {1, 5}) | ||
189 | |||
190 | |||
191 | t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")} | ||
192 | checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}) | ||
193 | |||
194 | -- bug in 0.12 ('hascapture' did not check for captures inside a rule) | ||
195 | do | ||
196 | local pat = m.P{ | ||
197 | 'S'; | ||
198 | S1 = m.C('abc') + 3, | ||
199 | S = #m.V('S1') -- rule has capture, but '#' must ignore it | ||
200 | } | ||
201 | assert(pat:match'abc' == 1) | ||
202 | end | ||
203 | |||
204 | |||
205 | -- bug: loop in 'hascaptures' | ||
206 | do | ||
207 | local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) | ||
208 | assert(p:match("xxx") == "") | ||
209 | end | ||
210 | |||
211 | |||
212 | |||
213 | -- test for small capture boundary | ||
214 | for i = 250,260 do | ||
215 | assert(#m.match(m.C(i), string.rep('a', i)) == i) | ||
216 | assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i) | ||
217 | end | ||
218 | |||
219 | -- tests for any*n and any*-n | ||
220 | for n = 1, 550, 13 do | ||
221 | local x_1 = string.rep('x', n - 1) | ||
222 | local x = x_1 .. 'a' | ||
223 | assert(not m.P(n):match(x_1)) | ||
224 | assert(m.P(n):match(x) == n + 1) | ||
225 | assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4) | ||
226 | assert(m.C(n):match(x) == x) | ||
227 | assert(m.C(m.C(n)):match(x) == x) | ||
228 | assert(m.P(-n):match(x_1) == 1) | ||
229 | assert(not m.P(-n):match(x)) | ||
230 | assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20) | ||
231 | local n3 = math.floor(n/3) | ||
232 | assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1) | ||
233 | end | ||
234 | |||
235 | -- true values | ||
236 | assert(m.P(0):match("x") == 1) | ||
237 | assert(m.P(0):match("") == 1) | ||
238 | assert(m.C(0):match("x") == "") | ||
239 | |||
240 | assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1) | ||
241 | assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0) | ||
242 | assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd") | ||
243 | p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4 | ||
244 | |||
245 | |||
246 | -- test for alternation optimization | ||
247 | assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2) | ||
248 | assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3) | ||
249 | assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1) | ||
250 | assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3) | ||
251 | assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3) | ||
252 | assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3) | ||
253 | assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3) | ||
254 | assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4) | ||
255 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3) | ||
256 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4) | ||
257 | assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4) | ||
258 | assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) | ||
259 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3) | ||
260 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4) | ||
261 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4) | ||
262 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) | ||
263 | assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3) | ||
264 | assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4) | ||
265 | assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4) | ||
266 | assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4) | ||
267 | assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3) | ||
268 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3) | ||
269 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4) | ||
270 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3) | ||
271 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1) | ||
272 | assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1) | ||
273 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3) | ||
274 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4) | ||
275 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3) | ||
276 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2) | ||
277 | assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1) | ||
278 | assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe")) | ||
279 | |||
280 | assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4) | ||
281 | |||
282 | |||
283 | -- bug in 0.12 (rc1) | ||
284 | assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4) | ||
285 | |||
286 | assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) == | ||
287 | 4*10 + 1) | ||
288 | |||
289 | -- optimizations with optional parts | ||
290 | assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1) | ||
291 | assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1) | ||
292 | assert(m.match(("ab" * m.B"c")^-1, "ab") == 1) | ||
293 | assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7) | ||
294 | |||
295 | assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3) | ||
296 | |||
297 | p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1 | ||
298 | assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21) | ||
299 | |||
300 | |||
301 | -- bug in 0.12.2 | ||
302 | -- p = { ('ab' ('c' 'ef'?)*)? } | ||
303 | p = m.C(('ab' * ('c' * m.P'ef'^-1)^0)^-1) | ||
304 | s = "abcefccefc" | ||
305 | assert(s == p:match(s)) | ||
306 | |||
307 | |||
308 | pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510" | ||
309 | assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) == | ||
310 | m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi)) | ||
311 | print"+" | ||
312 | |||
313 | |||
314 | -- tests for capture optimizations | ||
315 | assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5) | ||
316 | t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")} | ||
317 | checkeq(t, {3, 6}) | ||
318 | |||
319 | |||
320 | -- tests for numbered captures | ||
321 | p = m.C(1) | ||
322 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a") | ||
323 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef") | ||
324 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc") | ||
325 | assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7) | ||
326 | |||
327 | a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh") | ||
328 | assert(a == "a" and b == "efg" and c == "h") | ||
329 | |||
330 | -- test for table captures | ||
331 | t = m.match(m.Ct(letter^1), "alo") | ||
332 | checkeq(t, {}) | ||
333 | |||
334 | t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo") | ||
335 | assert(n == "t" and table.concat(t) == "alo") | ||
336 | |||
337 | t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") | ||
338 | assert(table.concat(t, ";") == "alo;a;l;o") | ||
339 | |||
340 | t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") | ||
341 | assert(table.concat(t, ";") == "alo;a;l;o") | ||
342 | |||
343 | t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo") | ||
344 | assert(table.concat(t[1], ";") == "1;2;2;3;3;4") | ||
345 | |||
346 | t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo") | ||
347 | checkeq(t, {"alo", "a", "o"}) | ||
348 | |||
349 | |||
350 | -- tests for groups | ||
351 | p = m.Cg(1) -- no capture | ||
352 | assert(p:match('x') == 'x') | ||
353 | p = m.Cg(m.P(true)/function () end * 1) -- no value | ||
354 | assert(p:match('x') == 'x') | ||
355 | p = m.Cg(m.Cg(m.Cg(m.C(1)))) | ||
356 | assert(p:match('x') == 'x') | ||
357 | p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2))) | ||
358 | t = {p:match'abc'} | ||
359 | checkeq(t, {'a', 'b', 'c', 1, 2}) | ||
360 | |||
361 | p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho")) | ||
362 | t = p:match'' | ||
363 | checkeq(t, {hi = 10, ho = 20}) | ||
364 | t = p:match'abc' | ||
365 | checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}) | ||
366 | |||
367 | -- non-string group names | ||
368 | p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)) | ||
369 | t = p:match('abcdefghij') | ||
370 | assert(t[print] == 'a' and t[23.5] == 'b' and t[io] == 'c') | ||
371 | |||
372 | |||
373 | -- test for error messages | ||
374 | local function checkerr (msg, f, ...) | ||
375 | local st, err = pcall(f, ...) | ||
376 | assert(not st and m.match({ m.P(msg) + 1 * m.V(1) }, err)) | ||
377 | end | ||
378 | |||
379 | checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a") | ||
380 | checkerr("rule '1' used outside a grammar", m.match, m.V(1), "") | ||
381 | checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "") | ||
382 | checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "") | ||
383 | checkerr("undefined in given grammar", m.match, { m.V{} }, "") | ||
384 | |||
385 | checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }) | ||
386 | checkerr("grammar has no initial rule", m.P, { [print] = {} }) | ||
387 | |||
388 | -- grammar with a long call chain before left recursion | ||
389 | p = {'a', | ||
390 | a = m.V'b' * m.V'c' * m.V'd' * m.V'a', | ||
391 | b = m.V'c', | ||
392 | c = m.V'd', | ||
393 | d = m.V'e', | ||
394 | e = m.V'f', | ||
395 | f = m.V'g', | ||
396 | g = m.P'' | ||
397 | } | ||
398 | checkerr("rule 'a' may be left recursive", m.match, p, "a") | ||
399 | |||
400 | -- Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) | ||
401 | -- the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 | ||
402 | -- that is optimized to ICommit L1 | ||
403 | |||
404 | p = m.P { (m.P {m.P'abc'} + 'ayz') * m.V'y'; y = m.P'x' } | ||
405 | assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') | ||
406 | |||
407 | |||
408 | do | ||
409 | -- large dynamic Cc | ||
410 | local lim = 2^16 - 1 | ||
411 | local c = 0 | ||
412 | local function seq (n) | ||
413 | if n == 1 then c = c + 1; return m.Cc(c) | ||
414 | else | ||
415 | local m = math.floor(n / 2) | ||
416 | return seq(m) * seq(n - m) | ||
417 | end | ||
418 | end | ||
419 | p = m.Ct(seq(lim)) | ||
420 | t = p:match('') | ||
421 | assert(t[lim] == lim) | ||
422 | checkerr("too many", function () p = p / print end) | ||
423 | checkerr("too many", seq, lim + 1) | ||
424 | end | ||
425 | |||
426 | |||
427 | -- tests for non-pattern as arguments to pattern functions | ||
428 | |||
429 | p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } | ||
430 | assert(m.match(p, "aaabaac") == 7) | ||
431 | |||
432 | p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans | ||
433 | |||
434 | assert(p:match("abc01de") == 8) | ||
435 | assert(p:match("abc01de3456") == nil) | ||
436 | |||
437 | p = 'abc' * (2 * (-5 * (true * m.P'de'))) | ||
438 | |||
439 | assert(p:match("abc01de") == 8) | ||
440 | assert(p:match("abc01de3456") == nil) | ||
441 | |||
442 | p = { m.V(2), m.P"abc" } * | ||
443 | (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" }) | ||
444 | assert(p:match("abcaaaxx") == 7) | ||
445 | assert(p:match("abcxx") == 6) | ||
446 | |||
447 | |||
448 | -- a large table capture | ||
449 | t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000)) | ||
450 | assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a') | ||
451 | |||
452 | print('+') | ||
453 | |||
454 | |||
455 | -- bug in 0.10 (rechecking a grammar, after tail-call optimization) | ||
456 | m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } } | ||
457 | |||
458 | local V = m.V | ||
459 | |||
460 | local Space = m.S(" \n\t")^0 | ||
461 | local Number = m.C(m.R("09")^1) * Space | ||
462 | local FactorOp = m.C(m.S("+-")) * Space | ||
463 | local TermOp = m.C(m.S("*/")) * Space | ||
464 | local Open = "(" * Space | ||
465 | local Close = ")" * Space | ||
466 | |||
467 | |||
468 | local function f_factor (v1, op, v2, d) | ||
469 | assert(d == nil) | ||
470 | if op == "+" then return v1 + v2 | ||
471 | else return v1 - v2 | ||
472 | end | ||
473 | end | ||
474 | |||
475 | |||
476 | local function f_term (v1, op, v2, d) | ||
477 | assert(d == nil) | ||
478 | if op == "*" then return v1 * v2 | ||
479 | else return v1 / v2 | ||
480 | end | ||
481 | end | ||
482 | |||
483 | G = m.P{ "Exp", | ||
484 | Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor); | ||
485 | Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term); | ||
486 | Term = Number / tonumber + Open * V"Exp" * Close; | ||
487 | } | ||
488 | |||
489 | G = Space * G * -1 | ||
490 | |||
491 | for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do | ||
492 | assert(m.match(G, s) == loadstring("return "..s)()) | ||
493 | end | ||
494 | |||
495 | |||
496 | -- test for grammars (errors deep in calling non-terminals) | ||
497 | g = m.P{ | ||
498 | [1] = m.V(2) + "a", | ||
499 | [2] = "a" * m.V(3) * "x", | ||
500 | [3] = "b" * m.V(3) + "c" | ||
501 | } | ||
502 | |||
503 | assert(m.match(g, "abbbcx") == 7) | ||
504 | assert(m.match(g, "abbbbx") == 2) | ||
505 | |||
506 | |||
507 | -- tests for \0 | ||
508 | assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4) | ||
509 | assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5) | ||
510 | assert(m.match(m.P(1)^3, "\0\1\0a") == 5) | ||
511 | assert(not m.match(-4, "\0\1\0a")) | ||
512 | assert(m.match("\0\1\0a", "\0\1\0a") == 5) | ||
513 | assert(m.match("\0\0\0", "\0\0\0") == 4) | ||
514 | assert(not m.match("\0\0\0", "\0\0")) | ||
515 | |||
516 | |||
517 | -- tests for predicates | ||
518 | assert(not m.match(-m.P("a") * 2, "alo")) | ||
519 | assert(m.match(- -m.P("a") * 2, "alo") == 3) | ||
520 | assert(m.match(#m.P("a") * 2, "alo") == 3) | ||
521 | assert(m.match(##m.P("a") * 2, "alo") == 3) | ||
522 | assert(not m.match(##m.P("c") * 2, "alo")) | ||
523 | assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
524 | assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
525 | assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
526 | assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") | ||
527 | |||
528 | |||
529 | -- fixed length | ||
530 | do | ||
531 | -- 'and' predicate using fixed length | ||
532 | local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) | ||
533 | assert(p:match("acd") == "ac") | ||
534 | |||
535 | p = #m.P{ "a" * m.V(2), m.P"b" } * 2 | ||
536 | assert(p:match("abc") == 3) | ||
537 | |||
538 | p = #(m.P"abc" * m.B"c") | ||
539 | assert(p:match("abc") == 1 and not p:match("ab")) | ||
540 | |||
541 | p = m.P{ "a" * m.V(2), m.P"b"^1 } | ||
542 | checkerr("pattern may not have fixed length", m.B, p) | ||
543 | |||
544 | p = "abc" * (m.P"b"^1 + m.P"a"^0) | ||
545 | checkerr("pattern may not have fixed length", m.B, p) | ||
546 | end | ||
547 | |||
548 | |||
549 | p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) | ||
550 | assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) | ||
551 | |||
552 | p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20) | ||
553 | assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10) | ||
554 | |||
555 | |||
556 | |||
557 | -- look-behind predicate | ||
558 | assert(not m.match(m.B'a', 'a')) | ||
559 | assert(m.match(1 * m.B'a', 'a') == 2) | ||
560 | assert(not m.match(m.B(1), 'a')) | ||
561 | assert(m.match(1 * m.B(1), 'a') == 2) | ||
562 | assert(m.match(-m.B(1), 'a') == 1) | ||
563 | assert(m.match(m.B(250), string.rep('a', 250)) == nil) | ||
564 | assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251) | ||
565 | |||
566 | -- look-behind with an open call | ||
567 | checkerr("pattern may not have fixed length", m.B, m.V'S1') | ||
568 | checkerr("too long to look behind", m.B, 260) | ||
569 | |||
570 | B = #letter * -m.B(letter) + -letter * m.B(letter) | ||
571 | x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) }) | ||
572 | checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}) | ||
573 | checkeq(m.match(x, ' ar cal '), {2,4,5,8}) | ||
574 | checkeq(m.match(x, ' '), {}) | ||
575 | checkeq(m.match(x, 'aloalo'), {1,7}) | ||
576 | |||
577 | assert(m.match(B, "a") == 1) | ||
578 | assert(m.match(1 * B, "a") == 2) | ||
579 | assert(not m.B(1 - letter):match("")) | ||
580 | assert((-m.B(letter)):match("") == 1) | ||
581 | |||
582 | assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5) | ||
583 | assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa")) | ||
584 | assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5) | ||
585 | |||
586 | -- look-behind with grammars | ||
587 | assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil) | ||
588 | assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil) | ||
589 | assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4) | ||
590 | |||
591 | |||
592 | |||
593 | -- bug in 0.9 | ||
594 | assert(m.match(('a' * #m.P'b'), "ab") == 2) | ||
595 | assert(not m.match(('a' * #m.P'b'), "a")) | ||
596 | |||
597 | assert(not m.match(#m.S'567', "")) | ||
598 | assert(m.match(#m.S'567' * 1, "6") == 2) | ||
599 | |||
600 | |||
601 | -- tests for Tail Calls | ||
602 | |||
603 | p = m.P{ 'a' * m.V(1) + '' } | ||
604 | assert(p:match(string.rep('a', 1000)) == 1001) | ||
605 | |||
606 | -- create a grammar for a simple DFA for even number of 0s and 1s | ||
607 | -- | ||
608 | -- ->1 <---0---> 2 | ||
609 | -- ^ ^ | ||
610 | -- | | | ||
611 | -- 1 1 | ||
612 | -- | | | ||
613 | -- V V | ||
614 | -- 3 <---0---> 4 | ||
615 | -- | ||
616 | -- this grammar should keep no backtracking information | ||
617 | |||
618 | p = m.P{ | ||
619 | [1] = '0' * m.V(2) + '1' * m.V(3) + -1, | ||
620 | [2] = '0' * m.V(1) + '1' * m.V(4), | ||
621 | [3] = '0' * m.V(4) + '1' * m.V(1), | ||
622 | [4] = '0' * m.V(3) + '1' * m.V(2), | ||
623 | } | ||
624 | |||
625 | assert(p:match(string.rep("00", 10000))) | ||
626 | assert(p:match(string.rep("01", 10000))) | ||
627 | assert(p:match(string.rep("011", 10000))) | ||
628 | assert(not p:match(string.rep("011", 10000) .. "1")) | ||
629 | assert(not p:match(string.rep("011", 10001))) | ||
630 | |||
631 | |||
632 | -- this grammar does need backtracking info. | ||
633 | local lim = 10000 | ||
634 | p = m.P{ '0' * m.V(1) + '0' } | ||
635 | checkerr("stack overflow", m.match, p, string.rep("0", lim)) | ||
636 | m.setmaxstack(2*lim) | ||
637 | checkerr("stack overflow", m.match, p, string.rep("0", lim)) | ||
638 | m.setmaxstack(2*lim + 4) | ||
639 | assert(m.match(p, string.rep("0", lim)) == lim + 1) | ||
640 | |||
641 | -- this repetition should not need stack space (only the call does) | ||
642 | p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' } | ||
643 | m.setmaxstack(200) | ||
644 | assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362) | ||
645 | |||
646 | m.setmaxstack(100) -- restore low limit | ||
647 | |||
648 | -- tests for optional start position | ||
649 | assert(m.match("a", "abc", 1)) | ||
650 | assert(m.match("b", "abc", 2)) | ||
651 | assert(m.match("c", "abc", 3)) | ||
652 | assert(not m.match(1, "abc", 4)) | ||
653 | assert(m.match("a", "abc", -3)) | ||
654 | assert(m.match("b", "abc", -2)) | ||
655 | assert(m.match("c", "abc", -1)) | ||
656 | assert(m.match("abc", "abc", -4)) -- truncate to position 1 | ||
657 | |||
658 | assert(m.match("", "abc", 10)) -- empty string is everywhere! | ||
659 | assert(m.match("", "", 10)) | ||
660 | assert(not m.match(1, "", 1)) | ||
661 | assert(not m.match(1, "", -1)) | ||
662 | assert(not m.match(1, "", 0)) | ||
663 | |||
664 | print("+") | ||
665 | |||
666 | |||
667 | -- tests for argument captures | ||
668 | checkerr("invalid argument", m.Carg, 0) | ||
669 | checkerr("invalid argument", m.Carg, -1) | ||
670 | checkerr("invalid argument", m.Carg, 2^18) | ||
671 | checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1) | ||
672 | assert(m.match(m.Carg(1), 'a', 1, print) == print) | ||
673 | x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)} | ||
674 | checkeq(x, {10, 20}) | ||
675 | |||
676 | assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * | ||
677 | m.Cmt(m.Cb("a"), function (s,i,x) | ||
678 | assert(s == "a" and i == 1); | ||
679 | return i, x+1 | ||
680 | end) * | ||
681 | m.Carg(2), function (s,i,a,b,c) | ||
682 | assert(s == "a" and i == 1 and c == nil); | ||
683 | return i, 2*a + 3*b | ||
684 | end) * "a", | ||
685 | "a", 1, false, 100, 1000) == 2*1001 + 3*100) | ||
686 | |||
687 | |||
688 | -- tests for Lua functions | ||
689 | |||
690 | t = {} | ||
691 | s = "" | ||
692 | p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false | ||
693 | s = "hi, this is a test" | ||
694 | assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1) | ||
695 | assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3) | ||
696 | |||
697 | assert(not m.match(p, s)) | ||
698 | |||
699 | p = mt.__add(function (s, i) return i end, function (s, i) return nil end) | ||
700 | assert(m.match(p, "alo")) | ||
701 | |||
702 | p = mt.__mul(function (s, i) return i end, function (s, i) return nil end) | ||
703 | assert(not m.match(p, "alo")) | ||
704 | |||
705 | |||
706 | t = {} | ||
707 | p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end | ||
708 | s = "hi, this is a test" | ||
709 | assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1) | ||
710 | assert(#t == string.len(s) and t[1] == 2 and t[2] == 3) | ||
711 | |||
712 | t = {} | ||
713 | p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; | ||
714 | return i <= s1:len() and i end) * 1 | ||
715 | s = "hi, this is a test" | ||
716 | assert(m.match(p^0, s) == string.len(s) + 1) | ||
717 | assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2) | ||
718 | |||
719 | p = function (s1, i) return m.match(m.P"a"^1, s1, i) end | ||
720 | assert(m.match(p, "aaaa") == 5) | ||
721 | assert(m.match(p, "abaa") == 2) | ||
722 | assert(not m.match(p, "baaa")) | ||
723 | |||
724 | checkerr("invalid position", m.match, function () return 2^20 end, s) | ||
725 | checkerr("invalid position", m.match, function () return 0 end, s) | ||
726 | checkerr("invalid position", m.match, function (s, i) return i - 1 end, s) | ||
727 | checkerr("invalid position", m.match, | ||
728 | m.P(1)^0 * function (_, i) return i - 1 end, s) | ||
729 | assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s)) | ||
730 | checkerr("invalid position", m.match, | ||
731 | m.P(1)^0 * function (_, i) return i + 1 end, s) | ||
732 | assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s)) | ||
733 | checkerr("invalid position", m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s) | ||
734 | assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s)) | ||
735 | assert(m.match(m.P(1)^0 * function (_, i) return true end, s) == | ||
736 | string.len(s) + 1) | ||
737 | for i = 1, string.len(s) + 1 do | ||
738 | assert(m.match(function (_, _) return i end, s) == i) | ||
739 | end | ||
740 | |||
741 | p = (m.P(function (s, i) return i%2 == 0 and i end) * 1 | ||
742 | + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0 | ||
743 | * -1 | ||
744 | assert(p:match(string.rep('a', 14000))) | ||
745 | |||
746 | -- tests for Function Replacements | ||
747 | f = function (a, ...) if a ~= "x" then return {a, ...} end end | ||
748 | |||
749 | t = m.match(m.C(1)^0/f, "abc") | ||
750 | checkeq(t, {"a", "b", "c"}) | ||
751 | |||
752 | t = m.match(m.C(1)^0/f/f, "abc") | ||
753 | checkeq(t, {{"a", "b", "c"}}) | ||
754 | |||
755 | t = m.match(m.P(1)^0/f/f, "abc") -- no capture | ||
756 | checkeq(t, {{"abc"}}) | ||
757 | |||
758 | t = m.match((m.P(1)^0/f * m.Cp())/f, "abc") | ||
759 | checkeq(t, {{"abc"}, 4}) | ||
760 | |||
761 | t = m.match((m.C(1)^0/f * m.Cp())/f, "abc") | ||
762 | checkeq(t, {{"a", "b", "c"}, 4}) | ||
763 | |||
764 | t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc") | ||
765 | checkeq(t, {4}) | ||
766 | |||
767 | t = m.match(m.C(m.C(1)^0)/f, "abc") | ||
768 | checkeq(t, {"abc", "a", "b", "c"}) | ||
769 | |||
770 | g = function (...) return 1, ... end | ||
771 | t = {m.match(m.C(1)^0/g/g, "abc")} | ||
772 | checkeq(t, {1, 1, "a", "b", "c"}) | ||
773 | |||
774 | t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")} | ||
775 | t1 = {1,1,nil,nil,4,nil,3,nil,nil} | ||
776 | for i=1,10 do assert(t[i] == t1[i]) end | ||
777 | |||
778 | -- bug in 0.12.2: ktable with only nil could be eliminated when joining | ||
779 | -- with a pattern without ktable | ||
780 | assert((m.P"aaa" * m.Cc(nil)):match"aaa" == nil) | ||
781 | |||
782 | t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")} | ||
783 | checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}) | ||
784 | |||
785 | t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc") | ||
786 | checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1}) | ||
787 | |||
788 | -- tests for Query Replacements | ||
789 | |||
790 | assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10) | ||
791 | assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10) | ||
792 | assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40) | ||
793 | t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc") | ||
794 | checkeq(t, {40}) | ||
795 | |||
796 | assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e") | ||
797 | assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde") | ||
798 | assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e") | ||
799 | assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.") | ||
800 | assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+") | ||
801 | assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde") | ||
802 | assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde") | ||
803 | assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde") | ||
804 | assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde") | ||
805 | assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx") | ||
806 | assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3") | ||
807 | assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3) | ||
808 | assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3) | ||
809 | |||
810 | assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0") | ||
811 | |||
812 | assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd") | ||
813 | assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d") | ||
814 | assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad") | ||
815 | assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a") | ||
816 | assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx") | ||
817 | assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == | ||
818 | "411 - abc ") | ||
819 | |||
820 | assert(m.match(m.P(1)/"%0", "abc") == "a") | ||
821 | checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc") | ||
822 | checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc") | ||
823 | |||
824 | p = m.C(1) | ||
825 | p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1" | ||
826 | assert(p:match("1234567890") == "9 - 1") | ||
827 | |||
828 | assert(m.match(m.Cc(print), "") == print) | ||
829 | |||
830 | -- too many captures (just ignore extra ones) | ||
831 | p = m.C(1)^0 / "%2-%9-%0-%9" | ||
832 | assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8") | ||
833 | s = string.rep("12345678901234567890", 20) | ||
834 | assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3") | ||
835 | |||
836 | -- string captures with non-string subcaptures | ||
837 | p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1" | ||
838 | assert(p:match'x' == 'alo - x - alo') | ||
839 | |||
840 | checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a") | ||
841 | |||
842 | -- long strings for string capture | ||
843 | l = 10000 | ||
844 | s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) | ||
845 | |||
846 | p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' | ||
847 | |||
848 | assert(p:match(s) == string.rep('c', l) .. | ||
849 | string.rep('b', l) .. | ||
850 | string.rep('a', l)) | ||
851 | |||
852 | print"+" | ||
853 | |||
854 | -- accumulator capture | ||
855 | function f (x) return x + 1 end | ||
856 | assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) | ||
857 | |||
858 | t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} | ||
859 | checkeq(t, {1}) | ||
860 | p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0, | ||
861 | rawset) | ||
862 | t = p:match("a=b;c=du;xux=yuy;") | ||
863 | checkeq(t, {a="b", c="du", xux="yuy"}) | ||
864 | |||
865 | |||
866 | -- errors in accumulator capture | ||
867 | |||
868 | -- no initial capture | ||
869 | checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa') | ||
870 | -- no initial capture (very long match forces fold to be a pair open-close) | ||
871 | checkerr("no initial value", m.match, m.Cf(m.P(500), print), | ||
872 | string.rep('a', 600)) | ||
873 | |||
874 | -- nested capture produces no initial value | ||
875 | checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo") | ||
876 | |||
877 | |||
878 | -- tests for loop checker | ||
879 | |||
880 | local function isnullable (p) | ||
881 | checkerr("may accept empty string", function (p) return p^0 end, m.P(p)) | ||
882 | end | ||
883 | |||
884 | isnullable(m.P("x")^-4) | ||
885 | assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3) | ||
886 | assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3) | ||
887 | isnullable("") | ||
888 | isnullable(m.P("x")^0) | ||
889 | isnullable(m.P("x")^-1) | ||
890 | isnullable(m.P("x") + 1 + 2 + m.P("a")^-1) | ||
891 | isnullable(-m.P("ab")) | ||
892 | isnullable(- -m.P("ab")) | ||
893 | isnullable(# #(m.P("ab") + "xy")) | ||
894 | isnullable(- #m.P("ab")^0) | ||
895 | isnullable(# -m.P("ab")^1) | ||
896 | isnullable(#m.V(3)) | ||
897 | isnullable(m.V(3) + m.V(1) + m.P('a')^-1) | ||
898 | isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}) | ||
899 | assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc") | ||
900 | == 3) | ||
901 | assert(m.match(m.P""^-3, "a") == 1) | ||
902 | |||
903 | local function find (p, s) | ||
904 | return m.match(basiclookfor(p), s) | ||
905 | end | ||
906 | |||
907 | |||
908 | local function badgrammar (g, expected) | ||
909 | local stat, msg = pcall(m.P, g) | ||
910 | assert(not stat) | ||
911 | if expected then assert(find(expected, msg)) end | ||
912 | end | ||
913 | |||
914 | badgrammar({[1] = m.V(1)}, "rule '1'") | ||
915 | badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal | ||
916 | badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal | ||
917 | badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal | ||
918 | badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive | ||
919 | badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive | ||
920 | badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive | ||
921 | badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive | ||
922 | badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive | ||
923 | badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop | ||
924 | badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop | ||
925 | badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop | ||
926 | badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop | ||
927 | badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop | ||
928 | badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive | ||
929 | badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'") | ||
930 | |||
931 | assert(m.match({'a' * -m.V(1)}, "aaa") == 2) | ||
932 | assert(m.match({'a' * -m.V(1)}, "aaaa") == nil) | ||
933 | |||
934 | |||
935 | -- good x bad grammars | ||
936 | m.P{ ('a' * m.V(1))^-1 } | ||
937 | m.P{ -('a' * m.V(1)) } | ||
938 | m.P{ ('abc' * m.V(1))^-1 } | ||
939 | m.P{ -('abc' * m.V(1)) } | ||
940 | badgrammar{ #m.P('abc') * m.V(1) } | ||
941 | badgrammar{ -('a' + m.V(1)) } | ||
942 | m.P{ #('a' * m.V(1)) } | ||
943 | badgrammar{ #('a' + m.V(1)) } | ||
944 | m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) } | ||
945 | badgrammar{ m.B{ m.P'abc' } * m.V(1) } | ||
946 | badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) } | ||
947 | |||
948 | |||
949 | -- simple tests for maximum sizes: | ||
950 | local p = m.P"a" | ||
951 | for i=1,14 do p = p * p end | ||
952 | |||
953 | p = {} | ||
954 | for i=1,100 do p[i] = m.P"a" end | ||
955 | p = m.P(p) | ||
956 | |||
957 | |||
958 | -- strange values for rule labels | ||
959 | |||
960 | p = m.P{ "print", | ||
961 | print = m.V(print), | ||
962 | [print] = m.V(_G), | ||
963 | [_G] = m.P"a", | ||
964 | } | ||
965 | |||
966 | assert(p:match("a")) | ||
967 | |||
968 | -- initial rule | ||
969 | g = {} | ||
970 | for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end | ||
971 | g.i11 = m.P"" | ||
972 | for i = 1, 10 do | ||
973 | g[1] = "i"..i | ||
974 | local p = m.P(g) | ||
975 | assert(p:match("aaaaaaaaaaa") == 11 - i + 1) | ||
976 | end | ||
977 | |||
978 | print"+" | ||
979 | |||
980 | |||
981 | -- tests for back references | ||
982 | checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') | ||
983 | checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') | ||
984 | |||
985 | p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) | ||
986 | t = p:match("ab") | ||
987 | checkeq(t, {"a", "b"}) | ||
988 | |||
989 | p = m.P(true) | ||
990 | for i = 1, 10 do p = p * m.Cg(1, i) end | ||
991 | for i = 1, 10 do | ||
992 | local p = p * m.Cb(i) | ||
993 | assert(p:match('abcdefghij') == string.sub('abcdefghij', i, i)) | ||
994 | end | ||
995 | |||
996 | |||
997 | t = {} | ||
998 | function foo (p) t[#t + 1] = p; return p .. "x" end | ||
999 | |||
1000 | p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" * | ||
1001 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * | ||
1002 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * | ||
1003 | m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" | ||
1004 | x = {p:match'ab'} | ||
1005 | checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}) | ||
1006 | checkeq(t, {'ab', | ||
1007 | 'ab', 'abx', | ||
1008 | 'ab', 'abx', 'abxx', | ||
1009 | 'ab', 'abx', 'abxx', 'abxxx'}) | ||
1010 | |||
1011 | |||
1012 | |||
1013 | -- tests for match-time captures | ||
1014 | |||
1015 | p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end) | ||
1016 | + 'acd' | ||
1017 | |||
1018 | assert(p:match('abc') == 3) | ||
1019 | assert(p:match('acd') == 4) | ||
1020 | |||
1021 | local function id (s, i, ...) | ||
1022 | return true, ... | ||
1023 | end | ||
1024 | |||
1025 | assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + | ||
1026 | m.R'09'^1 / string.char + | ||
1027 | m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") | ||
1028 | |||
1029 | p = m.P{'S', | ||
1030 | S = m.V'atom' * space | ||
1031 | + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id), | ||
1032 | atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id) | ||
1033 | } | ||
1034 | x = p:match"(a g () ((b) c) (d (e)))" | ||
1035 | checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); | ||
1036 | |||
1037 | x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))} | ||
1038 | assert(#x == 500) | ||
1039 | |||
1040 | local function id(s, i, x) | ||
1041 | if x == 'a' then return i, 1, 3, 7 | ||
1042 | else return nil, 2, 4, 6, 8 | ||
1043 | end | ||
1044 | end | ||
1045 | |||
1046 | p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 | ||
1047 | assert(table.concat{p:match('abababab')} == string.rep('137', 4)) | ||
1048 | |||
1049 | local function ref (s, i, x) | ||
1050 | return m.match(x, s, i - x:len()) | ||
1051 | end | ||
1052 | |||
1053 | assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4) | ||
1054 | assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4) | ||
1055 | assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo')) | ||
1056 | |||
1057 | ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end | ||
1058 | |||
1059 | assert(m.Cmt(1, ref):match'2') | ||
1060 | assert(not m.Cmt(1, ref):match'1') | ||
1061 | assert(m.Cmt(m.P(1)^0, ref):match'03') | ||
1062 | |||
1063 | function ref (s, i, a, b) | ||
1064 | if a == b then return i, a:upper() end | ||
1065 | end | ||
1066 | |||
1067 | p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref) | ||
1068 | p = (any - p)^0 * p * any^0 * -1 | ||
1069 | |||
1070 | assert(p:match'abbbc-bc ddaa' == 'BC') | ||
1071 | |||
1072 | do -- match-time captures cannot be optimized away | ||
1073 | local touch = 0 | ||
1074 | f = m.P(function () touch = touch + 1; return true end) | ||
1075 | |||
1076 | local function check(n) n = n or 1; assert(touch == n); touch = 0 end | ||
1077 | |||
1078 | assert(m.match(f * false + 'b', 'a') == nil); check() | ||
1079 | assert(m.match(f * false + 'b', '') == nil); check() | ||
1080 | assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check() | ||
1081 | assert(m.match( (f * 'a')^0 * 'b', '') == nil); check() | ||
1082 | assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check() | ||
1083 | assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check() | ||
1084 | assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check() | ||
1085 | assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() | ||
1086 | assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); | ||
1087 | check() | ||
1088 | assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check() | ||
1089 | assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2) | ||
1090 | assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1) | ||
1091 | assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1) | ||
1092 | assert(m.match(-f * 'a' + 'b', '') == nil); check(1) | ||
1093 | end | ||
1094 | |||
1095 | c = '[' * m.Cg(m.P'='^0, "init") * '[' * | ||
1096 | { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2) | ||
1097 | return s1 == s2 end) | ||
1098 | + 1 * m.V(1) } / 0 | ||
1099 | |||
1100 | assert(c:match'[==[]]====]]]]==]===[]' == 18) | ||
1101 | assert(c:match'[[]=]====]=]]]==]===[]' == 14) | ||
1102 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1103 | |||
1104 | |||
1105 | -- old bug: optimization of concat with fail removed match-time capture | ||
1106 | p = m.Cmt(0, function (s) p = s end) * m.P(false) | ||
1107 | assert(not p:match('alo')) | ||
1108 | assert(p == 'alo') | ||
1109 | |||
1110 | |||
1111 | -- ensure that failed match-time captures are not kept on Lua stack | ||
1112 | do | ||
1113 | local t = {__mode = "kv"}; setmetatable(t,t) | ||
1114 | local c = 0 | ||
1115 | |||
1116 | local function foo (s,i) | ||
1117 | collectgarbage(); | ||
1118 | assert(next(t) == "__mode" and next(t, "__mode") == nil) | ||
1119 | local x = {} | ||
1120 | t[x] = true | ||
1121 | c = c + 1 | ||
1122 | return i, x | ||
1123 | end | ||
1124 | |||
1125 | local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" } | ||
1126 | p:match(string.rep('1', 10)) | ||
1127 | assert(c == 11) | ||
1128 | end | ||
1129 | |||
1130 | |||
1131 | -- Return a match-time capture that returns 'n' captures | ||
1132 | local function manyCmt (n) | ||
1133 | return m.Cmt("a", function () | ||
1134 | local a = {}; for i = 1, n do a[i] = n - i end | ||
1135 | return true, unpack(a) | ||
1136 | end) | ||
1137 | end | ||
1138 | |||
1139 | -- bug in 1.0: failed match-time that used previous match-time results | ||
1140 | do | ||
1141 | local x | ||
1142 | local function aux (...) x = #{...}; return false end | ||
1143 | local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} | ||
1144 | assert(#res == 10 and res[1] == 9 and res[10] == 0) | ||
1145 | end | ||
1146 | |||
1147 | |||
1148 | -- bug in 1.0: problems with math-times returning too many captures | ||
1149 | do | ||
1150 | local lim = 2^11 - 10 | ||
1151 | local res = {m.match(manyCmt(lim), "a")} | ||
1152 | assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) | ||
1153 | checkerr("too many", m.match, manyCmt(2^15), "a") | ||
1154 | end | ||
1155 | |||
1156 | p = (m.P(function () return true, "a" end) * 'a' | ||
1157 | + m.P(function (s, i) return i, "aa", 20 end) * 'b' | ||
1158 | + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 | ||
1159 | |||
1160 | t = {p:match('abacc')} | ||
1161 | checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) | ||
1162 | |||
1163 | |||
1164 | ------------------------------------------------------------------- | ||
1165 | -- Tests for 're' module | ||
1166 | ------------------------------------------------------------------- | ||
1167 | |||
1168 | local re = require "re" | ||
1169 | |||
1170 | local match, compile = re.match, re.compile | ||
1171 | |||
1172 | |||
1173 | |||
1174 | assert(match("a", ".") == 2) | ||
1175 | assert(match("a", "''") == 1) | ||
1176 | assert(match("", " ! . ") == 1) | ||
1177 | assert(not match("a", " ! . ")) | ||
1178 | assert(match("abcde", " ( . . ) * ") == 5) | ||
1179 | assert(match("abbcde", " [a-c] +") == 5) | ||
1180 | assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7) | ||
1181 | assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8) | ||
1182 | assert(match("abbc--", " [a-c] + +") == 5) | ||
1183 | assert(match("abbc--", " [ac-] +") == 2) | ||
1184 | assert(match("abbc--", " [-acb] + ") == 7) | ||
1185 | assert(not match("abbcde", " [b-z] + ")) | ||
1186 | assert(match("abb\"de", '"abb"["]"de"') == 7) | ||
1187 | assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") | ||
1188 | assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) | ||
1189 | |||
1190 | assert(re.match("aaand", "[a]^2") == 3) | ||
1191 | |||
1192 | local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} | ||
1193 | checkeq(t, {4, 5, 7}) | ||
1194 | local t = {match("abceefe", "((&&'e' {})? .)*")} | ||
1195 | checkeq(t, {4, 5, 7}) | ||
1196 | local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")} | ||
1197 | checkeq(t, {4, 5, 7}) | ||
1198 | local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")} | ||
1199 | checkeq(t, {4, 5, 7}) | ||
1200 | |||
1201 | assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5) | ||
1202 | assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4) | ||
1203 | assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8) | ||
1204 | |||
1205 | assert(match("abc", "a <- (. a)?") == 4) | ||
1206 | b = "balanced <- '(' ([^()] / balanced)* ')'" | ||
1207 | assert(match("(abc)", b)) | ||
1208 | assert(match("(a(b)((c) (d)))", b)) | ||
1209 | assert(not match("(a(b ((c) (d)))", b)) | ||
1210 | |||
1211 | b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] | ||
1212 | assert(b == m.P(b)) | ||
1213 | assert(b:match"((((a))(b)))") | ||
1214 | |||
1215 | local g = [[ | ||
1216 | S <- "0" B / "1" A / "" -- balanced strings | ||
1217 | A <- "0" S / "1" A A -- one more 0 | ||
1218 | B <- "1" S / "0" B B -- one more 1 | ||
1219 | ]] | ||
1220 | assert(match("00011011", g) == 9) | ||
1221 | |||
1222 | local g = [[ | ||
1223 | S <- ("0" B / "1" A)* | ||
1224 | A <- "0" / "1" A A | ||
1225 | B <- "1" / "0" B B | ||
1226 | ]] | ||
1227 | assert(match("00011011", g) == 9) | ||
1228 | assert(match("000110110", g) == 9) | ||
1229 | assert(match("011110110", g) == 3) | ||
1230 | assert(match("000110010", g) == 1) | ||
1231 | |||
1232 | s = "aaaaaaaaaaaaaaaaaaaaaaaa" | ||
1233 | assert(match(s, "'a'^3") == 4) | ||
1234 | assert(match(s, "'a'^0") == 1) | ||
1235 | assert(match(s, "'a'^+3") == s:len() + 1) | ||
1236 | assert(not match(s, "'a'^+30")) | ||
1237 | assert(match(s, "'a'^-30") == s:len() + 1) | ||
1238 | assert(match(s, "'a'^-5") == 6) | ||
1239 | for i = 1, s:len() do | ||
1240 | assert(match(s, string.format("'a'^+%d", i)) >= i + 1) | ||
1241 | assert(match(s, string.format("'a'^-%d", i)) <= i + 1) | ||
1242 | assert(match(s, string.format("'a'^%d", i)) == i + 1) | ||
1243 | end | ||
1244 | assert(match("01234567890123456789", "[0-9]^3+") == 19) | ||
1245 | |||
1246 | |||
1247 | assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123") | ||
1248 | t = match("0123456789", "{| {.}* |}") | ||
1249 | checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}) | ||
1250 | assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101") | ||
1251 | |||
1252 | assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c") | ||
1253 | assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d") | ||
1254 | assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6) | ||
1255 | |||
1256 | assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")) | ||
1257 | assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")) | ||
1258 | |||
1259 | eqcharset(compile"[]]", "]") | ||
1260 | eqcharset(compile"[][]", m.S"[]") | ||
1261 | eqcharset(compile"[]-]", m.S"-]") | ||
1262 | eqcharset(compile"[-]", m.S"-") | ||
1263 | eqcharset(compile"[az-]", m.S"a-z") | ||
1264 | eqcharset(compile"[-az]", m.S"a-z") | ||
1265 | eqcharset(compile"[a-z]", m.R"az") | ||
1266 | eqcharset(compile"[]['\"]", m.S[[]['"]]) | ||
1267 | |||
1268 | eqcharset(compile"[^]]", any - "]") | ||
1269 | eqcharset(compile"[^][]", any - m.S"[]") | ||
1270 | eqcharset(compile"[^]-]", any - m.S"-]") | ||
1271 | eqcharset(compile"[^]-]", any - m.S"-]") | ||
1272 | eqcharset(compile"[^-]", any - m.S"-") | ||
1273 | eqcharset(compile"[^az-]", any - m.S"a-z") | ||
1274 | eqcharset(compile"[^-az]", any - m.S"a-z") | ||
1275 | eqcharset(compile"[^a-z]", any - m.R"az") | ||
1276 | eqcharset(compile"[^]['\"]", any - m.S[[]['"]]) | ||
1277 | |||
1278 | -- tests for comments in 're' | ||
1279 | e = compile[[ | ||
1280 | A <- _B -- \t \n %nl .<> <- -> -- | ||
1281 | _B <- 'x' --]] | ||
1282 | assert(e:match'xy' == 2) | ||
1283 | |||
1284 | -- tests for 're' with pre-definitions | ||
1285 | defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"} | ||
1286 | e = compile("%letters (%letters / %digits)*", defs) | ||
1287 | assert(e:match"x123" == 5) | ||
1288 | e = compile("%_", defs) | ||
1289 | assert(e:match"__" == 3) | ||
1290 | |||
1291 | e = compile([[ | ||
1292 | S <- A+ | ||
1293 | A <- %letters+ B | ||
1294 | B <- %digits+ | ||
1295 | ]], defs) | ||
1296 | |||
1297 | e = compile("{[0-9]+'.'?[0-9]*} -> sin", math) | ||
1298 | assert(e:match("2.34") == math.sin(2.34)) | ||
1299 | |||
1300 | |||
1301 | function eq (_, _, a, b) return a == b end | ||
1302 | |||
1303 | c = re.compile([[ | ||
1304 | longstring <- '[' {:init: '='* :} '[' close | ||
1305 | close <- ']' =init ']' / . close | ||
1306 | ]]) | ||
1307 | |||
1308 | assert(c:match'[==[]]===]]]]==]===[]' == 17) | ||
1309 | assert(c:match'[[]=]====]=]]]==]===[]' == 14) | ||
1310 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1311 | |||
1312 | c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. " | ||
1313 | |||
1314 | assert(c:match'[==[]]===]]]]==]') | ||
1315 | assert(c:match'[[]=]====]=][]==]===[]]') | ||
1316 | assert(not c:match'[[]=]====]=]=]==]===[]') | ||
1317 | |||
1318 | assert(re.find("hi alalo", "{:x:..:} =x") == 4) | ||
1319 | assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4) | ||
1320 | assert(not re.find("hi alalo", "{:x:..:} =x", 5)) | ||
1321 | assert(re.find("hi alalo", "{'al'}", 5) == 6) | ||
1322 | assert(re.find("hi aloalolo", "{:x:..:} =x") == 8) | ||
1323 | assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11) | ||
1324 | |||
1325 | -- re.find discards any captures | ||
1326 | local a,b,c = re.find("alo", "{.}{'o'}") | ||
1327 | assert(a == 2 and b == 3 and c == nil) | ||
1328 | |||
1329 | local function match (s,p) | ||
1330 | local i,e = re.find(s,p) | ||
1331 | if i then return s:sub(i, e) end | ||
1332 | end | ||
1333 | assert(match("alo alo", '[a-z]+') == "alo") | ||
1334 | assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil) | ||
1335 | assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo") | ||
1336 | |||
1337 | assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo") | ||
1338 | assert(re.gsub("alo alo", "%w+", ".") == ". .") | ||
1339 | assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == | ||
1340 | "hI, hOw ArE yOU") | ||
1341 | |||
1342 | s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]' | ||
1343 | c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' " | ||
1344 | assert(re.gsub(s, c, "%2") == 'hi and =]') | ||
1345 | assert(re.gsub(s, c, "%0") == s) | ||
1346 | assert(re.gsub('[=[hi]=]', c, "%2") == '=') | ||
1347 | |||
1348 | assert(re.find("", "!.") == 1) | ||
1349 | assert(re.find("alo", "!.") == 4) | ||
1350 | |||
1351 | function addtag (s, i, t, tag) t.tag = tag; return i, t end | ||
1352 | |||
1353 | c = re.compile([[ | ||
1354 | doc <- block !. | ||
1355 | block <- (start {| (block / { [^<]+ })* |} end?) => addtag | ||
1356 | start <- '<' {:tag: [a-z]+ :} '>' | ||
1357 | end <- '</' { =tag } '>' | ||
1358 | ]], {addtag = addtag}) | ||
1359 | |||
1360 | x = c:match[[ | ||
1361 | <x>hi<b>hello</b>but<b>totheend</x>]] | ||
1362 | checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', | ||
1363 | {'totheend'}}) | ||
1364 | |||
1365 | |||
1366 | -- test for folding captures | ||
1367 | c = re.compile([[ | ||
1368 | S <- (number (%s+ number)*) ~> add | ||
1369 | number <- %d+ -> tonumber | ||
1370 | ]], {tonumber = tonumber, add = function (a,b) return a + b end}) | ||
1371 | assert(c:match("3 401 50") == 3 + 401 + 50) | ||
1372 | |||
1373 | -- tests for look-ahead captures | ||
1374 | x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} | ||
1375 | checkeq(x, {"", "alo", ""}) | ||
1376 | |||
1377 | assert(re.match("aloalo", | ||
1378 | "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") | ||
1379 | == "AallooAalloo") | ||
1380 | |||
1381 | -- bug in 0.9 (and older versions), due to captures in look-aheads | ||
1382 | x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]] | ||
1383 | assert(x:match"alo alo" == "+ +") | ||
1384 | |||
1385 | -- valid capture in look-ahead (used inside the look-ahead itself) | ||
1386 | x = re.compile[[ | ||
1387 | S <- &({:two: .. :} . =two) {[a-z]+} / . S | ||
1388 | ]] | ||
1389 | assert(x:match("hello aloaLo aloalo xuxu") == "aloalo") | ||
1390 | |||
1391 | |||
1392 | p = re.compile[[ | ||
1393 | block <- {| {:ident:space*:} line | ||
1394 | ((=ident !space line) / &(=ident space) block)* |} | ||
1395 | line <- {[^%nl]*} %nl | ||
1396 | space <- '_' -- should be ' ', but '_' is simpler for editors | ||
1397 | ]] | ||
1398 | |||
1399 | t= p:match[[ | ||
1400 | 1 | ||
1401 | __1.1 | ||
1402 | __1.2 | ||
1403 | ____1.2.1 | ||
1404 | ____ | ||
1405 | 2 | ||
1406 | __2.1 | ||
1407 | ]] | ||
1408 | checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, | ||
1409 | "2", {"2.1", ident = "__"}, ident = ""}) | ||
1410 | |||
1411 | |||
1412 | -- nested grammars | ||
1413 | p = re.compile[[ | ||
1414 | s <- a b !. | ||
1415 | b <- ( x <- ('b' x)? ) | ||
1416 | a <- ( x <- 'a' x? ) | ||
1417 | ]] | ||
1418 | |||
1419 | assert(p:match'aaabbb') | ||
1420 | assert(p:match'aaa') | ||
1421 | assert(not p:match'bbb') | ||
1422 | assert(not p:match'aaabbba') | ||
1423 | |||
1424 | -- testing groups | ||
1425 | t = {re.match("abc", "{:S <- {:.:} {S} / '':}")} | ||
1426 | checkeq(t, {"a", "bc", "b", "c", "c", ""}) | ||
1427 | |||
1428 | t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}") | ||
1429 | checkeq(t, {a="1", b="2", c="4"}) | ||
1430 | t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}") | ||
1431 | checkeq(t, {a="1", b="2", c="4"}) | ||
1432 | t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}") | ||
1433 | checkeq(t, {"1", b="2", "4", "5"}) | ||
1434 | t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}") | ||
1435 | checkeq(t, {"1", "23", "4", "5"}) | ||
1436 | t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}") | ||
1437 | checkeq(t, {"1", "23", "4", "5"}) | ||
1438 | |||
1439 | |||
1440 | -- testing pre-defined names | ||
1441 | assert(os.setlocale("C") == "C") | ||
1442 | |||
1443 | function eqlpeggsub (p1, p2) | ||
1444 | local s1 = cs2str(re.compile(p1)) | ||
1445 | local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "") | ||
1446 | -- if s1 ~= s2 then print(#s1,#s2) end | ||
1447 | assert(s1 == s2) | ||
1448 | end | ||
1449 | |||
1450 | |||
1451 | eqlpeggsub("%w", "%w") | ||
1452 | eqlpeggsub("%a", "%a") | ||
1453 | eqlpeggsub("%l", "%l") | ||
1454 | eqlpeggsub("%u", "%u") | ||
1455 | eqlpeggsub("%p", "%p") | ||
1456 | eqlpeggsub("%d", "%d") | ||
1457 | eqlpeggsub("%x", "%x") | ||
1458 | eqlpeggsub("%s", "%s") | ||
1459 | eqlpeggsub("%c", "%c") | ||
1460 | |||
1461 | eqlpeggsub("%W", "%W") | ||
1462 | eqlpeggsub("%A", "%A") | ||
1463 | eqlpeggsub("%L", "%L") | ||
1464 | eqlpeggsub("%U", "%U") | ||
1465 | eqlpeggsub("%P", "%P") | ||
1466 | eqlpeggsub("%D", "%D") | ||
1467 | eqlpeggsub("%X", "%X") | ||
1468 | eqlpeggsub("%S", "%S") | ||
1469 | eqlpeggsub("%C", "%C") | ||
1470 | |||
1471 | eqlpeggsub("[%w]", "%w") | ||
1472 | eqlpeggsub("[_%w]", "_%w") | ||
1473 | eqlpeggsub("[^%w]", "%W") | ||
1474 | eqlpeggsub("[%W%S]", "%W%S") | ||
1475 | |||
1476 | re.updatelocale() | ||
1477 | |||
1478 | |||
1479 | -- testing nested substitutions x string captures | ||
1480 | |||
1481 | p = re.compile[[ | ||
1482 | text <- {~ item* ~} | ||
1483 | item <- macro / [^()] / '(' item* ')' | ||
1484 | arg <- ' '* {~ (!',' item)* ~} | ||
1485 | args <- '(' arg (',' arg)* ')' | ||
1486 | macro <- ('apply' args) -> '%1(%2)' | ||
1487 | / ('add' args) -> '%1 + %2' | ||
1488 | / ('mul' args) -> '%1 * %2' | ||
1489 | ]] | ||
1490 | |||
1491 | assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)") | ||
1492 | |||
1493 | rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] | ||
1494 | |||
1495 | assert(rev:match"0123456789" == "9876543210") | ||
1496 | |||
1497 | |||
1498 | -- testing error messages in re | ||
1499 | |||
1500 | local function errmsg (p, err) | ||
1501 | checkerr(err, re.compile, p) | ||
1502 | end | ||
1503 | |||
1504 | errmsg('aaaa', "rule 'aaaa'") | ||
1505 | errmsg('a', 'outside') | ||
1506 | errmsg('b <- a', 'undefined') | ||
1507 | errmsg("x <- 'a' x <- 'b'", 'already defined') | ||
1508 | errmsg("'a' -", "near '-'") | ||
1509 | |||
1510 | |||
1511 | print"OK" | ||
1512 | |||
1513 | |||