aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-06-06 17:50:31 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-06-06 17:50:31 -0300
commite31e13f59ef1a4df1698b15ff1fe0198553cc3c2 (patch)
treec16ea4f41bdd378734afd4e475d7d7973a5a2b5d
parent44fab2a44d06a956c3121ceba2b39ca7b00dc428 (diff)
downloadlpeg-e31e13f59ef1a4df1698b15ff1fe0198553cc3c2.tar.gz
lpeg-e31e13f59ef1a4df1698b15ff1fe0198553cc3c2.tar.bz2
lpeg-e31e13f59ef1a4df1698b15ff1fe0198553cc3c2.zip
First implementation for the accumulator capture
-rw-r--r--HISTORY3
-rw-r--r--lpcap.c24
-rw-r--r--lpcap.h4
-rw-r--r--lpeg.html89
-rw-r--r--lpprint.c4
-rw-r--r--lptree.c10
-rw-r--r--makefile2
-rwxr-xr-xtest.lua17
8 files changed, 131 insertions, 22 deletions
diff --git a/HISTORY b/HISTORY
index 7fa72e6..a250709 100644
--- a/HISTORY
+++ b/HISTORY
@@ -2,9 +2,12 @@ HISTORY for LPeg 1.1.0
2 2
3* Changes from version 1.0.2 to 1.1.0 3* Changes from version 1.0.2 to 1.1.0
4 --------------------------------- 4 ---------------------------------
5 + accumulator capture
5 + UTF-8 ranges 6 + UTF-8 ranges
6 + Larger limit for number of rules in a grammar 7 + Larger limit for number of rules in a grammar
8 + Larger limit for number of captures in a match
7 + bug fixes 9 + bug fixes
10 + other small improvements
8 11
9* Changes from version 1.0.1 to 1.0.2 12* Changes from version 1.0.1 to 1.0.2
10 --------------------------------- 13 ---------------------------------
diff --git a/lpcap.c b/lpcap.c
index ec17d23..4750f01 100644
--- a/lpcap.c
+++ b/lpcap.c
@@ -231,6 +231,22 @@ static int functioncap (CapState *cs) {
231 231
232 232
233/* 233/*
234** Accumulator capture
235*/
236static int accumulatorcap (CapState *cs) {
237 lua_State *L = cs->L;
238 int n;
239 if (lua_gettop(L) < cs->firstcap)
240 luaL_error(L, "no previous value for accumulator capture");
241 pushluaval(cs); /* push function */
242 lua_insert(L, -2); /* previous value becomes first argument */
243 n = pushnestedvalues(cs, 0); /* push nested captures */
244 lua_call(L, n + 1, 1); /* call function */
245 return 0; /* did not add any extra value */
246}
247
248
249/*
234** Select capture 250** Select capture
235*/ 251*/
236static int numcap (CapState *cs) { 252static int numcap (CapState *cs) {
@@ -422,13 +438,16 @@ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) {
422 case Csubst: 438 case Csubst:
423 substcap(b, cs); /* add capture directly to buffer */ 439 substcap(b, cs); /* add capture directly to buffer */
424 return 1; 440 return 1;
441 case Cacc: /* accumulator capture? */
442 return luaL_error(cs->L, "accumulator capture inside substitution capture");
425 default: { 443 default: {
426 lua_State *L = cs->L; 444 lua_State *L = cs->L;
427 int n = pushcapture(cs); 445 int n = pushcapture(cs);
428 if (n > 0) { 446 if (n > 0) {
429 if (n > 1) lua_pop(L, n - 1); /* only one result */ 447 if (n > 1) lua_pop(L, n - 1); /* only one result */
430 if (!lua_isstring(L, -1)) 448 if (!lua_isstring(L, -1))
431 luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); 449 return luaL_error(L, "invalid %s value (a %s)",
450 what, luaL_typename(L, -1));
432 luaL_addvalue(b); 451 luaL_addvalue(b);
433 } 452 }
434 return n; 453 return n;
@@ -512,6 +531,7 @@ static int pushcapture (CapState *cs) {
512 case Cbackref: res = backrefcap(cs); break; 531 case Cbackref: res = backrefcap(cs); break;
513 case Ctable: res = tablecap(cs); break; 532 case Ctable: res = tablecap(cs); break;
514 case Cfunction: res = functioncap(cs); break; 533 case Cfunction: res = functioncap(cs); break;
534 case Cacc: res = accumulatorcap(cs); break;
515 case Cnum: res = numcap(cs); break; 535 case Cnum: res = numcap(cs); break;
516 case Cquery: res = querycap(cs); break; 536 case Cquery: res = querycap(cs); break;
517 case Cfold: res = foldcap(cs); break; 537 case Cfold: res = foldcap(cs); break;
@@ -537,9 +557,11 @@ int getcaptures (lua_State *L, const char *s, const char *r, int ptop) {
537 CapState cs; 557 CapState cs;
538 cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; 558 cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0;
539 cs.s = s; cs.valuecached = 0; cs.ptop = ptop; 559 cs.s = s; cs.valuecached = 0; cs.ptop = ptop;
560 cs.firstcap = lua_gettop(L) + 1; /* where first value (if any) will go */
540 do { /* collect their values */ 561 do { /* collect their values */
541 n += pushcapture(&cs); 562 n += pushcapture(&cs);
542 } while (!isclosecap(cs.cap)); 563 } while (!isclosecap(cs.cap));
564 assert(lua_gettop(L) - cs.firstcap == n - 1);
543 } 565 }
544 if (n == 0) { /* no capture values? */ 566 if (n == 0) { /* no capture values? */
545 lua_pushinteger(L, r - s + 1); /* return only end position */ 567 lua_pushinteger(L, r - s + 1); /* return only end position */
diff --git a/lpcap.h b/lpcap.h
index 10539a0..e72d913 100644
--- a/lpcap.h
+++ b/lpcap.h
@@ -16,6 +16,7 @@ typedef enum CapKind {
16 Csimple, /* next node is pattern */ 16 Csimple, /* next node is pattern */
17 Ctable, /* next node is pattern */ 17 Ctable, /* next node is pattern */
18 Cfunction, /* ktable[key] is function; next node is pattern */ 18 Cfunction, /* ktable[key] is function; next node is pattern */
19 Cacc, /* ktable[key] is function; next node is pattern */
19 Cquery, /* ktable[key] is table; next node is pattern */ 20 Cquery, /* ktable[key] is table; next node is pattern */
20 Cstring, /* ktable[key] is string; next node is pattern */ 21 Cstring, /* ktable[key] is string; next node is pattern */
21 Cnum, /* numbered capture; 'key' is number of value to return */ 22 Cnum, /* numbered capture; 'key' is number of value to return */
@@ -38,7 +39,8 @@ typedef struct CapState {
38 Capture *cap; /* current capture */ 39 Capture *cap; /* current capture */
39 Capture *ocap; /* (original) capture list */ 40 Capture *ocap; /* (original) capture list */
40 lua_State *L; 41 lua_State *L;
41 int ptop; /* index of last argument to 'match' */ 42 int ptop; /* stack index of last argument to 'match' */
43 int firstcap; /* stack index of first capture pushed in the stack */
42 const char *s; /* original string */ 44 const char *s; /* original string */
43 int valuecached; /* value stored in cache slot */ 45 int valuecached; /* value stored in cache slot */
44 int reclevel; /* recursion level */ 46 int reclevel; /* recursion level */
diff --git a/lpeg.html b/lpeg.html
index f50d327..9faa1c7 100644
--- a/lpeg.html
+++ b/lpeg.html
@@ -638,6 +638,10 @@ or no value when <code>number</code> is zero.</td></tr>
638<tr><td><a href="#cap-func"><code>patt / function</code></a></td> 638<tr><td><a href="#cap-func"><code>patt / function</code></a></td>
639 <td>the returns of <code>function</code> applied to the captures 639 <td>the returns of <code>function</code> applied to the captures
640 of <code>patt</code></td></tr> 640 of <code>patt</code></td></tr>
641<tr><td><a href="#cap-rep"><code>patt % function</code></a></td>
642 <td>the return of <code>function</code> applied to the previous
643 capture plus the captures of <code>patt</code></td></tr>;
644 the returned value becomes the value of the previous capture
641<tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td> 645<tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td>
642 <td>the returns of <code>function</code> applied to the captures 646 <td>the returns of <code>function</code> applied to the captures
643 of <code>patt</code>; the application is done at match time</td></tr> 647 of <code>patt</code>; the application is done at match time</td></tr>
@@ -889,6 +893,75 @@ there is no captured value.
889</p> 893</p>
890 894
891 895
896<h3><a name="cap-rep"></a><code>patt % function</code></h3>
897<p>
898Creates an <em>accumulator capture</em>.
899This pattern behaves similarly to a
900<a href="cap-func">function capture</a>,
901with the following differences:
902The last captured value is added as a first argument to
903the call;
904the return of the function is adjusted to one single value;
905that value becomes the last captured value.
906</p>
907
908<p>
909As an example,
910consider the following code fragment:
911</p>
912<pre class="example">
913local name = lpeg.C(lpeg.R("az")^1)
914local p = name * (lpeg.P("^") % string.upper)^-1
915print(p:match("count")) --&gt; count
916print(p:match("count^")) --&gt; COUNT
917</pre>
918<p>
919In the first match,
920the accumulator capture does not match,
921and so the match results in its first capture, a name.
922In the second match,
923the accumulator capture matches,
924so the function <code>string.upper</code>
925is called with the previous capture (created by <code>name</code>)
926plus the string <code>"^"</code>;
927the function ignores its second argument and returns the first argument
928changed to upper case;
929that value then becomes the first and only
930capture value created by the match.
931</p>
932-- matches a numeral and captures its numerical value
933number = lpeg.R"09"^1 / tonumber
934
935-- auxiliary function to add two numbers
936function add (acc, newvalue) return acc + newvalue end
937
938-- matches a list of numbers, adding their values
939sum = number * ("," * number % add)^0
940
941-- example of use
942print(sum:match("10,30,43")) --&gt; 83
943</pre>
944<p>
945First, the initial <code>number</code> captures a number;
946that first capture will play the role of an accumulator.
947Then, each time <code>number</code> matches inside the loop
948there is a accumulator capture:
949It calls <code>add</code> with the current value of the accumulator
950and the value of the new number,
951and their sum replaces the value of the accumulator.
952At the end of the match,
953the accumulator with all sums is the final value.
954</p>
955
956<p>
957Due to the nature of this capture,
958you should avoid using it in places where it is not clear
959what is its "previous" capture.
960Due to implementation details,
961you should not use this capture inside a substitution capture.
962</p>
963
964
892<h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3> 965<h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3>
893<p> 966<p>
894Creates a <em>match-time capture</em>. 967Creates a <em>match-time capture</em>.
@@ -968,19 +1041,17 @@ lpeg.locale(lpeg) -- adds locale entries into 'lpeg' table
968local space = lpeg.space^0 1041local space = lpeg.space^0
969local name = lpeg.C(lpeg.alpha^1) * space 1042local name = lpeg.C(lpeg.alpha^1) * space
970local sep = lpeg.S(",;") * space 1043local sep = lpeg.S(",;") * space
971local pair = lpeg.Cg(name * "=" * space * name) * sep^-1 1044local pair = name * "=" * space * name * sep^-1
972local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset) 1045local list = lpeg.Ct("") * (pair % rawset)^0
973t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } 1046t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" }
974</pre> 1047</pre>
975<p> 1048<p>
976Each pair has the format <code>name = name</code> followed by 1049Each pair has the format <code>name = name</code> followed by
977an optional separator (a comma or a semicolon). 1050an optional separator (a comma or a semicolon).
978The <code>pair</code> pattern encloses the pair in a group pattern, 1051The <code>list</code> pattern then <em>folds</em> these captures.
979so that the names become the values of a single capture.
980The <code>list</code> pattern then folds these captures.
981It starts with an empty table, 1052It starts with an empty table,
982created by a table capture matching an empty string; 1053created by a table capture matching an empty string;
983then for each capture (a pair of names) it applies <code>rawset</code> 1054then for each a pair of names it applies <code>rawset</code>
984over the accumulator (the table) and the capture values (the pair of names). 1055over the accumulator (the table) and the capture values (the pair of names).
985<code>rawset</code> returns the table itself, 1056<code>rawset</code> returns the table itself,
986so the accumulator is always the table. 1057so the accumulator is always the table.
@@ -1295,8 +1366,8 @@ end
1295-- Grammar 1366-- Grammar
1296local V = lpeg.V 1367local V = lpeg.V
1297G = lpeg.P{ "Exp", 1368G = lpeg.P{ "Exp",
1298 Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval); 1369 Exp = V"Term" * (TermOp * V"Term" % eval)^0;
1299 Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval); 1370 Term = V"Factor" * (FactorOp * V"Factor" % eval)^0;
1300 Factor = Number / tonumber + Open * V"Exp" * Close; 1371 Factor = Number / tonumber + Open * V"Exp" * Close;
1301} 1372}
1302 1373
@@ -1304,7 +1375,7 @@ G = lpeg.P{ "Exp",
1304print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 1375print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5
1305</pre> 1376</pre>
1306<p> 1377<p>
1307Note the use of the fold (accumulator) capture. 1378Note the use of the accumulator capture.
1308To compute the value of an expression, 1379To compute the value of an expression,
1309the accumulator starts with the value of the first term, 1380the accumulator starts with the value of the first term,
1310and then applies <code>eval</code> over 1381and then applies <code>eval</code> over
diff --git a/lpprint.c b/lpprint.c
index 54a3da7..1142376 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -60,7 +60,7 @@ static void printTcharset (TTree *tree) {
60static const char *capkind (int kind) { 60static const char *capkind (int kind) {
61 const char *const modes[] = { 61 const char *const modes[] = {
62 "close", "position", "constant", "backref", 62 "close", "position", "constant", "backref",
63 "argument", "simple", "table", "function", 63 "argument", "simple", "table", "function", "replace",
64 "query", "string", "num", "substitution", "fold", 64 "query", "string", "num", "substitution", "fold",
65 "runtime", "group"}; 65 "runtime", "group"};
66 return modes[kind]; 66 return modes[kind];
@@ -147,7 +147,6 @@ void printpatt (Instruction *p) {
147} 147}
148 148
149 149
150#if defined(LPEG_DEBUG)
151static void printcap (Capture *cap) { 150static void printcap (Capture *cap) {
152 printf("%s (idx: %d - size: %d) -> %p\n", 151 printf("%s (idx: %d - size: %d) -> %p\n",
153 capkind(cap->kind), cap->idx, cap->siz, cap->s); 152 capkind(cap->kind), cap->idx, cap->siz, cap->s);
@@ -160,7 +159,6 @@ void printcaplist (Capture *cap, Capture *limit) {
160 printcap(cap); 159 printcap(cap);
161 printf("=======\n"); 160 printf("=======\n");
162} 161}
163#endif
164 162
165/* }====================================================== */ 163/* }====================================================== */
166 164
diff --git a/lptree.c b/lptree.c
index 0b2d824..330471a 100644
--- a/lptree.c
+++ b/lptree.c
@@ -19,7 +19,7 @@
19const byte numsiblings[] = { 19const byte numsiblings[] = {
20 0, 0, 0, /* char, set, any */ 20 0, 0, 0, /* char, set, any */
21 0, 0, 0, /* true, false, utf-range */ 21 0, 0, 0, /* true, false, utf-range */
22 1, /* rep */ 22 1, /* acc */
23 2, 2, /* seq, choice */ 23 2, 2, /* seq, choice */
24 1, 1, /* not, and */ 24 1, 1, /* not, and */
25 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */ 25 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */
@@ -850,6 +850,11 @@ static int lp_divcapture (lua_State *L) {
850} 850}
851 851
852 852
853static int lp_acccapture (lua_State *L) {
854 return capture_aux(L, Cacc, 2);
855}
856
857
853static int lp_substcapture (lua_State *L) { 858static int lp_substcapture (lua_State *L) {
854 return capture_aux(L, Csubst, 0); 859 return capture_aux(L, Csubst, 0);
855} 860}
@@ -1250,7 +1255,7 @@ static int lp_match (lua_State *L) {
1250 int ptop = lua_gettop(L); 1255 int ptop = lua_gettop(L);
1251 lua_pushnil(L); /* initialize subscache */ 1256 lua_pushnil(L); /* initialize subscache */
1252 lua_pushlightuserdata(L, capture); /* initialize caplistidx */ 1257 lua_pushlightuserdata(L, capture); /* initialize caplistidx */
1253 lua_getuservalue(L, 1); /* initialize penvidx */ 1258 lua_getuservalue(L, 1); /* initialize ktableidx */
1254 r = match(L, s, s + i, s + l, code, capture, ptop); 1259 r = match(L, s, s + i, s + l, code, capture, ptop);
1255 if (r == NULL) { 1260 if (r == NULL) {
1256 lua_pushnil(L); 1261 lua_pushnil(L);
@@ -1369,6 +1374,7 @@ static struct luaL_Reg metareg[] = {
1369 {"__gc", lp_gc}, 1374 {"__gc", lp_gc},
1370 {"__len", lp_and}, 1375 {"__len", lp_and},
1371 {"__div", lp_divcapture}, 1376 {"__div", lp_divcapture},
1377 {"__mod", lp_acccapture},
1372 {"__unm", lp_not}, 1378 {"__unm", lp_not},
1373 {"__sub", lp_sub}, 1379 {"__sub", lp_sub},
1374 {NULL, NULL} 1380 {NULL, NULL}
diff --git a/makefile b/makefile
index 2b3c12e..2aff497 100644
--- a/makefile
+++ b/makefile
@@ -3,7 +3,7 @@ LUADIR = ./lua/
3 3
4# COPT = -O2 -DNDEBUG 4# COPT = -O2 -DNDEBUG
5# COPT = -g 5# COPT = -g
6COPT = -DLPEG_DEBUG -g 6COPT = -O0 -DLPEG_DEBUG -g
7 7
8CWARNS = -Wall -Wextra -pedantic \ 8CWARNS = -Wall -Wextra -pedantic \
9 -Waggregate-return \ 9 -Waggregate-return \
diff --git a/test.lua b/test.lua
index d31a69f..cd85b31 100755
--- a/test.lua
+++ b/test.lua
@@ -493,8 +493,8 @@ local function f_term (v1, op, v2, d)
493end 493end
494 494
495G = m.P{ "Exp", 495G = m.P{ "Exp",
496 Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor); 496 Exp = V"Factor" * (FactorOp * V"Factor" % f_factor)^0;
497 Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term); 497 Factor = V"Term" * (TermOp * V"Term" % f_term)^0;
498 Term = Number / tonumber + Open * V"Exp" * Close; 498 Term = Number / tonumber + Open * V"Exp" * Close;
499} 499}
500 500
@@ -866,6 +866,7 @@ print"+"
866-- accumulator capture 866-- accumulator capture
867function f (x) return x + 1 end 867function f (x) return x + 1 end
868assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) 868assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7)
869assert(m.match(m.Cc(0) * (m.C(1) % f)^0, "alo alo") == 7)
869 870
870t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} 871t = {m.match(m.Cf(m.Cc(1,2,3), error), "")}
871checkeq(t, {1}) 872checkeq(t, {1})
@@ -875,7 +876,7 @@ t = p:match("a=b;c=du;xux=yuy;")
875checkeq(t, {a="b", c="du", xux="yuy"}) 876checkeq(t, {a="b", c="du", xux="yuy"})
876 877
877 878
878-- errors in accumulator capture 879-- errors in fold capture
879 880
880-- no initial capture 881-- no initial capture
881checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa') 882checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa')
@@ -883,8 +884,14 @@ checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa')
883checkerr("no initial value", m.match, m.Cf(m.P(500), print), 884checkerr("no initial value", m.match, m.Cf(m.P(500), print),
884 string.rep('a', 600)) 885 string.rep('a', 600))
885 886
886-- nested capture produces no initial value 887
887checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo") 888-- errors in accumulator capture
889
890-- no initial capture
891checkerr("no previous value", m.match, m.P(5) % print, 'aaaaaa')
892-- no initial capture (very long match forces fold to be a pair open-close)
893checkerr("no previous value", m.match, m.P(500) % print,
894 string.rep('a', 600))
888 895
889 896
890-- tests for loop checker 897-- tests for loop checker