From 9a9ee3d9ab8ce435d743d293ec43075151370200 Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Mon, 19 Jun 2023 11:14:02 -0300 Subject: Some fixes in vibibility check for back captures --- lpcap.c | 24 +++++++++++++------- lpcap.h | 6 +++++ lpeg.html | 75 +++++++++++++++++++++++++++++++++++++-------------------------- lpprint.c | 4 ++-- lpvm.c | 5 ----- test.lua | 29 ++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 46 deletions(-) diff --git a/lpcap.c b/lpcap.c index 74a34db..fca8cbb 100644 --- a/lpcap.c +++ b/lpcap.c @@ -126,15 +126,23 @@ static void pushonenestedvalue (CapState *cs) { /* -** Checks whether group 'grp' is visible to 'ref', that is, -** 'grp' is not nested inside a capture that does not contain 'ref'. -** To do that, must find minimum capture that contains 'grp'. +** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is +** not nested inside a full capture that does not contain 'ref'. (We +** only need to care for full captures because the search at 'findback' +** skips open-end blocks; so, if 'grp' is nested in a non-full capture, +** 'ref' is also inside it.) To check this, we search backward for the +** inner full capture enclosing 'grp'. A full capture cannot contain +** non-full captures, so a close capture means we cannot be inside a +** full capture anymore. */ static int capvisible (CapState *cs, Capture *grp, Capture *ref) { Capture *cap = grp; - while (cap-- > cs->ocap) { /* repeat until end of list */ + int i = MAXLOP; /* maximum distance for an 'open' */ + while (i-- > 0 && cap-- > cs->ocap) { if (isclosecap(cap)) - cap = findopen(cap); /* skip nested captures */ + return 1; /* can stop the search */ + else if (grp->index - cap->index >= UCHAR_MAX) + return 1; /* can stop the search */ else if (capinside(cap, grp)) /* is 'grp' inside cap? */ return capinside(cap, ref); /* ok iff cap also contains 'ref' */ } @@ -150,10 +158,10 @@ static Capture *findback (CapState *cs, Capture *ref) { lua_State *L = cs->L; Capture *cap = ref; while (cap-- > cs->ocap) { /* repeat until end of list */ - if (isopencap(cap)) - continue; /* enclosing captures are not visible to 'ref' */ - else if (isclosecap(cap)) + if (isclosecap(cap)) cap = findopen(cap); /* skip nested captures */ + else if (capinside(cap, ref)) + continue; /* enclosing captures are not visible to 'ref' */ if (captype(cap) == Cgroup && capvisible(cs, cap, ref)) { getfromktable(cs, cap->idx); /* get group name */ if (lp_equal(L, -2, -1)) { /* right group? */ diff --git a/lpcap.h b/lpcap.h index 30f3714..abbd553 100644 --- a/lpcap.h +++ b/lpcap.h @@ -70,6 +70,12 @@ typedef struct CapState { : (c2)->index < (c1)->index + (c1)->siz - 1) +/** +** Maximum number of captures to visit when looking for an 'open'. +*/ +#define MAXLOP 20 + + int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); int getcaptures (lua_State *L, const char *s, const char *r, int ptop); diff --git a/lpeg.html b/lpeg.html index 9faa1c7..c9bd9f9 100644 --- a/lpeg.html +++ b/lpeg.html @@ -608,17 +608,17 @@ The following table summarizes the basic captures: lpeg.Carg(n) the value of the nth extra argument to lpeg.match (matches the empty string) -lpeg.Cb(name) +lpeg.Cb(key) the values produced by the previous - group capture named name + group capture named key (matches the empty string) lpeg.Cc(values) the given values (matches the empty string) lpeg.Cf(patt, func) a folding of the captures from patt -lpeg.Cg(patt [, name]) +lpeg.Cg(patt [, key]) the values produced by patt, - optionally tagged with name + optionally tagged with key lpeg.Cp() the current position (matches the empty string) lpeg.Cs(patt) @@ -639,9 +639,10 @@ or no value when number is zero. the returns of function applied to the captures of patt patt % function - the return of function applied to the previous - capture plus the captures of patt; - the returned value becomes the value of the previous capture + produces no value; + it accummulates the captures from patt + into the previous capture through function + lpeg.Cmt(patt, function) the returns of function applied to the captures of patt; the application is done at match time @@ -699,24 +700,25 @@ argument given in the call to lpeg.match.

-

lpeg.Cb (name)

+

lpeg.Cb (key)

Creates a back capture. This pattern matches the empty string and produces the values produced by the most recent -group capture named name -(where name can be any Lua value). +group capture named key +(where key can be any Lua value).

Most recent means the last complete outermost -group capture with the given name. +group capture with the given key. A Complete capture means that the entire pattern -corresponding to the capture has matched. +corresponding to the capture has matched; +in other words, the back capture is not nested inside the group. An Outermost capture means that the capture is not inside -another complete capture. +another complete capture that does not contain the back capture itself.

@@ -785,13 +787,13 @@ print(sum:match("10,30,43")) --> 83 -

lpeg.Cg (patt [, name])

+

lpeg.Cg (patt [, key])

Creates a group capture. It groups all values returned by patt into a single capture. -The group may be anonymous (if no name is given) -or named with the given name +The group may be anonymous (if no key is given) +or named with the given key (which can be any non-nil Lua value).

@@ -837,7 +839,7 @@ starting at 1. Moreover, for each named capture group created by patt, the first value of the group is put into the table -with the group name as its key. +with the group key as its key. The captured value is only the table.

@@ -897,12 +899,14 @@ there is no captured value.

Creates an accumulator capture. This pattern behaves similarly to a -function capture, +function capture, with the following differences: The last captured value is added as a first argument to the call; the return of the function is adjusted to one single value; -that value becomes the last captured value. +that value replaces the last captured value. +Note that the capture itself produces no values; +it only changes the value of its previous capture.

@@ -929,6 +933,12 @@ changed to upper case; that value then becomes the first and only capture value created by the match.

+ +

+As another example, +let us consider the problem of adding a list of numbers. +

+
 -- matches a numeral and captures its numerical value
 number = lpeg.R"09"^1 / tonumber
 
@@ -944,11 +954,11 @@ print(sum:match("10,30,43"))   --> 83
 

First, the initial number captures a number; that first capture will play the role of an accumulator. -Then, each time number matches inside the loop -there is a accumulator capture: +Then, each time the sequence comma-number +matches inside the loop there is an accumulator capture: It calls add with the current value of the accumulator and the value of the new number, -and their sum replaces the value of the accumulator. +and the result of the call (their sum) replaces the value of the accumulator. At the end of the match, the accumulator with all sums is the final value.

@@ -956,9 +966,12 @@ the accumulator with all sums is the final value.

Due to the nature of this capture, you should avoid using it in places where it is not clear -what is its "previous" capture. +what is its "previous" capture +(e.g., directly nested in a string capture +or a numbered capture). Due to implementation details, -you should not use this capture inside a substitution capture. +you should not use this capture directly nested in a +substitution capture.

@@ -1014,9 +1027,9 @@ local lpeg = require "lpeg" -- matches a word followed by end-of-string p = lpeg.R"az"^1 * -1 -print(p:match("hello")) --> 6 -print(lpeg.match(p, "hello")) --> 6 -print(p:match("1 hello")) --> nil +print(p:match("hello")) --> 6 +print(lpeg.match(p, "hello")) --> 6 +print(p:match("1 hello")) --> nil

The pattern is simply a sequence of one or more lower-case letters @@ -1043,7 +1056,7 @@ local name = lpeg.C(lpeg.alpha^1) * space local sep = lpeg.S(",;") * space local pair = name * "=" * space * name * sep^-1 local list = lpeg.Ct("") * (pair % rawset)^0 -t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } +t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" }

Each pair has the format name = name followed by @@ -1135,7 +1148,7 @@ function anywhere (p) return lpeg.P{ I * p * I + 1 * lpeg.V(1) } end -print(anywhere("world"):match("hello world!")) -> 7 12 +print(anywhere("world"):match("hello world!")) --> 7 12

@@ -1344,7 +1357,7 @@ function evalExp (s) end -- small example -print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5 +print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5

@@ -1372,7 +1385,7 @@ G = lpeg.P{ "Exp", } -- small example -print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 +print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5

Note the use of the accumulator capture. diff --git a/lpprint.c b/lpprint.c index 6349ac2..da902e6 100644 --- a/lpprint.c +++ b/lpprint.c @@ -149,8 +149,8 @@ void printpatt (Instruction *p) { static void printcap (Capture *cap, int ident) { while (ident--) printf(" "); - printf("%s (idx: %d - size: %d) -> %lu\n", - capkind(cap->kind), cap->idx, cap->siz, (long)cap->index); + printf("%s (idx: %d - size: %d) -> %lu (%p)\n", + capkind(cap->kind), cap->idx, cap->siz, (long)cap->index, (void*)cap); } diff --git a/lpvm.c b/lpvm.c index 5a30679..0a2fde4 100644 --- a/lpvm.c +++ b/lpvm.c @@ -198,11 +198,6 @@ static int removedyncap (lua_State *L, Capture *capture, } -/** -** Maximum number of captures to visit when looking for an 'open'. -*/ -#define MAXLOP 20 - /* ** Find the corresponding 'open' capture before 'cap', when that capture ** can become a full capture. If a full capture c1 is followed by an diff --git a/test.lua b/test.lua index cd85b31..7e61603 100755 --- a/test.lua +++ b/test.lua @@ -1005,6 +1005,35 @@ p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) t = p:match("ab") checkeq(t, {"a", "b"}) + +do + -- some basic cases + assert(m.match(m.Cg(m.Cc(3), "a") * m.Cb("a"), "a") == 3) + assert(m.match(m.Cg(m.C(1), 133) * m.Cb(133), "X") == "X") + + -- first reference to 'x' should not see the group enclosing it + local p = m.Cg(m.Cb('x'), 'x') * m.Cb('x') + checkerr("back reference 'x' not found", m.match, p, '') + + local p = m.Cg(m.Cb('x') * m.C(1), 'x') * m.Cb('x') + checkerr("back reference 'x' not found", m.match, p, 'abc') + + -- reference to 'x' should not see the group enclosed in another capture + local s = string.rep("a", 30) + local p = (m.C(1)^-4 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') + checkerr("back reference 'x' not found", m.match, p, s) + + local p = (m.C(1)^-20 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') + checkerr("back reference 'x' not found", m.match, p, s) + + -- second reference 'k' should refer to 10 and first ref. 'k' + p = m.Cg(m.Cc(20), 'k') * m.Cg(m.Cc(10) * m.Cb('k') * m.C(1), 'k') + * (m.Cb('k') / function (a,b,c) return a*10 + b + tonumber(c) end) + -- 10 * 10 (Cc) + 20 (Cb) + 7 (C) == 127 + assert(p:match("756") == 127) + +end + p = m.P(true) for i = 1, 10 do p = p * m.Cg(1, i) end for i = 1, 10 do -- cgit v1.2.3-55-g6feb