Some fixes in vibibility check for back captures

author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2023-06-19 11:14:02 -0300
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2023-06-19 11:14:02 -0300
commit: 9a9ee3d9ab8ce435d743d293ec43075151370200 (patch)
tree: 445290bfa04c2cd30f514b65b90b1d8b973f21f1
parent: a561630f17e61548193666abf9a8b20f20462558 (diff)
download: lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.gz
lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.bz2
lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.zip
6 files changed, 97 insertions, 46 deletions
diff --git a/lpcap.c b/lpcap.c
index 74a34db..fca8cbb 100644
--- a/lpcap.c
+++ b/lpcap.c
@@ -126,15 +126,23 @@ static void pushonenestedvalue (CapState *cs) {
 /*
-** Checks whether group 'grp' is visible to 'ref', that is,
+** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is
-** 'grp' is not nested inside a capture that does not contain 'ref'.
+** not nested inside a full capture that does not contain 'ref'.  (We
-** To do that, must find minimum capture that contains 'grp'.
+** only need to care for full captures because the search at 'findback'
+** skips open-end blocks; so, if 'grp' is nested in a non-full capture,
+** 'ref' is also inside it.)  To check this, we search backward for the
+** inner full capture enclosing 'grp'.  A full capture cannot contain
+** non-full captures, so a close capture means we cannot be inside a
+** full capture anymore.
 */
 static int capvisible (CapState *cs, Capture *grp, Capture *ref) {
  Capture *cap = grp;
-  while (cap-- > cs->ocap) {  /* repeat until end of list */
+  int i = MAXLOP;  /* maximum distance for an 'open' */
+  while (i-- > 0 && cap-- > cs->ocap) {
    if (isclosecap(cap))
-      cap = findopen(cap);  /* skip nested captures */
+      return 1;  /* can stop the search */
+    else if (grp->index - cap->index >= UCHAR_MAX)
+      return 1;  /* can stop the search */
    else if (capinside(cap, grp))  /* is 'grp' inside cap? */
      return capinside(cap, ref);  /* ok iff cap also contains 'ref' */
  }
@@ -150,10 +158,10 @@ static Capture *findback (CapState *cs, Capture *ref) {
  lua_State *L = cs->L;
  Capture *cap = ref;
  while (cap-- > cs->ocap) {  /* repeat until end of list */
-    if (isopencap(cap))
+    if (isclosecap(cap))
-      continue;  /* enclosing captures are not visible to 'ref' */
-    else if (isclosecap(cap))
      cap = findopen(cap);  /* skip nested captures */
+    else if (capinside(cap, ref))
+      continue;  /* enclosing captures are not visible to 'ref' */
    if (captype(cap) == Cgroup && capvisible(cs, cap, ref)) {
      getfromktable(cs, cap->idx);  /* get group name */
      if (lp_equal(L, -2, -1)) {  /* right group? */
diff --git a/lpcap.h b/lpcap.h
index 30f3714..abbd553 100644
--- a/lpcap.h
+++ b/lpcap.h
@@ -70,6 +70,12 @@ typedef struct CapState {
                       : (c2)->index < (c1)->index + (c1)->siz - 1)
+/**
+** Maximum number of captures to visit when looking for an 'open'.
+*/
+#define MAXLOP          20
 int runtimecap (CapState *cs, Capture *close, const char *s, int *rem);
 int getcaptures (lua_State *L, const char *s, const char *r, int ptop);
diff --git a/lpeg.html b/lpeg.html
index 9faa1c7..c9bd9f9 100644
--- a/lpeg.html
+++ b/lpeg.html
@@ -608,17 +608,17 @@ The following table summarizes the basic captures:
 <tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td>
    <td>the value of the n<sup>th</sup> extra argument to
        <code>lpeg.match</code> (matches the empty string)</td></tr>
-<tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td>
+<tr><td><a href="#cap-b"><code>lpeg.Cb(key)</code></a></td>
    <td>the values produced by the previous
-        group capture named <code>name</code>
+        group capture named <code>key</code>
        (matches the empty string)</td></tr>
 <tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td>
    <td>the given values (matches the empty string)</td></tr>
 <tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td>
  <td>a <em>folding</em> of the captures from <code>patt</code></td></tr>
-<tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td>
+<tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, key])</code></a></td>
    <td>the values produced by <code>patt</code>,
-        optionally tagged with <code>name</code></td></tr>
+        optionally tagged with <code>key</code></td></tr>
 <tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td>
    <td>the current position (matches the empty string)</td></tr>
 <tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td>
@@ -639,9 +639,10 @@ or no value when <code>number</code> is zero.</td></tr>
  <td>the returns of <code>function</code> applied to the captures
      of <code>patt</code></td></tr>
 <tr><td><a href="#cap-rep"><code>patt % function</code></a></td>
-  <td>the return of <code>function</code> applied to the previous
+  <td>produces no value;
-      capture plus the captures of <code>patt</code></td></tr>;
+      it <em>accummulates</em> the captures from <code>patt</code>
-      the returned value becomes the value of the previous capture
+      into the previous capture through <code>function</code>
+      </td></tr>
 <tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td>
  <td>the returns of <code>function</code> applied to the captures
      of <code>patt</code>; the application is done at match time</td></tr>
@@ -699,24 +700,25 @@ argument given in the call to <code>lpeg.match</code>.
 </p>
-<h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3>
+<h3><a name="cap-b"></a><code>lpeg.Cb (key)</code></h3>
 <p>
 Creates a <em>back capture</em>.
 This pattern matches the empty string and
 produces the values produced by the <em>most recent</em>
-<a href="#cap-g">group capture</a> named <code>name</code>
+<a href="#cap-g">group capture</a> named <code>key</code>
-(where <code>name</code> can be any Lua value).
+(where <code>key</code> can be any Lua value).
 </p>
 <p>
 <em>Most recent</em> means the last
 <em>complete</em>
 <em>outermost</em>
-group capture with the given name.
+group capture with the given key.
 A <em>Complete</em> capture means that the entire pattern
-corresponding to the capture has matched.
+corresponding to the capture has matched;
+in other words, the back capture is not nested inside the group.
 An <em>Outermost</em> capture means that the capture is not inside
-another complete capture.
+another complete capture that does not contain the back capture itself.
 </p>
 <p>
@@ -785,13 +787,13 @@ print(sum:match("10,30,43"))   --&gt; 83
 </pre>
-<h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3>
+<h3><a name="cap-g"></a><code>lpeg.Cg (patt [, key])</code></h3>
 <p>
 Creates a <em>group capture</em>.
 It groups all values returned by <code>patt</code>
 into a single capture.
-The group may be anonymous (if no name is given)
+The group may be anonymous (if no key is given)
-or named with the given name
+or named with the given key
 (which can be any non-nil Lua value).
 </p>
@@ -837,7 +839,7 @@ starting at 1.
 Moreover,
 for each named capture group created by <code>patt</code>,
 the first value of the group is put into the table
-with the group name as its key.
+with the group key as its key.
 The captured value is only the table.
 </p>
@@ -897,12 +899,14 @@ there is no captured value.
 <p>
 Creates an <em>accumulator capture</em>.
 This pattern behaves similarly to a
-<a href="cap-func">function capture</a>,
+<a href="#cap-func">function capture</a>,
 with the following differences:
 The last captured value is added as a first argument to
 the call;
 the return of the function is adjusted to one single value;
-that value becomes the last captured value.
+that value replaces the last captured value.
+Note that the capture itself produces no values;
+it only changes the value of its previous capture.
 </p>
 <p>
@@ -929,6 +933,12 @@ changed to upper case;
 that value then becomes the first and only
 capture value created by the match.
 </p>
+<p>
+As another example,
+let us consider the problem of adding a list of numbers.
+</p>
+<pre class="example">
 -- matches a numeral and captures its numerical value
 number = lpeg.R"09"^1 / tonumber
@@ -944,11 +954,11 @@ print(sum:match("10,30,43"))   --&gt; 83
 <p>
 First, the initial <code>number</code> captures a number;
 that first capture will play the role of an accumulator.
-Then, each time <code>number</code> matches inside the loop
+Then, each time the sequence <code>comma-number</code>
-there is a accumulator capture:
+matches inside the loop there is an accumulator capture:
 It calls <code>add</code> with the current value of the accumulator
 and the value of the new number,
-and their sum replaces the value of the accumulator.
+and the result of the call (their sum) replaces the value of the accumulator.
 At the end of the match,
 the accumulator with all sums is the final value.
 </p>
@@ -956,9 +966,12 @@ the accumulator with all sums is the final value.
 <p>
 Due to the nature of this capture,
 you should avoid using it in places where it is not clear
-what is its "previous" capture.
+what is its "previous" capture
+(e.g., directly nested in a <a href="#cap-string">string capture</a>
+or a <a href="#cap-num">numbered capture</a>).
 Due to implementation details,
-you should not use this capture inside a substitution capture.
+you should not use this capture directly nested in a
+<a href="#cap-s">substitution capture</a>.
 </p>
@@ -1014,9 +1027,9 @@ local lpeg = require "lpeg"
 -- matches a word followed by end-of-string
 p = lpeg.R"az"^1 * -1
-print(p:match("hello"))        --> 6
+print(p:match("hello"))        --&gt; 6
-print(lpeg.match(p, "hello"))  --> 6
+print(lpeg.match(p, "hello"))  --&gt; 6
-print(p:match("1 hello"))      --> nil
+print(p:match("1 hello"))      --&gt; nil
 </pre>
 <p>
 The pattern is simply a sequence of one or more lower-case letters
@@ -1043,7 +1056,7 @@ local name = lpeg.C(lpeg.alpha^1) * space
 local sep = lpeg.S(",;") * space
 local pair = name * "=" * space * name * sep^-1
 local list = lpeg.Ct("") * (pair % rawset)^0
-t = list:match("a=b, c = hi; next = pi")  --> { a = "b", c = "hi", next = "pi" }
+t = list:match("a=b, c = hi; next = pi")  --&gt; { a = "b", c = "hi", next = "pi" }
 </pre>
 <p>
 Each pair has the format <code>name = name</code> followed by
@@ -1135,7 +1148,7 @@ function anywhere (p)
  return lpeg.P{ I * p * I + 1 * lpeg.V(1) }
 end
-print(anywhere("world"):match("hello world!"))   -> 7   12
+print(anywhere("world"):match("hello world!"))   --&gt; 7   12
 </pre>
 <p>
@@ -1344,7 +1357,7 @@ function evalExp (s)
 end
 -- small example
-print(evalExp"3 + 5*9 / (1+1) - 12")   --> 13.5
+print(evalExp"3 + 5*9 / (1+1) - 12")   --&gt; 13.5
 </pre>
 <p>
@@ -1372,7 +1385,7 @@ G = lpeg.P{ "Exp",
 }
 -- small example
-print(lpeg.match(G, "3 + 5*9 / (1+1) - 12"))   --> 13.5
+print(lpeg.match(G, "3 + 5*9 / (1+1) - 12"))   --&gt; 13.5
 </pre>
 <p>
 Note the use of the accumulator capture.
diff --git a/lpprint.c b/lpprint.c
index 6349ac2..da902e6 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -149,8 +149,8 @@ void printpatt (Instruction *p) {
 static void printcap (Capture *cap, int ident) {
  while (ident--) printf(" ");
-  printf("%s (idx: %d - size: %d) -> %lu\n",
+  printf("%s (idx: %d - size: %d) -> %lu  (%p)\n",
-         capkind(cap->kind), cap->idx, cap->siz, (long)cap->index);
+         capkind(cap->kind), cap->idx, cap->siz, (long)cap->index, (void*)cap);
 }
diff --git a/lpvm.c b/lpvm.c
index 5a30679..0a2fde4 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -198,11 +198,6 @@ static int removedyncap (lua_State *L, Capture *capture,
 }
-/**
-** Maximum number of captures to visit when looking for an 'open'.
-*/
-#define MAXLOP          20
 /*
 ** Find the corresponding 'open' capture before 'cap', when that capture
 ** can become a full capture. If a full capture c1 is followed by an
diff --git a/test.lua b/test.lua
index cd85b31..7e61603 100755
--- a/test.lua
+++ b/test.lua
@@ -1005,6 +1005,35 @@ p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k"))
 t = p:match("ab")
 checkeq(t, {"a", "b"})
+do
+  -- some basic cases
+  assert(m.match(m.Cg(m.Cc(3), "a") * m.Cb("a"), "a") == 3)
+  assert(m.match(m.Cg(m.C(1), 133) * m.Cb(133), "X") == "X")
+  -- first reference to 'x' should not see the group enclosing it
+  local p = m.Cg(m.Cb('x'), 'x') * m.Cb('x')
+  checkerr("back reference 'x' not found", m.match, p, '')
+  local p = m.Cg(m.Cb('x') * m.C(1), 'x') * m.Cb('x')
+  checkerr("back reference 'x' not found", m.match, p, 'abc')
+  -- reference to 'x' should not see the group enclosed in another capture
+  local s = string.rep("a", 30)
+  local p = (m.C(1)^-4 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x')
+  checkerr("back reference 'x' not found", m.match, p, s)
+  local p = (m.C(1)^-20 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x')
+  checkerr("back reference 'x' not found", m.match, p, s)
+  -- second reference 'k' should refer to 10 and first ref. 'k'
+  p = m.Cg(m.Cc(20), 'k') * m.Cg(m.Cc(10) * m.Cb('k') * m.C(1), 'k')
+      * (m.Cb('k') / function (a,b,c) return a*10 + b + tonumber(c) end)
+  -- 10 * 10 (Cc) + 20 (Cb) + 7 (C) == 127
+  assert(p:match("756") == 127)
+end
 p = m.P(true)
 for i = 1, 10 do p = p * m.Cg(1, i) end
 for i = 1, 10 do
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2023-06-19 11:14:02 -0300
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2023-06-19 11:14:02 -0300
commit	9a9ee3d9ab8ce435d743d293ec43075151370200 (patch)
tree	445290bfa04c2cd30f514b65b90b1d8b973f21f1
parent	a561630f17e61548193666abf9a8b20f20462558 (diff)
download	lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.gz lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.bz2 lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.zip