diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-19 11:14:02 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-19 11:14:02 -0300 |
commit | 9a9ee3d9ab8ce435d743d293ec43075151370200 (patch) | |
tree | 445290bfa04c2cd30f514b65b90b1d8b973f21f1 | |
parent | a561630f17e61548193666abf9a8b20f20462558 (diff) | |
download | lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.gz lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.tar.bz2 lpeg-9a9ee3d9ab8ce435d743d293ec43075151370200.zip |
Some fixes in vibibility check for back captures
-rw-r--r-- | lpcap.c | 24 | ||||
-rw-r--r-- | lpcap.h | 6 | ||||
-rw-r--r-- | lpeg.html | 75 | ||||
-rw-r--r-- | lpprint.c | 4 | ||||
-rw-r--r-- | lpvm.c | 5 | ||||
-rwxr-xr-x | test.lua | 29 |
6 files changed, 97 insertions, 46 deletions
@@ -126,15 +126,23 @@ static void pushonenestedvalue (CapState *cs) { | |||
126 | 126 | ||
127 | 127 | ||
128 | /* | 128 | /* |
129 | ** Checks whether group 'grp' is visible to 'ref', that is, | 129 | ** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is |
130 | ** 'grp' is not nested inside a capture that does not contain 'ref'. | 130 | ** not nested inside a full capture that does not contain 'ref'. (We |
131 | ** To do that, must find minimum capture that contains 'grp'. | 131 | ** only need to care for full captures because the search at 'findback' |
132 | ** skips open-end blocks; so, if 'grp' is nested in a non-full capture, | ||
133 | ** 'ref' is also inside it.) To check this, we search backward for the | ||
134 | ** inner full capture enclosing 'grp'. A full capture cannot contain | ||
135 | ** non-full captures, so a close capture means we cannot be inside a | ||
136 | ** full capture anymore. | ||
132 | */ | 137 | */ |
133 | static int capvisible (CapState *cs, Capture *grp, Capture *ref) { | 138 | static int capvisible (CapState *cs, Capture *grp, Capture *ref) { |
134 | Capture *cap = grp; | 139 | Capture *cap = grp; |
135 | while (cap-- > cs->ocap) { /* repeat until end of list */ | 140 | int i = MAXLOP; /* maximum distance for an 'open' */ |
141 | while (i-- > 0 && cap-- > cs->ocap) { | ||
136 | if (isclosecap(cap)) | 142 | if (isclosecap(cap)) |
137 | cap = findopen(cap); /* skip nested captures */ | 143 | return 1; /* can stop the search */ |
144 | else if (grp->index - cap->index >= UCHAR_MAX) | ||
145 | return 1; /* can stop the search */ | ||
138 | else if (capinside(cap, grp)) /* is 'grp' inside cap? */ | 146 | else if (capinside(cap, grp)) /* is 'grp' inside cap? */ |
139 | return capinside(cap, ref); /* ok iff cap also contains 'ref' */ | 147 | return capinside(cap, ref); /* ok iff cap also contains 'ref' */ |
140 | } | 148 | } |
@@ -150,10 +158,10 @@ static Capture *findback (CapState *cs, Capture *ref) { | |||
150 | lua_State *L = cs->L; | 158 | lua_State *L = cs->L; |
151 | Capture *cap = ref; | 159 | Capture *cap = ref; |
152 | while (cap-- > cs->ocap) { /* repeat until end of list */ | 160 | while (cap-- > cs->ocap) { /* repeat until end of list */ |
153 | if (isopencap(cap)) | 161 | if (isclosecap(cap)) |
154 | continue; /* enclosing captures are not visible to 'ref' */ | ||
155 | else if (isclosecap(cap)) | ||
156 | cap = findopen(cap); /* skip nested captures */ | 162 | cap = findopen(cap); /* skip nested captures */ |
163 | else if (capinside(cap, ref)) | ||
164 | continue; /* enclosing captures are not visible to 'ref' */ | ||
157 | if (captype(cap) == Cgroup && capvisible(cs, cap, ref)) { | 165 | if (captype(cap) == Cgroup && capvisible(cs, cap, ref)) { |
158 | getfromktable(cs, cap->idx); /* get group name */ | 166 | getfromktable(cs, cap->idx); /* get group name */ |
159 | if (lp_equal(L, -2, -1)) { /* right group? */ | 167 | if (lp_equal(L, -2, -1)) { /* right group? */ |
@@ -70,6 +70,12 @@ typedef struct CapState { | |||
70 | : (c2)->index < (c1)->index + (c1)->siz - 1) | 70 | : (c2)->index < (c1)->index + (c1)->siz - 1) |
71 | 71 | ||
72 | 72 | ||
73 | /** | ||
74 | ** Maximum number of captures to visit when looking for an 'open'. | ||
75 | */ | ||
76 | #define MAXLOP 20 | ||
77 | |||
78 | |||
73 | 79 | ||
74 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); | 80 | int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); |
75 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop); | 81 | int getcaptures (lua_State *L, const char *s, const char *r, int ptop); |
@@ -608,17 +608,17 @@ The following table summarizes the basic captures: | |||
608 | <tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td> | 608 | <tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td> |
609 | <td>the value of the n<sup>th</sup> extra argument to | 609 | <td>the value of the n<sup>th</sup> extra argument to |
610 | <code>lpeg.match</code> (matches the empty string)</td></tr> | 610 | <code>lpeg.match</code> (matches the empty string)</td></tr> |
611 | <tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td> | 611 | <tr><td><a href="#cap-b"><code>lpeg.Cb(key)</code></a></td> |
612 | <td>the values produced by the previous | 612 | <td>the values produced by the previous |
613 | group capture named <code>name</code> | 613 | group capture named <code>key</code> |
614 | (matches the empty string)</td></tr> | 614 | (matches the empty string)</td></tr> |
615 | <tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td> | 615 | <tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td> |
616 | <td>the given values (matches the empty string)</td></tr> | 616 | <td>the given values (matches the empty string)</td></tr> |
617 | <tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td> | 617 | <tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td> |
618 | <td>a <em>folding</em> of the captures from <code>patt</code></td></tr> | 618 | <td>a <em>folding</em> of the captures from <code>patt</code></td></tr> |
619 | <tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td> | 619 | <tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, key])</code></a></td> |
620 | <td>the values produced by <code>patt</code>, | 620 | <td>the values produced by <code>patt</code>, |
621 | optionally tagged with <code>name</code></td></tr> | 621 | optionally tagged with <code>key</code></td></tr> |
622 | <tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td> | 622 | <tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td> |
623 | <td>the current position (matches the empty string)</td></tr> | 623 | <td>the current position (matches the empty string)</td></tr> |
624 | <tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td> | 624 | <tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td> |
@@ -639,9 +639,10 @@ or no value when <code>number</code> is zero.</td></tr> | |||
639 | <td>the returns of <code>function</code> applied to the captures | 639 | <td>the returns of <code>function</code> applied to the captures |
640 | of <code>patt</code></td></tr> | 640 | of <code>patt</code></td></tr> |
641 | <tr><td><a href="#cap-rep"><code>patt % function</code></a></td> | 641 | <tr><td><a href="#cap-rep"><code>patt % function</code></a></td> |
642 | <td>the return of <code>function</code> applied to the previous | 642 | <td>produces no value; |
643 | capture plus the captures of <code>patt</code></td></tr>; | 643 | it <em>accummulates</em> the captures from <code>patt</code> |
644 | the returned value becomes the value of the previous capture | 644 | into the previous capture through <code>function</code> |
645 | </td></tr> | ||
645 | <tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td> | 646 | <tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td> |
646 | <td>the returns of <code>function</code> applied to the captures | 647 | <td>the returns of <code>function</code> applied to the captures |
647 | of <code>patt</code>; the application is done at match time</td></tr> | 648 | of <code>patt</code>; the application is done at match time</td></tr> |
@@ -699,24 +700,25 @@ argument given in the call to <code>lpeg.match</code>. | |||
699 | </p> | 700 | </p> |
700 | 701 | ||
701 | 702 | ||
702 | <h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3> | 703 | <h3><a name="cap-b"></a><code>lpeg.Cb (key)</code></h3> |
703 | <p> | 704 | <p> |
704 | Creates a <em>back capture</em>. | 705 | Creates a <em>back capture</em>. |
705 | This pattern matches the empty string and | 706 | This pattern matches the empty string and |
706 | produces the values produced by the <em>most recent</em> | 707 | produces the values produced by the <em>most recent</em> |
707 | <a href="#cap-g">group capture</a> named <code>name</code> | 708 | <a href="#cap-g">group capture</a> named <code>key</code> |
708 | (where <code>name</code> can be any Lua value). | 709 | (where <code>key</code> can be any Lua value). |
709 | </p> | 710 | </p> |
710 | 711 | ||
711 | <p> | 712 | <p> |
712 | <em>Most recent</em> means the last | 713 | <em>Most recent</em> means the last |
713 | <em>complete</em> | 714 | <em>complete</em> |
714 | <em>outermost</em> | 715 | <em>outermost</em> |
715 | group capture with the given name. | 716 | group capture with the given key. |
716 | A <em>Complete</em> capture means that the entire pattern | 717 | A <em>Complete</em> capture means that the entire pattern |
717 | corresponding to the capture has matched. | 718 | corresponding to the capture has matched; |
719 | in other words, the back capture is not nested inside the group. | ||
718 | An <em>Outermost</em> capture means that the capture is not inside | 720 | An <em>Outermost</em> capture means that the capture is not inside |
719 | another complete capture. | 721 | another complete capture that does not contain the back capture itself. |
720 | </p> | 722 | </p> |
721 | 723 | ||
722 | <p> | 724 | <p> |
@@ -785,13 +787,13 @@ print(sum:match("10,30,43")) --> 83 | |||
785 | </pre> | 787 | </pre> |
786 | 788 | ||
787 | 789 | ||
788 | <h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3> | 790 | <h3><a name="cap-g"></a><code>lpeg.Cg (patt [, key])</code></h3> |
789 | <p> | 791 | <p> |
790 | Creates a <em>group capture</em>. | 792 | Creates a <em>group capture</em>. |
791 | It groups all values returned by <code>patt</code> | 793 | It groups all values returned by <code>patt</code> |
792 | into a single capture. | 794 | into a single capture. |
793 | The group may be anonymous (if no name is given) | 795 | The group may be anonymous (if no key is given) |
794 | or named with the given name | 796 | or named with the given key |
795 | (which can be any non-nil Lua value). | 797 | (which can be any non-nil Lua value). |
796 | </p> | 798 | </p> |
797 | 799 | ||
@@ -837,7 +839,7 @@ starting at 1. | |||
837 | Moreover, | 839 | Moreover, |
838 | for each named capture group created by <code>patt</code>, | 840 | for each named capture group created by <code>patt</code>, |
839 | the first value of the group is put into the table | 841 | the first value of the group is put into the table |
840 | with the group name as its key. | 842 | with the group key as its key. |
841 | The captured value is only the table. | 843 | The captured value is only the table. |
842 | </p> | 844 | </p> |
843 | 845 | ||
@@ -897,12 +899,14 @@ there is no captured value. | |||
897 | <p> | 899 | <p> |
898 | Creates an <em>accumulator capture</em>. | 900 | Creates an <em>accumulator capture</em>. |
899 | This pattern behaves similarly to a | 901 | This pattern behaves similarly to a |
900 | <a href="cap-func">function capture</a>, | 902 | <a href="#cap-func">function capture</a>, |
901 | with the following differences: | 903 | with the following differences: |
902 | The last captured value is added as a first argument to | 904 | The last captured value is added as a first argument to |
903 | the call; | 905 | the call; |
904 | the return of the function is adjusted to one single value; | 906 | the return of the function is adjusted to one single value; |
905 | that value becomes the last captured value. | 907 | that value replaces the last captured value. |
908 | Note that the capture itself produces no values; | ||
909 | it only changes the value of its previous capture. | ||
906 | </p> | 910 | </p> |
907 | 911 | ||
908 | <p> | 912 | <p> |
@@ -929,6 +933,12 @@ changed to upper case; | |||
929 | that value then becomes the first and only | 933 | that value then becomes the first and only |
930 | capture value created by the match. | 934 | capture value created by the match. |
931 | </p> | 935 | </p> |
936 | |||
937 | <p> | ||
938 | As another example, | ||
939 | let us consider the problem of adding a list of numbers. | ||
940 | </p> | ||
941 | <pre class="example"> | ||
932 | -- matches a numeral and captures its numerical value | 942 | -- matches a numeral and captures its numerical value |
933 | number = lpeg.R"09"^1 / tonumber | 943 | number = lpeg.R"09"^1 / tonumber |
934 | 944 | ||
@@ -944,11 +954,11 @@ print(sum:match("10,30,43")) --> 83 | |||
944 | <p> | 954 | <p> |
945 | First, the initial <code>number</code> captures a number; | 955 | First, the initial <code>number</code> captures a number; |
946 | that first capture will play the role of an accumulator. | 956 | that first capture will play the role of an accumulator. |
947 | Then, each time <code>number</code> matches inside the loop | 957 | Then, each time the sequence <code>comma-number</code> |
948 | there is a accumulator capture: | 958 | matches inside the loop there is an accumulator capture: |
949 | It calls <code>add</code> with the current value of the accumulator | 959 | It calls <code>add</code> with the current value of the accumulator |
950 | and the value of the new number, | 960 | and the value of the new number, |
951 | and their sum replaces the value of the accumulator. | 961 | and the result of the call (their sum) replaces the value of the accumulator. |
952 | At the end of the match, | 962 | At the end of the match, |
953 | the accumulator with all sums is the final value. | 963 | the accumulator with all sums is the final value. |
954 | </p> | 964 | </p> |
@@ -956,9 +966,12 @@ the accumulator with all sums is the final value. | |||
956 | <p> | 966 | <p> |
957 | Due to the nature of this capture, | 967 | Due to the nature of this capture, |
958 | you should avoid using it in places where it is not clear | 968 | you should avoid using it in places where it is not clear |
959 | what is its "previous" capture. | 969 | what is its "previous" capture |
970 | (e.g., directly nested in a <a href="#cap-string">string capture</a> | ||
971 | or a <a href="#cap-num">numbered capture</a>). | ||
960 | Due to implementation details, | 972 | Due to implementation details, |
961 | you should not use this capture inside a substitution capture. | 973 | you should not use this capture directly nested in a |
974 | <a href="#cap-s">substitution capture</a>. | ||
962 | </p> | 975 | </p> |
963 | 976 | ||
964 | 977 | ||
@@ -1014,9 +1027,9 @@ local lpeg = require "lpeg" | |||
1014 | -- matches a word followed by end-of-string | 1027 | -- matches a word followed by end-of-string |
1015 | p = lpeg.R"az"^1 * -1 | 1028 | p = lpeg.R"az"^1 * -1 |
1016 | 1029 | ||
1017 | print(p:match("hello")) --> 6 | 1030 | print(p:match("hello")) --> 6 |
1018 | print(lpeg.match(p, "hello")) --> 6 | 1031 | print(lpeg.match(p, "hello")) --> 6 |
1019 | print(p:match("1 hello")) --> nil | 1032 | print(p:match("1 hello")) --> nil |
1020 | </pre> | 1033 | </pre> |
1021 | <p> | 1034 | <p> |
1022 | The pattern is simply a sequence of one or more lower-case letters | 1035 | The pattern is simply a sequence of one or more lower-case letters |
@@ -1043,7 +1056,7 @@ local name = lpeg.C(lpeg.alpha^1) * space | |||
1043 | local sep = lpeg.S(",;") * space | 1056 | local sep = lpeg.S(",;") * space |
1044 | local pair = name * "=" * space * name * sep^-1 | 1057 | local pair = name * "=" * space * name * sep^-1 |
1045 | local list = lpeg.Ct("") * (pair % rawset)^0 | 1058 | local list = lpeg.Ct("") * (pair % rawset)^0 |
1046 | t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } | 1059 | t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } |
1047 | </pre> | 1060 | </pre> |
1048 | <p> | 1061 | <p> |
1049 | Each pair has the format <code>name = name</code> followed by | 1062 | Each pair has the format <code>name = name</code> followed by |
@@ -1135,7 +1148,7 @@ function anywhere (p) | |||
1135 | return lpeg.P{ I * p * I + 1 * lpeg.V(1) } | 1148 | return lpeg.P{ I * p * I + 1 * lpeg.V(1) } |
1136 | end | 1149 | end |
1137 | 1150 | ||
1138 | print(anywhere("world"):match("hello world!")) -> 7 12 | 1151 | print(anywhere("world"):match("hello world!")) --> 7 12 |
1139 | </pre> | 1152 | </pre> |
1140 | 1153 | ||
1141 | <p> | 1154 | <p> |
@@ -1344,7 +1357,7 @@ function evalExp (s) | |||
1344 | end | 1357 | end |
1345 | 1358 | ||
1346 | -- small example | 1359 | -- small example |
1347 | print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5 | 1360 | print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5 |
1348 | </pre> | 1361 | </pre> |
1349 | 1362 | ||
1350 | <p> | 1363 | <p> |
@@ -1372,7 +1385,7 @@ G = lpeg.P{ "Exp", | |||
1372 | } | 1385 | } |
1373 | 1386 | ||
1374 | -- small example | 1387 | -- small example |
1375 | print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 | 1388 | print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 |
1376 | </pre> | 1389 | </pre> |
1377 | <p> | 1390 | <p> |
1378 | Note the use of the accumulator capture. | 1391 | Note the use of the accumulator capture. |
@@ -149,8 +149,8 @@ void printpatt (Instruction *p) { | |||
149 | 149 | ||
150 | static void printcap (Capture *cap, int ident) { | 150 | static void printcap (Capture *cap, int ident) { |
151 | while (ident--) printf(" "); | 151 | while (ident--) printf(" "); |
152 | printf("%s (idx: %d - size: %d) -> %lu\n", | 152 | printf("%s (idx: %d - size: %d) -> %lu (%p)\n", |
153 | capkind(cap->kind), cap->idx, cap->siz, (long)cap->index); | 153 | capkind(cap->kind), cap->idx, cap->siz, (long)cap->index, (void*)cap); |
154 | } | 154 | } |
155 | 155 | ||
156 | 156 | ||
@@ -198,11 +198,6 @@ static int removedyncap (lua_State *L, Capture *capture, | |||
198 | } | 198 | } |
199 | 199 | ||
200 | 200 | ||
201 | /** | ||
202 | ** Maximum number of captures to visit when looking for an 'open'. | ||
203 | */ | ||
204 | #define MAXLOP 20 | ||
205 | |||
206 | /* | 201 | /* |
207 | ** Find the corresponding 'open' capture before 'cap', when that capture | 202 | ** Find the corresponding 'open' capture before 'cap', when that capture |
208 | ** can become a full capture. If a full capture c1 is followed by an | 203 | ** can become a full capture. If a full capture c1 is followed by an |
@@ -1005,6 +1005,35 @@ p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) | |||
1005 | t = p:match("ab") | 1005 | t = p:match("ab") |
1006 | checkeq(t, {"a", "b"}) | 1006 | checkeq(t, {"a", "b"}) |
1007 | 1007 | ||
1008 | |||
1009 | do | ||
1010 | -- some basic cases | ||
1011 | assert(m.match(m.Cg(m.Cc(3), "a") * m.Cb("a"), "a") == 3) | ||
1012 | assert(m.match(m.Cg(m.C(1), 133) * m.Cb(133), "X") == "X") | ||
1013 | |||
1014 | -- first reference to 'x' should not see the group enclosing it | ||
1015 | local p = m.Cg(m.Cb('x'), 'x') * m.Cb('x') | ||
1016 | checkerr("back reference 'x' not found", m.match, p, '') | ||
1017 | |||
1018 | local p = m.Cg(m.Cb('x') * m.C(1), 'x') * m.Cb('x') | ||
1019 | checkerr("back reference 'x' not found", m.match, p, 'abc') | ||
1020 | |||
1021 | -- reference to 'x' should not see the group enclosed in another capture | ||
1022 | local s = string.rep("a", 30) | ||
1023 | local p = (m.C(1)^-4 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') | ||
1024 | checkerr("back reference 'x' not found", m.match, p, s) | ||
1025 | |||
1026 | local p = (m.C(1)^-20 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') | ||
1027 | checkerr("back reference 'x' not found", m.match, p, s) | ||
1028 | |||
1029 | -- second reference 'k' should refer to 10 and first ref. 'k' | ||
1030 | p = m.Cg(m.Cc(20), 'k') * m.Cg(m.Cc(10) * m.Cb('k') * m.C(1), 'k') | ||
1031 | * (m.Cb('k') / function (a,b,c) return a*10 + b + tonumber(c) end) | ||
1032 | -- 10 * 10 (Cc) + 20 (Cb) + 7 (C) == 127 | ||
1033 | assert(p:match("756") == 127) | ||
1034 | |||
1035 | end | ||
1036 | |||
1008 | p = m.P(true) | 1037 | p = m.P(true) |
1009 | for i = 1, 10 do p = p * m.Cg(1, i) end | 1038 | for i = 1, 10 do p = p * m.Cg(1, i) end |
1010 | for i = 1, 10 do | 1039 | for i = 1, 10 do |