diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-22 10:51:31 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-22 10:51:31 -0300 |
commit | 7d43b367e7a89369c1302124677a305aa0d070c7 (patch) | |
tree | cc0a05cc02a417b9107fa68a9506f7afef667dcb | |
parent | 4eb4419163dd6c97665b9481e9581ff32496b392 (diff) | |
download | lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.tar.gz lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.tar.bz2 lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.zip |
Improved documentation for accumulator captures
-rw-r--r-- | lpcap.c | 2 | ||||
-rw-r--r-- | lpeg.html | 94 | ||||
-rw-r--r-- | re.html | 38 |
3 files changed, 75 insertions, 59 deletions
@@ -477,7 +477,7 @@ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { | |||
477 | substcap(b, cs); /* add capture directly to buffer */ | 477 | substcap(b, cs); /* add capture directly to buffer */ |
478 | return 1; | 478 | return 1; |
479 | case Cacc: /* accumulator capture? */ | 479 | case Cacc: /* accumulator capture? */ |
480 | return luaL_error(cs->L, "accumulator capture inside substitution capture"); | 480 | return luaL_error(cs->L, "invalid context for an accumulator capture"); |
481 | default: { | 481 | default: { |
482 | lua_State *L = cs->L; | 482 | lua_State *L = cs->L; |
483 | int n = pushcapture(cs); | 483 | int n = pushcapture(cs); |
@@ -901,8 +901,8 @@ Creates an <em>accumulator capture</em>. | |||
901 | This pattern behaves similarly to a | 901 | This pattern behaves similarly to a |
902 | <a href="#cap-func">function capture</a>, | 902 | <a href="#cap-func">function capture</a>, |
903 | with the following differences: | 903 | with the following differences: |
904 | The last captured value is added as a first argument to | 904 | The last captured value before <code>patt</code> |
905 | the call; | 905 | is added as a first argument to the call; |
906 | the return of the function is adjusted to one single value; | 906 | the return of the function is adjusted to one single value; |
907 | that value replaces the last captured value. | 907 | that value replaces the last captured value. |
908 | Note that the capture itself produces no values; | 908 | Note that the capture itself produces no values; |
@@ -911,31 +911,6 @@ it only changes the value of its previous capture. | |||
911 | 911 | ||
912 | <p> | 912 | <p> |
913 | As an example, | 913 | As an example, |
914 | consider the following code fragment: | ||
915 | </p> | ||
916 | <pre class="example"> | ||
917 | local name = lpeg.C(lpeg.R("az")^1) | ||
918 | local p = name * (lpeg.P("^") % string.upper)^-1 | ||
919 | print(p:match("count")) --> count | ||
920 | print(p:match("count^")) --> COUNT | ||
921 | </pre> | ||
922 | <p> | ||
923 | In the first match, | ||
924 | the accumulator capture does not match, | ||
925 | and so the match results in its first capture, a name. | ||
926 | In the second match, | ||
927 | the accumulator capture matches, | ||
928 | so the function <code>string.upper</code> | ||
929 | is called with the previous capture (created by <code>name</code>) | ||
930 | plus the string <code>"^"</code>; | ||
931 | the function ignores its second argument and returns the first argument | ||
932 | changed to upper case; | ||
933 | that value then becomes the first and only | ||
934 | capture value created by the match. | ||
935 | </p> | ||
936 | |||
937 | <p> | ||
938 | As another example, | ||
939 | let us consider the problem of adding a list of numbers. | 914 | let us consider the problem of adding a list of numbers. |
940 | </p> | 915 | </p> |
941 | <pre class="example"> | 916 | <pre class="example"> |
@@ -956,22 +931,56 @@ First, the initial <code>number</code> captures a number; | |||
956 | that first capture will play the role of an accumulator. | 931 | that first capture will play the role of an accumulator. |
957 | Then, each time the sequence <code>comma-number</code> | 932 | Then, each time the sequence <code>comma-number</code> |
958 | matches inside the loop there is an accumulator capture: | 933 | matches inside the loop there is an accumulator capture: |
959 | It calls <code>add</code> with the current value of the accumulator | 934 | It calls <code>add</code> with the current value of the |
960 | and the value of the new number, | 935 | accumulator—which is the last captured value, created by the |
961 | and the result of the call (their sum) replaces the value of the accumulator. | 936 | first <code>number</code>— and the value of the new number, |
937 | and the result of the call (the sum of the two numbers) | ||
938 | replaces the value of the accumulator. | ||
962 | At the end of the match, | 939 | At the end of the match, |
963 | the accumulator with all sums is the final value. | 940 | the accumulator with all sums is the final value. |
964 | </p> | 941 | </p> |
965 | 942 | ||
966 | <p> | 943 | <p> |
944 | As another example, | ||
945 | consider the following code fragment: | ||
946 | </p> | ||
947 | <pre class="example"> | ||
948 | local name = lpeg.C(lpeg.R("az")^1) | ||
949 | local p = name * (lpeg.P("^") % string.upper)^-1 | ||
950 | print(p:match("count")) --> count | ||
951 | print(p:match("count^")) --> COUNT | ||
952 | </pre> | ||
953 | <p> | ||
954 | In the match against <code>"count"</code>, | ||
955 | as there is no <code>"^"</code>, | ||
956 | the optional accumulator capture does not match; | ||
957 | so, the match results in its sole capture, a name. | ||
958 | In the match against <code>"count^"</code>, | ||
959 | the accumulator capture matches, | ||
960 | so the function <code>string.upper</code> | ||
961 | is called with the previous captured value (created by <code>name</code>) | ||
962 | plus the string <code>"^"</code>; | ||
963 | the function ignores its second argument and returns the first argument | ||
964 | changed to upper case; | ||
965 | that value then becomes the first and only | ||
966 | capture value created by the match. | ||
967 | </p> | ||
968 | |||
969 | <p> | ||
967 | Due to the nature of this capture, | 970 | Due to the nature of this capture, |
968 | you should avoid using it in places where it is not clear | 971 | you should avoid using it in places where it is not clear |
969 | what is its "previous" capture | 972 | what is the "previous" capture, |
970 | (e.g., directly nested in a <a href="#cap-string">string capture</a> | 973 | such as directly nested in a <a href="#cap-string">string capture</a> |
971 | or a <a href="#cap-num">numbered capture</a>). | 974 | or a <a href="#cap-num">numbered capture</a>. |
972 | Due to implementation details, | 975 | (Note that these captures may not need to evaluate |
976 | all their subcaptures to compute their results.) | ||
977 | Moreover, due to implementation details, | ||
973 | you should not use this capture directly nested in a | 978 | you should not use this capture directly nested in a |
974 | <a href="#cap-s">substitution capture</a>. | 979 | <a href="#cap-s">substitution capture</a>. |
980 | A simple and effective way to avoid these issues is | ||
981 | to enclose the whole accumulation composition | ||
982 | (including the capture that generates the initial value) | ||
983 | into an anonymous <a href="#cap-g">group capture</a>. | ||
975 | </p> | 984 | </p> |
976 | 985 | ||
977 | 986 | ||
@@ -1056,7 +1065,8 @@ local name = lpeg.C(lpeg.alpha^1) * space | |||
1056 | local sep = lpeg.S(",;") * space | 1065 | local sep = lpeg.S(",;") * space |
1057 | local pair = name * "=" * space * name * sep^-1 | 1066 | local pair = name * "=" * space * name * sep^-1 |
1058 | local list = lpeg.Ct("") * (pair % rawset)^0 | 1067 | local list = lpeg.Ct("") * (pair % rawset)^0 |
1059 | t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } | 1068 | t = list:match("a=b, c = hi; next = pi") |
1069 | --> { a = "b", c = "hi", next = "pi" } | ||
1060 | </pre> | 1070 | </pre> |
1061 | <p> | 1071 | <p> |
1062 | Each pair has the format <code>name = name</code> followed by | 1072 | Each pair has the format <code>name = name</code> followed by |
@@ -1098,7 +1108,7 @@ by <code>sep</code>. | |||
1098 | If the split results in too many values, | 1108 | If the split results in too many values, |
1099 | it may overflow the maximum number of values | 1109 | it may overflow the maximum number of values |
1100 | that can be returned by a Lua function. | 1110 | that can be returned by a Lua function. |
1101 | In this case, | 1111 | To avoid this problem, |
1102 | we can collect these values in a table: | 1112 | we can collect these values in a table: |
1103 | </p> | 1113 | </p> |
1104 | <pre class="example"> | 1114 | <pre class="example"> |
@@ -1134,7 +1144,7 @@ end | |||
1134 | </pre> | 1144 | </pre> |
1135 | <p> | 1145 | <p> |
1136 | This grammar has a straight reading: | 1146 | This grammar has a straight reading: |
1137 | it matches <code>p</code> or skips one character and tries again. | 1147 | its sole rule matches <code>p</code> or skips one character and tries again. |
1138 | </p> | 1148 | </p> |
1139 | 1149 | ||
1140 | <p> | 1150 | <p> |
@@ -1143,9 +1153,9 @@ If we want to know where the pattern is in the string | |||
1143 | we can add position captures to the pattern: | 1153 | we can add position captures to the pattern: |
1144 | </p> | 1154 | </p> |
1145 | <pre class="example"> | 1155 | <pre class="example"> |
1146 | local I = lpeg.Cp() | 1156 | local Cp = lpeg.Cp() |
1147 | function anywhere (p) | 1157 | function anywhere (p) |
1148 | return lpeg.P{ I * p * I + 1 * lpeg.V(1) } | 1158 | return lpeg.P{ Cp * p * Cp + 1 * lpeg.V(1) } |
1149 | end | 1159 | end |
1150 | 1160 | ||
1151 | print(anywhere("world"):match("hello world!")) --> 7 12 | 1161 | print(anywhere("world"):match("hello world!")) --> 7 12 |
@@ -1155,15 +1165,15 @@ print(anywhere("world"):match("hello world!")) --> 7 12 | |||
1155 | Another option for the search is like this: | 1165 | Another option for the search is like this: |
1156 | </p> | 1166 | </p> |
1157 | <pre class="example"> | 1167 | <pre class="example"> |
1158 | local I = lpeg.Cp() | 1168 | local Cp = lpeg.Cp() |
1159 | function anywhere (p) | 1169 | function anywhere (p) |
1160 | return (1 - lpeg.P(p))^0 * I * p * I | 1170 | return (1 - lpeg.P(p))^0 * Cp * p * Cp |
1161 | end | 1171 | end |
1162 | </pre> | 1172 | </pre> |
1163 | <p> | 1173 | <p> |
1164 | Again the pattern has a straight reading: | 1174 | Again the pattern has a straight reading: |
1165 | it skips as many characters as possible while not matching <code>p</code>, | 1175 | it skips as many characters as possible while not matching <code>p</code>, |
1166 | and then matches <code>p</code> (plus appropriate captures). | 1176 | and then matches <code>p</code> plus appropriate captures. |
1167 | </p> | 1177 | </p> |
1168 | 1178 | ||
1169 | <p> | 1179 | <p> |
@@ -61,6 +61,20 @@ Constructions are listed in order of decreasing precedence. | |||
61 | <table border="1"> | 61 | <table border="1"> |
62 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> | 62 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> |
63 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> | 63 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> |
64 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | ||
65 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | ||
66 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
67 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
68 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
69 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
70 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
71 | <tr><td><code>p^num</code></td> | ||
72 | <td>exactly <code>num</code> repetitions</td></tr> | ||
73 | <tr><td><code>p^+num</code></td> | ||
74 | <td>at least <code>num</code> repetitions</td></tr> | ||
75 | <tr><td><code>p^-num</code></td> | ||
76 | <td>at most <code>num</code> repetitions</td></tr> | ||
77 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
64 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> | 78 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> |
65 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> | 79 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> |
66 | <tr><td><code>[class]</code></td> <td>character class</td></tr> | 80 | <tr><td><code>[class]</code></td> <td>character class</td></tr> |
@@ -69,22 +83,15 @@ Constructions are listed in order of decreasing precedence. | |||
69 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> | 83 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> |
70 | <tr><td><code>name</code></td><td>non terminal</td></tr> | 84 | <tr><td><code>name</code></td><td>non terminal</td></tr> |
71 | <tr><td><code><name></code></td><td>non terminal</td></tr> | 85 | <tr><td><code><name></code></td><td>non terminal</td></tr> |
86 | |||
72 | <tr><td><code>{}</code></td> <td>position capture</td></tr> | 87 | <tr><td><code>{}</code></td> <td>position capture</td></tr> |
73 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> | 88 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> |
74 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> | 89 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> |
75 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> | 90 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> |
76 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> | 91 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> |
77 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> | 92 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> |
78 | <tr><td><code>=name</code></td> <td>back reference | 93 | <tr><td><code>=name</code></td> <td>back reference</td></tr> |
79 | </td></tr> | 94 | |
80 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
81 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
82 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
83 | <tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr> | ||
84 | <tr><td><code>p^+num</code></td> | ||
85 | <td>at least <code>n</code> repetitions</td></tr> | ||
86 | <tr><td><code>p^-num</code></td> | ||
87 | <td>at most <code>n</code> repetitions</td></tr> | ||
88 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> | 95 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> |
89 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> | 96 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> |
90 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> | 97 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> |
@@ -94,11 +101,8 @@ equivalent to <code>p / defs[name]</code></td></tr> | |||
94 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> | 101 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> |
95 | <tr><td><code>p ~> name</code></td> <td>fold capture | 102 | <tr><td><code>p ~> name</code></td> <td>fold capture |
96 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | 103 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> |
97 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | 104 | <tr><td><code>p >> name</code></td> <td>accumulator capture |
98 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | 105 | equivalent to <code>(p % defs[name])</code></td></tr> |
99 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
100 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
101 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
102 | </tbody></table> | 106 | </tbody></table> |
103 | <p> | 107 | <p> |
104 | Any space appearing in a syntax description can be | 108 | Any space appearing in a syntax description can be |
@@ -199,9 +203,10 @@ print(re.match("the number 423 is odd", "({%a+} / .)*")) | |||
199 | --> the number is odd | 203 | --> the number is odd |
200 | 204 | ||
201 | -- returns the first numeral in a string | 205 | -- returns the first numeral in a string |
202 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) | 206 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) |
203 | --> 423 | 207 | --> 423 |
204 | 208 | ||
209 | -- substitutes a dot for each vowel in a string | ||
205 | print(re.gsub("hello World", "[aeiou]", ".")) | 210 | print(re.gsub("hello World", "[aeiou]", ".")) |
206 | --> h.ll. W.rld | 211 | --> h.ll. W.rld |
207 | </pre> | 212 | </pre> |
@@ -415,6 +420,7 @@ prefix <- '&' S prefix / '!' S prefix / suffix | |||
415 | suffix <- primary S (([+*?] | 420 | suffix <- primary S (([+*?] |
416 | / '^' [+-]? num | 421 | / '^' [+-]? num |
417 | / '->' S (string / '{}' / name) | 422 | / '->' S (string / '{}' / name) |
423 | / '>>' S name | ||
418 | / '=>' S name) S)* | 424 | / '=>' S name) S)* |
419 | 425 | ||
420 | primary <- '(' exp ')' / string / class / defined | 426 | primary <- '(' exp ')' / string / class / defined |