diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-22 10:51:31 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-06-22 10:51:31 -0300 |
| commit | 7d43b367e7a89369c1302124677a305aa0d070c7 (patch) | |
| tree | cc0a05cc02a417b9107fa68a9506f7afef667dcb | |
| parent | 4eb4419163dd6c97665b9481e9581ff32496b392 (diff) | |
| download | lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.tar.gz lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.tar.bz2 lpeg-7d43b367e7a89369c1302124677a305aa0d070c7.zip | |
Improved documentation for accumulator captures
| -rw-r--r-- | lpcap.c | 2 | ||||
| -rw-r--r-- | lpeg.html | 94 | ||||
| -rw-r--r-- | re.html | 38 |
3 files changed, 75 insertions, 59 deletions
| @@ -477,7 +477,7 @@ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { | |||
| 477 | substcap(b, cs); /* add capture directly to buffer */ | 477 | substcap(b, cs); /* add capture directly to buffer */ |
| 478 | return 1; | 478 | return 1; |
| 479 | case Cacc: /* accumulator capture? */ | 479 | case Cacc: /* accumulator capture? */ |
| 480 | return luaL_error(cs->L, "accumulator capture inside substitution capture"); | 480 | return luaL_error(cs->L, "invalid context for an accumulator capture"); |
| 481 | default: { | 481 | default: { |
| 482 | lua_State *L = cs->L; | 482 | lua_State *L = cs->L; |
| 483 | int n = pushcapture(cs); | 483 | int n = pushcapture(cs); |
| @@ -901,8 +901,8 @@ Creates an <em>accumulator capture</em>. | |||
| 901 | This pattern behaves similarly to a | 901 | This pattern behaves similarly to a |
| 902 | <a href="#cap-func">function capture</a>, | 902 | <a href="#cap-func">function capture</a>, |
| 903 | with the following differences: | 903 | with the following differences: |
| 904 | The last captured value is added as a first argument to | 904 | The last captured value before <code>patt</code> |
| 905 | the call; | 905 | is added as a first argument to the call; |
| 906 | the return of the function is adjusted to one single value; | 906 | the return of the function is adjusted to one single value; |
| 907 | that value replaces the last captured value. | 907 | that value replaces the last captured value. |
| 908 | Note that the capture itself produces no values; | 908 | Note that the capture itself produces no values; |
| @@ -911,31 +911,6 @@ it only changes the value of its previous capture. | |||
| 911 | 911 | ||
| 912 | <p> | 912 | <p> |
| 913 | As an example, | 913 | As an example, |
| 914 | consider the following code fragment: | ||
| 915 | </p> | ||
| 916 | <pre class="example"> | ||
| 917 | local name = lpeg.C(lpeg.R("az")^1) | ||
| 918 | local p = name * (lpeg.P("^") % string.upper)^-1 | ||
| 919 | print(p:match("count")) --> count | ||
| 920 | print(p:match("count^")) --> COUNT | ||
| 921 | </pre> | ||
| 922 | <p> | ||
| 923 | In the first match, | ||
| 924 | the accumulator capture does not match, | ||
| 925 | and so the match results in its first capture, a name. | ||
| 926 | In the second match, | ||
| 927 | the accumulator capture matches, | ||
| 928 | so the function <code>string.upper</code> | ||
| 929 | is called with the previous capture (created by <code>name</code>) | ||
| 930 | plus the string <code>"^"</code>; | ||
| 931 | the function ignores its second argument and returns the first argument | ||
| 932 | changed to upper case; | ||
| 933 | that value then becomes the first and only | ||
| 934 | capture value created by the match. | ||
| 935 | </p> | ||
| 936 | |||
| 937 | <p> | ||
| 938 | As another example, | ||
| 939 | let us consider the problem of adding a list of numbers. | 914 | let us consider the problem of adding a list of numbers. |
| 940 | </p> | 915 | </p> |
| 941 | <pre class="example"> | 916 | <pre class="example"> |
| @@ -956,22 +931,56 @@ First, the initial <code>number</code> captures a number; | |||
| 956 | that first capture will play the role of an accumulator. | 931 | that first capture will play the role of an accumulator. |
| 957 | Then, each time the sequence <code>comma-number</code> | 932 | Then, each time the sequence <code>comma-number</code> |
| 958 | matches inside the loop there is an accumulator capture: | 933 | matches inside the loop there is an accumulator capture: |
| 959 | It calls <code>add</code> with the current value of the accumulator | 934 | It calls <code>add</code> with the current value of the |
| 960 | and the value of the new number, | 935 | accumulator—which is the last captured value, created by the |
| 961 | and the result of the call (their sum) replaces the value of the accumulator. | 936 | first <code>number</code>— and the value of the new number, |
| 937 | and the result of the call (the sum of the two numbers) | ||
| 938 | replaces the value of the accumulator. | ||
| 962 | At the end of the match, | 939 | At the end of the match, |
| 963 | the accumulator with all sums is the final value. | 940 | the accumulator with all sums is the final value. |
| 964 | </p> | 941 | </p> |
| 965 | 942 | ||
| 966 | <p> | 943 | <p> |
| 944 | As another example, | ||
| 945 | consider the following code fragment: | ||
| 946 | </p> | ||
| 947 | <pre class="example"> | ||
| 948 | local name = lpeg.C(lpeg.R("az")^1) | ||
| 949 | local p = name * (lpeg.P("^") % string.upper)^-1 | ||
| 950 | print(p:match("count")) --> count | ||
| 951 | print(p:match("count^")) --> COUNT | ||
| 952 | </pre> | ||
| 953 | <p> | ||
| 954 | In the match against <code>"count"</code>, | ||
| 955 | as there is no <code>"^"</code>, | ||
| 956 | the optional accumulator capture does not match; | ||
| 957 | so, the match results in its sole capture, a name. | ||
| 958 | In the match against <code>"count^"</code>, | ||
| 959 | the accumulator capture matches, | ||
| 960 | so the function <code>string.upper</code> | ||
| 961 | is called with the previous captured value (created by <code>name</code>) | ||
| 962 | plus the string <code>"^"</code>; | ||
| 963 | the function ignores its second argument and returns the first argument | ||
| 964 | changed to upper case; | ||
| 965 | that value then becomes the first and only | ||
| 966 | capture value created by the match. | ||
| 967 | </p> | ||
| 968 | |||
| 969 | <p> | ||
| 967 | Due to the nature of this capture, | 970 | Due to the nature of this capture, |
| 968 | you should avoid using it in places where it is not clear | 971 | you should avoid using it in places where it is not clear |
| 969 | what is its "previous" capture | 972 | what is the "previous" capture, |
| 970 | (e.g., directly nested in a <a href="#cap-string">string capture</a> | 973 | such as directly nested in a <a href="#cap-string">string capture</a> |
| 971 | or a <a href="#cap-num">numbered capture</a>). | 974 | or a <a href="#cap-num">numbered capture</a>. |
| 972 | Due to implementation details, | 975 | (Note that these captures may not need to evaluate |
| 976 | all their subcaptures to compute their results.) | ||
| 977 | Moreover, due to implementation details, | ||
| 973 | you should not use this capture directly nested in a | 978 | you should not use this capture directly nested in a |
| 974 | <a href="#cap-s">substitution capture</a>. | 979 | <a href="#cap-s">substitution capture</a>. |
| 980 | A simple and effective way to avoid these issues is | ||
| 981 | to enclose the whole accumulation composition | ||
| 982 | (including the capture that generates the initial value) | ||
| 983 | into an anonymous <a href="#cap-g">group capture</a>. | ||
| 975 | </p> | 984 | </p> |
| 976 | 985 | ||
| 977 | 986 | ||
| @@ -1056,7 +1065,8 @@ local name = lpeg.C(lpeg.alpha^1) * space | |||
| 1056 | local sep = lpeg.S(",;") * space | 1065 | local sep = lpeg.S(",;") * space |
| 1057 | local pair = name * "=" * space * name * sep^-1 | 1066 | local pair = name * "=" * space * name * sep^-1 |
| 1058 | local list = lpeg.Ct("") * (pair % rawset)^0 | 1067 | local list = lpeg.Ct("") * (pair % rawset)^0 |
| 1059 | t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } | 1068 | t = list:match("a=b, c = hi; next = pi") |
| 1069 | --> { a = "b", c = "hi", next = "pi" } | ||
| 1060 | </pre> | 1070 | </pre> |
| 1061 | <p> | 1071 | <p> |
| 1062 | Each pair has the format <code>name = name</code> followed by | 1072 | Each pair has the format <code>name = name</code> followed by |
| @@ -1098,7 +1108,7 @@ by <code>sep</code>. | |||
| 1098 | If the split results in too many values, | 1108 | If the split results in too many values, |
| 1099 | it may overflow the maximum number of values | 1109 | it may overflow the maximum number of values |
| 1100 | that can be returned by a Lua function. | 1110 | that can be returned by a Lua function. |
| 1101 | In this case, | 1111 | To avoid this problem, |
| 1102 | we can collect these values in a table: | 1112 | we can collect these values in a table: |
| 1103 | </p> | 1113 | </p> |
| 1104 | <pre class="example"> | 1114 | <pre class="example"> |
| @@ -1134,7 +1144,7 @@ end | |||
| 1134 | </pre> | 1144 | </pre> |
| 1135 | <p> | 1145 | <p> |
| 1136 | This grammar has a straight reading: | 1146 | This grammar has a straight reading: |
| 1137 | it matches <code>p</code> or skips one character and tries again. | 1147 | its sole rule matches <code>p</code> or skips one character and tries again. |
| 1138 | </p> | 1148 | </p> |
| 1139 | 1149 | ||
| 1140 | <p> | 1150 | <p> |
| @@ -1143,9 +1153,9 @@ If we want to know where the pattern is in the string | |||
| 1143 | we can add position captures to the pattern: | 1153 | we can add position captures to the pattern: |
| 1144 | </p> | 1154 | </p> |
| 1145 | <pre class="example"> | 1155 | <pre class="example"> |
| 1146 | local I = lpeg.Cp() | 1156 | local Cp = lpeg.Cp() |
| 1147 | function anywhere (p) | 1157 | function anywhere (p) |
| 1148 | return lpeg.P{ I * p * I + 1 * lpeg.V(1) } | 1158 | return lpeg.P{ Cp * p * Cp + 1 * lpeg.V(1) } |
| 1149 | end | 1159 | end |
| 1150 | 1160 | ||
| 1151 | print(anywhere("world"):match("hello world!")) --> 7 12 | 1161 | print(anywhere("world"):match("hello world!")) --> 7 12 |
| @@ -1155,15 +1165,15 @@ print(anywhere("world"):match("hello world!")) --> 7 12 | |||
| 1155 | Another option for the search is like this: | 1165 | Another option for the search is like this: |
| 1156 | </p> | 1166 | </p> |
| 1157 | <pre class="example"> | 1167 | <pre class="example"> |
| 1158 | local I = lpeg.Cp() | 1168 | local Cp = lpeg.Cp() |
| 1159 | function anywhere (p) | 1169 | function anywhere (p) |
| 1160 | return (1 - lpeg.P(p))^0 * I * p * I | 1170 | return (1 - lpeg.P(p))^0 * Cp * p * Cp |
| 1161 | end | 1171 | end |
| 1162 | </pre> | 1172 | </pre> |
| 1163 | <p> | 1173 | <p> |
| 1164 | Again the pattern has a straight reading: | 1174 | Again the pattern has a straight reading: |
| 1165 | it skips as many characters as possible while not matching <code>p</code>, | 1175 | it skips as many characters as possible while not matching <code>p</code>, |
| 1166 | and then matches <code>p</code> (plus appropriate captures). | 1176 | and then matches <code>p</code> plus appropriate captures. |
| 1167 | </p> | 1177 | </p> |
| 1168 | 1178 | ||
| 1169 | <p> | 1179 | <p> |
| @@ -61,6 +61,20 @@ Constructions are listed in order of decreasing precedence. | |||
| 61 | <table border="1"> | 61 | <table border="1"> |
| 62 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> | 62 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> |
| 63 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> | 63 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> |
| 64 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | ||
| 65 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | ||
| 66 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
| 67 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
| 68 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
| 69 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
| 70 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
| 71 | <tr><td><code>p^num</code></td> | ||
| 72 | <td>exactly <code>num</code> repetitions</td></tr> | ||
| 73 | <tr><td><code>p^+num</code></td> | ||
| 74 | <td>at least <code>num</code> repetitions</td></tr> | ||
| 75 | <tr><td><code>p^-num</code></td> | ||
| 76 | <td>at most <code>num</code> repetitions</td></tr> | ||
| 77 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
| 64 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> | 78 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> |
| 65 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> | 79 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> |
| 66 | <tr><td><code>[class]</code></td> <td>character class</td></tr> | 80 | <tr><td><code>[class]</code></td> <td>character class</td></tr> |
| @@ -69,22 +83,15 @@ Constructions are listed in order of decreasing precedence. | |||
| 69 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> | 83 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> |
| 70 | <tr><td><code>name</code></td><td>non terminal</td></tr> | 84 | <tr><td><code>name</code></td><td>non terminal</td></tr> |
| 71 | <tr><td><code><name></code></td><td>non terminal</td></tr> | 85 | <tr><td><code><name></code></td><td>non terminal</td></tr> |
| 86 | |||
| 72 | <tr><td><code>{}</code></td> <td>position capture</td></tr> | 87 | <tr><td><code>{}</code></td> <td>position capture</td></tr> |
| 73 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> | 88 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> |
| 74 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> | 89 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> |
| 75 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> | 90 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> |
| 76 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> | 91 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> |
| 77 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> | 92 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> |
| 78 | <tr><td><code>=name</code></td> <td>back reference | 93 | <tr><td><code>=name</code></td> <td>back reference</td></tr> |
| 79 | </td></tr> | 94 | |
| 80 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
| 81 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
| 82 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
| 83 | <tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr> | ||
| 84 | <tr><td><code>p^+num</code></td> | ||
| 85 | <td>at least <code>n</code> repetitions</td></tr> | ||
| 86 | <tr><td><code>p^-num</code></td> | ||
| 87 | <td>at most <code>n</code> repetitions</td></tr> | ||
| 88 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> | 95 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> |
| 89 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> | 96 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> |
| 90 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> | 97 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> |
| @@ -94,11 +101,8 @@ equivalent to <code>p / defs[name]</code></td></tr> | |||
| 94 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> | 101 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> |
| 95 | <tr><td><code>p ~> name</code></td> <td>fold capture | 102 | <tr><td><code>p ~> name</code></td> <td>fold capture |
| 96 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | 103 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> |
| 97 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | 104 | <tr><td><code>p >> name</code></td> <td>accumulator capture |
| 98 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | 105 | equivalent to <code>(p % defs[name])</code></td></tr> |
| 99 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
| 100 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
| 101 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
| 102 | </tbody></table> | 106 | </tbody></table> |
| 103 | <p> | 107 | <p> |
| 104 | Any space appearing in a syntax description can be | 108 | Any space appearing in a syntax description can be |
| @@ -199,9 +203,10 @@ print(re.match("the number 423 is odd", "({%a+} / .)*")) | |||
| 199 | --> the number is odd | 203 | --> the number is odd |
| 200 | 204 | ||
| 201 | -- returns the first numeral in a string | 205 | -- returns the first numeral in a string |
| 202 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) | 206 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) |
| 203 | --> 423 | 207 | --> 423 |
| 204 | 208 | ||
| 209 | -- substitutes a dot for each vowel in a string | ||
| 205 | print(re.gsub("hello World", "[aeiou]", ".")) | 210 | print(re.gsub("hello World", "[aeiou]", ".")) |
| 206 | --> h.ll. W.rld | 211 | --> h.ll. W.rld |
| 207 | </pre> | 212 | </pre> |
| @@ -415,6 +420,7 @@ prefix <- '&' S prefix / '!' S prefix / suffix | |||
| 415 | suffix <- primary S (([+*?] | 420 | suffix <- primary S (([+*?] |
| 416 | / '^' [+-]? num | 421 | / '^' [+-]? num |
| 417 | / '->' S (string / '{}' / name) | 422 | / '->' S (string / '{}' / name) |
| 423 | / '>>' S name | ||
| 418 | / '=>' S name) S)* | 424 | / '=>' S name) S)* |
| 419 | 425 | ||
| 420 | primary <- '(' exp ')' / string / class / defined | 426 | primary <- '(' exp ')' / string / class / defined |
