diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-02-20 10:13:46 -0300 |
commit | e08e5df853560de6482d84066a7accc6a18de545 (patch) | |
tree | ee19686bb35da90709a32ed24bf7855de1a3946a /re.html | |
download | lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.gz lpeg-e08e5df853560de6482d84066a7accc6a18de545.tar.bz2 lpeg-e08e5df853560de6482d84066a7accc6a18de545.zip |
Fist version of LPeg on GIT
LPeg repository is being moved to git. Past versions won't be moved;
they are still available in RCS.
Diffstat (limited to 're.html')
-rw-r--r-- | re.html | 500 |
1 files changed, 500 insertions, 0 deletions
@@ -0,0 +1,500 @@ | |||
1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | ||
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html> | ||
4 | <head> | ||
5 | <title>LPeg.re - Regex syntax for LPEG</title> | ||
6 | <link rel="stylesheet" | ||
7 | href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css" | ||
8 | type="text/css"/> | ||
9 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> | ||
10 | </head> | ||
11 | <body> | ||
12 | |||
13 | <!-- $Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $ --> | ||
14 | |||
15 | <div id="container"> | ||
16 | |||
17 | <div id="product"> | ||
18 | <div id="product_logo"> | ||
19 | <a href="http://www.inf.puc-rio.br/~roberto/lpeg/"> | ||
20 | <img alt="LPeg logo" src="lpeg-128.gif"/> | ||
21 | </a> | ||
22 | </div> | ||
23 | <div id="product_name"><big><strong>LPeg.re</strong></big></div> | ||
24 | <div id="product_description"> | ||
25 | Regex syntax for LPEG | ||
26 | </div> | ||
27 | </div> <!-- id="product" --> | ||
28 | |||
29 | <div id="main"> | ||
30 | |||
31 | <div id="navigation"> | ||
32 | <h1>re</h1> | ||
33 | |||
34 | <ul> | ||
35 | <li><a href="#basic">Basic Constructions</a></li> | ||
36 | <li><a href="#func">Functions</a></li> | ||
37 | <li><a href="#ex">Some Examples</a></li> | ||
38 | <li><a href="#license">License</a></li> | ||
39 | </ul> | ||
40 | </li> | ||
41 | </ul> | ||
42 | </div> <!-- id="navigation" --> | ||
43 | |||
44 | <div id="content"> | ||
45 | |||
46 | <h2><a name="basic"></a>The <code>re</code> Module</h2> | ||
47 | |||
48 | <p> | ||
49 | The <code>re</code> module | ||
50 | (provided by file <code>re.lua</code> in the distribution) | ||
51 | supports a somewhat conventional regex syntax | ||
52 | for pattern usage within <a href="lpeg.html">LPeg</a>. | ||
53 | </p> | ||
54 | |||
55 | <p> | ||
56 | The next table summarizes <code>re</code>'s syntax. | ||
57 | A <code>p</code> represents an arbitrary pattern; | ||
58 | <code>num</code> represents a number (<code>[0-9]+</code>); | ||
59 | <code>name</code> represents an identifier | ||
60 | (<code>[a-zA-Z][a-zA-Z0-9_]*</code>). | ||
61 | Constructions are listed in order of decreasing precedence. | ||
62 | <table border="1"> | ||
63 | <tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> | ||
64 | <tr><td><code>( p )</code></td> <td>grouping</td></tr> | ||
65 | <tr><td><code>'string'</code></td> <td>literal string</td></tr> | ||
66 | <tr><td><code>"string"</code></td> <td>literal string</td></tr> | ||
67 | <tr><td><code>[class]</code></td> <td>character class</td></tr> | ||
68 | <tr><td><code>.</code></td> <td>any character</td></tr> | ||
69 | <tr><td><code>%name</code></td> | ||
70 | <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> | ||
71 | <tr><td><code>name</code></td><td>non terminal</td></tr> | ||
72 | <tr><td><code><name></code></td><td>non terminal</td></tr> | ||
73 | <tr><td><code>{}</code></td> <td>position capture</td></tr> | ||
74 | <tr><td><code>{ p }</code></td> <td>simple capture</td></tr> | ||
75 | <tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> | ||
76 | <tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> | ||
77 | <tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> | ||
78 | <tr><td><code>{| p |}</code></td> <td>table capture</td></tr> | ||
79 | <tr><td><code>=name</code></td> <td>back reference | ||
80 | </td></tr> | ||
81 | <tr><td><code>p ?</code></td> <td>optional match</td></tr> | ||
82 | <tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> | ||
83 | <tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> | ||
84 | <tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr> | ||
85 | <tr><td><code>p^+num</code></td> | ||
86 | <td>at least <code>n</code> repetitions</td></tr> | ||
87 | <tr><td><code>p^-num</code></td> | ||
88 | <td>at most <code>n</code> repetitions</td></tr> | ||
89 | <tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> | ||
90 | <tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> | ||
91 | <tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> | ||
92 | <tr><td><code>p -> name</code></td> <td>function/query/string capture | ||
93 | equivalent to <code>p / defs[name]</code></td></tr> | ||
94 | <tr><td><code>p => name</code></td> <td>match-time capture | ||
95 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> | ||
96 | <tr><td><code>p ~> name</code></td> <td>fold capture | ||
97 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | ||
98 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | ||
99 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | ||
100 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | ||
101 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | ||
102 | <tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> | ||
103 | </tbody></table> | ||
104 | <p> | ||
105 | Any space appearing in a syntax description can be | ||
106 | replaced by zero or more space characters and Lua-style comments | ||
107 | (<code>--</code> until end of line). | ||
108 | </p> | ||
109 | |||
110 | <p> | ||
111 | Character classes define sets of characters. | ||
112 | An initial <code>^</code> complements the resulting set. | ||
113 | A range <em>x</em><code>-</code><em>y</em> includes in the set | ||
114 | all characters with codes between the codes of <em>x</em> and <em>y</em>. | ||
115 | A pre-defined class <code>%</code><em>name</em> includes all | ||
116 | characters of that class. | ||
117 | A simple character includes itself in the set. | ||
118 | The only special characters inside a class are <code>^</code> | ||
119 | (special only if it is the first character); | ||
120 | <code>]</code> | ||
121 | (can be included in the set as the first character, | ||
122 | after the optional <code>^</code>); | ||
123 | <code>%</code> (special only if followed by a letter); | ||
124 | and <code>-</code> | ||
125 | (can be included in the set as the first or the last character). | ||
126 | </p> | ||
127 | |||
128 | <p> | ||
129 | Currently the pre-defined classes are similar to those from the | ||
130 | Lua's string library | ||
131 | (<code>%a</code> for letters, | ||
132 | <code>%A</code> for non letters, etc.). | ||
133 | There is also a class <code>%nl</code> | ||
134 | containing only the newline character, | ||
135 | which is particularly handy for grammars written inside long strings, | ||
136 | as long strings do not interpret escape sequences like <code>\n</code>. | ||
137 | </p> | ||
138 | |||
139 | |||
140 | <h2><a name="func">Functions</a></h2> | ||
141 | |||
142 | <h3><code>re.compile (string, [, defs])</code></h3> | ||
143 | <p> | ||
144 | Compiles the given string and | ||
145 | returns an equivalent LPeg pattern. | ||
146 | The given string may define either an expression or a grammar. | ||
147 | The optional <code>defs</code> table provides extra Lua values | ||
148 | to be used by the pattern. | ||
149 | </p> | ||
150 | |||
151 | <h3><code>re.find (subject, pattern [, init])</code></h3> | ||
152 | <p> | ||
153 | Searches the given pattern in the given subject. | ||
154 | If it finds a match, | ||
155 | returns the index where this occurrence starts and | ||
156 | the index where it ends. | ||
157 | Otherwise, returns nil. | ||
158 | </p> | ||
159 | |||
160 | <p> | ||
161 | An optional numeric argument <code>init</code> makes the search | ||
162 | starts at that position in the subject string. | ||
163 | As usual in Lua libraries, | ||
164 | a negative value counts from the end. | ||
165 | </p> | ||
166 | |||
167 | <h3><code>re.gsub (subject, pattern, replacement)</code></h3> | ||
168 | <p> | ||
169 | Does a <em>global substitution</em>, | ||
170 | replacing all occurrences of <code>pattern</code> | ||
171 | in the given <code>subject</code> by <code>replacement</code>. | ||
172 | |||
173 | <h3><code>re.match (subject, pattern)</code></h3> | ||
174 | <p> | ||
175 | Matches the given pattern against the given subject, | ||
176 | returning all captures. | ||
177 | </p> | ||
178 | |||
179 | <h3><code>re.updatelocale ()</code></h3> | ||
180 | <p> | ||
181 | Updates the pre-defined character classes to the current locale. | ||
182 | </p> | ||
183 | |||
184 | |||
185 | <h2><a name="ex">Some Examples</a></h2> | ||
186 | |||
187 | <h3>A complete simple program</h3> | ||
188 | <p> | ||
189 | The next code shows a simple complete Lua program using | ||
190 | the <code>re</code> module: | ||
191 | </p> | ||
192 | <pre class="example"> | ||
193 | local re = require"re" | ||
194 | |||
195 | -- find the position of the first numeral in a string | ||
196 | print(re.find("the number 423 is odd", "[0-9]+")) --> 12 14 | ||
197 | |||
198 | -- returns all words in a string | ||
199 | print(re.match("the number 423 is odd", "({%a+} / .)*")) | ||
200 | --> the number is odd | ||
201 | |||
202 | -- returns the first numeral in a string | ||
203 | print(re.match("the number 423 is odd", "s <- {%d+} / . s")) | ||
204 | --> 423 | ||
205 | |||
206 | print(re.gsub("hello World", "[aeiou]", ".")) | ||
207 | --> h.ll. W.rld | ||
208 | </pre> | ||
209 | |||
210 | |||
211 | <h3>Balanced parentheses</h3> | ||
212 | <p> | ||
213 | The following call will produce the same pattern produced by the | ||
214 | Lua expression in the | ||
215 | <a href="lpeg.html#balanced">balanced parentheses</a> example: | ||
216 | </p> | ||
217 | <pre class="example"> | ||
218 | b = re.compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] | ||
219 | </pre> | ||
220 | |||
221 | <h3>String reversal</h3> | ||
222 | <p> | ||
223 | The next example reverses a string: | ||
224 | </p> | ||
225 | <pre class="example"> | ||
226 | rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] | ||
227 | print(rev:match"0123456789") --> 9876543210 | ||
228 | </pre> | ||
229 | |||
230 | <h3>CSV decoder</h3> | ||
231 | <p> | ||
232 | The next example replicates the <a href="lpeg.html#CSV">CSV decoder</a>: | ||
233 | </p> | ||
234 | <pre class="example"> | ||
235 | record = re.compile[[ | ||
236 | record <- {| field (',' field)* |} (%nl / !.) | ||
237 | field <- escaped / nonescaped | ||
238 | nonescaped <- { [^,"%nl]* } | ||
239 | escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"' | ||
240 | ]] | ||
241 | </pre> | ||
242 | |||
243 | <h3>Lua's long strings</h3> | ||
244 | <p> | ||
245 | The next example matches Lua long strings: | ||
246 | </p> | ||
247 | <pre class="example"> | ||
248 | c = re.compile([[ | ||
249 | longstring <- ('[' {:eq: '='* :} '[' close) | ||
250 | close <- ']' =eq ']' / . close | ||
251 | ]]) | ||
252 | |||
253 | print(c:match'[==[]]===]]]]==]===[]') --> 17 | ||
254 | </pre> | ||
255 | |||
256 | <h3>Abstract Syntax Trees</h3> | ||
257 | <p> | ||
258 | This example shows a simple way to build an | ||
259 | abstract syntax tree (AST) for a given grammar. | ||
260 | To keep our example simple, | ||
261 | let us consider the following grammar | ||
262 | for lists of names: | ||
263 | </p> | ||
264 | <pre class="example"> | ||
265 | p = re.compile[[ | ||
266 | listname <- (name s)* | ||
267 | name <- [a-z][a-z]* | ||
268 | s <- %s* | ||
269 | ]] | ||
270 | </pre> | ||
271 | <p> | ||
272 | Now, we will add captures to build a corresponding AST. | ||
273 | As a first step, the pattern will build a table to | ||
274 | represent each non terminal; | ||
275 | terminals will be represented by their corresponding strings: | ||
276 | </p> | ||
277 | <pre class="example"> | ||
278 | c = re.compile[[ | ||
279 | listname <- {| (name s)* |} | ||
280 | name <- {| {[a-z][a-z]*} |} | ||
281 | s <- %s* | ||
282 | ]] | ||
283 | </pre> | ||
284 | <p> | ||
285 | Now, a match against <code>"hi hello bye"</code> | ||
286 | results in the table | ||
287 | <code>{{"hi"}, {"hello"}, {"bye"}}</code>. | ||
288 | </p> | ||
289 | <p> | ||
290 | For such a simple grammar, | ||
291 | this AST is more than enough; | ||
292 | actually, the tables around each single name | ||
293 | are already overkilling. | ||
294 | More complex grammars, | ||
295 | however, may need some more structure. | ||
296 | Specifically, | ||
297 | it would be useful if each table had | ||
298 | a <code>tag</code> field telling what non terminal | ||
299 | that table represents. | ||
300 | We can add such a tag using | ||
301 | <a href="lpeg.html#cap-g">named group captures</a>: | ||
302 | </p> | ||
303 | <pre class="example"> | ||
304 | x = re.compile[[ | ||
305 | listname <- {| {:tag: '' -> 'list':} (name s)* |} | ||
306 | name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |} | ||
307 | s <- ' '* | ||
308 | ]] | ||
309 | </pre> | ||
310 | <p> | ||
311 | With these group captures, | ||
312 | a match against <code>"hi hello bye"</code> | ||
313 | results in the following table: | ||
314 | </p> | ||
315 | <pre class="example"> | ||
316 | {tag="list", | ||
317 | {tag="id", "hi"}, | ||
318 | {tag="id", "hello"}, | ||
319 | {tag="id", "bye"} | ||
320 | } | ||
321 | </pre> | ||
322 | |||
323 | |||
324 | <h3>Indented blocks</h3> | ||
325 | <p> | ||
326 | This example breaks indented blocks into tables, | ||
327 | respecting the indentation: | ||
328 | </p> | ||
329 | <pre class="example"> | ||
330 | p = re.compile[[ | ||
331 | block <- {| {:ident:' '*:} line | ||
332 | ((=ident !' ' line) / &(=ident ' ') block)* |} | ||
333 | line <- {[^%nl]*} %nl | ||
334 | ]] | ||
335 | </pre> | ||
336 | <p> | ||
337 | As an example, | ||
338 | consider the following text: | ||
339 | </p> | ||
340 | <pre class="example"> | ||
341 | t = p:match[[ | ||
342 | first line | ||
343 | subline 1 | ||
344 | subline 2 | ||
345 | second line | ||
346 | third line | ||
347 | subline 3.1 | ||
348 | subline 3.1.1 | ||
349 | subline 3.2 | ||
350 | ]] | ||
351 | </pre> | ||
352 | <p> | ||
353 | The resulting table <code>t</code> will be like this: | ||
354 | </p> | ||
355 | <pre class="example"> | ||
356 | {'first line'; {'subline 1'; 'subline 2'; ident = ' '}; | ||
357 | 'second line'; | ||
358 | 'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = ' '}; | ||
359 | 'subline 3.2'; ident = ' '}; | ||
360 | ident = ''} | ||
361 | </pre> | ||
362 | |||
363 | <h3>Macro expander</h3> | ||
364 | <p> | ||
365 | This example implements a simple macro expander. | ||
366 | Macros must be defined as part of the pattern, | ||
367 | following some simple rules: | ||
368 | </p> | ||
369 | <pre class="example"> | ||
370 | p = re.compile[[ | ||
371 | text <- {~ item* ~} | ||
372 | item <- macro / [^()] / '(' item* ')' | ||
373 | arg <- ' '* {~ (!',' item)* ~} | ||
374 | args <- '(' arg (',' arg)* ')' | ||
375 | -- now we define some macros | ||
376 | macro <- ('apply' args) -> '%1(%2)' | ||
377 | / ('add' args) -> '%1 + %2' | ||
378 | / ('mul' args) -> '%1 * %2' | ||
379 | ]] | ||
380 | |||
381 | print(p:match"add(mul(a,b), apply(f,x))") --> a * b + f(x) | ||
382 | </pre> | ||
383 | <p> | ||
384 | A <code>text</code> is a sequence of items, | ||
385 | wherein we apply a substitution capture to expand any macros. | ||
386 | An <code>item</code> is either a macro, | ||
387 | any character different from parentheses, | ||
388 | or a parenthesized expression. | ||
389 | A macro argument (<code>arg</code>) is a sequence | ||
390 | of items different from a comma. | ||
391 | (Note that a comma may appear inside an item, | ||
392 | e.g., inside a parenthesized expression.) | ||
393 | Again we do a substitution capture to expand any macro | ||
394 | in the argument before expanding the outer macro. | ||
395 | <code>args</code> is a list of arguments separated by commas. | ||
396 | Finally we define the macros. | ||
397 | Each macro is a string substitution; | ||
398 | it replaces the macro name and its arguments by its corresponding string, | ||
399 | with each <code>%</code><em>n</em> replaced by the <em>n</em>-th argument. | ||
400 | </p> | ||
401 | |||
402 | <h3>Patterns</h3> | ||
403 | <p> | ||
404 | This example shows the complete syntax | ||
405 | of patterns accepted by <code>re</code>. | ||
406 | </p> | ||
407 | <pre class="example"> | ||
408 | p = [=[ | ||
409 | |||
410 | pattern <- exp !. | ||
411 | exp <- S (grammar / alternative) | ||
412 | |||
413 | alternative <- seq ('/' S seq)* | ||
414 | seq <- prefix* | ||
415 | prefix <- '&' S prefix / '!' S prefix / suffix | ||
416 | suffix <- primary S (([+*?] | ||
417 | / '^' [+-]? num | ||
418 | / '->' S (string / '{}' / name) | ||
419 | / '=>' S name) S)* | ||
420 | |||
421 | primary <- '(' exp ')' / string / class / defined | ||
422 | / '{:' (name ':')? exp ':}' | ||
423 | / '=' name | ||
424 | / '{}' | ||
425 | / '{~' exp '~}' | ||
426 | / '{' exp '}' | ||
427 | / '.' | ||
428 | / name S !arrow | ||
429 | / '<' name '>' -- old-style non terminals | ||
430 | |||
431 | grammar <- definition+ | ||
432 | definition <- name S arrow exp | ||
433 | |||
434 | class <- '[' '^'? item (!']' item)* ']' | ||
435 | item <- defined / range / . | ||
436 | range <- . '-' [^]] | ||
437 | |||
438 | S <- (%s / '--' [^%nl]*)* -- spaces and comments | ||
439 | name <- [A-Za-z][A-Za-z0-9_]* | ||
440 | arrow <- '<-' | ||
441 | num <- [0-9]+ | ||
442 | string <- '"' [^"]* '"' / "'" [^']* "'" | ||
443 | defined <- '%' name | ||
444 | |||
445 | ]=] | ||
446 | |||
447 | print(re.match(p, p)) -- a self description must match itself | ||
448 | </pre> | ||
449 | |||
450 | |||
451 | |||
452 | <h2><a name="license">License</a></h2> | ||
453 | |||
454 | <p> | ||
455 | Copyright © 2008-2015 Lua.org, PUC-Rio. | ||
456 | </p> | ||
457 | <p> | ||
458 | Permission is hereby granted, free of charge, | ||
459 | to any person obtaining a copy of this software and | ||
460 | associated documentation files (the "Software"), | ||
461 | to deal in the Software without restriction, | ||
462 | including without limitation the rights to use, | ||
463 | copy, modify, merge, publish, distribute, sublicense, | ||
464 | and/or sell copies of the Software, | ||
465 | and to permit persons to whom the Software is | ||
466 | furnished to do so, | ||
467 | subject to the following conditions: | ||
468 | </p> | ||
469 | |||
470 | <p> | ||
471 | The above copyright notice and this permission notice | ||
472 | shall be included in all copies or substantial portions of the Software. | ||
473 | </p> | ||
474 | |||
475 | <p> | ||
476 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
477 | EXPRESS OR IMPLIED, | ||
478 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
479 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
480 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
481 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
482 | TORT OR OTHERWISE, ARISING FROM, | ||
483 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
484 | THE SOFTWARE. | ||
485 | </p> | ||
486 | |||
487 | </div> <!-- id="content" --> | ||
488 | |||
489 | </div> <!-- id="main" --> | ||
490 | |||
491 | <div id="about"> | ||
492 | <p><small> | ||
493 | $Id: re.html,v 1.25 2018/06/04 16:21:19 roberto Exp $ | ||
494 | </small></p> | ||
495 | </div> <!-- id="about" --> | ||
496 | |||
497 | </div> <!-- id="container" --> | ||
498 | |||
499 | </body> | ||
500 | </html> | ||