diff options
| -rw-r--r-- | HISTORY | 8 | ||||
| -rw-r--r-- | lpcap.c | 4 | ||||
| -rw-r--r-- | lpcap.h | 3 | ||||
| -rw-r--r-- | lpcode.c | 4 | ||||
| -rw-r--r-- | lpcode.h | 3 | ||||
| -rw-r--r-- | lpeg.html | 92 | ||||
| -rw-r--r-- | lpprint.c | 4 | ||||
| -rw-r--r-- | lpprint.h | 4 | ||||
| -rw-r--r-- | lptree.c | 4 | ||||
| -rw-r--r-- | lptree.h | 3 | ||||
| -rw-r--r-- | lptypes.h | 3 | ||||
| -rw-r--r-- | lpvm.c | 4 | ||||
| -rw-r--r-- | lpvm.h | 3 | ||||
| -rw-r--r-- | re.html | 9 | ||||
| -rw-r--r-- | re.lua | 1 |
15 files changed, 15 insertions, 134 deletions
| @@ -1,4 +1,10 @@ | |||
| 1 | HISTORY for LPeg 1.0.2 | 1 | HISTORY for LPeg 1.1.0 |
| 2 | |||
| 3 | * Changes from version 1.0.2 to 1.1.0 | ||
| 4 | --------------------------------- | ||
| 5 | + UTF-8 ranges | ||
| 6 | + Larger limit for number of rules in a grammar | ||
| 7 | + bug fixes | ||
| 2 | 8 | ||
| 3 | * Changes from version 1.0.1 to 1.0.2 | 9 | * Changes from version 1.0.1 to 1.0.2 |
| 4 | --------------------------------- | 10 | --------------------------------- |
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpcap.c $ | ||
| 3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
| 4 | */ | ||
| 5 | 1 | ||
| 6 | #include "lua.h" | 2 | #include "lua.h" |
| 7 | #include "lauxlib.h" | 3 | #include "lauxlib.h" |
| @@ -1,6 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpcap.h $ | ||
| 3 | */ | ||
| 4 | 1 | ||
| 5 | #if !defined(lpcap_h) | 2 | #if !defined(lpcap_h) |
| 6 | #define lpcap_h | 3 | #define lpcap_h |
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpcode.c $ | ||
| 3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
| 4 | */ | ||
| 5 | 1 | ||
| 6 | #include <limits.h> | 2 | #include <limits.h> |
| 7 | 3 | ||
| @@ -1,6 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpcode.h $ | ||
| 3 | */ | ||
| 4 | 1 | ||
| 5 | #if !defined(lpcode_h) | 2 | #if !defined(lpcode_h) |
| 6 | #define lpcode_h | 3 | #define lpcode_h |
| @@ -10,7 +10,6 @@ | |||
| 10 | </head> | 10 | </head> |
| 11 | <body> | 11 | <body> |
| 12 | 12 | ||
| 13 | <!-- $Id: lpeg.html $ --> | ||
| 14 | 13 | ||
| 15 | <div id="container"> | 14 | <div id="container"> |
| 16 | 15 | ||
| @@ -664,10 +663,10 @@ LPeg does not specify when (and if) it evaluates its captures. | |||
| 664 | consider the pattern <code>lpeg.P"a" / func / 0</code>. | 663 | consider the pattern <code>lpeg.P"a" / func / 0</code>. |
| 665 | Because the "division" by 0 instructs LPeg to throw away the | 664 | Because the "division" by 0 instructs LPeg to throw away the |
| 666 | results from the pattern, | 665 | results from the pattern, |
| 667 | LPeg may or may not call <code>func</code>.) | 666 | it is not specified whether LPeg will call <code>func</code>.) |
| 668 | Therefore, captures should avoid side effects. | 667 | Therefore, captures should avoid side effects. |
| 669 | Moreover, | 668 | Moreover, |
| 670 | most captures cannot affect the way a pattern matches a subject. | 669 | captures cannot affect the way a pattern matches a subject. |
| 671 | The only exception to this rule is the | 670 | The only exception to this rule is the |
| 672 | so-called <a href="#matchtime"><em>match-time capture</em></a>. | 671 | so-called <a href="#matchtime"><em>match-time capture</em></a>. |
| 673 | When a match-time capture matches, | 672 | When a match-time capture matches, |
| @@ -1175,91 +1174,6 @@ local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1) | |||
| 1175 | </pre> | 1174 | </pre> |
| 1176 | 1175 | ||
| 1177 | 1176 | ||
| 1178 | <h3>UTF-8 and Latin 1</h3> | ||
| 1179 | <p> | ||
| 1180 | It is not difficult to use LPeg to convert a string from | ||
| 1181 | UTF-8 encoding to Latin 1 (ISO 8859-1): | ||
| 1182 | </p> | ||
| 1183 | |||
| 1184 | <pre class="example"> | ||
| 1185 | -- convert a two-byte UTF-8 sequence to a Latin 1 character | ||
| 1186 | local function f2 (s) | ||
| 1187 | local c1, c2 = string.byte(s, 1, 2) | ||
| 1188 | return string.char(c1 * 64 + c2 - 12416) | ||
| 1189 | end | ||
| 1190 | |||
| 1191 | local utf8 = lpeg.R("\0\127") | ||
| 1192 | + lpeg.R("\194\195") * lpeg.R("\128\191") / f2 | ||
| 1193 | |||
| 1194 | local decode_pattern = lpeg.Cs(utf8^0) * -1 | ||
| 1195 | </pre> | ||
| 1196 | <p> | ||
| 1197 | In this code, | ||
| 1198 | the definition of UTF-8 is already restricted to the | ||
| 1199 | Latin 1 range (from 0 to 255). | ||
| 1200 | Any encoding outside this range (as well as any invalid encoding) | ||
| 1201 | will not match that pattern. | ||
| 1202 | </p> | ||
| 1203 | |||
| 1204 | <p> | ||
| 1205 | As the definition of <code>decode_pattern</code> demands that | ||
| 1206 | the pattern matches the whole input (because of the -1 at its end), | ||
| 1207 | any invalid string will simply fail to match, | ||
| 1208 | without any useful information about the problem. | ||
| 1209 | We can improve this situation redefining <code>decode_pattern</code> | ||
| 1210 | as follows: | ||
| 1211 | </p> | ||
| 1212 | <pre class="example"> | ||
| 1213 | local function er (_, i) error("invalid encoding at position " .. i) end | ||
| 1214 | |||
| 1215 | local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er)) | ||
| 1216 | </pre> | ||
| 1217 | <p> | ||
| 1218 | Now, if the pattern <code>utf8^0</code> stops | ||
| 1219 | before the end of the string, | ||
| 1220 | an appropriate error function is called. | ||
| 1221 | </p> | ||
| 1222 | |||
| 1223 | |||
| 1224 | <h3>UTF-8 and Unicode</h3> | ||
| 1225 | <p> | ||
| 1226 | We can extend the previous patterns to handle all Unicode code points. | ||
| 1227 | Of course, | ||
| 1228 | we cannot translate them to Latin 1 or any other one-byte encoding. | ||
| 1229 | Instead, our translation results in a array with the code points | ||
| 1230 | represented as numbers. | ||
| 1231 | The full code is here: | ||
| 1232 | </p> | ||
| 1233 | <pre class="example"> | ||
| 1234 | -- decode a two-byte UTF-8 sequence | ||
| 1235 | local function f2 (s) | ||
| 1236 | local c1, c2 = string.byte(s, 1, 2) | ||
| 1237 | return c1 * 64 + c2 - 12416 | ||
| 1238 | end | ||
| 1239 | |||
| 1240 | -- decode a three-byte UTF-8 sequence | ||
| 1241 | local function f3 (s) | ||
| 1242 | local c1, c2, c3 = string.byte(s, 1, 3) | ||
| 1243 | return (c1 * 64 + c2) * 64 + c3 - 925824 | ||
| 1244 | end | ||
| 1245 | |||
| 1246 | -- decode a four-byte UTF-8 sequence | ||
| 1247 | local function f4 (s) | ||
| 1248 | local c1, c2, c3, c4 = string.byte(s, 1, 4) | ||
| 1249 | return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 | ||
| 1250 | end | ||
| 1251 | |||
| 1252 | local cont = lpeg.R("\128\191") -- continuation byte | ||
| 1253 | |||
| 1254 | local utf8 = lpeg.R("\0\127") / string.byte | ||
| 1255 | + lpeg.R("\194\223") * cont / f2 | ||
| 1256 | + lpeg.R("\224\239") * cont * cont / f3 | ||
| 1257 | + lpeg.R("\240\244") * cont * cont * cont / f4 | ||
| 1258 | |||
| 1259 | local decode_pattern = lpeg.Ct(utf8^0) * -1 | ||
| 1260 | </pre> | ||
| 1261 | |||
| 1262 | |||
| 1263 | <h3>Lua's long strings</h3> | 1177 | <h3>Lua's long strings</h3> |
| 1264 | <p> | 1178 | <p> |
| 1265 | A long string in Lua starts with the pattern <code>[=*[</code> | 1179 | A long string in Lua starts with the pattern <code>[=*[</code> |
| @@ -1416,7 +1330,7 @@ the following command is all you need to install LPeg: | |||
| 1416 | <h2><a name="license">License</a></h2> | 1330 | <h2><a name="license">License</a></h2> |
| 1417 | 1331 | ||
| 1418 | <p> | 1332 | <p> |
| 1419 | Copyright © 2007-2019 Lua.org, PUC-Rio. | 1333 | Copyright © 2007-2023 Lua.org, PUC-Rio. |
| 1420 | </p> | 1334 | </p> |
| 1421 | <p> | 1335 | <p> |
| 1422 | Permission is hereby granted, free of charge, | 1336 | Permission is hereby granted, free of charge, |
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpprint.c $ | ||
| 3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
| 4 | */ | ||
| 5 | 1 | ||
| 6 | #include <ctype.h> | 2 | #include <ctype.h> |
| 7 | #include <limits.h> | 3 | #include <limits.h> |
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpprint.h $ | ||
| 3 | */ | ||
| 4 | |||
| 5 | 1 | ||
| 6 | #if !defined(lpprint_h) | 2 | #if !defined(lpprint_h) |
| 7 | #define lpprint_h | 3 | #define lpprint_h |
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lptree.c $ | ||
| 3 | ** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
| 4 | */ | ||
| 5 | 1 | ||
| 6 | #include <ctype.h> | 2 | #include <ctype.h> |
| 7 | #include <limits.h> | 3 | #include <limits.h> |
| @@ -1,6 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lptree.h $ | ||
| 3 | */ | ||
| 4 | 1 | ||
| 5 | #if !defined(lptree_h) | 2 | #if !defined(lptree_h) |
| 6 | #define lptree_h | 3 | #define lptree_h |
| @@ -1,7 +1,6 @@ | |||
| 1 | /* | 1 | /* |
| 2 | ** $Id: lptypes.h $ | ||
| 3 | ** LPeg - PEG pattern matching for Lua | 2 | ** LPeg - PEG pattern matching for Lua |
| 4 | ** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license) | 3 | ** Copyright 2007-2023, Lua.org & PUC-Rio (see 'lpeg.html' for license) |
| 5 | ** written by Roberto Ierusalimschy | 4 | ** written by Roberto Ierusalimschy |
| 6 | */ | 5 | */ |
| 7 | 6 | ||
| @@ -1,7 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpvm.c $ | ||
| 3 | ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) | ||
| 4 | */ | ||
| 5 | 1 | ||
| 6 | #include <limits.h> | 2 | #include <limits.h> |
| 7 | #include <string.h> | 3 | #include <string.h> |
| @@ -1,6 +1,3 @@ | |||
| 1 | /* | ||
| 2 | ** $Id: lpvm.h $ | ||
| 3 | */ | ||
| 4 | 1 | ||
| 5 | #if !defined(lpvm_h) | 2 | #if !defined(lpvm_h) |
| 6 | #define lpvm_h | 3 | #define lpvm_h |
| @@ -10,7 +10,6 @@ | |||
| 10 | </head> | 10 | </head> |
| 11 | <body> | 11 | <body> |
| 12 | 12 | ||
| 13 | <!-- $Id: re.html $ --> | ||
| 14 | 13 | ||
| 15 | <div id="container"> | 14 | <div id="container"> |
| 16 | 15 | ||
| @@ -95,7 +94,7 @@ equivalent to <code>p / defs[name]</code></td></tr> | |||
| 95 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> | 94 | equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> |
| 96 | <tr><td><code>p ~> name</code></td> <td>fold capture | 95 | <tr><td><code>p ~> name</code></td> <td>fold capture |
| 97 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | 96 | equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> |
| 98 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> | 97 | <tr><td><code>& p</code></td> <td>and predicate</td></tr> |
| 99 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> | 98 | <tr><td><code>! p</code></td> <td>not predicate</td></tr> |
| 100 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> | 99 | <tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> |
| 101 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> | 100 | <tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> |
| @@ -103,7 +102,7 @@ equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> | |||
| 103 | </tbody></table> | 102 | </tbody></table> |
| 104 | <p> | 103 | <p> |
| 105 | Any space appearing in a syntax description can be | 104 | Any space appearing in a syntax description can be |
| 106 | replaced by zero or more space characters and Lua-style comments | 105 | replaced by zero or more space characters and Lua-style short comments |
| 107 | (<code>--</code> until end of line). | 106 | (<code>--</code> until end of line). |
| 108 | </p> | 107 | </p> |
| 109 | 108 | ||
| @@ -329,7 +328,7 @@ respecting the indentation: | |||
| 329 | <pre class="example"> | 328 | <pre class="example"> |
| 330 | p = re.compile[[ | 329 | p = re.compile[[ |
| 331 | block <- {| {:ident:' '*:} line | 330 | block <- {| {:ident:' '*:} line |
| 332 | ((=ident !' ' line) / &(=ident ' ') block)* |} | 331 | ((=ident !' ' line) / &(=ident ' ') block)* |} |
| 333 | line <- {[^%nl]*} %nl | 332 | line <- {[^%nl]*} %nl |
| 334 | ]] | 333 | ]] |
| 335 | </pre> | 334 | </pre> |
| @@ -453,7 +452,7 @@ print(re.match(p, p)) -- a self description must match itself | |||
| 453 | <h2><a name="license">License</a></h2> | 452 | <h2><a name="license">License</a></h2> |
| 454 | 453 | ||
| 455 | <p> | 454 | <p> |
| 456 | Copyright © 2008-2015 Lua.org, PUC-Rio. | 455 | Copyright © 2008-2023 Lua.org, PUC-Rio. |
| 457 | </p> | 456 | </p> |
| 458 | <p> | 457 | <p> |
| 459 | Permission is hereby granted, free of charge, | 458 | Permission is hereby granted, free of charge, |
| @@ -1,4 +1,3 @@ | |||
| 1 | -- $Id: re.lua $ | ||
| 2 | 1 | ||
| 3 | -- imported functions and modules | 2 | -- imported functions and modules |
| 4 | local tonumber, type, print, error = tonumber, type, print, error | 3 | local tonumber, type, print, error = tonumber, type, print, error |
