From 05edfcff9b7eca52571b221e614b5cbf84e7d43d Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Mon, 10 Apr 2023 13:47:45 -0300 Subject: Documentation Removed '$Id' from all files + updated copyright year + other changes in comments and documentation --- lpeg.html | 92 +++------------------------------------------------------------ 1 file changed, 3 insertions(+), 89 deletions(-) (limited to 'lpeg.html') diff --git a/lpeg.html b/lpeg.html index f4d8658..f50d327 100644 --- a/lpeg.html +++ b/lpeg.html @@ -10,7 +10,6 @@ -
@@ -664,10 +663,10 @@ LPeg does not specify when (and if) it evaluates its captures. consider the pattern lpeg.P"a" / func / 0. Because the "division" by 0 instructs LPeg to throw away the results from the pattern, -LPeg may or may not call func.) +it is not specified whether LPeg will call func.) Therefore, captures should avoid side effects. Moreover, -most captures cannot affect the way a pattern matches a subject. +captures cannot affect the way a pattern matches a subject. The only exception to this rule is the so-called match-time capture. When a match-time capture matches, @@ -1175,91 +1174,6 @@ local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1) -

UTF-8 and Latin 1

-

-It is not difficult to use LPeg to convert a string from -UTF-8 encoding to Latin 1 (ISO 8859-1): -

- -
--- convert a two-byte UTF-8 sequence to a Latin 1 character
-local function f2 (s)
-  local c1, c2 = string.byte(s, 1, 2)
-  return string.char(c1 * 64 + c2 - 12416)
-end
-
-local utf8 = lpeg.R("\0\127")
-           + lpeg.R("\194\195") * lpeg.R("\128\191") / f2
-
-local decode_pattern = lpeg.Cs(utf8^0) * -1
-
-

-In this code, -the definition of UTF-8 is already restricted to the -Latin 1 range (from 0 to 255). -Any encoding outside this range (as well as any invalid encoding) -will not match that pattern. -

- -

-As the definition of decode_pattern demands that -the pattern matches the whole input (because of the -1 at its end), -any invalid string will simply fail to match, -without any useful information about the problem. -We can improve this situation redefining decode_pattern -as follows: -

-
-local function er (_, i) error("invalid encoding at position " .. i) end
-
-local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er))
-
-

-Now, if the pattern utf8^0 stops -before the end of the string, -an appropriate error function is called. -

- - -

UTF-8 and Unicode

-

-We can extend the previous patterns to handle all Unicode code points. -Of course, -we cannot translate them to Latin 1 or any other one-byte encoding. -Instead, our translation results in a array with the code points -represented as numbers. -The full code is here: -

-
--- decode a two-byte UTF-8 sequence
-local function f2 (s)
-  local c1, c2 = string.byte(s, 1, 2)
-  return c1 * 64 + c2 - 12416
-end
-
--- decode a three-byte UTF-8 sequence
-local function f3 (s)
-  local c1, c2, c3 = string.byte(s, 1, 3)
-  return (c1 * 64 + c2) * 64 + c3 - 925824
-end
-
--- decode a four-byte UTF-8 sequence
-local function f4 (s)
-  local c1, c2, c3, c4 = string.byte(s, 1, 4)
-  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
-end
-
-local cont = lpeg.R("\128\191")   -- continuation byte
-
-local utf8 = lpeg.R("\0\127") / string.byte
-           + lpeg.R("\194\223") * cont / f2
-           + lpeg.R("\224\239") * cont * cont / f3
-           + lpeg.R("\240\244") * cont * cont * cont / f4
-
-local decode_pattern = lpeg.Ct(utf8^0) * -1
-
- -

Lua's long strings

A long string in Lua starts with the pattern [=*[ @@ -1416,7 +1330,7 @@ the following command is all you need to install LPeg:

License

-Copyright © 2007-2019 Lua.org, PUC-Rio. +Copyright © 2007-2023 Lua.org, PUC-Rio.

Permission is hereby granted, free of charge, -- cgit v1.2.3-55-g6feb