@@ -664,10 +663,10 @@ LPeg does not specify when (and if) it evaluates its captures. consider the pattern lpeg.P"a" / func / 0. Because the "division" by 0 instructs LPeg to throw away the results from the pattern, -LPeg may or may not call func.) +it is not specified whether LPeg will call func.) Therefore, captures should avoid side effects. Moreover, -most captures cannot affect the way a pattern matches a subject. +captures cannot affect the way a pattern matches a subject. The only exception to this rule is the so-called match-time capture. When a match-time capture matches, @@ -1175,91 +1174,6 @@ local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1) -

UTF-8 and Latin 1

-It is not difficult to use LPeg to convert a string from -UTF-8 encoding to Latin 1 (ISO 8859-1): -

- -

--- convert a two-byte UTF-8 sequence to a Latin 1 character
-local function f2 (s)
-  local c1, c2 = string.byte(s, 1, 2)
-  return string.char(c1 * 64 + c2 - 12416)
-end
-
-local utf8 = lpeg.R("\0\127")
-           + lpeg.R("\194\195") * lpeg.R("\128\191") / f2
-
-local decode_pattern = lpeg.Cs(utf8^0) * -1
-

-In this code, -the definition of UTF-8 is already restricted to the -Latin 1 range (from 0 to 255). -Any encoding outside this range (as well as any invalid encoding) -will not match that pattern. -

- -

-As the definition of decode_pattern demands that -the pattern matches the whole input (because of the -1 at its end), -any invalid string will simply fail to match, -without any useful information about the problem. -We can improve this situation redefining decode_pattern -as follows: -

-local function er (_, i) error("invalid encoding at position " .. i) end
-
-local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er))
-

-Now, if the pattern utf8^0 stops -before the end of the string, -an appropriate error function is called. -

- - -

UTF-8 and Unicode

-We can extend the previous patterns to handle all Unicode code points. -Of course, -we cannot translate them to Latin 1 or any other one-byte encoding. -Instead, our translation results in a array with the code points -represented as numbers. -The full code is here: -

--- decode a two-byte UTF-8 sequence
-local function f2 (s)
-  local c1, c2 = string.byte(s, 1, 2)
-  return c1 * 64 + c2 - 12416
-end
-
--- decode a three-byte UTF-8 sequence
-local function f3 (s)
-  local c1, c2, c3 = string.byte(s, 1, 3)
-  return (c1 * 64 + c2) * 64 + c3 - 925824
-end
-
--- decode a four-byte UTF-8 sequence
-local function f4 (s)
-  local c1, c2, c3, c4 = string.byte(s, 1, 4)
-  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
-end
-
-local cont = lpeg.R("\128\191")   -- continuation byte
-
-local utf8 = lpeg.R("\0\127") / string.byte
-           + lpeg.R("\194\223") * cont / f2
-           + lpeg.R("\224\239") * cont * cont / f3
-           + lpeg.R("\240\244") * cont * cont * cont / f4
-
-local decode_pattern = lpeg.Ct(utf8^0) * -1
-

- -

Lua's long strings

A long string in Lua starts with the pattern [=*[ @@ -1416,7 +1330,7 @@ the following command is all you need to install LPeg:

License

Permission is hereby granted, free of charge, diff --git a/lpprint.c b/lpprint.c index 6893bb8..1c1b7b6 100644 --- a/lpprint.c +++ b/lpprint.c @@ -1,7 +1,3 @@ -/* -** $Id: lpprint.c $ -** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) -*/ #include #include diff --git a/lpprint.h b/lpprint.h index 15ef121..42d7f98 100644 --- a/lpprint.h +++ b/lpprint.h @@ -1,7 +1,3 @@ -/* -** $Id: lpprint.h $ -*/ - #if !defined(lpprint_h) #define lpprint_h diff --git a/lptree.c b/lptree.c index 2318153..4affac9 100644 --- a/lptree.c +++ b/lptree.c @@ -1,7 +1,3 @@ -/* -** $Id: lptree.c $ -** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) -*/ #include #include diff --git a/lptree.h b/lptree.h index 892e013..aa331d2 100644 --- a/lptree.h +++ b/lptree.h @@ -1,6 +1,3 @@ -/* -** $Id: lptree.h $ -*/ #if !defined(lptree_h) #define lptree_h diff --git a/lptypes.h b/lptypes.h index ccb4c18..98b9597 100644 --- a/lptypes.h +++ b/lptypes.h @@ -1,7 +1,6 @@ /* -** $Id: lptypes.h $ ** LPeg - PEG pattern matching for Lua -** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license) +** Copyright 2007-2023, Lua.org & PUC-Rio (see 'lpeg.html' for license) ** written by Roberto Ierusalimschy */ diff --git a/lpvm.c b/lpvm.c index 72ac1dd..8c001fc 100644 --- a/lpvm.c +++ b/lpvm.c @@ -1,7 +1,3 @@ -/* -** $Id: lpvm.c $ -** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) -*/ #include #include diff --git a/lpvm.h b/lpvm.h index ca625f9..cc79dcd 100644 --- a/lpvm.h +++ b/lpvm.h @@ -1,6 +1,3 @@ -/* -** $Id: lpvm.h $ -*/ #if !defined(lpvm_h) #define lpvm_h diff --git a/re.html b/re.html index 24a582a..ed4ccb1 100644 --- a/re.html +++ b/re.html @@ -10,7 +10,6 @@ -

@@ -95,7 +94,7 @@ equivalent to p / defs[name] equivalent to lpeg.Cmt(p, defs[name]) p ~> name fold capture equivalent to lpeg.Cf(p, defs[name]) -& p and predicate +& p and predicate ! p not predicate p1 p2 concatenation p1 / p2 ordered choice @@ -103,7 +102,7 @@ equivalent to lpeg.Cf(p, defs[name])

Any space appearing in a syntax description can be -replaced by zero or more space characters and Lua-style comments +replaced by zero or more space characters and Lua-style short comments (-- until end of line).

@@ -329,7 +328,7 @@ respecting the indentation:

 p = re.compile[[
   block <- {| {:ident:' '*:} line
-           ((=ident !' ' line) / &(=ident ' ') block)* |}
+           ((=ident !' ' line) / &(=ident ' ') block)* |}
   line <- {[^%nl]*} %nl
 ]]

@@ -453,7 +452,7 @@ print(re.match(p, p)) -- a self description must match itself

License

Permission is hereby granted, free of charge, diff --git a/re.lua b/re.lua index 3bb8af7..22cb8de 100644 --- a/re.lua +++ b/re.lua @@ -1,4 +1,3 @@ --- $Id: re.lua $ -- imported functions and modules local tonumber, type, print, error = tonumber, type, print, error -- cgit v1.2.3-55-g6feb