aboutsummaryrefslogtreecommitdiff
path: root/test.lua
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
commit24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree646cd65d6e2dab57691f98f83f15f25c70685ef8 /test.lua
parent3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
downloadlpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'test.lua')
-rwxr-xr-xtest.lua58
1 files changed, 57 insertions, 1 deletions
diff --git a/test.lua b/test.lua
index f57cdec..e86c21a 100755
--- a/test.lua
+++ b/test.lua
@@ -48,7 +48,6 @@ end
48 48
49print"General tests for LPeg library" 49print"General tests for LPeg library"
50 50
51assert(type(m.version()) == "string")
52print("version " .. m.version()) 51print("version " .. m.version())
53assert(m.type("alo") ~= "pattern") 52assert(m.type("alo") ~= "pattern")
54assert(m.type(io.input) ~= "pattern") 53assert(m.type(io.input) ~= "pattern")
@@ -1189,6 +1188,63 @@ do print"testing large grammars"
1189end 1188end
1190 1189
1191 1190
1191print "testing UTF-8 ranges"
1192
1193do -- a few typical UTF-8 ranges
1194 local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
1195 + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
1196 + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
1197 + m.utfR(0, 0x7f)^1 / "ascii: %0"
1198 + m.utfR(0, 0x10ffff) / "other: %0"
1199
1200 p = m.Ct(p^0) * -m.P(1)
1201
1202 local cyr = "ждюя"
1203 local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀
1204 local cjk = "专举乸"
1205 local ascii = "alo"
1206 local last = "\244\143\191\191" -- U+10FFFF
1207
1208 local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
1209 t = (p:match(s))
1210
1211 assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
1212 t[3] == "emot: " .. emot and t[4] == "other: —" and
1213 t[5] == "cjk: " .. cjk and t[6] == "other: —" and
1214 t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
1215 t[9] == nil)
1216end
1217
1218
1219do -- valid and invalid code points
1220 local p = m.utfR(0, 0x10ffff)^0
1221 assert(p:match("汉字\128") == #"汉字" + 1)
1222 assert(p:match("\244\159\191") == 1)
1223 assert(p:match("\244\159\191\191") == 1)
1224 assert(p:match("\255") == 1)
1225
1226 -- basic errors
1227 checkerr("empty range", m.utfR, 1, 0)
1228 checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
1229end
1230
1231
1232do -- back references (fixed width)
1233 -- match a byte after a CJK point
1234 local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
1235 p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p'
1236 assert(p:match("ab д 专X x") == "X")
1237
1238 -- match a byte after a hebrew point
1239 local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
1240 p = m.P(#"ש") * p
1241 assert(p:match("שX") == "X")
1242
1243 checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
1244end
1245
1246
1247
1192------------------------------------------------------------------- 1248-------------------------------------------------------------------
1193-- Tests for 're' module 1249-- Tests for 're' module
1194------------------------------------------------------------------- 1250-------------------------------------------------------------------