diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 /test.lua | |
parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'test.lua')
-rwxr-xr-x | test.lua | 58 |
1 files changed, 57 insertions, 1 deletions
@@ -48,7 +48,6 @@ end | |||
48 | 48 | ||
49 | print"General tests for LPeg library" | 49 | print"General tests for LPeg library" |
50 | 50 | ||
51 | assert(type(m.version()) == "string") | ||
52 | print("version " .. m.version()) | 51 | print("version " .. m.version()) |
53 | assert(m.type("alo") ~= "pattern") | 52 | assert(m.type("alo") ~= "pattern") |
54 | assert(m.type(io.input) ~= "pattern") | 53 | assert(m.type(io.input) ~= "pattern") |
@@ -1189,6 +1188,63 @@ do print"testing large grammars" | |||
1189 | end | 1188 | end |
1190 | 1189 | ||
1191 | 1190 | ||
1191 | print "testing UTF-8 ranges" | ||
1192 | |||
1193 | do -- a few typical UTF-8 ranges | ||
1194 | local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" | ||
1195 | + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" | ||
1196 | + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" | ||
1197 | + m.utfR(0, 0x7f)^1 / "ascii: %0" | ||
1198 | + m.utfR(0, 0x10ffff) / "other: %0" | ||
1199 | |||
1200 | p = m.Ct(p^0) * -m.P(1) | ||
1201 | |||
1202 | local cyr = "ждюя" | ||
1203 | local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 | ||
1204 | local cjk = "专举乸" | ||
1205 | local ascii = "alo" | ||
1206 | local last = "\244\143\191\191" -- U+10FFFF | ||
1207 | |||
1208 | local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last | ||
1209 | t = (p:match(s)) | ||
1210 | |||
1211 | assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and | ||
1212 | t[3] == "emot: " .. emot and t[4] == "other: —" and | ||
1213 | t[5] == "cjk: " .. cjk and t[6] == "other: —" and | ||
1214 | t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and | ||
1215 | t[9] == nil) | ||
1216 | end | ||
1217 | |||
1218 | |||
1219 | do -- valid and invalid code points | ||
1220 | local p = m.utfR(0, 0x10ffff)^0 | ||
1221 | assert(p:match("汉字\128") == #"汉字" + 1) | ||
1222 | assert(p:match("\244\159\191") == 1) | ||
1223 | assert(p:match("\244\159\191\191") == 1) | ||
1224 | assert(p:match("\255") == 1) | ||
1225 | |||
1226 | -- basic errors | ||
1227 | checkerr("empty range", m.utfR, 1, 0) | ||
1228 | checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) | ||
1229 | end | ||
1230 | |||
1231 | |||
1232 | do -- back references (fixed width) | ||
1233 | -- match a byte after a CJK point | ||
1234 | local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) | ||
1235 | p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' | ||
1236 | assert(p:match("ab д 专X x") == "X") | ||
1237 | |||
1238 | -- match a byte after a hebrew point | ||
1239 | local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) | ||
1240 | p = m.P(#"ש") * p | ||
1241 | assert(p:match("שX") == "X") | ||
1242 | |||
1243 | checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) | ||
1244 | end | ||
1245 | |||
1246 | |||
1247 | |||
1192 | ------------------------------------------------------------------- | 1248 | ------------------------------------------------------------------- |
1193 | -- Tests for 're' module | 1249 | -- Tests for 're' module |
1194 | ------------------------------------------------------------------- | 1250 | ------------------------------------------------------------------- |