Implementation of UTF-8 ranges

New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
commit: 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree: 646cd65d6e2dab57691f98f83f15f25c70685ef8 /test.lua
parent: 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download: lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
1 files changed, 57 insertions, 1 deletions
diff --git a/test.lua b/test.lua
index f57cdec..e86c21a 100755
--- a/test.lua
+++ b/test.lua
@@ -48,7 +48,6 @@ end
 print"General tests for LPeg library"
-assert(type(m.version()) == "string")
 print("version " .. m.version())
 assert(m.type("alo") ~= "pattern")
 assert(m.type(io.input) ~= "pattern")
@@ -1189,6 +1188,63 @@ do  print"testing large grammars"
 end
+print "testing UTF-8 ranges"
+do   -- a few typical UTF-8 ranges
+  local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
+          + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
+          + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
+          + m.utfR(0, 0x7f)^1 / "ascii: %0"
+          + m.utfR(0, 0x10ffff) / "other: %0"
+  p = m.Ct(p^0) * -m.P(1)
+  local cyr = "ждюя"
+  local emot = "\240\159\152\128\240\159\153\128"   --  😀🙀
+  local cjk = "专举乸"
+  local ascii = "alo"
+  local last = "\244\143\191\191"                -- U+10FFFF
+  local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
+  t = (p:match(s))
+  assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
+         t[3] == "emot: " .. emot and t[4] == "other: —" and
+         t[5] == "cjk: " .. cjk and t[6] == "other: —" and
+         t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
+         t[9] == nil)
+end
+do   -- valid and invalid code points
+  local p = m.utfR(0, 0x10ffff)^0
+  assert(p:match("汉字\128") == #"汉字" + 1)
+  assert(p:match("\244\159\191") == 1)
+  assert(p:match("\244\159\191\191") == 1)
+  assert(p:match("\255") == 1)
+   -- basic errors
+  checkerr("empty range", m.utfR, 1, 0)
+  checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
+end
+do  -- back references (fixed width)
+  -- match a byte after a CJK point
+  local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
+  p = m.P{ p + m.P(1) * m.V(1) }   -- search for 'p'
+  assert(p:match("ab д 专X x") == "X")
+  -- match a byte after a hebrew point
+  local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
+  p = m.P(#"ש") * p
+  assert(p:match("שX") == "X")
+  checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
+end
 -------------------------------------------------------------------
 -- Tests for 're' module
 -------------------------------------------------------------------
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
commit	24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree	646cd65d6e2dab57691f98f83f15f25c70685ef8 /test.lua
parent	3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download	lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip

diff --git a/test.lua b/test.lua index f57cdec..e86c21a 100755 --- a/test.lua +++ b/test.lua
@@ -48,7 +48,6 @@ end
48		48
49	print"General tests for LPeg library"	49	print"General tests for LPeg library"
50		50
51	assert(type(m.version()) == "string")
52	print("version " .. m.version())	51	print("version " .. m.version())
53	assert(m.type("alo") ~= "pattern")	52	assert(m.type("alo") ~= "pattern")
54	assert(m.type(io.input) ~= "pattern")	53	assert(m.type(io.input) ~= "pattern")
@@ -1189,6 +1188,63 @@ do print"testing large grammars"
1189	end	1188	end
1190		1189
1191		1190
		1191	print "testing UTF-8 ranges"
		1192
		1193	do -- a few typical UTF-8 ranges
		1194	local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
		1195	+ m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
		1196	+ m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
		1197	+ m.utfR(0, 0x7f)^1 / "ascii: %0"
		1198	+ m.utfR(0, 0x10ffff) / "other: %0"
		1199
		1200	p = m.Ct(p^0) * -m.P(1)
		1201
		1202	local cyr = "ждюя"
		1203	local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀
		1204	local cjk = "专举乸"
		1205	local ascii = "alo"
		1206	local last = "\244\143\191\191" -- U+10FFFF
		1207
		1208	local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
		1209	t = (p:match(s))
		1210
		1211	assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
		1212	t[3] == "emot: " .. emot and t[4] == "other: —" and
		1213	t[5] == "cjk: " .. cjk and t[6] == "other: —" and
		1214	t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
		1215	t[9] == nil)
		1216	end
		1217
		1218
		1219	do -- valid and invalid code points
		1220	local p = m.utfR(0, 0x10ffff)^0
		1221	assert(p:match("汉字\128") == #"汉字" + 1)
		1222	assert(p:match("\244\159\191") == 1)
		1223	assert(p:match("\244\159\191\191") == 1)
		1224	assert(p:match("\255") == 1)
		1225
		1226	-- basic errors
		1227	checkerr("empty range", m.utfR, 1, 0)
		1228	checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
		1229	end
		1230
		1231
		1232	do -- back references (fixed width)
		1233	-- match a byte after a CJK point
		1234	local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
		1235	p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p'
		1236	assert(p:match("ab д 专X x") == "X")
		1237
		1238	-- match a byte after a hebrew point
		1239	local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
		1240	p = m.P(#"ש") * p
		1241	assert(p:match("שX") == "X")
		1242
		1243	checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
		1244	end
		1245
		1246
		1247
1192	-------------------------------------------------------------------	1248	-------------------------------------------------------------------
1193	-- Tests for 're' module	1249	-- Tests for 're' module
1194	-------------------------------------------------------------------	1250	-------------------------------------------------------------------