Lua 5.3.5 ported to gitv5.3.5

This is the first commit for the branch Lua 5.3. All source files were copied from the official distribution of 5.3.5 in the Lua site. The test files are the same of 5.3.4. The manual came from the previous RCS repository, revision 1.167.1.2.
author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2018-12-17 14:46:37 -0200
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2018-12-17 14:46:37 -0200
commit: 063d4e4543088e7a21965bda8ee5a0f952a9029e (patch)
tree: 6c3f2f8e98c26f071a94a32f9f2754396a66a9de /testes/utf8.lua
parent: e354c6355e7f48e087678ec49e340ca0696725b1 (diff)
download: lua-5.3.5.tar.gz
lua-5.3.5.tar.bz2
lua-5.3.5.zip
1 files changed, 210 insertions, 0 deletions
diff --git a/testes/utf8.lua b/testes/utf8.lua
new file mode 100644
index 00000000..ebc190b7
--- /dev/null
+++ b/testes/utf8.lua
@@ -0,0 +1,210 @@
+-- $Id: utf8.lua,v 1.12 2016/11/07 13:11:28 roberto Exp $
+-- See Copyright Notice in file all.lua
+print "testing UTF-8 library"
+local utf8 = require'utf8'
+local function checkerror (msg, f, ...)
+  local s, err = pcall(f, ...)
+  assert(not s and string.find(err, msg))
+end
+local function len (s)
+  return #string.gsub(s, "[\x80-\xBF]", "")
+end
+local justone = "^" .. utf8.charpattern .. "$"
+-- 't' is the list of codepoints of 's'
+local function checksyntax (s, t)
+  local ts = {"return '"}
+  for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
+  ts[#t + 2] = "'"
+  ts = table.concat(ts)
+  assert(assert(load(ts))() == s)
+end
+assert(utf8.offset("alo", 5) == nil)
+assert(utf8.offset("alo", -4) == nil)
+-- 't' is the list of codepoints of 's'
+local function check (s, t)
+  local l = utf8.len(s) 
+  assert(#t == l and len(s) == l)
+  assert(utf8.char(table.unpack(t)) == s)
+  assert(utf8.offset(s, 0) == 1)
+  checksyntax(s, t)
+  local t1 = {utf8.codepoint(s, 1, -1)}
+  assert(#t == #t1)
+  for i = 1, #t do assert(t[i] == t1[i]) end
+  for i = 1, l do
+    local pi = utf8.offset(s, i)        -- position of i-th char
+    local pi1 = utf8.offset(s, 2, pi)   -- position of next char
+    assert(string.find(string.sub(s, pi, pi1 - 1), justone))
+    assert(utf8.offset(s, -1, pi1) == pi)
+    assert(utf8.offset(s, i - l - 1) == pi)
+    assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
+    for j = pi, pi1 - 1 do 
+      assert(utf8.offset(s, 0, j) == pi)
+    end
+    for j = pi + 1, pi1 - 1 do
+      assert(not utf8.len(s, j))
+    end
+   assert(utf8.len(s, pi, pi) == 1)
+   assert(utf8.len(s, pi, pi1 - 1) == 1)
+   assert(utf8.len(s, pi) == l - i + 1)
+   assert(utf8.len(s, pi1) == l - i)
+   assert(utf8.len(s, 1, pi) == i)
+  end
+  local i = 0
+  for p, c in utf8.codes(s) do
+    i = i + 1
+    assert(c == t[i] and p == utf8.offset(s, i))
+    assert(utf8.codepoint(s, p) == c)
+  end
+  assert(i == #t)
+  i = 0
+  for p, c in utf8.codes(s) do
+    i = i + 1
+    assert(c == t[i] and p == utf8.offset(s, i)) 
+  end
+  assert(i == #t)
+  i = 0
+  for c in string.gmatch(s, utf8.charpattern) do
+    i = i + 1
+    assert(c == utf8.char(t[i]))
+  end
+  assert(i == #t)
+  for i = 1, l do
+    assert(utf8.offset(s, i) == utf8.offset(s, i - l - 1, #s + 1))
+  end
+end
+do    -- error indication in utf8.len
+  local function check (s, p)
+    local a, b = utf8.len(s)
+    assert(not a and b == p)
+  end
+  check("abc\xE3def", 4)
+  check("汉字\x80", #("汉字") + 1)
+  check("\xF4\x9F\xBF", 1)
+  check("\xF4\x9F\xBF\xBF", 1)
+end
+-- error in utf8.codes
+checkerror("invalid UTF%-8 code",
+  function ()
+    local s = "ab\xff"
+    for c in utf8.codes(s) do assert(c) end
+  end)
+-- error in initial position for offset
+checkerror("position out of range", utf8.offset, "abc", 1, 5)
+checkerror("position out of range", utf8.offset, "abc", 1, -4)
+checkerror("position out of range", utf8.offset, "", 1, 2)
+checkerror("position out of range", utf8.offset, "", 1, -1)
+checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
+checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
+checkerror("continuation byte", utf8.offset, "\x80", 1)
+local s = "hello World"
+local t = {string.byte(s, 1, -1)}
+for i = 1, utf8.len(s) do assert(t[i] == string.byte(s, i)) end
+check(s, t)
+check("汉字/漢字", {27721, 23383, 47, 28450, 23383,})
+do
+  local s = "áéí\128"
+  local t = {utf8.codepoint(s,1,#s - 1)}
+  assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237)
+  checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s)
+  checkerror("out of range", utf8.codepoint, s, #s + 1)
+  t = {utf8.codepoint(s, 4, 3)}
+  assert(#t == 0)
+  checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
+  checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
+end
+assert(utf8.char() == "")
+assert(utf8.char(97, 98, 99) == "abc")
+assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
+checkerror("value out of range", utf8.char, 0x10FFFF + 1)
+local function invalid (s)
+  checkerror("invalid UTF%-8 code", utf8.codepoint, s)
+  assert(not utf8.len(s))
+end
+-- UTF-8 representation for 0x11ffff (value out of valid range)
+invalid("\xF4\x9F\xBF\xBF")
+-- overlong sequences
+invalid("\xC0\x80")          -- zero
+invalid("\xC1\xBF")          -- 0x7F (should be coded in 1 byte)
+invalid("\xE0\x9F\xBF")      -- 0x7FF (should be coded in 2 bytes)
+invalid("\xF0\x8F\xBF\xBF")  -- 0xFFFF (should be coded in 3 bytes)
+-- invalid bytes
+invalid("\x80")  -- continuation byte
+invalid("\xBF")  -- continuation byte
+invalid("\xFE")  -- invalid byte
+invalid("\xFF")  -- invalid byte
+-- empty string
+check("", {})
+-- minimum and maximum values for each sequence size
+s = "\0 \x7F\z
+     \xC2\x80 \xDF\xBF\z
+     \xE0\xA0\x80 \xEF\xBF\xBF\z
+     \xF0\x90\x80\x80  \xF4\x8F\xBF\xBF"
+s = string.gsub(s, " ", "")
+check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
+x = "日本語a-4\0éó"
+check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
+-- Supplementary Characters
+check("𣲷𠜎𠱓𡁻𠵼ab𠺢",
+      {0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,})
+check("𨳊𩶘𦧺𨳒𥄫𤓓\xF4\x8F\xBF\xBF",
+      {0x28CCA, 0x29D98, 0x269FA, 0x28CD2, 0x2512B, 0x244D3, 0x10ffff})
+local i = 0
+for p, c in string.gmatch(x, "()(" .. utf8.charpattern .. ")") do
+  i = i + 1
+  assert(utf8.offset(x, i) == p)
+  assert(utf8.len(x, p) == utf8.len(x) - i + 1)
+  assert(utf8.len(c) == 1)
+  for j = 1, #c - 1 do
+    assert(utf8.offset(x, 0, p + j - 1) == p)
+  end
+end
+print'ok'
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2018-12-17 14:46:37 -0200
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2018-12-17 14:46:37 -0200
commit	063d4e4543088e7a21965bda8ee5a0f952a9029e (patch)
tree	6c3f2f8e98c26f071a94a32f9f2754396a66a9de /testes/utf8.lua
parent	e354c6355e7f48e087678ec49e340ca0696725b1 (diff)
download	lua-5.3.5.tar.gz lua-5.3.5.tar.bz2 lua-5.3.5.zip

diff --git a/testes/utf8.lua b/testes/utf8.lua new file mode 100644 index 00000000..ebc190b7 --- /dev/null +++ b/testes/utf8.lua
@@ -0,0 +1,210 @@
	1	-- $Id: utf8.lua,v 1.12 2016/11/07 13:11:28 roberto Exp $
	2	-- See Copyright Notice in file all.lua
	3
	4	print "testing UTF-8 library"
	5
	6	local utf8 = require'utf8'
	7
	8
	9	local function checkerror (msg, f, ...)
	10	local s, err = pcall(f, ...)
	11	assert(not s and string.find(err, msg))
	12	end
	13
	14
	15	local function len (s)
	16	return #string.gsub(s, "[\x80-\xBF]", "")
	17	end
	18
	19
	20	local justone = "^" .. utf8.charpattern .. "$"
	21
	22	-- 't' is the list of codepoints of 's'
	23	local function checksyntax (s, t)
	24	local ts = {"return '"}
	25	for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
	26	ts[#t + 2] = "'"
	27	ts = table.concat(ts)
	28	assert(assert(load(ts))() == s)
	29	end
	30
	31	assert(utf8.offset("alo", 5) == nil)
	32	assert(utf8.offset("alo", -4) == nil)
	33
	34	-- 't' is the list of codepoints of 's'
	35	local function check (s, t)
	36	local l = utf8.len(s)
	37	assert(#t == l and len(s) == l)
	38	assert(utf8.char(table.unpack(t)) == s)
	39
	40	assert(utf8.offset(s, 0) == 1)
	41
	42	checksyntax(s, t)
	43
	44	local t1 = {utf8.codepoint(s, 1, -1)}
	45	assert(#t == #t1)
	46	for i = 1, #t do assert(t[i] == t1[i]) end
	47
	48	for i = 1, l do
	49	local pi = utf8.offset(s, i) -- position of i-th char
	50	local pi1 = utf8.offset(s, 2, pi) -- position of next char
	51	assert(string.find(string.sub(s, pi, pi1 - 1), justone))
	52	assert(utf8.offset(s, -1, pi1) == pi)
	53	assert(utf8.offset(s, i - l - 1) == pi)
	54	assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
	55	for j = pi, pi1 - 1 do
	56	assert(utf8.offset(s, 0, j) == pi)
	57	end
	58	for j = pi + 1, pi1 - 1 do
	59	assert(not utf8.len(s, j))
	60	end
	61	assert(utf8.len(s, pi, pi) == 1)
	62	assert(utf8.len(s, pi, pi1 - 1) == 1)
	63	assert(utf8.len(s, pi) == l - i + 1)
	64	assert(utf8.len(s, pi1) == l - i)
	65	assert(utf8.len(s, 1, pi) == i)
	66	end
	67
	68	local i = 0
	69	for p, c in utf8.codes(s) do
	70	i = i + 1
	71	assert(c == t[i] and p == utf8.offset(s, i))
	72	assert(utf8.codepoint(s, p) == c)
	73	end
	74	assert(i == #t)
	75
	76	i = 0
	77	for p, c in utf8.codes(s) do
	78	i = i + 1
	79	assert(c == t[i] and p == utf8.offset(s, i))
	80	end
	81	assert(i == #t)
	82
	83	i = 0
	84	for c in string.gmatch(s, utf8.charpattern) do
	85	i = i + 1
	86	assert(c == utf8.char(t[i]))
	87	end
	88	assert(i == #t)
	89
	90	for i = 1, l do
	91	assert(utf8.offset(s, i) == utf8.offset(s, i - l - 1, #s + 1))
	92	end
	93
	94	end
	95
	96
	97	do -- error indication in utf8.len
	98	local function check (s, p)
	99	local a, b = utf8.len(s)
	100	assert(not a and b == p)
	101	end
	102	check("abc\xE3def", 4)
	103	check("汉字\x80", #("汉字") + 1)
	104	check("\xF4\x9F\xBF", 1)
	105	check("\xF4\x9F\xBF\xBF", 1)
	106	end
	107
	108	-- error in utf8.codes
	109	checkerror("invalid UTF%-8 code",
	110	function ()
	111	local s = "ab\xff"
	112	for c in utf8.codes(s) do assert(c) end
	113	end)
	114
	115
	116	-- error in initial position for offset
	117	checkerror("position out of range", utf8.offset, "abc", 1, 5)
	118	checkerror("position out of range", utf8.offset, "abc", 1, -4)
	119	checkerror("position out of range", utf8.offset, "", 1, 2)
	120	checkerror("position out of range", utf8.offset, "", 1, -1)
	121	checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
	122	checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
	123	checkerror("continuation byte", utf8.offset, "\x80", 1)
	124
	125
	126
	127	local s = "hello World"
	128	local t = {string.byte(s, 1, -1)}
	129	for i = 1, utf8.len(s) do assert(t[i] == string.byte(s, i)) end
	130	check(s, t)
	131
	132	check("汉字/漢字", {27721, 23383, 47, 28450, 23383,})
	133
	134	do
	135	local s = "áéí\128"
	136	local t = {utf8.codepoint(s,1,#s - 1)}
	137	assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237)
	138	checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s)
	139	checkerror("out of range", utf8.codepoint, s, #s + 1)
	140	t = {utf8.codepoint(s, 4, 3)}
	141	assert(#t == 0)
	142	checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
	143	checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
	144	end
	145
	146	assert(utf8.char() == "")
	147	assert(utf8.char(97, 98, 99) == "abc")
	148
	149	assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
	150
	151	checkerror("value out of range", utf8.char, 0x10FFFF + 1)
	152
	153	local function invalid (s)
	154	checkerror("invalid UTF%-8 code", utf8.codepoint, s)
	155	assert(not utf8.len(s))
	156	end
	157
	158	-- UTF-8 representation for 0x11ffff (value out of valid range)
	159	invalid("\xF4\x9F\xBF\xBF")
	160
	161	-- overlong sequences
	162	invalid("\xC0\x80") -- zero
	163	invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
	164	invalid("\xE0\x9F\xBF") -- 0x7FF (should be coded in 2 bytes)
	165	invalid("\xF0\x8F\xBF\xBF") -- 0xFFFF (should be coded in 3 bytes)
	166
	167
	168	-- invalid bytes
	169	invalid("\x80") -- continuation byte
	170	invalid("\xBF") -- continuation byte
	171	invalid("\xFE") -- invalid byte
	172	invalid("\xFF") -- invalid byte
	173
	174
	175	-- empty string
	176	check("", {})
	177
	178	-- minimum and maximum values for each sequence size
	179	s = "\0 \x7F\z
	180	\xC2\x80 \xDF\xBF\z
	181	\xE0\xA0\x80 \xEF\xBF\xBF\z
	182	\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF"
	183	s = string.gsub(s, " ", "")
	184	check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
	185
	186	x = "日本語a-4\0éó"
	187	check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
	188
	189
	190	-- Supplementary Characters
	191	check("𣲷𠜎𠱓𡁻𠵼ab𠺢",
	192	{0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,})
	193
	194	check("𨳊𩶘𦧺𨳒𥄫𤓓\xF4\x8F\xBF\xBF",
	195	{0x28CCA, 0x29D98, 0x269FA, 0x28CD2, 0x2512B, 0x244D3, 0x10ffff})
	196
	197
	198	local i = 0
	199	for p, c in string.gmatch(x, "()(" .. utf8.charpattern .. ")") do
	200	i = i + 1
	201	assert(utf8.offset(x, i) == p)
	202	assert(utf8.len(x, p) == utf8.len(x) - i + 1)
	203	assert(utf8.len(c) == 1)
	204	for j = 1, #c - 1 do
	205	assert(utf8.offset(x, 0, p + j - 1) == p)
	206	end
	207	end
	208
	209	print'ok'
	210