summaryrefslogtreecommitdiff
path: root/testes/utf8.lua
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2018-12-17 14:46:37 -0200
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2018-12-17 14:46:37 -0200
commit063d4e4543088e7a21965bda8ee5a0f952a9029e (patch)
tree6c3f2f8e98c26f071a94a32f9f2754396a66a9de /testes/utf8.lua
parente354c6355e7f48e087678ec49e340ca0696725b1 (diff)
downloadlua-5.3.5.tar.gz
lua-5.3.5.tar.bz2
lua-5.3.5.zip
Lua 5.3.5 ported to gitv5.3.5
This is the first commit for the branch Lua 5.3. All source files were copied from the official distribution of 5.3.5 in the Lua site. The test files are the same of 5.3.4. The manual came from the previous RCS repository, revision 1.167.1.2.
Diffstat (limited to 'testes/utf8.lua')
-rw-r--r--testes/utf8.lua210
1 files changed, 210 insertions, 0 deletions
diff --git a/testes/utf8.lua b/testes/utf8.lua
new file mode 100644
index 00000000..ebc190b7
--- /dev/null
+++ b/testes/utf8.lua
@@ -0,0 +1,210 @@
1-- $Id: utf8.lua,v 1.12 2016/11/07 13:11:28 roberto Exp $
2-- See Copyright Notice in file all.lua
3
4print "testing UTF-8 library"
5
6local utf8 = require'utf8'
7
8
9local function checkerror (msg, f, ...)
10 local s, err = pcall(f, ...)
11 assert(not s and string.find(err, msg))
12end
13
14
15local function len (s)
16 return #string.gsub(s, "[\x80-\xBF]", "")
17end
18
19
20local justone = "^" .. utf8.charpattern .. "$"
21
22-- 't' is the list of codepoints of 's'
23local function checksyntax (s, t)
24 local ts = {"return '"}
25 for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
26 ts[#t + 2] = "'"
27 ts = table.concat(ts)
28 assert(assert(load(ts))() == s)
29end
30
31assert(utf8.offset("alo", 5) == nil)
32assert(utf8.offset("alo", -4) == nil)
33
34-- 't' is the list of codepoints of 's'
35local function check (s, t)
36 local l = utf8.len(s)
37 assert(#t == l and len(s) == l)
38 assert(utf8.char(table.unpack(t)) == s)
39
40 assert(utf8.offset(s, 0) == 1)
41
42 checksyntax(s, t)
43
44 local t1 = {utf8.codepoint(s, 1, -1)}
45 assert(#t == #t1)
46 for i = 1, #t do assert(t[i] == t1[i]) end
47
48 for i = 1, l do
49 local pi = utf8.offset(s, i) -- position of i-th char
50 local pi1 = utf8.offset(s, 2, pi) -- position of next char
51 assert(string.find(string.sub(s, pi, pi1 - 1), justone))
52 assert(utf8.offset(s, -1, pi1) == pi)
53 assert(utf8.offset(s, i - l - 1) == pi)
54 assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
55 for j = pi, pi1 - 1 do
56 assert(utf8.offset(s, 0, j) == pi)
57 end
58 for j = pi + 1, pi1 - 1 do
59 assert(not utf8.len(s, j))
60 end
61 assert(utf8.len(s, pi, pi) == 1)
62 assert(utf8.len(s, pi, pi1 - 1) == 1)
63 assert(utf8.len(s, pi) == l - i + 1)
64 assert(utf8.len(s, pi1) == l - i)
65 assert(utf8.len(s, 1, pi) == i)
66 end
67
68 local i = 0
69 for p, c in utf8.codes(s) do
70 i = i + 1
71 assert(c == t[i] and p == utf8.offset(s, i))
72 assert(utf8.codepoint(s, p) == c)
73 end
74 assert(i == #t)
75
76 i = 0
77 for p, c in utf8.codes(s) do
78 i = i + 1
79 assert(c == t[i] and p == utf8.offset(s, i))
80 end
81 assert(i == #t)
82
83 i = 0
84 for c in string.gmatch(s, utf8.charpattern) do
85 i = i + 1
86 assert(c == utf8.char(t[i]))
87 end
88 assert(i == #t)
89
90 for i = 1, l do
91 assert(utf8.offset(s, i) == utf8.offset(s, i - l - 1, #s + 1))
92 end
93
94end
95
96
97do -- error indication in utf8.len
98 local function check (s, p)
99 local a, b = utf8.len(s)
100 assert(not a and b == p)
101 end
102 check("abc\xE3def", 4)
103 check("汉字\x80", #("汉字") + 1)
104 check("\xF4\x9F\xBF", 1)
105 check("\xF4\x9F\xBF\xBF", 1)
106end
107
108-- error in utf8.codes
109checkerror("invalid UTF%-8 code",
110 function ()
111 local s = "ab\xff"
112 for c in utf8.codes(s) do assert(c) end
113 end)
114
115
116-- error in initial position for offset
117checkerror("position out of range", utf8.offset, "abc", 1, 5)
118checkerror("position out of range", utf8.offset, "abc", 1, -4)
119checkerror("position out of range", utf8.offset, "", 1, 2)
120checkerror("position out of range", utf8.offset, "", 1, -1)
121checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
122checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
123checkerror("continuation byte", utf8.offset, "\x80", 1)
124
125
126
127local s = "hello World"
128local t = {string.byte(s, 1, -1)}
129for i = 1, utf8.len(s) do assert(t[i] == string.byte(s, i)) end
130check(s, t)
131
132check("汉字/漢字", {27721, 23383, 47, 28450, 23383,})
133
134do
135 local s = "áéí\128"
136 local t = {utf8.codepoint(s,1,#s - 1)}
137 assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237)
138 checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s)
139 checkerror("out of range", utf8.codepoint, s, #s + 1)
140 t = {utf8.codepoint(s, 4, 3)}
141 assert(#t == 0)
142 checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
143 checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
144end
145
146assert(utf8.char() == "")
147assert(utf8.char(97, 98, 99) == "abc")
148
149assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
150
151checkerror("value out of range", utf8.char, 0x10FFFF + 1)
152
153local function invalid (s)
154 checkerror("invalid UTF%-8 code", utf8.codepoint, s)
155 assert(not utf8.len(s))
156end
157
158-- UTF-8 representation for 0x11ffff (value out of valid range)
159invalid("\xF4\x9F\xBF\xBF")
160
161-- overlong sequences
162invalid("\xC0\x80") -- zero
163invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
164invalid("\xE0\x9F\xBF") -- 0x7FF (should be coded in 2 bytes)
165invalid("\xF0\x8F\xBF\xBF") -- 0xFFFF (should be coded in 3 bytes)
166
167
168-- invalid bytes
169invalid("\x80") -- continuation byte
170invalid("\xBF") -- continuation byte
171invalid("\xFE") -- invalid byte
172invalid("\xFF") -- invalid byte
173
174
175-- empty string
176check("", {})
177
178-- minimum and maximum values for each sequence size
179s = "\0 \x7F\z
180 \xC2\x80 \xDF\xBF\z
181 \xE0\xA0\x80 \xEF\xBF\xBF\z
182 \xF0\x90\x80\x80 \xF4\x8F\xBF\xBF"
183s = string.gsub(s, " ", "")
184check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
185
186x = "日本語a-4\0éó"
187check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
188
189
190-- Supplementary Characters
191check("𣲷𠜎𠱓𡁻𠵼ab𠺢",
192 {0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,})
193
194check("𨳊𩶘𦧺𨳒𥄫𤓓\xF4\x8F\xBF\xBF",
195 {0x28CCA, 0x29D98, 0x269FA, 0x28CD2, 0x2512B, 0x244D3, 0x10ffff})
196
197
198local i = 0
199for p, c in string.gmatch(x, "()(" .. utf8.charpattern .. ")") do
200 i = i + 1
201 assert(utf8.offset(x, i) == p)
202 assert(utf8.len(x, p) == utf8.len(x) - i + 1)
203 assert(utf8.len(c) == 1)
204 for j = 1, #c - 1 do
205 assert(utf8.offset(x, 0, p + j - 1) == p)
206 end
207end
208
209print'ok'
210