diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-03-15 13:14:17 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-03-15 13:14:17 -0300 |
commit | 1e0c73d5b643707335b06abd2546a83d9439d14c (patch) | |
tree | b80b7d5e2cfeeef888ddf98fcc6276832134c1bf /testes | |
parent | 8fa4f1380b9a203bfdf002c2e9e9e13ebb8384c1 (diff) | |
download | lua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.gz lua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.bz2 lua-1e0c73d5b643707335b06abd2546a83d9439d14c.zip |
Changes in the validation of UTF-8
All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).
By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
Diffstat (limited to 'testes')
-rw-r--r-- | testes/literals.lua | 17 | ||||
-rw-r--r-- | testes/utf8.lua | 92 |
2 files changed, 72 insertions, 37 deletions
diff --git a/testes/literals.lua b/testes/literals.lua index 76c08f12..fc45d4ad 100644 --- a/testes/literals.lua +++ b/testes/literals.lua | |||
@@ -56,16 +56,23 @@ assert("abc\z | |||
56 | assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) | 56 | assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) |
57 | 57 | ||
58 | -- limits for 1-byte sequences | 58 | -- limits for 1-byte sequences |
59 | assert("\u{0}\u{7F}" == "\x00\z\x7F") | 59 | assert("\u{0}\u{7F}" == "\x00\x7F") |
60 | 60 | ||
61 | -- limits for 2-byte sequences | 61 | -- limits for 2-byte sequences |
62 | assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF") | 62 | assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF") |
63 | 63 | ||
64 | -- limits for 3-byte sequences | 64 | -- limits for 3-byte sequences |
65 | assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF") | 65 | assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF") |
66 | 66 | ||
67 | -- limits for 4-byte sequences | 67 | -- limits for 4-byte sequences |
68 | assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF") | 68 | assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF") |
69 | |||
70 | -- limits for 5-byte sequences | ||
71 | assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF") | ||
72 | |||
73 | -- limits for 6-byte sequences | ||
74 | assert("\u{4000000}\u{7FFFFFFF}" == | ||
75 | "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF") | ||
69 | 76 | ||
70 | 77 | ||
71 | -- Error in escape sequences | 78 | -- Error in escape sequences |
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]]) | |||
94 | lexerror([[" \256"]], [[\256"]]) | 101 | lexerror([[" \256"]], [[\256"]]) |
95 | 102 | ||
96 | -- errors in UTF-8 sequences | 103 | -- errors in UTF-8 sequences |
97 | lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large | 104 | lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large |
98 | lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' | 105 | lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' |
99 | lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' | 106 | lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' |
100 | lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' | 107 | lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' |
diff --git a/testes/utf8.lua b/testes/utf8.lua index 4b6a57fd..86ec1b00 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua | |||
@@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$" | |||
21 | 21 | ||
22 | -- 't' is the list of codepoints of 's' | 22 | -- 't' is the list of codepoints of 's' |
23 | local function checksyntax (s, t) | 23 | local function checksyntax (s, t) |
24 | -- creates a string "return '\u{t[1]}...\u{t[n]}'" | ||
24 | local ts = {"return '"} | 25 | local ts = {"return '"} |
25 | for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end | 26 | for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end |
26 | ts[#t + 2] = "'" | 27 | ts[#t + 2] = "'" |
27 | ts = table.concat(ts) | 28 | ts = table.concat(ts) |
29 | -- its execution should result in 's' | ||
28 | assert(assert(load(ts))() == s) | 30 | assert(assert(load(ts))() == s) |
29 | end | 31 | end |
30 | 32 | ||
31 | assert(utf8.offset("alo", 5) == nil) | 33 | assert(utf8.offset("alo", 5) == nil) |
32 | assert(utf8.offset("alo", -4) == nil) | 34 | assert(utf8.offset("alo", -4) == nil) |
33 | 35 | ||
34 | -- 't' is the list of codepoints of 's' | 36 | -- 'check' makes several tests over the validity of string 's'. |
35 | local function check (s, t) | 37 | -- 't' is the list of codepoints of 's'. |
36 | local l = utf8.len(s) | 38 | local function check (s, t, nonstrict) |
39 | local l = utf8.len(s, 1, -1, nonstrict) | ||
37 | assert(#t == l and len(s) == l) | 40 | assert(#t == l and len(s) == l) |
38 | assert(utf8.char(table.unpack(t)) == s) | 41 | assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent |
39 | 42 | ||
40 | assert(utf8.offset(s, 0) == 1) | 43 | assert(utf8.offset(s, 0) == 1) |
41 | 44 | ||
42 | checksyntax(s, t) | 45 | checksyntax(s, t) |
43 | 46 | ||
44 | local t1 = {utf8.codepoint(s, 1, -1)} | 47 | -- creates new table with all codepoints of 's' |
48 | local t1 = {utf8.codepoint(s, 1, -1, nonstrict)} | ||
45 | assert(#t == #t1) | 49 | assert(#t == #t1) |
46 | for i = 1, #t do assert(t[i] == t1[i]) end | 50 | for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' |
47 | 51 | ||
48 | for i = 1, l do | 52 | for i = 1, l do -- for all codepoints |
49 | local pi = utf8.offset(s, i) -- position of i-th char | 53 | local pi = utf8.offset(s, i) -- position of i-th char |
50 | local pi1 = utf8.offset(s, 2, pi) -- position of next char | 54 | local pi1 = utf8.offset(s, 2, pi) -- position of next char |
51 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) | 55 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) |
52 | assert(utf8.offset(s, -1, pi1) == pi) | 56 | assert(utf8.offset(s, -1, pi1) == pi) |
53 | assert(utf8.offset(s, i - l - 1) == pi) | 57 | assert(utf8.offset(s, i - l - 1) == pi) |
54 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi))) | 58 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) |
55 | for j = pi, pi1 - 1 do | 59 | for j = pi, pi1 - 1 do |
56 | assert(utf8.offset(s, 0, j) == pi) | 60 | assert(utf8.offset(s, 0, j) == pi) |
57 | end | 61 | end |
58 | for j = pi + 1, pi1 - 1 do | 62 | for j = pi + 1, pi1 - 1 do |
59 | assert(not utf8.len(s, j)) | 63 | assert(not utf8.len(s, j)) |
60 | end | 64 | end |
61 | assert(utf8.len(s, pi, pi) == 1) | 65 | assert(utf8.len(s, pi, pi, nonstrict) == 1) |
62 | assert(utf8.len(s, pi, pi1 - 1) == 1) | 66 | assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) |
63 | assert(utf8.len(s, pi) == l - i + 1) | 67 | assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) |
64 | assert(utf8.len(s, pi1) == l - i) | 68 | assert(utf8.len(s, pi1, -1, nonstrict) == l - i) |
65 | assert(utf8.len(s, 1, pi) == i) | 69 | assert(utf8.len(s, 1, pi, -1, nonstrict) == i) |
66 | end | 70 | end |
67 | 71 | ||
68 | local i = 0 | 72 | local i = 0 |
69 | for p, c in utf8.codes(s) do | 73 | for p, c in utf8.codes(s, nonstrict) do |
70 | i = i + 1 | 74 | i = i + 1 |
71 | assert(c == t[i] and p == utf8.offset(s, i)) | 75 | assert(c == t[i] and p == utf8.offset(s, i)) |
72 | assert(utf8.codepoint(s, p) == c) | 76 | assert(utf8.codepoint(s, p, p, nonstrict) == c) |
73 | end | ||
74 | assert(i == #t) | ||
75 | |||
76 | i = 0 | ||
77 | for p, c in utf8.codes(s) do | ||
78 | i = i + 1 | ||
79 | assert(c == t[i] and p == utf8.offset(s, i)) | ||
80 | end | 77 | end |
81 | assert(i == #t) | 78 | assert(i == #t) |
82 | 79 | ||
@@ -105,13 +102,17 @@ do -- error indication in utf8.len | |||
105 | check("\xF4\x9F\xBF\xBF", 1) | 102 | check("\xF4\x9F\xBF\xBF", 1) |
106 | end | 103 | end |
107 | 104 | ||
108 | -- error in utf8.codes | 105 | -- errors in utf8.codes |
109 | checkerror("invalid UTF%-8 code", | 106 | do |
110 | function () | 107 | local function errorcodes (s) |
111 | local s = "ab\xff" | 108 | checkerror("invalid UTF%-8 code", |
112 | for c in utf8.codes(s) do assert(c) end | 109 | function () |
113 | end) | 110 | for c in utf8.codes(s) do assert(c) end |
114 | 111 | end) | |
112 | end | ||
113 | errorcodes("ab\xff") | ||
114 | errorcodes("\u{110000}") | ||
115 | end | ||
115 | 116 | ||
116 | -- error in initial position for offset | 117 | -- error in initial position for offset |
117 | checkerror("position out of range", utf8.offset, "abc", 1, 5) | 118 | checkerror("position out of range", utf8.offset, "abc", 1, 5) |
@@ -141,14 +142,22 @@ do | |||
141 | assert(#t == 0) | 142 | assert(#t == 0) |
142 | checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) | 143 | checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) |
143 | checkerror("out of range", utf8.codepoint, s, 1, #s + 1) | 144 | checkerror("out of range", utf8.codepoint, s, 1, #s + 1) |
145 | -- surrogates | ||
146 | assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1) | ||
147 | assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1) | ||
148 | assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800) | ||
149 | assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF) | ||
150 | assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF) | ||
144 | end | 151 | end |
145 | 152 | ||
146 | assert(utf8.char() == "") | 153 | assert(utf8.char() == "") |
147 | assert(utf8.char(97, 98, 99) == "abc") | 154 | assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1") |
148 | 155 | ||
149 | assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) | 156 | assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) |
157 | assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1) | ||
150 | 158 | ||
151 | checkerror("value out of range", utf8.char, 0x10FFFF + 1) | 159 | checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1) |
160 | checkerror("value out of range", utf8.char, -1) | ||
152 | 161 | ||
153 | local function invalid (s) | 162 | local function invalid (s) |
154 | checkerror("invalid UTF%-8 code", utf8.codepoint, s) | 163 | checkerror("invalid UTF%-8 code", utf8.codepoint, s) |
@@ -158,6 +167,10 @@ end | |||
158 | -- UTF-8 representation for 0x11ffff (value out of valid range) | 167 | -- UTF-8 representation for 0x11ffff (value out of valid range) |
159 | invalid("\xF4\x9F\xBF\xBF") | 168 | invalid("\xF4\x9F\xBF\xBF") |
160 | 169 | ||
170 | -- surrogates | ||
171 | invalid("\u{D800}") | ||
172 | invalid("\u{DFFF}") | ||
173 | |||
161 | -- overlong sequences | 174 | -- overlong sequences |
162 | invalid("\xC0\x80") -- zero | 175 | invalid("\xC0\x80") -- zero |
163 | invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) | 176 | invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) |
@@ -183,6 +196,21 @@ s = "\0 \x7F\z | |||
183 | s = string.gsub(s, " ", "") | 196 | s = string.gsub(s, " ", "") |
184 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) | 197 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) |
185 | 198 | ||
199 | do | ||
200 | -- original UTF-8 values | ||
201 | local s = "\u{4000000}\u{7FFFFFFF}" | ||
202 | assert(#s == 12) | ||
203 | check(s, {0x4000000, 0x7FFFFFFF}, true) | ||
204 | |||
205 | s = "\u{200000}\u{3FFFFFF}" | ||
206 | assert(#s == 10) | ||
207 | check(s, {0x200000, 0x3FFFFFF}, true) | ||
208 | |||
209 | s = "\u{10000}\u{1fffff}" | ||
210 | assert(#s == 8) | ||
211 | check(s, {0x10000, 0x1FFFFF}, true) | ||
212 | end | ||
213 | |||
186 | x = "日本語a-4\0éó" | 214 | x = "日本語a-4\0éó" |
187 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) | 215 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) |
188 | 216 | ||