diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-03-15 13:14:17 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-03-15 13:14:17 -0300 |
| commit | 1e0c73d5b643707335b06abd2546a83d9439d14c (patch) | |
| tree | b80b7d5e2cfeeef888ddf98fcc6276832134c1bf /testes | |
| parent | 8fa4f1380b9a203bfdf002c2e9e9e13ebb8384c1 (diff) | |
| download | lua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.gz lua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.bz2 lua-1e0c73d5b643707335b06abd2546a83d9439d14c.zip | |
Changes in the validation of UTF-8
All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).
By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
Diffstat (limited to 'testes')
| -rw-r--r-- | testes/literals.lua | 17 | ||||
| -rw-r--r-- | testes/utf8.lua | 92 |
2 files changed, 72 insertions, 37 deletions
diff --git a/testes/literals.lua b/testes/literals.lua index 76c08f12..fc45d4ad 100644 --- a/testes/literals.lua +++ b/testes/literals.lua | |||
| @@ -56,16 +56,23 @@ assert("abc\z | |||
| 56 | assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) | 56 | assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) |
| 57 | 57 | ||
| 58 | -- limits for 1-byte sequences | 58 | -- limits for 1-byte sequences |
| 59 | assert("\u{0}\u{7F}" == "\x00\z\x7F") | 59 | assert("\u{0}\u{7F}" == "\x00\x7F") |
| 60 | 60 | ||
| 61 | -- limits for 2-byte sequences | 61 | -- limits for 2-byte sequences |
| 62 | assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF") | 62 | assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF") |
| 63 | 63 | ||
| 64 | -- limits for 3-byte sequences | 64 | -- limits for 3-byte sequences |
| 65 | assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF") | 65 | assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF") |
| 66 | 66 | ||
| 67 | -- limits for 4-byte sequences | 67 | -- limits for 4-byte sequences |
| 68 | assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF") | 68 | assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF") |
| 69 | |||
| 70 | -- limits for 5-byte sequences | ||
| 71 | assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF") | ||
| 72 | |||
| 73 | -- limits for 6-byte sequences | ||
| 74 | assert("\u{4000000}\u{7FFFFFFF}" == | ||
| 75 | "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF") | ||
| 69 | 76 | ||
| 70 | 77 | ||
| 71 | -- Error in escape sequences | 78 | -- Error in escape sequences |
| @@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]]) | |||
| 94 | lexerror([[" \256"]], [[\256"]]) | 101 | lexerror([[" \256"]], [[\256"]]) |
| 95 | 102 | ||
| 96 | -- errors in UTF-8 sequences | 103 | -- errors in UTF-8 sequences |
| 97 | lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large | 104 | lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large |
| 98 | lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' | 105 | lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' |
| 99 | lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' | 106 | lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' |
| 100 | lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' | 107 | lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' |
diff --git a/testes/utf8.lua b/testes/utf8.lua index 4b6a57fd..86ec1b00 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua | |||
| @@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$" | |||
| 21 | 21 | ||
| 22 | -- 't' is the list of codepoints of 's' | 22 | -- 't' is the list of codepoints of 's' |
| 23 | local function checksyntax (s, t) | 23 | local function checksyntax (s, t) |
| 24 | -- creates a string "return '\u{t[1]}...\u{t[n]}'" | ||
| 24 | local ts = {"return '"} | 25 | local ts = {"return '"} |
| 25 | for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end | 26 | for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end |
| 26 | ts[#t + 2] = "'" | 27 | ts[#t + 2] = "'" |
| 27 | ts = table.concat(ts) | 28 | ts = table.concat(ts) |
| 29 | -- its execution should result in 's' | ||
| 28 | assert(assert(load(ts))() == s) | 30 | assert(assert(load(ts))() == s) |
| 29 | end | 31 | end |
| 30 | 32 | ||
| 31 | assert(utf8.offset("alo", 5) == nil) | 33 | assert(utf8.offset("alo", 5) == nil) |
| 32 | assert(utf8.offset("alo", -4) == nil) | 34 | assert(utf8.offset("alo", -4) == nil) |
| 33 | 35 | ||
| 34 | -- 't' is the list of codepoints of 's' | 36 | -- 'check' makes several tests over the validity of string 's'. |
| 35 | local function check (s, t) | 37 | -- 't' is the list of codepoints of 's'. |
| 36 | local l = utf8.len(s) | 38 | local function check (s, t, nonstrict) |
| 39 | local l = utf8.len(s, 1, -1, nonstrict) | ||
| 37 | assert(#t == l and len(s) == l) | 40 | assert(#t == l and len(s) == l) |
| 38 | assert(utf8.char(table.unpack(t)) == s) | 41 | assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent |
| 39 | 42 | ||
| 40 | assert(utf8.offset(s, 0) == 1) | 43 | assert(utf8.offset(s, 0) == 1) |
| 41 | 44 | ||
| 42 | checksyntax(s, t) | 45 | checksyntax(s, t) |
| 43 | 46 | ||
| 44 | local t1 = {utf8.codepoint(s, 1, -1)} | 47 | -- creates new table with all codepoints of 's' |
| 48 | local t1 = {utf8.codepoint(s, 1, -1, nonstrict)} | ||
| 45 | assert(#t == #t1) | 49 | assert(#t == #t1) |
| 46 | for i = 1, #t do assert(t[i] == t1[i]) end | 50 | for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' |
| 47 | 51 | ||
| 48 | for i = 1, l do | 52 | for i = 1, l do -- for all codepoints |
| 49 | local pi = utf8.offset(s, i) -- position of i-th char | 53 | local pi = utf8.offset(s, i) -- position of i-th char |
| 50 | local pi1 = utf8.offset(s, 2, pi) -- position of next char | 54 | local pi1 = utf8.offset(s, 2, pi) -- position of next char |
| 51 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) | 55 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) |
| 52 | assert(utf8.offset(s, -1, pi1) == pi) | 56 | assert(utf8.offset(s, -1, pi1) == pi) |
| 53 | assert(utf8.offset(s, i - l - 1) == pi) | 57 | assert(utf8.offset(s, i - l - 1) == pi) |
| 54 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi))) | 58 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) |
| 55 | for j = pi, pi1 - 1 do | 59 | for j = pi, pi1 - 1 do |
| 56 | assert(utf8.offset(s, 0, j) == pi) | 60 | assert(utf8.offset(s, 0, j) == pi) |
| 57 | end | 61 | end |
| 58 | for j = pi + 1, pi1 - 1 do | 62 | for j = pi + 1, pi1 - 1 do |
| 59 | assert(not utf8.len(s, j)) | 63 | assert(not utf8.len(s, j)) |
| 60 | end | 64 | end |
| 61 | assert(utf8.len(s, pi, pi) == 1) | 65 | assert(utf8.len(s, pi, pi, nonstrict) == 1) |
| 62 | assert(utf8.len(s, pi, pi1 - 1) == 1) | 66 | assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) |
| 63 | assert(utf8.len(s, pi) == l - i + 1) | 67 | assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) |
| 64 | assert(utf8.len(s, pi1) == l - i) | 68 | assert(utf8.len(s, pi1, -1, nonstrict) == l - i) |
| 65 | assert(utf8.len(s, 1, pi) == i) | 69 | assert(utf8.len(s, 1, pi, -1, nonstrict) == i) |
| 66 | end | 70 | end |
| 67 | 71 | ||
| 68 | local i = 0 | 72 | local i = 0 |
| 69 | for p, c in utf8.codes(s) do | 73 | for p, c in utf8.codes(s, nonstrict) do |
| 70 | i = i + 1 | 74 | i = i + 1 |
| 71 | assert(c == t[i] and p == utf8.offset(s, i)) | 75 | assert(c == t[i] and p == utf8.offset(s, i)) |
| 72 | assert(utf8.codepoint(s, p) == c) | 76 | assert(utf8.codepoint(s, p, p, nonstrict) == c) |
| 73 | end | ||
| 74 | assert(i == #t) | ||
| 75 | |||
| 76 | i = 0 | ||
| 77 | for p, c in utf8.codes(s) do | ||
| 78 | i = i + 1 | ||
| 79 | assert(c == t[i] and p == utf8.offset(s, i)) | ||
| 80 | end | 77 | end |
| 81 | assert(i == #t) | 78 | assert(i == #t) |
| 82 | 79 | ||
| @@ -105,13 +102,17 @@ do -- error indication in utf8.len | |||
| 105 | check("\xF4\x9F\xBF\xBF", 1) | 102 | check("\xF4\x9F\xBF\xBF", 1) |
| 106 | end | 103 | end |
| 107 | 104 | ||
| 108 | -- error in utf8.codes | 105 | -- errors in utf8.codes |
| 109 | checkerror("invalid UTF%-8 code", | 106 | do |
| 110 | function () | 107 | local function errorcodes (s) |
| 111 | local s = "ab\xff" | 108 | checkerror("invalid UTF%-8 code", |
| 112 | for c in utf8.codes(s) do assert(c) end | 109 | function () |
| 113 | end) | 110 | for c in utf8.codes(s) do assert(c) end |
| 114 | 111 | end) | |
| 112 | end | ||
| 113 | errorcodes("ab\xff") | ||
| 114 | errorcodes("\u{110000}") | ||
| 115 | end | ||
| 115 | 116 | ||
| 116 | -- error in initial position for offset | 117 | -- error in initial position for offset |
| 117 | checkerror("position out of range", utf8.offset, "abc", 1, 5) | 118 | checkerror("position out of range", utf8.offset, "abc", 1, 5) |
| @@ -141,14 +142,22 @@ do | |||
| 141 | assert(#t == 0) | 142 | assert(#t == 0) |
| 142 | checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) | 143 | checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) |
| 143 | checkerror("out of range", utf8.codepoint, s, 1, #s + 1) | 144 | checkerror("out of range", utf8.codepoint, s, 1, #s + 1) |
| 145 | -- surrogates | ||
| 146 | assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1) | ||
| 147 | assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1) | ||
| 148 | assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800) | ||
| 149 | assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF) | ||
| 150 | assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF) | ||
| 144 | end | 151 | end |
| 145 | 152 | ||
| 146 | assert(utf8.char() == "") | 153 | assert(utf8.char() == "") |
| 147 | assert(utf8.char(97, 98, 99) == "abc") | 154 | assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1") |
| 148 | 155 | ||
| 149 | assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) | 156 | assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) |
| 157 | assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1) | ||
| 150 | 158 | ||
| 151 | checkerror("value out of range", utf8.char, 0x10FFFF + 1) | 159 | checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1) |
| 160 | checkerror("value out of range", utf8.char, -1) | ||
| 152 | 161 | ||
| 153 | local function invalid (s) | 162 | local function invalid (s) |
| 154 | checkerror("invalid UTF%-8 code", utf8.codepoint, s) | 163 | checkerror("invalid UTF%-8 code", utf8.codepoint, s) |
| @@ -158,6 +167,10 @@ end | |||
| 158 | -- UTF-8 representation for 0x11ffff (value out of valid range) | 167 | -- UTF-8 representation for 0x11ffff (value out of valid range) |
| 159 | invalid("\xF4\x9F\xBF\xBF") | 168 | invalid("\xF4\x9F\xBF\xBF") |
| 160 | 169 | ||
| 170 | -- surrogates | ||
| 171 | invalid("\u{D800}") | ||
| 172 | invalid("\u{DFFF}") | ||
| 173 | |||
| 161 | -- overlong sequences | 174 | -- overlong sequences |
| 162 | invalid("\xC0\x80") -- zero | 175 | invalid("\xC0\x80") -- zero |
| 163 | invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) | 176 | invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) |
| @@ -183,6 +196,21 @@ s = "\0 \x7F\z | |||
| 183 | s = string.gsub(s, " ", "") | 196 | s = string.gsub(s, " ", "") |
| 184 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) | 197 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) |
| 185 | 198 | ||
| 199 | do | ||
| 200 | -- original UTF-8 values | ||
| 201 | local s = "\u{4000000}\u{7FFFFFFF}" | ||
| 202 | assert(#s == 12) | ||
| 203 | check(s, {0x4000000, 0x7FFFFFFF}, true) | ||
| 204 | |||
| 205 | s = "\u{200000}\u{3FFFFFF}" | ||
| 206 | assert(#s == 10) | ||
| 207 | check(s, {0x200000, 0x3FFFFFF}, true) | ||
| 208 | |||
| 209 | s = "\u{10000}\u{1fffff}" | ||
| 210 | assert(#s == 8) | ||
| 211 | check(s, {0x10000, 0x1FFFFF}, true) | ||
| 212 | end | ||
| 213 | |||
| 186 | x = "日本語a-4\0éó" | 214 | x = "日本語a-4\0éó" |
| 187 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) | 215 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) |
| 188 | 216 | ||
