aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-03-15 13:14:17 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-03-15 13:14:17 -0300
commit1e0c73d5b643707335b06abd2546a83d9439d14c (patch)
treeb80b7d5e2cfeeef888ddf98fcc6276832134c1bf
parent8fa4f1380b9a203bfdf002c2e9e9e13ebb8384c1 (diff)
downloadlua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.gz
lua-1e0c73d5b643707335b06abd2546a83d9439d14c.tar.bz2
lua-1e0c73d5b643707335b06abd2546a83d9439d14c.zip
Changes in the validation of UTF-8
All UTF-8 encoding functionality (including the escape sequence '\u') accepts all values from the original UTF-8 specification (with sequences of up to six bytes). By default, the decoding functions in the UTF-8 library do not accept invalid Unicode code points, such as surrogates. A new parameter 'nonstrict' makes them accept all code points up to (2^31)-1, as in the original UTF-8 specification.
-rw-r--r--llex.c2
-rw-r--r--lobject.c6
-rw-r--r--lutf8lib.c76
-rw-r--r--manual/manual.of43
-rw-r--r--testes/literals.lua17
-rw-r--r--testes/utf8.lua92
6 files changed, 164 insertions, 72 deletions
diff --git a/llex.c b/llex.c
index 38c6d92d..1539f525 100644
--- a/llex.c
+++ b/llex.c
@@ -335,7 +335,7 @@ static unsigned long readutf8esc (LexState *ls) {
335 while ((save_and_next(ls), lisxdigit(ls->current))) { 335 while ((save_and_next(ls), lisxdigit(ls->current))) {
336 i++; 336 i++;
337 r = (r << 4) + luaO_hexavalue(ls->current); 337 r = (r << 4) + luaO_hexavalue(ls->current);
338 esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); 338 esccheck(ls, r <= 0x7FFFFFFFu, "UTF-8 value too large");
339 } 339 }
340 esccheck(ls, ls->current == '}', "missing '}'"); 340 esccheck(ls, ls->current == '}', "missing '}'");
341 next(ls); /* skip '}' */ 341 next(ls); /* skip '}' */
diff --git a/lobject.c b/lobject.c
index 3ce052c2..5d340de6 100644
--- a/lobject.c
+++ b/lobject.c
@@ -343,7 +343,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
343 343
344int luaO_utf8esc (char *buff, unsigned long x) { 344int luaO_utf8esc (char *buff, unsigned long x) {
345 int n = 1; /* number of bytes put in buffer (backwards) */ 345 int n = 1; /* number of bytes put in buffer (backwards) */
346 lua_assert(x <= 0x10FFFF); 346 lua_assert(x <= 0x7FFFFFFFu);
347 if (x < 0x80) /* ascii? */ 347 if (x < 0x80) /* ascii? */
348 buff[UTF8BUFFSZ - 1] = cast_char(x); 348 buff[UTF8BUFFSZ - 1] = cast_char(x);
349 else { /* need continuation bytes */ 349 else { /* need continuation bytes */
@@ -435,9 +435,9 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
435 pushstr(L, buff, l); 435 pushstr(L, buff, l);
436 break; 436 break;
437 } 437 }
438 case 'U': { /* an 'int' as a UTF-8 sequence */ 438 case 'U': { /* a 'long' as a UTF-8 sequence */
439 char buff[UTF8BUFFSZ]; 439 char buff[UTF8BUFFSZ];
440 int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long))); 440 int l = luaO_utf8esc(buff, va_arg(argp, long));
441 pushstr(L, buff + UTF8BUFFSZ - l, l); 441 pushstr(L, buff + UTF8BUFFSZ - l, l);
442 break; 442 break;
443 } 443 }
diff --git a/lutf8lib.c b/lutf8lib.c
index dc95b285..ec711c9a 100644
--- a/lutf8lib.c
+++ b/lutf8lib.c
@@ -21,12 +21,14 @@
21#include "lualib.h" 21#include "lualib.h"
22 22
23 23
24#define MAXUNICODE 0x10FFFF 24#define MAXUNICODE 0x10FFFFu
25
26#define MAXUTF 0x7FFFFFFFu
25 27
26/* 28/*
27** Integer type for decoded UTF-8 values; MAXUNICODE needs 21 bits. 29** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
28*/ 30*/
29#if LUAI_BITSINT >= 21 31#if LUAI_BITSINT >= 31
30typedef unsigned int utfint; 32typedef unsigned int utfint;
31#else 33#else
32typedef unsigned long utfint; 34typedef unsigned long utfint;
@@ -46,38 +48,46 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
46 48
47 49
48/* 50/*
49** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. 51** Decode one UTF-8 sequence, returning NULL if byte sequence is
52** invalid. The array 'limits' stores the minimum value for each
53** sequence length, to check for overlong representations. Its first
54** entry forces an error for non-ascii bytes with no continuation
55** bytes (count == 0).
50*/ 56*/
51static const char *utf8_decode (const char *o, utfint *val) { 57static const char *utf8_decode (const char *s, utfint *val, int strict) {
52 static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; 58 static const utfint limits[] =
53 const unsigned char *s = (const unsigned char *)o; 59 {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
54 unsigned int c = s[0]; 60 unsigned int c = (unsigned char)s[0];
55 utfint res = 0; /* final result */ 61 utfint res = 0; /* final result */
56 if (c < 0x80) /* ascii? */ 62 if (c < 0x80) /* ascii? */
57 res = c; 63 res = c;
58 else { 64 else {
59 int count = 0; /* to count number of continuation bytes */ 65 int count = 0; /* to count number of continuation bytes */
60 while (c & 0x40) { /* still have continuation bytes? */ 66 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
61 int cc = s[++count]; /* read next byte */ 67 unsigned int cc = (unsigned char)s[++count]; /* read next byte */
62 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 68 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
63 return NULL; /* invalid byte sequence */ 69 return NULL; /* invalid byte sequence */
64 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 70 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
65 c <<= 1; /* to test next bit */
66 } 71 }
67 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ 72 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
68 if (count > 3 || res > MAXUNICODE || res <= limits[count]) 73 if (count > 5 || res > MAXUTF || res < limits[count])
69 return NULL; /* invalid byte sequence */ 74 return NULL; /* invalid byte sequence */
70 s += count; /* skip continuation bytes read */ 75 s += count; /* skip continuation bytes read */
71 } 76 }
77 if (strict) {
78 /* check for invalid code points; too large or surrogates */
79 if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
80 return NULL;
81 }
72 if (val) *val = res; 82 if (val) *val = res;
73 return (const char *)s + 1; /* +1 to include first byte */ 83 return s + 1; /* +1 to include first byte */
74} 84}
75 85
76 86
77/* 87/*
78** utf8len(s [, i [, j]]) --> number of characters that start in the 88** utf8len(s [, i [, j [, nonstrict]]]) --> number of characters that
79** range [i,j], or nil + current position if 's' is not well formed in 89** start in the range [i,j], or nil + current position if 's' is not
80** that interval 90** well formed in that interval
81*/ 91*/
82static int utflen (lua_State *L) { 92static int utflen (lua_State *L) {
83 lua_Integer n = 0; /* counter for the number of characters */ 93 lua_Integer n = 0; /* counter for the number of characters */
@@ -85,12 +95,13 @@ static int utflen (lua_State *L) {
85 const char *s = luaL_checklstring(L, 1, &len); 95 const char *s = luaL_checklstring(L, 1, &len);
86 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 96 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
87 lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); 97 lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
98 int nonstrict = lua_toboolean(L, 4);
88 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 99 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
89 "initial position out of string"); 100 "initial position out of string");
90 luaL_argcheck(L, --posj < (lua_Integer)len, 3, 101 luaL_argcheck(L, --posj < (lua_Integer)len, 3,
91 "final position out of string"); 102 "final position out of string");
92 while (posi <= posj) { 103 while (posi <= posj) {
93 const char *s1 = utf8_decode(s + posi, NULL); 104 const char *s1 = utf8_decode(s + posi, NULL, !nonstrict);
94 if (s1 == NULL) { /* conversion error? */ 105 if (s1 == NULL) { /* conversion error? */
95 lua_pushnil(L); /* return nil ... */ 106 lua_pushnil(L); /* return nil ... */
96 lua_pushinteger(L, posi + 1); /* ... and current position */ 107 lua_pushinteger(L, posi + 1); /* ... and current position */
@@ -105,14 +116,15 @@ static int utflen (lua_State *L) {
105 116
106 117
107/* 118/*
108** codepoint(s, [i, [j]]) -> returns codepoints for all characters 119** codepoint(s, [i, [j [, nonstrict]]]) -> returns codepoints for all
109** that start in the range [i,j] 120** characters that start in the range [i,j]
110*/ 121*/
111static int codepoint (lua_State *L) { 122static int codepoint (lua_State *L) {
112 size_t len; 123 size_t len;
113 const char *s = luaL_checklstring(L, 1, &len); 124 const char *s = luaL_checklstring(L, 1, &len);
114 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 125 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
115 lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); 126 lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
127 int nonstrict = lua_toboolean(L, 4);
116 int n; 128 int n;
117 const char *se; 129 const char *se;
118 luaL_argcheck(L, posi >= 1, 2, "out of range"); 130 luaL_argcheck(L, posi >= 1, 2, "out of range");
@@ -126,7 +138,7 @@ static int codepoint (lua_State *L) {
126 se = s + pose; /* string end */ 138 se = s + pose; /* string end */
127 for (s += posi - 1; s < se;) { 139 for (s += posi - 1; s < se;) {
128 utfint code; 140 utfint code;
129 s = utf8_decode(s, &code); 141 s = utf8_decode(s, &code, !nonstrict);
130 if (s == NULL) 142 if (s == NULL)
131 return luaL_error(L, "invalid UTF-8 code"); 143 return luaL_error(L, "invalid UTF-8 code");
132 lua_pushinteger(L, code); 144 lua_pushinteger(L, code);
@@ -137,8 +149,8 @@ static int codepoint (lua_State *L) {
137 149
138 150
139static void pushutfchar (lua_State *L, int arg) { 151static void pushutfchar (lua_State *L, int arg) {
140 lua_Integer code = luaL_checkinteger(L, arg); 152 lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
141 luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); 153 luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
142 lua_pushfstring(L, "%U", (long)code); 154 lua_pushfstring(L, "%U", (long)code);
143} 155}
144 156
@@ -209,7 +221,7 @@ static int byteoffset (lua_State *L) {
209} 221}
210 222
211 223
212static int iter_aux (lua_State *L) { 224static int iter_aux (lua_State *L, int strict) {
213 size_t len; 225 size_t len;
214 const char *s = luaL_checklstring(L, 1, &len); 226 const char *s = luaL_checklstring(L, 1, &len);
215 lua_Integer n = lua_tointeger(L, 2) - 1; 227 lua_Integer n = lua_tointeger(L, 2) - 1;
@@ -223,8 +235,8 @@ static int iter_aux (lua_State *L) {
223 return 0; /* no more codepoints */ 235 return 0; /* no more codepoints */
224 else { 236 else {
225 utfint code; 237 utfint code;
226 const char *next = utf8_decode(s + n, &code); 238 const char *next = utf8_decode(s + n, &code, strict);
227 if (next == NULL || iscont(next)) 239 if (next == NULL)
228 return luaL_error(L, "invalid UTF-8 code"); 240 return luaL_error(L, "invalid UTF-8 code");
229 lua_pushinteger(L, n + 1); 241 lua_pushinteger(L, n + 1);
230 lua_pushinteger(L, code); 242 lua_pushinteger(L, code);
@@ -233,9 +245,19 @@ static int iter_aux (lua_State *L) {
233} 245}
234 246
235 247
248static int iter_auxstrict (lua_State *L) {
249 return iter_aux(L, 1);
250}
251
252static int iter_auxnostrict (lua_State *L) {
253 return iter_aux(L, 0);
254}
255
256
236static int iter_codes (lua_State *L) { 257static int iter_codes (lua_State *L) {
258 int nonstrict = lua_toboolean(L, 2);
237 luaL_checkstring(L, 1); 259 luaL_checkstring(L, 1);
238 lua_pushcfunction(L, iter_aux); 260 lua_pushcfunction(L, nonstrict ? iter_auxnostrict : iter_auxstrict);
239 lua_pushvalue(L, 1); 261 lua_pushvalue(L, 1);
240 lua_pushinteger(L, 0); 262 lua_pushinteger(L, 0);
241 return 3; 263 return 3;
@@ -243,7 +265,7 @@ static int iter_codes (lua_State *L) {
243 265
244 266
245/* pattern to match a single UTF-8 character */ 267/* pattern to match a single UTF-8 character */
246#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" 268#define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
247 269
248 270
249static const luaL_Reg funcs[] = { 271static const luaL_Reg funcs[] = {
diff --git a/manual/manual.of b/manual/manual.of
index 1e4ca857..8a8ebad5 100644
--- a/manual/manual.of
+++ b/manual/manual.of
@@ -1004,6 +1004,8 @@ the escape sequence @T{\u{@rep{XXX}}}
1004(note the mandatory enclosing brackets), 1004(note the mandatory enclosing brackets),
1005where @rep{XXX} is a sequence of one or more hexadecimal digits 1005where @rep{XXX} is a sequence of one or more hexadecimal digits
1006representing the character code point. 1006representing the character code point.
1007This code point can be any value smaller than @M{2@sp{31}}.
1008(Lua uses the original UTF-8 specification here.)
1007 1009
1008Literal strings can also be defined using a long format 1010Literal strings can also be defined using a long format
1009enclosed by @def{long brackets}. 1011enclosed by @def{long brackets}.
@@ -6899,6 +6901,7 @@ x = string.gsub("$name-$version.tar.gz", "%$(%w+)", t)
6899} 6901}
6900 6902
6901@LibEntry{string.len (s)| 6903@LibEntry{string.len (s)|
6904
6902Receives a string and returns its length. 6905Receives a string and returns its length.
6903The empty string @T{""} has length 0. 6906The empty string @T{""} has length 0.
6904Embedded zeros are counted, 6907Embedded zeros are counted,
@@ -6907,6 +6910,7 @@ so @T{"a\000bc\000"} has length 5.
6907} 6910}
6908 6911
6909@LibEntry{string.lower (s)| 6912@LibEntry{string.lower (s)|
6913
6910Receives a string and returns a copy of this string with all 6914Receives a string and returns a copy of this string with all
6911uppercase letters changed to lowercase. 6915uppercase letters changed to lowercase.
6912All other characters are left unchanged. 6916All other characters are left unchanged.
@@ -6915,6 +6919,7 @@ The definition of what an uppercase letter is depends on the current locale.
6915} 6919}
6916 6920
6917@LibEntry{string.match (s, pattern [, init])| 6921@LibEntry{string.match (s, pattern [, init])|
6922
6918Looks for the first @emph{match} of 6923Looks for the first @emph{match} of
6919@id{pattern} @see{pm} in the string @id{s}. 6924@id{pattern} @see{pm} in the string @id{s}.
6920If it finds one, then @id{match} returns 6925If it finds one, then @id{match} returns
@@ -6946,6 +6951,7 @@ The format string cannot have the variable-length options
6946} 6951}
6947 6952
6948@LibEntry{string.rep (s, n [, sep])| 6953@LibEntry{string.rep (s, n [, sep])|
6954
6949Returns a string that is the concatenation of @id{n} copies of 6955Returns a string that is the concatenation of @id{n} copies of
6950the string @id{s} separated by the string @id{sep}. 6956the string @id{s} separated by the string @id{sep}.
6951The default value for @id{sep} is the empty string 6957The default value for @id{sep} is the empty string
@@ -6958,11 +6964,13 @@ with a single call to this function.)
6958} 6964}
6959 6965
6960@LibEntry{string.reverse (s)| 6966@LibEntry{string.reverse (s)|
6967
6961Returns a string that is the string @id{s} reversed. 6968Returns a string that is the string @id{s} reversed.
6962 6969
6963} 6970}
6964 6971
6965@LibEntry{string.sub (s, i [, j])| 6972@LibEntry{string.sub (s, i [, j])|
6973
6966Returns the substring of @id{s} that 6974Returns the substring of @id{s} that
6967starts at @id{i} and continues until @id{j}; 6975starts at @id{i} and continues until @id{j};
6968@id{i} and @id{j} can be negative. 6976@id{i} and @id{j} can be negative.
@@ -6998,6 +7006,7 @@ this function also returns the index of the first unread byte in @id{s}.
6998} 7006}
6999 7007
7000@LibEntry{string.upper (s)| 7008@LibEntry{string.upper (s)|
7009
7001Receives a string and returns a copy of this string with all 7010Receives a string and returns a copy of this string with all
7002lowercase letters changed to uppercase. 7011lowercase letters changed to uppercase.
7003All other characters are left unchanged. 7012All other characters are left unchanged.
@@ -7318,8 +7327,24 @@ or one plus the length of the subject string.
7318As in the string library, 7327As in the string library,
7319negative indices count from the end of the string. 7328negative indices count from the end of the string.
7320 7329
7330Functions that create byte sequences
7331accept all values up to @T{0x7FFFFFFF},
7332as defined in the original UTF-8 specification;
7333that implies byte sequences of up to six bytes.
7334
7335Functions that interpret byte sequences only accept
7336valid sequences (well formed and not overlong).
7337By default, they only accept byte sequences
7338that result in valid Unicode code points,
7339rejecting values larger than @T{10FFFF} and surrogates.
7340A boolean argument @id{nonstrict}, when available,
7341lifts these checks,
7342so that all values up to @T{0x7FFFFFFF} are accepted.
7343(Not well formed and overlong sequences are still rejected.)
7344
7321 7345
7322@LibEntry{utf8.char (@Cdots)| 7346@LibEntry{utf8.char (@Cdots)|
7347
7323Receives zero or more integers, 7348Receives zero or more integers,
7324converts each one to its corresponding UTF-8 byte sequence 7349converts each one to its corresponding UTF-8 byte sequence
7325and returns a string with the concatenation of all these sequences. 7350and returns a string with the concatenation of all these sequences.
@@ -7327,14 +7352,15 @@ and returns a string with the concatenation of all these sequences.
7327} 7352}
7328 7353
7329@LibEntry{utf8.charpattern| 7354@LibEntry{utf8.charpattern|
7330The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xF4][\x80-\xBF]*} 7355
7356The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xFD][\x80-\xBF]*}
7331@see{pm}, 7357@see{pm},
7332which matches exactly one UTF-8 byte sequence, 7358which matches exactly one UTF-8 byte sequence,
7333assuming that the subject is a valid UTF-8 string. 7359assuming that the subject is a valid UTF-8 string.
7334 7360
7335} 7361}
7336 7362
7337@LibEntry{utf8.codes (s)| 7363@LibEntry{utf8.codes (s [, nonstrict])|
7338 7364
7339Returns values so that the construction 7365Returns values so that the construction
7340@verbatim{ 7366@verbatim{
@@ -7347,7 +7373,8 @@ It raises an error if it meets any invalid byte sequence.
7347 7373
7348} 7374}
7349 7375
7350@LibEntry{utf8.codepoint (s [, i [, j]])| 7376@LibEntry{utf8.codepoint (s [, i [, j [, nonstrict]]])|
7377
7351Returns the codepoints (as integers) from all characters in @id{s} 7378Returns the codepoints (as integers) from all characters in @id{s}
7352that start between byte position @id{i} and @id{j} (both included). 7379that start between byte position @id{i} and @id{j} (both included).
7353The default for @id{i} is 1 and for @id{j} is @id{i}. 7380The default for @id{i} is 1 and for @id{j} is @id{i}.
@@ -7355,7 +7382,8 @@ It raises an error if it meets any invalid byte sequence.
7355 7382
7356} 7383}
7357 7384
7358@LibEntry{utf8.len (s [, i [, j]])| 7385@LibEntry{utf8.len (s [, i [, j [, nonstrict]]])|
7386
7359Returns the number of UTF-8 characters in string @id{s} 7387Returns the number of UTF-8 characters in string @id{s}
7360that start between positions @id{i} and @id{j} (both inclusive). 7388that start between positions @id{i} and @id{j} (both inclusive).
7361The default for @id{i} is @num{1} and for @id{j} is @num{-1}. 7389The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
@@ -7365,6 +7393,7 @@ returns a false value plus the position of the first invalid byte.
7365} 7393}
7366 7394
7367@LibEntry{utf8.offset (s, n [, i])| 7395@LibEntry{utf8.offset (s, n [, i])|
7396
7368Returns the position (in bytes) where the encoding of the 7397Returns the position (in bytes) where the encoding of the
7369@id{n}-th character of @id{s} 7398@id{n}-th character of @id{s}
7370(counting from position @id{i}) starts. 7399(counting from position @id{i}) starts.
@@ -8755,6 +8784,12 @@ You can enclose the call in parentheses if you need to
8755discard these extra results. 8784discard these extra results.
8756} 8785}
8757 8786
8787@item{
8788By default, the decoding functions in the @Lid{utf8} library
8789do not accept surrogates as valid code points.
8790An extra parameter in these functions makes them more permissive.
8791}
8792
8758} 8793}
8759 8794
8760} 8795}
diff --git a/testes/literals.lua b/testes/literals.lua
index 76c08f12..fc45d4ad 100644
--- a/testes/literals.lua
+++ b/testes/literals.lua
@@ -56,16 +56,23 @@ assert("abc\z
56assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) 56assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
57 57
58-- limits for 1-byte sequences 58-- limits for 1-byte sequences
59assert("\u{0}\u{7F}" == "\x00\z\x7F") 59assert("\u{0}\u{7F}" == "\x00\x7F")
60 60
61-- limits for 2-byte sequences 61-- limits for 2-byte sequences
62assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF") 62assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
63 63
64-- limits for 3-byte sequences 64-- limits for 3-byte sequences
65assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF") 65assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF")
66 66
67-- limits for 4-byte sequences 67-- limits for 4-byte sequences
68assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF") 68assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
69
70-- limits for 5-byte sequences
71assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
72
73-- limits for 6-byte sequences
74assert("\u{4000000}\u{7FFFFFFF}" ==
75 "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
69 76
70 77
71-- Error in escape sequences 78-- Error in escape sequences
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
94lexerror([[" \256"]], [[\256"]]) 101lexerror([[" \256"]], [[\256"]])
95 102
96-- errors in UTF-8 sequences 103-- errors in UTF-8 sequences
97lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large 104lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large
98lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' 105lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{'
99lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' 106lexerror([["abc\u"]], [[abc\u"]]) -- missing '{'
100lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' 107lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}'
diff --git a/testes/utf8.lua b/testes/utf8.lua
index 4b6a57fd..86ec1b00 100644
--- a/testes/utf8.lua
+++ b/testes/utf8.lua
@@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
21 21
22-- 't' is the list of codepoints of 's' 22-- 't' is the list of codepoints of 's'
23local function checksyntax (s, t) 23local function checksyntax (s, t)
24 -- creates a string "return '\u{t[1]}...\u{t[n]}'"
24 local ts = {"return '"} 25 local ts = {"return '"}
25 for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end 26 for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
26 ts[#t + 2] = "'" 27 ts[#t + 2] = "'"
27 ts = table.concat(ts) 28 ts = table.concat(ts)
29 -- its execution should result in 's'
28 assert(assert(load(ts))() == s) 30 assert(assert(load(ts))() == s)
29end 31end
30 32
31assert(utf8.offset("alo", 5) == nil) 33assert(utf8.offset("alo", 5) == nil)
32assert(utf8.offset("alo", -4) == nil) 34assert(utf8.offset("alo", -4) == nil)
33 35
34-- 't' is the list of codepoints of 's' 36-- 'check' makes several tests over the validity of string 's'.
35local function check (s, t) 37-- 't' is the list of codepoints of 's'.
36 local l = utf8.len(s) 38local function check (s, t, nonstrict)
39 local l = utf8.len(s, 1, -1, nonstrict)
37 assert(#t == l and len(s) == l) 40 assert(#t == l and len(s) == l)
38 assert(utf8.char(table.unpack(t)) == s) 41 assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent
39 42
40 assert(utf8.offset(s, 0) == 1) 43 assert(utf8.offset(s, 0) == 1)
41 44
42 checksyntax(s, t) 45 checksyntax(s, t)
43 46
44 local t1 = {utf8.codepoint(s, 1, -1)} 47 -- creates new table with all codepoints of 's'
48 local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
45 assert(#t == #t1) 49 assert(#t == #t1)
46 for i = 1, #t do assert(t[i] == t1[i]) end 50 for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
47 51
48 for i = 1, l do 52 for i = 1, l do -- for all codepoints
49 local pi = utf8.offset(s, i) -- position of i-th char 53 local pi = utf8.offset(s, i) -- position of i-th char
50 local pi1 = utf8.offset(s, 2, pi) -- position of next char 54 local pi1 = utf8.offset(s, 2, pi) -- position of next char
51 assert(string.find(string.sub(s, pi, pi1 - 1), justone)) 55 assert(string.find(string.sub(s, pi, pi1 - 1), justone))
52 assert(utf8.offset(s, -1, pi1) == pi) 56 assert(utf8.offset(s, -1, pi1) == pi)
53 assert(utf8.offset(s, i - l - 1) == pi) 57 assert(utf8.offset(s, i - l - 1) == pi)
54 assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi))) 58 assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
55 for j = pi, pi1 - 1 do 59 for j = pi, pi1 - 1 do
56 assert(utf8.offset(s, 0, j) == pi) 60 assert(utf8.offset(s, 0, j) == pi)
57 end 61 end
58 for j = pi + 1, pi1 - 1 do 62 for j = pi + 1, pi1 - 1 do
59 assert(not utf8.len(s, j)) 63 assert(not utf8.len(s, j))
60 end 64 end
61 assert(utf8.len(s, pi, pi) == 1) 65 assert(utf8.len(s, pi, pi, nonstrict) == 1)
62 assert(utf8.len(s, pi, pi1 - 1) == 1) 66 assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
63 assert(utf8.len(s, pi) == l - i + 1) 67 assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
64 assert(utf8.len(s, pi1) == l - i) 68 assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
65 assert(utf8.len(s, 1, pi) == i) 69 assert(utf8.len(s, 1, pi, -1, nonstrict) == i)
66 end 70 end
67 71
68 local i = 0 72 local i = 0
69 for p, c in utf8.codes(s) do 73 for p, c in utf8.codes(s, nonstrict) do
70 i = i + 1 74 i = i + 1
71 assert(c == t[i] and p == utf8.offset(s, i)) 75 assert(c == t[i] and p == utf8.offset(s, i))
72 assert(utf8.codepoint(s, p) == c) 76 assert(utf8.codepoint(s, p, p, nonstrict) == c)
73 end
74 assert(i == #t)
75
76 i = 0
77 for p, c in utf8.codes(s) do
78 i = i + 1
79 assert(c == t[i] and p == utf8.offset(s, i))
80 end 77 end
81 assert(i == #t) 78 assert(i == #t)
82 79
@@ -105,13 +102,17 @@ do -- error indication in utf8.len
105 check("\xF4\x9F\xBF\xBF", 1) 102 check("\xF4\x9F\xBF\xBF", 1)
106end 103end
107 104
108-- error in utf8.codes 105-- errors in utf8.codes
109checkerror("invalid UTF%-8 code", 106do
110 function () 107 local function errorcodes (s)
111 local s = "ab\xff" 108 checkerror("invalid UTF%-8 code",
112 for c in utf8.codes(s) do assert(c) end 109 function ()
113 end) 110 for c in utf8.codes(s) do assert(c) end
114 111 end)
112 end
113 errorcodes("ab\xff")
114 errorcodes("\u{110000}")
115end
115 116
116-- error in initial position for offset 117-- error in initial position for offset
117checkerror("position out of range", utf8.offset, "abc", 1, 5) 118checkerror("position out of range", utf8.offset, "abc", 1, 5)
@@ -141,14 +142,22 @@ do
141 assert(#t == 0) 142 assert(#t == 0)
142 checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) 143 checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
143 checkerror("out of range", utf8.codepoint, s, 1, #s + 1) 144 checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
145 -- surrogates
146 assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
147 assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
148 assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
149 assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
150 assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
144end 151end
145 152
146assert(utf8.char() == "") 153assert(utf8.char() == "")
147assert(utf8.char(97, 98, 99) == "abc") 154assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
148 155
149assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) 156assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
157assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
150 158
151checkerror("value out of range", utf8.char, 0x10FFFF + 1) 159checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
160checkerror("value out of range", utf8.char, -1)
152 161
153local function invalid (s) 162local function invalid (s)
154 checkerror("invalid UTF%-8 code", utf8.codepoint, s) 163 checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@@ -158,6 +167,10 @@ end
158-- UTF-8 representation for 0x11ffff (value out of valid range) 167-- UTF-8 representation for 0x11ffff (value out of valid range)
159invalid("\xF4\x9F\xBF\xBF") 168invalid("\xF4\x9F\xBF\xBF")
160 169
170-- surrogates
171invalid("\u{D800}")
172invalid("\u{DFFF}")
173
161-- overlong sequences 174-- overlong sequences
162invalid("\xC0\x80") -- zero 175invalid("\xC0\x80") -- zero
163invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) 176invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
@@ -183,6 +196,21 @@ s = "\0 \x7F\z
183s = string.gsub(s, " ", "") 196s = string.gsub(s, " ", "")
184check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) 197check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
185 198
199do
200 -- original UTF-8 values
201 local s = "\u{4000000}\u{7FFFFFFF}"
202 assert(#s == 12)
203 check(s, {0x4000000, 0x7FFFFFFF}, true)
204
205 s = "\u{200000}\u{3FFFFFF}"
206 assert(#s == 10)
207 check(s, {0x200000, 0x3FFFFFF}, true)
208
209 s = "\u{10000}\u{1fffff}"
210 assert(#s == 8)
211 check(s, {0x10000, 0x1FFFFF}, true)
212end
213
186x = "日本語a-4\0éó" 214x = "日本語a-4\0éó"
187check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) 215check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})
188 216