aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2022-09-23 10:41:16 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2022-09-23 10:41:16 -0300
commita1089b415a3f5c753aa1b40758ffdaf28d5701b0 (patch)
treec2bc42dd8b83e33d3159e6cc7f28daa2d2204b53
parentf8c4c4fcf2b2fed00b3c5b71c19cd64e539dee51 (diff)
downloadlua-a1089b415a3f5c753aa1b40758ffdaf28d5701b0.tar.gz
lua-a1089b415a3f5c753aa1b40758ffdaf28d5701b0.tar.bz2
lua-a1089b415a3f5c753aa1b40758ffdaf28d5701b0.zip
Bug: 'utf8.codes' accepts spurious continuation bytes
-rw-r--r--lutf8lib.c27
-rw-r--r--testes/utf8.lua12
2 files changed, 27 insertions, 12 deletions
diff --git a/lutf8lib.c b/lutf8lib.c
index e7bf098f..3a5b9bc3 100644
--- a/lutf8lib.c
+++ b/lutf8lib.c
@@ -25,6 +25,9 @@
25 25
26#define MAXUTF 0x7FFFFFFFu 26#define MAXUTF 0x7FFFFFFFu
27 27
28
29#define MSGInvalid "invalid UTF-8 code"
30
28/* 31/*
29** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits. 32** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
30*/ 33*/
@@ -35,7 +38,8 @@ typedef unsigned long utfint;
35#endif 38#endif
36 39
37 40
38#define iscont(p) ((*(p) & 0xC0) == 0x80) 41#define iscont(c) (((c) & 0xC0) == 0x80)
42#define iscontp(p) iscont(*(p))
39 43
40 44
41/* from strlib */ 45/* from strlib */
@@ -65,7 +69,7 @@ static const char *utf8_decode (const char *s, utfint *val, int strict) {
65 int count = 0; /* to count number of continuation bytes */ 69 int count = 0; /* to count number of continuation bytes */
66 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ 70 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
67 unsigned int cc = (unsigned char)s[++count]; /* read next byte */ 71 unsigned int cc = (unsigned char)s[++count]; /* read next byte */
68 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 72 if (!iscont(cc)) /* not a continuation byte? */
69 return NULL; /* invalid byte sequence */ 73 return NULL; /* invalid byte sequence */
70 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 74 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
71 } 75 }
@@ -140,7 +144,7 @@ static int codepoint (lua_State *L) {
140 utfint code; 144 utfint code;
141 s = utf8_decode(s, &code, !lax); 145 s = utf8_decode(s, &code, !lax);
142 if (s == NULL) 146 if (s == NULL)
143 return luaL_error(L, "invalid UTF-8 code"); 147 return luaL_error(L, MSGInvalid);
144 lua_pushinteger(L, code); 148 lua_pushinteger(L, code);
145 n++; 149 n++;
146 } 150 }
@@ -190,16 +194,16 @@ static int byteoffset (lua_State *L) {
190 "position out of bounds"); 194 "position out of bounds");
191 if (n == 0) { 195 if (n == 0) {
192 /* find beginning of current byte sequence */ 196 /* find beginning of current byte sequence */
193 while (posi > 0 && iscont(s + posi)) posi--; 197 while (posi > 0 && iscontp(s + posi)) posi--;
194 } 198 }
195 else { 199 else {
196 if (iscont(s + posi)) 200 if (iscontp(s + posi))
197 return luaL_error(L, "initial position is a continuation byte"); 201 return luaL_error(L, "initial position is a continuation byte");
198 if (n < 0) { 202 if (n < 0) {
199 while (n < 0 && posi > 0) { /* move back */ 203 while (n < 0 && posi > 0) { /* move back */
200 do { /* find beginning of previous character */ 204 do { /* find beginning of previous character */
201 posi--; 205 posi--;
202 } while (posi > 0 && iscont(s + posi)); 206 } while (posi > 0 && iscontp(s + posi));
203 n++; 207 n++;
204 } 208 }
205 } 209 }
@@ -208,7 +212,7 @@ static int byteoffset (lua_State *L) {
208 while (n > 0 && posi < (lua_Integer)len) { 212 while (n > 0 && posi < (lua_Integer)len) {
209 do { /* find beginning of next character */ 213 do { /* find beginning of next character */
210 posi++; 214 posi++;
211 } while (iscont(s + posi)); /* (cannot pass final '\0') */ 215 } while (iscontp(s + posi)); /* (cannot pass final '\0') */
212 n--; 216 n--;
213 } 217 }
214 } 218 }
@@ -226,15 +230,15 @@ static int iter_aux (lua_State *L, int strict) {
226 const char *s = luaL_checklstring(L, 1, &len); 230 const char *s = luaL_checklstring(L, 1, &len);
227 lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2); 231 lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
228 if (n < len) { 232 if (n < len) {
229 while (iscont(s + n)) n++; /* skip continuation bytes */ 233 while (iscontp(s + n)) n++; /* go to next character */
230 } 234 }
231 if (n >= len) /* (also handles original 'n' being negative) */ 235 if (n >= len) /* (also handles original 'n' being negative) */
232 return 0; /* no more codepoints */ 236 return 0; /* no more codepoints */
233 else { 237 else {
234 utfint code; 238 utfint code;
235 const char *next = utf8_decode(s + n, &code, strict); 239 const char *next = utf8_decode(s + n, &code, strict);
236 if (next == NULL) 240 if (next == NULL || iscontp(next))
237 return luaL_error(L, "invalid UTF-8 code"); 241 return luaL_error(L, MSGInvalid);
238 lua_pushinteger(L, n + 1); 242 lua_pushinteger(L, n + 1);
239 lua_pushinteger(L, code); 243 lua_pushinteger(L, code);
240 return 2; 244 return 2;
@@ -253,7 +257,8 @@ static int iter_auxlax (lua_State *L) {
253 257
254static int iter_codes (lua_State *L) { 258static int iter_codes (lua_State *L) {
255 int lax = lua_toboolean(L, 2); 259 int lax = lua_toboolean(L, 2);
256 luaL_checkstring(L, 1); 260 const char *s = luaL_checkstring(L, 1);
261 luaL_argcheck(L, !iscontp(s), 1, MSGInvalid);
257 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); 262 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
258 lua_pushvalue(L, 1); 263 lua_pushvalue(L, 1);
259 lua_pushinteger(L, 0); 264 lua_pushinteger(L, 0);
diff --git a/testes/utf8.lua b/testes/utf8.lua
index 461e223c..7472cfd0 100644
--- a/testes/utf8.lua
+++ b/testes/utf8.lua
@@ -97,9 +97,15 @@ do -- error indication in utf8.len
97 assert(not a and b == p) 97 assert(not a and b == p)
98 end 98 end
99 check("abc\xE3def", 4) 99 check("abc\xE3def", 4)
100 check("汉字\x80", #("汉字") + 1)
101 check("\xF4\x9F\xBF", 1) 100 check("\xF4\x9F\xBF", 1)
102 check("\xF4\x9F\xBF\xBF", 1) 101 check("\xF4\x9F\xBF\xBF", 1)
102 -- spurious continuation bytes
103 check("汉字\x80", #("汉字") + 1)
104 check("\x80hello", 1)
105 check("hel\x80lo", 4)
106 check("汉字\xBF", #("汉字") + 1)
107 check("\xBFhello", 1)
108 check("hel\xBFlo", 4)
103end 109end
104 110
105-- errors in utf8.codes 111-- errors in utf8.codes
@@ -112,12 +118,16 @@ do
112 end 118 end
113 errorcodes("ab\xff") 119 errorcodes("ab\xff")
114 errorcodes("\u{110000}") 120 errorcodes("\u{110000}")
121 errorcodes("in\x80valid")
122 errorcodes("\xbfinvalid")
123 errorcodes("αλφ\xBFα")
115 124
116 -- calling interation function with invalid arguments 125 -- calling interation function with invalid arguments
117 local f = utf8.codes("") 126 local f = utf8.codes("")
118 assert(f("", 2) == nil) 127 assert(f("", 2) == nil)
119 assert(f("", -1) == nil) 128 assert(f("", -1) == nil)
120 assert(f("", math.mininteger) == nil) 129 assert(f("", math.mininteger) == nil)
130
121end 131end
122 132
123-- error in initial position for offset 133-- error in initial position for offset