summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2014-02-04 16:57:34 -0200
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2014-02-04 16:57:34 -0200
commit2f3da00e514068ab301c72ae0f07383b8a609c71 (patch)
treee0a034fafe9f1dc2e7f159cb743d35eadf5499a8
parent5a73e3ad9e4ae07b70645dd2e9195475eaafae0a (diff)
downloadlua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.gz
lua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.bz2
lua-2f3da00e514068ab301c72ae0f07383b8a609c71.zip
added support for UTF-8 escapes
-rw-r--r--llex.c56
1 files changed, 45 insertions, 11 deletions
diff --git a/llex.c b/llex.c
index 4d1a4423..818c0812 100644
--- a/llex.c
+++ b/llex.c
@@ -1,5 +1,5 @@
1/* 1/*
2** $Id: llex.c,v 2.70 2013/12/30 20:47:58 roberto Exp roberto $ 2** $Id: llex.c,v 2.71 2014/01/31 15:14:22 roberto Exp roberto $
3** Lexical Analyzer 3** Lexical Analyzer
4** See Copyright Notice in lua.h 4** See Copyright Notice in lua.h
5*/ 5*/
@@ -320,17 +320,18 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
320} 320}
321 321
322 322
323static void escerror (LexState *ls, const char *msg) { 323static void esccheck (LexState *ls, int c, const char *msg) {
324 if (ls->current != EOZ) 324 if (!c) {
325 save_and_next(ls); /* add current to buffer for error message */ 325 if (ls->current != EOZ)
326 lexerror(ls, msg, TK_STRING); 326 save_and_next(ls); /* add current to buffer for error message */
327 lexerror(ls, msg, TK_STRING);
328 }
327} 329}
328 330
329 331
330static int gethexa (LexState *ls) { 332static int gethexa (LexState *ls) {
331 save_and_next(ls); 333 save_and_next(ls);
332 if (!lisxdigit(ls->current)) 334 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
333 escerror(ls, "hexadecimal digit expected");
334 return luaO_hexavalue(ls->current); 335 return luaO_hexavalue(ls->current);
335} 336}
336 337
@@ -343,6 +344,40 @@ static int readhexaesc (LexState *ls) {
343} 344}
344 345
345 346
347static unsigned int readutf8esc (LexState *ls) {
348 int i = 3; /* chars to be removed: '\', 'u', and first digit */
349 unsigned int r = gethexa(ls); /* must have at least one digit */
350 while ((save_and_next(ls), lisxdigit(ls->current))) {
351 i++;
352 r = (r << 4) + luaO_hexavalue(ls->current);
353 esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
354 }
355 esccheck(ls, ls->current == ';', "missing ';' in UTF-8 escape");
356 next(ls); /* skip ';' */
357 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */
358 return r;
359}
360
361
362static void utf8esc (LexState *ls, unsigned int r) {
363 if (r < 0x80) /* ascii? */
364 save(ls, r);
365 else { /* need continuation bytes */
366 int buff[4]; /* to store continuation bytes */
367 int n = 0; /* number of continuation bytes */
368 unsigned int mfb = 0x3f; /* maximum that fits in first byte */
369 do {
370 buff[n++] = 0x80 | (r & 0x3f); /* add continuation byte */
371 r >>= 6; /* remove added bits */
372 mfb >>= 1; /* now there is one less bit in first byte */
373 } while (r > mfb); /* needs continuation byte? */
374 save(ls, (~mfb << 1) | r); /* add first byte */
375 while (n-- > 0) /* add 'buff' to string, reversed */
376 save(ls, buff[n]);
377 }
378}
379
380
346static int readdecesc (LexState *ls) { 381static int readdecesc (LexState *ls) {
347 int i; 382 int i;
348 int r = 0; /* result accumulator */ 383 int r = 0; /* result accumulator */
@@ -350,8 +385,7 @@ static int readdecesc (LexState *ls) {
350 r = 10*r + ls->current - '0'; 385 r = 10*r + ls->current - '0';
351 save_and_next(ls); 386 save_and_next(ls);
352 } 387 }
353 if (r > UCHAR_MAX) 388 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
354 escerror(ls, "decimal escape too large");
355 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 389 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */
356 return r; 390 return r;
357} 391}
@@ -380,6 +414,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
380 case 't': c = '\t'; goto read_save; 414 case 't': c = '\t'; goto read_save;
381 case 'v': c = '\v'; goto read_save; 415 case 'v': c = '\v'; goto read_save;
382 case 'x': c = readhexaesc(ls); goto read_save; 416 case 'x': c = readhexaesc(ls); goto read_save;
417 case 'u': utf8esc(ls, readutf8esc(ls)); goto no_save;
383 case '\n': case '\r': 418 case '\n': case '\r':
384 inclinenumber(ls); c = '\n'; goto only_save; 419 inclinenumber(ls); c = '\n'; goto only_save;
385 case '\\': case '\"': case '\'': 420 case '\\': case '\"': case '\'':
@@ -395,8 +430,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
395 goto no_save; 430 goto no_save;
396 } 431 }
397 default: { 432 default: {
398 if (!lisdigit(ls->current)) 433 esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
399 escerror(ls, "invalid escape sequence");
400 c = readdecesc(ls); /* digital escape \ddd */ 434 c = readdecesc(ls); /* digital escape \ddd */
401 goto only_save; 435 goto only_save;
402 } 436 }