diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2014-02-04 16:57:34 -0200 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2014-02-04 16:57:34 -0200 |
| commit | 2f3da00e514068ab301c72ae0f07383b8a609c71 (patch) | |
| tree | e0a034fafe9f1dc2e7f159cb743d35eadf5499a8 | |
| parent | 5a73e3ad9e4ae07b70645dd2e9195475eaafae0a (diff) | |
| download | lua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.gz lua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.bz2 lua-2f3da00e514068ab301c72ae0f07383b8a609c71.zip | |
added support for UTF-8 escapes
| -rw-r--r-- | llex.c | 56 |
1 files changed, 45 insertions, 11 deletions
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | ** $Id: llex.c,v 2.70 2013/12/30 20:47:58 roberto Exp roberto $ | 2 | ** $Id: llex.c,v 2.71 2014/01/31 15:14:22 roberto Exp roberto $ |
| 3 | ** Lexical Analyzer | 3 | ** Lexical Analyzer |
| 4 | ** See Copyright Notice in lua.h | 4 | ** See Copyright Notice in lua.h |
| 5 | */ | 5 | */ |
| @@ -320,17 +320,18 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { | |||
| 320 | } | 320 | } |
| 321 | 321 | ||
| 322 | 322 | ||
| 323 | static void escerror (LexState *ls, const char *msg) { | 323 | static void esccheck (LexState *ls, int c, const char *msg) { |
| 324 | if (ls->current != EOZ) | 324 | if (!c) { |
| 325 | save_and_next(ls); /* add current to buffer for error message */ | 325 | if (ls->current != EOZ) |
| 326 | lexerror(ls, msg, TK_STRING); | 326 | save_and_next(ls); /* add current to buffer for error message */ |
| 327 | lexerror(ls, msg, TK_STRING); | ||
| 328 | } | ||
| 327 | } | 329 | } |
| 328 | 330 | ||
| 329 | 331 | ||
| 330 | static int gethexa (LexState *ls) { | 332 | static int gethexa (LexState *ls) { |
| 331 | save_and_next(ls); | 333 | save_and_next(ls); |
| 332 | if (!lisxdigit(ls->current)) | 334 | esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); |
| 333 | escerror(ls, "hexadecimal digit expected"); | ||
| 334 | return luaO_hexavalue(ls->current); | 335 | return luaO_hexavalue(ls->current); |
| 335 | } | 336 | } |
| 336 | 337 | ||
| @@ -343,6 +344,40 @@ static int readhexaesc (LexState *ls) { | |||
| 343 | } | 344 | } |
| 344 | 345 | ||
| 345 | 346 | ||
| 347 | static unsigned int readutf8esc (LexState *ls) { | ||
| 348 | int i = 3; /* chars to be removed: '\', 'u', and first digit */ | ||
| 349 | unsigned int r = gethexa(ls); /* must have at least one digit */ | ||
| 350 | while ((save_and_next(ls), lisxdigit(ls->current))) { | ||
| 351 | i++; | ||
| 352 | r = (r << 4) + luaO_hexavalue(ls->current); | ||
| 353 | esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); | ||
| 354 | } | ||
| 355 | esccheck(ls, ls->current == ';', "missing ';' in UTF-8 escape"); | ||
| 356 | next(ls); /* skip ';' */ | ||
| 357 | luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ | ||
| 358 | return r; | ||
| 359 | } | ||
| 360 | |||
| 361 | |||
| 362 | static void utf8esc (LexState *ls, unsigned int r) { | ||
| 363 | if (r < 0x80) /* ascii? */ | ||
| 364 | save(ls, r); | ||
| 365 | else { /* need continuation bytes */ | ||
| 366 | int buff[4]; /* to store continuation bytes */ | ||
| 367 | int n = 0; /* number of continuation bytes */ | ||
| 368 | unsigned int mfb = 0x3f; /* maximum that fits in first byte */ | ||
| 369 | do { | ||
| 370 | buff[n++] = 0x80 | (r & 0x3f); /* add continuation byte */ | ||
| 371 | r >>= 6; /* remove added bits */ | ||
| 372 | mfb >>= 1; /* now there is one less bit in first byte */ | ||
| 373 | } while (r > mfb); /* needs continuation byte? */ | ||
| 374 | save(ls, (~mfb << 1) | r); /* add first byte */ | ||
| 375 | while (n-- > 0) /* add 'buff' to string, reversed */ | ||
| 376 | save(ls, buff[n]); | ||
| 377 | } | ||
| 378 | } | ||
| 379 | |||
| 380 | |||
| 346 | static int readdecesc (LexState *ls) { | 381 | static int readdecesc (LexState *ls) { |
| 347 | int i; | 382 | int i; |
| 348 | int r = 0; /* result accumulator */ | 383 | int r = 0; /* result accumulator */ |
| @@ -350,8 +385,7 @@ static int readdecesc (LexState *ls) { | |||
| 350 | r = 10*r + ls->current - '0'; | 385 | r = 10*r + ls->current - '0'; |
| 351 | save_and_next(ls); | 386 | save_and_next(ls); |
| 352 | } | 387 | } |
| 353 | if (r > UCHAR_MAX) | 388 | esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); |
| 354 | escerror(ls, "decimal escape too large"); | ||
| 355 | luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ | 389 | luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ |
| 356 | return r; | 390 | return r; |
| 357 | } | 391 | } |
| @@ -380,6 +414,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) { | |||
| 380 | case 't': c = '\t'; goto read_save; | 414 | case 't': c = '\t'; goto read_save; |
| 381 | case 'v': c = '\v'; goto read_save; | 415 | case 'v': c = '\v'; goto read_save; |
| 382 | case 'x': c = readhexaesc(ls); goto read_save; | 416 | case 'x': c = readhexaesc(ls); goto read_save; |
| 417 | case 'u': utf8esc(ls, readutf8esc(ls)); goto no_save; | ||
| 383 | case '\n': case '\r': | 418 | case '\n': case '\r': |
| 384 | inclinenumber(ls); c = '\n'; goto only_save; | 419 | inclinenumber(ls); c = '\n'; goto only_save; |
| 385 | case '\\': case '\"': case '\'': | 420 | case '\\': case '\"': case '\'': |
| @@ -395,8 +430,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) { | |||
| 395 | goto no_save; | 430 | goto no_save; |
| 396 | } | 431 | } |
| 397 | default: { | 432 | default: { |
| 398 | if (!lisdigit(ls->current)) | 433 | esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); |
| 399 | escerror(ls, "invalid escape sequence"); | ||
| 400 | c = readdecesc(ls); /* digital escape \ddd */ | 434 | c = readdecesc(ls); /* digital escape \ddd */ |
| 401 | goto only_save; | 435 | goto only_save; |
| 402 | } | 436 | } |
