diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2014-02-04 16:57:34 -0200 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2014-02-04 16:57:34 -0200 |
commit | 2f3da00e514068ab301c72ae0f07383b8a609c71 (patch) | |
tree | e0a034fafe9f1dc2e7f159cb743d35eadf5499a8 | |
parent | 5a73e3ad9e4ae07b70645dd2e9195475eaafae0a (diff) | |
download | lua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.gz lua-2f3da00e514068ab301c72ae0f07383b8a609c71.tar.bz2 lua-2f3da00e514068ab301c72ae0f07383b8a609c71.zip |
added support for UTF-8 escapes
-rw-r--r-- | llex.c | 56 |
1 files changed, 45 insertions, 11 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | ** $Id: llex.c,v 2.70 2013/12/30 20:47:58 roberto Exp roberto $ | 2 | ** $Id: llex.c,v 2.71 2014/01/31 15:14:22 roberto Exp roberto $ |
3 | ** Lexical Analyzer | 3 | ** Lexical Analyzer |
4 | ** See Copyright Notice in lua.h | 4 | ** See Copyright Notice in lua.h |
5 | */ | 5 | */ |
@@ -320,17 +320,18 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { | |||
320 | } | 320 | } |
321 | 321 | ||
322 | 322 | ||
323 | static void escerror (LexState *ls, const char *msg) { | 323 | static void esccheck (LexState *ls, int c, const char *msg) { |
324 | if (ls->current != EOZ) | 324 | if (!c) { |
325 | save_and_next(ls); /* add current to buffer for error message */ | 325 | if (ls->current != EOZ) |
326 | lexerror(ls, msg, TK_STRING); | 326 | save_and_next(ls); /* add current to buffer for error message */ |
327 | lexerror(ls, msg, TK_STRING); | ||
328 | } | ||
327 | } | 329 | } |
328 | 330 | ||
329 | 331 | ||
330 | static int gethexa (LexState *ls) { | 332 | static int gethexa (LexState *ls) { |
331 | save_and_next(ls); | 333 | save_and_next(ls); |
332 | if (!lisxdigit(ls->current)) | 334 | esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); |
333 | escerror(ls, "hexadecimal digit expected"); | ||
334 | return luaO_hexavalue(ls->current); | 335 | return luaO_hexavalue(ls->current); |
335 | } | 336 | } |
336 | 337 | ||
@@ -343,6 +344,40 @@ static int readhexaesc (LexState *ls) { | |||
343 | } | 344 | } |
344 | 345 | ||
345 | 346 | ||
347 | static unsigned int readutf8esc (LexState *ls) { | ||
348 | int i = 3; /* chars to be removed: '\', 'u', and first digit */ | ||
349 | unsigned int r = gethexa(ls); /* must have at least one digit */ | ||
350 | while ((save_and_next(ls), lisxdigit(ls->current))) { | ||
351 | i++; | ||
352 | r = (r << 4) + luaO_hexavalue(ls->current); | ||
353 | esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); | ||
354 | } | ||
355 | esccheck(ls, ls->current == ';', "missing ';' in UTF-8 escape"); | ||
356 | next(ls); /* skip ';' */ | ||
357 | luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ | ||
358 | return r; | ||
359 | } | ||
360 | |||
361 | |||
362 | static void utf8esc (LexState *ls, unsigned int r) { | ||
363 | if (r < 0x80) /* ascii? */ | ||
364 | save(ls, r); | ||
365 | else { /* need continuation bytes */ | ||
366 | int buff[4]; /* to store continuation bytes */ | ||
367 | int n = 0; /* number of continuation bytes */ | ||
368 | unsigned int mfb = 0x3f; /* maximum that fits in first byte */ | ||
369 | do { | ||
370 | buff[n++] = 0x80 | (r & 0x3f); /* add continuation byte */ | ||
371 | r >>= 6; /* remove added bits */ | ||
372 | mfb >>= 1; /* now there is one less bit in first byte */ | ||
373 | } while (r > mfb); /* needs continuation byte? */ | ||
374 | save(ls, (~mfb << 1) | r); /* add first byte */ | ||
375 | while (n-- > 0) /* add 'buff' to string, reversed */ | ||
376 | save(ls, buff[n]); | ||
377 | } | ||
378 | } | ||
379 | |||
380 | |||
346 | static int readdecesc (LexState *ls) { | 381 | static int readdecesc (LexState *ls) { |
347 | int i; | 382 | int i; |
348 | int r = 0; /* result accumulator */ | 383 | int r = 0; /* result accumulator */ |
@@ -350,8 +385,7 @@ static int readdecesc (LexState *ls) { | |||
350 | r = 10*r + ls->current - '0'; | 385 | r = 10*r + ls->current - '0'; |
351 | save_and_next(ls); | 386 | save_and_next(ls); |
352 | } | 387 | } |
353 | if (r > UCHAR_MAX) | 388 | esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); |
354 | escerror(ls, "decimal escape too large"); | ||
355 | luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ | 389 | luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ |
356 | return r; | 390 | return r; |
357 | } | 391 | } |
@@ -380,6 +414,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) { | |||
380 | case 't': c = '\t'; goto read_save; | 414 | case 't': c = '\t'; goto read_save; |
381 | case 'v': c = '\v'; goto read_save; | 415 | case 'v': c = '\v'; goto read_save; |
382 | case 'x': c = readhexaesc(ls); goto read_save; | 416 | case 'x': c = readhexaesc(ls); goto read_save; |
417 | case 'u': utf8esc(ls, readutf8esc(ls)); goto no_save; | ||
383 | case '\n': case '\r': | 418 | case '\n': case '\r': |
384 | inclinenumber(ls); c = '\n'; goto only_save; | 419 | inclinenumber(ls); c = '\n'; goto only_save; |
385 | case '\\': case '\"': case '\'': | 420 | case '\\': case '\"': case '\'': |
@@ -395,8 +430,7 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) { | |||
395 | goto no_save; | 430 | goto no_save; |
396 | } | 431 | } |
397 | default: { | 432 | default: { |
398 | if (!lisdigit(ls->current)) | 433 | esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); |
399 | escerror(ls, "invalid escape sequence"); | ||
400 | c = readdecesc(ls); /* digital escape \ddd */ | 434 | c = readdecesc(ls); /* digital escape \ddd */ |
401 | goto only_save; | 435 | goto only_save; |
402 | } | 436 | } |