diff options
| -rw-r--r-- | lutf8lib.c | 5 | ||||
| -rw-r--r-- | makefile | 2 | ||||
| -rw-r--r-- | testes/utf8.lua | 14 |
3 files changed, 18 insertions, 3 deletions
| @@ -56,6 +56,8 @@ static const char *utf8_decode (const char *s, l_uint32 *val, int strict) { | |||
| 56 | l_uint32 res = 0; /* final result */ | 56 | l_uint32 res = 0; /* final result */ |
| 57 | if (c < 0x80) /* ASCII? */ | 57 | if (c < 0x80) /* ASCII? */ |
| 58 | res = c; | 58 | res = c; |
| 59 | else if (c >= 0xfe) /* c >= 1111 1110b ? */ | ||
| 60 | return NULL; /* would need six or more continuation bytes */ | ||
| 59 | else { | 61 | else { |
| 60 | int count = 0; /* to count number of continuation bytes */ | 62 | int count = 0; /* to count number of continuation bytes */ |
| 61 | for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ | 63 | for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ |
| @@ -64,8 +66,9 @@ static const char *utf8_decode (const char *s, l_uint32 *val, int strict) { | |||
| 64 | return NULL; /* invalid byte sequence */ | 66 | return NULL; /* invalid byte sequence */ |
| 65 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ | 67 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ |
| 66 | } | 68 | } |
| 69 | lua_assert(count <= 5); | ||
| 67 | res |= ((l_uint32)(c & 0x7F) << (count * 5)); /* add first byte */ | 70 | res |= ((l_uint32)(c & 0x7F) << (count * 5)); /* add first byte */ |
| 68 | if (count > 5 || res > MAXUTF || res < limits[count]) | 71 | if (res > MAXUTF || res < limits[count]) |
| 69 | return NULL; /* invalid byte sequence */ | 72 | return NULL; /* invalid byte sequence */ |
| 70 | s += count; /* skip continuation bytes read */ | 73 | s += count; /* skip continuation bytes read */ |
| 71 | } | 74 | } |
| @@ -60,7 +60,7 @@ CWARNS= $(CWARNSCPP) $(CWARNSC) $(CWARNGCC) | |||
| 60 | # create problems; some are only available in newer gcc versions. To | 60 | # create problems; some are only available in newer gcc versions. To |
| 61 | # use some of them, we also have to define an environment variable | 61 | # use some of them, we also have to define an environment variable |
| 62 | # ASAN_OPTIONS="detect_invalid_pointer_pairs=2". | 62 | # ASAN_OPTIONS="detect_invalid_pointer_pairs=2". |
| 63 | # -fsanitize=undefined | 63 | # -fsanitize=undefined (you may need to add "-lubsan" to libs) |
| 64 | # -fsanitize=pointer-subtract -fsanitize=address -fsanitize=pointer-compare | 64 | # -fsanitize=pointer-subtract -fsanitize=address -fsanitize=pointer-compare |
| 65 | # TESTS= -DLUA_USER_H='"ltests.h"' -Og -g | 65 | # TESTS= -DLUA_USER_H='"ltests.h"' -Og -g |
| 66 | 66 | ||
diff --git a/testes/utf8.lua b/testes/utf8.lua index 028995a4..8a0213d6 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua | |||
| @@ -238,10 +238,18 @@ s = "\0 \x7F\z | |||
| 238 | s = string.gsub(s, " ", "") | 238 | s = string.gsub(s, " ", "") |
| 239 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) | 239 | check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) |
| 240 | 240 | ||
| 241 | |||
| 242 | -- again, without strictness | ||
| 243 | s = "\xF0\x90\x80\x80 \xF7\xBF\xBF\xBF\z | ||
| 244 | \xF8\x88\x80\x80\x80 \xFB\xBF\xBF\xBF\xBF\z | ||
| 245 | \xFC\x84\x80\x80\x80\x80 \xFD\xBF\xBF\xBF\xBF\xBF" | ||
| 246 | s = string.gsub(s, " ", "") | ||
| 247 | check(s, {0x10000,0x1FFFFF, 0x200000,0x3FFFFFF, 0x4000000,0x7FFFFFFF}, true) | ||
| 248 | |||
| 241 | do | 249 | do |
| 242 | -- original UTF-8 values | 250 | -- original UTF-8 values |
| 243 | local s = "\u{4000000}\u{7FFFFFFF}" | 251 | local s = "\u{4000000}\u{7FFFFFFF}" |
| 244 | assert(#s == 12) | 252 | assert(s == "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF") |
| 245 | check(s, {0x4000000, 0x7FFFFFFF}, true) | 253 | check(s, {0x4000000, 0x7FFFFFFF}, true) |
| 246 | 254 | ||
| 247 | s = "\u{200000}\u{3FFFFFF}" | 255 | s = "\u{200000}\u{3FFFFFF}" |
| @@ -257,6 +265,10 @@ local x = "日本語a-4\0éó" | |||
| 257 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) | 265 | check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) |
| 258 | 266 | ||
| 259 | 267 | ||
| 268 | -- more than 5 continuation bytes | ||
| 269 | assert(not utf8.len("\xff\x8f\x8f\x8f\x8f\x8f\x8f\x8f")) | ||
| 270 | |||
| 271 | |||
| 260 | -- Supplementary Characters | 272 | -- Supplementary Characters |
| 261 | check("𣲷𠜎𠱓𡁻𠵼ab𠺢", | 273 | check("𣲷𠜎𠱓𡁻𠵼ab𠺢", |
| 262 | {0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,}) | 274 | {0x23CB7, 0x2070E, 0x20C53, 0x2107B, 0x20D7C, 0x61, 0x62, 0x20EA2,}) |
