From 4dc56c6d362f2cd8a79d83369f0b852df07dae3f Mon Sep 17 00:00:00 2001 From: Mark Pulford Date: Sun, 8 May 2011 20:26:09 +0930 Subject: Add UTF-16 surrogate pair decode support - Add tests for UTF-16 decoding and failures - Add getutf8.pl to assist with UTF-16 decode testing - Re-add test_decode_cycle() which was accidentally removed earlier - Rename bytestring.dat to octets-escaped.dat --- lua_cjson.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 9 deletions(-) (limited to 'lua_cjson.c') diff --git a/lua_cjson.c b/lua_cjson.c index 3af8157..52b259d 100644 --- a/lua_cjson.c +++ b/lua_cjson.c @@ -680,19 +680,24 @@ static int decode_hex4(const char *hex) digit[3]; } +/* Converts a Unicode codepoint to UTF-8. + * Returns UTF-8 string length, and up to 4 bytes in *utf8 */ static int codepoint_to_utf8(char *utf8, int codepoint) { + /* 0xxxxxxx */ if (codepoint <= 0x7F) { utf8[0] = codepoint; return 1; } + /* 110xxxxx 10xxxxxx */ if (codepoint <= 0x7FF) { utf8[0] = (codepoint >> 6) | 0xC0; utf8[1] = (codepoint & 0x3F) | 0x80; return 2; } + /* 1110xxxx 10xxxxxx 10xxxxxx */ if (codepoint <= 0xFFFF) { utf8[0] = (codepoint >> 12) | 0xE0; utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; @@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint) return 3; } + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (codepoint <= 0x1FFFFF) { + utf8[0] = (codepoint >> 18) | 0xF0; + utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80; + utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80; + utf8[3] = (codepoint & 0x3F) | 0x80; + return 4; + } + return 0; } -/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX +/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX * \u is guaranteed to exist, but the remaining hex characters may be * missing. * Translate to UTF-8 and append to temporary token string. @@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint) */ static int json_append_unicode_escape(json_parse_t *json) { - char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */ + char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */ int codepoint; + int surrogate_low; int len; + int escape_len = 6; - /* Fetch UCS-2 codepoint */ + /* Fetch UTF-16 code unit */ codepoint = decode_hex4(&json->data[json->index + 2]); - if (codepoint < 0) { + if (codepoint < 0) return -1; + + /* UTF-16 surrogate pairs take the following 2 byte form: + * 11011 x yyyyyyyyyy + * When x = 0: y is the high 10 bits of the codepoint + * x = 1: y is the low 10 bits of the codepoint + * + * Check for a surrogate pair (high or low) */ + if ((codepoint & 0xF800) == 0xD800) { + /* Error if the 1st surrogate is not high */ + if (codepoint & 0x400) + return -1; + + /* Ensure the next code is a unicode escape */ + if (json->data[json->index + escape_len] != '\\' || + json->data[json->index + escape_len + 1] != 'u') { + return -1; + } + + /* Fetch the next codepoint */ + surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]); + if (surrogate_low < 0) + return -1; + + /* Error if the 2nd code is not a low surrogate */ + if ((surrogate_low & 0xFC00) != 0xDC00) + return -1; + + /* Calculate Unicode codepoint */ + codepoint = (codepoint & 0x3FF) << 10; + surrogate_low &= 0x3FF; + codepoint = (codepoint | surrogate_low) + 0x10000; + escape_len = 12; } - /* Convert to UTF-8 */ + /* Convert codepoint to UTF-8 */ len = codepoint_to_utf8(utf8, codepoint); - if (!len) { + if (!len) return -1; - } - /* Append bytes and advance index */ + /* Append bytes and advance parse index */ strbuf_append_mem_unsafe(json->tmp, utf8, len); - json->index += 6; + json->index += escape_len; return 0; } -- cgit v1.2.3-55-g6feb