1 files changed, 56 insertions, 9 deletions
diff --git a/lua_cjson.c b/lua_cjson.c
index 3af8157..52b259d 100644
--- a/lua_cjson.c
+++ b/lua_cjson.c
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex)
            digit[3];
 }
+/* Converts a Unicode codepoint to UTF-8.
+ * Returns UTF-8 string length, and up to 4 bytes in *utf8 */
 static int codepoint_to_utf8(char *utf8, int codepoint)
 {
+    /* 0xxxxxxx */
    if (codepoint <= 0x7F) {
        utf8[0] = codepoint;
        return 1;
    }
    
+    /* 110xxxxx 10xxxxxx */
    if (codepoint <= 0x7FF) {
        utf8[0] = (codepoint >> 6) | 0xC0;
        utf8[1] = (codepoint & 0x3F) | 0x80;
        return 2;
    }
+    /* 1110xxxx 10xxxxxx 10xxxxxx */
    if (codepoint <= 0xFFFF) {
        utf8[0] = (codepoint >> 12) | 0xE0;
        utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80;
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
        return 3;
    }
+    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    if (codepoint <= 0x1FFFFF) {
+        utf8[0] = (codepoint >> 18) | 0xF0;
+        utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80;
+        utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80;
+        utf8[3] = (codepoint & 0x3F) | 0x80;
+        return 4;
+    }
    return 0;
 }
-/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX
+/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
 * \u is guaranteed to exist, but the remaining hex characters may be
 * missing.
 * Translate to UTF-8 and append to temporary token string.
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
 */
 static int json_append_unicode_escape(json_parse_t *json)
 {
-    char utf8[4];       /* 3 bytes of UTF-8 can handle UCS-2 */
+    char utf8[4];       /* Surrogate pairs require 4 UTF-8 bytes */
    int codepoint;
+    int surrogate_low;
    int len;
+    int escape_len = 6;
-    /* Fetch UCS-2 codepoint */
+    /* Fetch UTF-16 code unit */
    codepoint = decode_hex4(&json->data[json->index + 2]);
-    if (codepoint < 0) {
+    if (codepoint < 0)
        return -1;
+    /* UTF-16 surrogate pairs take the following 2 byte form:
+     *      11011 x yyyyyyyyyy
+     * When x = 0: y is the high 10 bits of the codepoint
+     *      x = 1: y is the low 10 bits of the codepoint
+     *
+     * Check for a surrogate pair (high or low) */
+    if ((codepoint & 0xF800) == 0xD800) {
+        /* Error if the 1st surrogate is not high */
+        if (codepoint & 0x400)
+            return -1;
+        /* Ensure the next code is a unicode escape */
+        if (json->data[json->index + escape_len] != '\\' ||
+            json->data[json->index + escape_len + 1] != 'u') {
+            return -1;
+        }
+        /* Fetch the next codepoint */
+        surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]);
+        if (surrogate_low < 0)
+            return -1;
+        /* Error if the 2nd code is not a low surrogate */
+        if ((surrogate_low & 0xFC00) != 0xDC00)
+            return -1;
+        /* Calculate Unicode codepoint */
+        codepoint = (codepoint & 0x3FF) << 10;
+        surrogate_low &= 0x3FF;
+        codepoint = (codepoint | surrogate_low) + 0x10000;
+        escape_len = 12;
    }
-    /* Convert to UTF-8 */
+    /* Convert codepoint to UTF-8 */
    len = codepoint_to_utf8(utf8, codepoint);
-    if (!len) {
+    if (!len)
        return -1;
-    }
-    /* Append bytes and advance index */
+    /* Append bytes and advance parse index */
    strbuf_append_mem_unsafe(json->tmp, utf8, len);
-    json->index += 6;
+    json->index += escape_len;
    return 0;
 }