aboutsummaryrefslogtreecommitdiff
path: root/lua_cjson.c
diff options
context:
space:
mode:
Diffstat (limited to 'lua_cjson.c')
-rw-r--r--lua_cjson.c65
1 files changed, 56 insertions, 9 deletions
diff --git a/lua_cjson.c b/lua_cjson.c
index 3af8157..52b259d 100644
--- a/lua_cjson.c
+++ b/lua_cjson.c
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex)
680 digit[3]; 680 digit[3];
681} 681}
682 682
683/* Converts a Unicode codepoint to UTF-8.
684 * Returns UTF-8 string length, and up to 4 bytes in *utf8 */
683static int codepoint_to_utf8(char *utf8, int codepoint) 685static int codepoint_to_utf8(char *utf8, int codepoint)
684{ 686{
687 /* 0xxxxxxx */
685 if (codepoint <= 0x7F) { 688 if (codepoint <= 0x7F) {
686 utf8[0] = codepoint; 689 utf8[0] = codepoint;
687 return 1; 690 return 1;
688 } 691 }
689 692
693 /* 110xxxxx 10xxxxxx */
690 if (codepoint <= 0x7FF) { 694 if (codepoint <= 0x7FF) {
691 utf8[0] = (codepoint >> 6) | 0xC0; 695 utf8[0] = (codepoint >> 6) | 0xC0;
692 utf8[1] = (codepoint & 0x3F) | 0x80; 696 utf8[1] = (codepoint & 0x3F) | 0x80;
693 return 2; 697 return 2;
694 } 698 }
695 699
700 /* 1110xxxx 10xxxxxx 10xxxxxx */
696 if (codepoint <= 0xFFFF) { 701 if (codepoint <= 0xFFFF) {
697 utf8[0] = (codepoint >> 12) | 0xE0; 702 utf8[0] = (codepoint >> 12) | 0xE0;
698 utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; 703 utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80;
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
700 return 3; 705 return 3;
701 } 706 }
702 707
708 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
709 if (codepoint <= 0x1FFFFF) {
710 utf8[0] = (codepoint >> 18) | 0xF0;
711 utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80;
712 utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80;
713 utf8[3] = (codepoint & 0x3F) | 0x80;
714 return 4;
715 }
716
703 return 0; 717 return 0;
704} 718}
705 719
706 720
707/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX 721/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
708 * \u is guaranteed to exist, but the remaining hex characters may be 722 * \u is guaranteed to exist, but the remaining hex characters may be
709 * missing. 723 * missing.
710 * Translate to UTF-8 and append to temporary token string. 724 * Translate to UTF-8 and append to temporary token string.
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
714 */ 728 */
715static int json_append_unicode_escape(json_parse_t *json) 729static int json_append_unicode_escape(json_parse_t *json)
716{ 730{
717 char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */ 731 char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */
718 int codepoint; 732 int codepoint;
733 int surrogate_low;
719 int len; 734 int len;
735 int escape_len = 6;
720 736
721 /* Fetch UCS-2 codepoint */ 737 /* Fetch UTF-16 code unit */
722 codepoint = decode_hex4(&json->data[json->index + 2]); 738 codepoint = decode_hex4(&json->data[json->index + 2]);
723 if (codepoint < 0) { 739 if (codepoint < 0)
724 return -1; 740 return -1;
741
742 /* UTF-16 surrogate pairs take the following 2 byte form:
743 * 11011 x yyyyyyyyyy
744 * When x = 0: y is the high 10 bits of the codepoint
745 * x = 1: y is the low 10 bits of the codepoint
746 *
747 * Check for a surrogate pair (high or low) */
748 if ((codepoint & 0xF800) == 0xD800) {
749 /* Error if the 1st surrogate is not high */
750 if (codepoint & 0x400)
751 return -1;
752
753 /* Ensure the next code is a unicode escape */
754 if (json->data[json->index + escape_len] != '\\' ||
755 json->data[json->index + escape_len + 1] != 'u') {
756 return -1;
757 }
758
759 /* Fetch the next codepoint */
760 surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]);
761 if (surrogate_low < 0)
762 return -1;
763
764 /* Error if the 2nd code is not a low surrogate */
765 if ((surrogate_low & 0xFC00) != 0xDC00)
766 return -1;
767
768 /* Calculate Unicode codepoint */
769 codepoint = (codepoint & 0x3FF) << 10;
770 surrogate_low &= 0x3FF;
771 codepoint = (codepoint | surrogate_low) + 0x10000;
772 escape_len = 12;
725 } 773 }
726 774
727 /* Convert to UTF-8 */ 775 /* Convert codepoint to UTF-8 */
728 len = codepoint_to_utf8(utf8, codepoint); 776 len = codepoint_to_utf8(utf8, codepoint);
729 if (!len) { 777 if (!len)
730 return -1; 778 return -1;
731 }
732 779
733 /* Append bytes and advance index */ 780 /* Append bytes and advance parse index */
734 strbuf_append_mem_unsafe(json->tmp, utf8, len); 781 strbuf_append_mem_unsafe(json->tmp, utf8, len);
735 json->index += 6; 782 json->index += escape_len;
736 783
737 return 0; 784 return 0;
738} 785}