diff options
Diffstat (limited to 'lua_cjson.c')
-rw-r--r-- | lua_cjson.c | 65 |
1 files changed, 56 insertions, 9 deletions
diff --git a/lua_cjson.c b/lua_cjson.c index 3af8157..52b259d 100644 --- a/lua_cjson.c +++ b/lua_cjson.c | |||
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex) | |||
680 | digit[3]; | 680 | digit[3]; |
681 | } | 681 | } |
682 | 682 | ||
683 | /* Converts a Unicode codepoint to UTF-8. | ||
684 | * Returns UTF-8 string length, and up to 4 bytes in *utf8 */ | ||
683 | static int codepoint_to_utf8(char *utf8, int codepoint) | 685 | static int codepoint_to_utf8(char *utf8, int codepoint) |
684 | { | 686 | { |
687 | /* 0xxxxxxx */ | ||
685 | if (codepoint <= 0x7F) { | 688 | if (codepoint <= 0x7F) { |
686 | utf8[0] = codepoint; | 689 | utf8[0] = codepoint; |
687 | return 1; | 690 | return 1; |
688 | } | 691 | } |
689 | 692 | ||
693 | /* 110xxxxx 10xxxxxx */ | ||
690 | if (codepoint <= 0x7FF) { | 694 | if (codepoint <= 0x7FF) { |
691 | utf8[0] = (codepoint >> 6) | 0xC0; | 695 | utf8[0] = (codepoint >> 6) | 0xC0; |
692 | utf8[1] = (codepoint & 0x3F) | 0x80; | 696 | utf8[1] = (codepoint & 0x3F) | 0x80; |
693 | return 2; | 697 | return 2; |
694 | } | 698 | } |
695 | 699 | ||
700 | /* 1110xxxx 10xxxxxx 10xxxxxx */ | ||
696 | if (codepoint <= 0xFFFF) { | 701 | if (codepoint <= 0xFFFF) { |
697 | utf8[0] = (codepoint >> 12) | 0xE0; | 702 | utf8[0] = (codepoint >> 12) | 0xE0; |
698 | utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; | 703 | utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; |
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint) | |||
700 | return 3; | 705 | return 3; |
701 | } | 706 | } |
702 | 707 | ||
708 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ | ||
709 | if (codepoint <= 0x1FFFFF) { | ||
710 | utf8[0] = (codepoint >> 18) | 0xF0; | ||
711 | utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80; | ||
712 | utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80; | ||
713 | utf8[3] = (codepoint & 0x3F) | 0x80; | ||
714 | return 4; | ||
715 | } | ||
716 | |||
703 | return 0; | 717 | return 0; |
704 | } | 718 | } |
705 | 719 | ||
706 | 720 | ||
707 | /* Called when index pointing to beginning of UCS-2 hex code: \uXXXX | 721 | /* Called when index pointing to beginning of UTF-16 code escape: \uXXXX |
708 | * \u is guaranteed to exist, but the remaining hex characters may be | 722 | * \u is guaranteed to exist, but the remaining hex characters may be |
709 | * missing. | 723 | * missing. |
710 | * Translate to UTF-8 and append to temporary token string. | 724 | * Translate to UTF-8 and append to temporary token string. |
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint) | |||
714 | */ | 728 | */ |
715 | static int json_append_unicode_escape(json_parse_t *json) | 729 | static int json_append_unicode_escape(json_parse_t *json) |
716 | { | 730 | { |
717 | char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */ | 731 | char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */ |
718 | int codepoint; | 732 | int codepoint; |
733 | int surrogate_low; | ||
719 | int len; | 734 | int len; |
735 | int escape_len = 6; | ||
720 | 736 | ||
721 | /* Fetch UCS-2 codepoint */ | 737 | /* Fetch UTF-16 code unit */ |
722 | codepoint = decode_hex4(&json->data[json->index + 2]); | 738 | codepoint = decode_hex4(&json->data[json->index + 2]); |
723 | if (codepoint < 0) { | 739 | if (codepoint < 0) |
724 | return -1; | 740 | return -1; |
741 | |||
742 | /* UTF-16 surrogate pairs take the following 2 byte form: | ||
743 | * 11011 x yyyyyyyyyy | ||
744 | * When x = 0: y is the high 10 bits of the codepoint | ||
745 | * x = 1: y is the low 10 bits of the codepoint | ||
746 | * | ||
747 | * Check for a surrogate pair (high or low) */ | ||
748 | if ((codepoint & 0xF800) == 0xD800) { | ||
749 | /* Error if the 1st surrogate is not high */ | ||
750 | if (codepoint & 0x400) | ||
751 | return -1; | ||
752 | |||
753 | /* Ensure the next code is a unicode escape */ | ||
754 | if (json->data[json->index + escape_len] != '\\' || | ||
755 | json->data[json->index + escape_len + 1] != 'u') { | ||
756 | return -1; | ||
757 | } | ||
758 | |||
759 | /* Fetch the next codepoint */ | ||
760 | surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]); | ||
761 | if (surrogate_low < 0) | ||
762 | return -1; | ||
763 | |||
764 | /* Error if the 2nd code is not a low surrogate */ | ||
765 | if ((surrogate_low & 0xFC00) != 0xDC00) | ||
766 | return -1; | ||
767 | |||
768 | /* Calculate Unicode codepoint */ | ||
769 | codepoint = (codepoint & 0x3FF) << 10; | ||
770 | surrogate_low &= 0x3FF; | ||
771 | codepoint = (codepoint | surrogate_low) + 0x10000; | ||
772 | escape_len = 12; | ||
725 | } | 773 | } |
726 | 774 | ||
727 | /* Convert to UTF-8 */ | 775 | /* Convert codepoint to UTF-8 */ |
728 | len = codepoint_to_utf8(utf8, codepoint); | 776 | len = codepoint_to_utf8(utf8, codepoint); |
729 | if (!len) { | 777 | if (!len) |
730 | return -1; | 778 | return -1; |
731 | } | ||
732 | 779 | ||
733 | /* Append bytes and advance index */ | 780 | /* Append bytes and advance parse index */ |
734 | strbuf_append_mem_unsafe(json->tmp, utf8, len); | 781 | strbuf_append_mem_unsafe(json->tmp, utf8, len); |
735 | json->index += 6; | 782 | json->index += escape_len; |
736 | 783 | ||
737 | return 0; | 784 | return 0; |
738 | } | 785 | } |