diff options
Diffstat (limited to '')
| -rw-r--r-- | lua_cjson.c | 65 | ||||
| -rw-r--r-- | tests/common.lua | 4 | ||||
| -rwxr-xr-x | tests/genutf8.pl | 25 | ||||
| -rw-r--r-- | tests/octets-escaped.dat (renamed from tests/bytestring.dat) | 0 | ||||
| -rwxr-xr-x | tests/test.lua | 66 |
5 files changed, 149 insertions, 11 deletions
diff --git a/lua_cjson.c b/lua_cjson.c index 3af8157..52b259d 100644 --- a/lua_cjson.c +++ b/lua_cjson.c | |||
| @@ -680,19 +680,24 @@ static int decode_hex4(const char *hex) | |||
| 680 | digit[3]; | 680 | digit[3]; |
| 681 | } | 681 | } |
| 682 | 682 | ||
| 683 | /* Converts a Unicode codepoint to UTF-8. | ||
| 684 | * Returns UTF-8 string length, and up to 4 bytes in *utf8 */ | ||
| 683 | static int codepoint_to_utf8(char *utf8, int codepoint) | 685 | static int codepoint_to_utf8(char *utf8, int codepoint) |
| 684 | { | 686 | { |
| 687 | /* 0xxxxxxx */ | ||
| 685 | if (codepoint <= 0x7F) { | 688 | if (codepoint <= 0x7F) { |
| 686 | utf8[0] = codepoint; | 689 | utf8[0] = codepoint; |
| 687 | return 1; | 690 | return 1; |
| 688 | } | 691 | } |
| 689 | 692 | ||
| 693 | /* 110xxxxx 10xxxxxx */ | ||
| 690 | if (codepoint <= 0x7FF) { | 694 | if (codepoint <= 0x7FF) { |
| 691 | utf8[0] = (codepoint >> 6) | 0xC0; | 695 | utf8[0] = (codepoint >> 6) | 0xC0; |
| 692 | utf8[1] = (codepoint & 0x3F) | 0x80; | 696 | utf8[1] = (codepoint & 0x3F) | 0x80; |
| 693 | return 2; | 697 | return 2; |
| 694 | } | 698 | } |
| 695 | 699 | ||
| 700 | /* 1110xxxx 10xxxxxx 10xxxxxx */ | ||
| 696 | if (codepoint <= 0xFFFF) { | 701 | if (codepoint <= 0xFFFF) { |
| 697 | utf8[0] = (codepoint >> 12) | 0xE0; | 702 | utf8[0] = (codepoint >> 12) | 0xE0; |
| 698 | utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; | 703 | utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; |
| @@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint) | |||
| 700 | return 3; | 705 | return 3; |
| 701 | } | 706 | } |
| 702 | 707 | ||
| 708 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ | ||
| 709 | if (codepoint <= 0x1FFFFF) { | ||
| 710 | utf8[0] = (codepoint >> 18) | 0xF0; | ||
| 711 | utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80; | ||
| 712 | utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80; | ||
| 713 | utf8[3] = (codepoint & 0x3F) | 0x80; | ||
| 714 | return 4; | ||
| 715 | } | ||
| 716 | |||
| 703 | return 0; | 717 | return 0; |
| 704 | } | 718 | } |
| 705 | 719 | ||
| 706 | 720 | ||
| 707 | /* Called when index pointing to beginning of UCS-2 hex code: \uXXXX | 721 | /* Called when index pointing to beginning of UTF-16 code escape: \uXXXX |
| 708 | * \u is guaranteed to exist, but the remaining hex characters may be | 722 | * \u is guaranteed to exist, but the remaining hex characters may be |
| 709 | * missing. | 723 | * missing. |
| 710 | * Translate to UTF-8 and append to temporary token string. | 724 | * Translate to UTF-8 and append to temporary token string. |
| @@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint) | |||
| 714 | */ | 728 | */ |
| 715 | static int json_append_unicode_escape(json_parse_t *json) | 729 | static int json_append_unicode_escape(json_parse_t *json) |
| 716 | { | 730 | { |
| 717 | char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */ | 731 | char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */ |
| 718 | int codepoint; | 732 | int codepoint; |
| 733 | int surrogate_low; | ||
| 719 | int len; | 734 | int len; |
| 735 | int escape_len = 6; | ||
| 720 | 736 | ||
| 721 | /* Fetch UCS-2 codepoint */ | 737 | /* Fetch UTF-16 code unit */ |
| 722 | codepoint = decode_hex4(&json->data[json->index + 2]); | 738 | codepoint = decode_hex4(&json->data[json->index + 2]); |
| 723 | if (codepoint < 0) { | 739 | if (codepoint < 0) |
| 724 | return -1; | 740 | return -1; |
| 741 | |||
| 742 | /* UTF-16 surrogate pairs take the following 2 byte form: | ||
| 743 | * 11011 x yyyyyyyyyy | ||
| 744 | * When x = 0: y is the high 10 bits of the codepoint | ||
| 745 | * x = 1: y is the low 10 bits of the codepoint | ||
| 746 | * | ||
| 747 | * Check for a surrogate pair (high or low) */ | ||
| 748 | if ((codepoint & 0xF800) == 0xD800) { | ||
| 749 | /* Error if the 1st surrogate is not high */ | ||
| 750 | if (codepoint & 0x400) | ||
| 751 | return -1; | ||
| 752 | |||
| 753 | /* Ensure the next code is a unicode escape */ | ||
| 754 | if (json->data[json->index + escape_len] != '\\' || | ||
| 755 | json->data[json->index + escape_len + 1] != 'u') { | ||
| 756 | return -1; | ||
| 757 | } | ||
| 758 | |||
| 759 | /* Fetch the next codepoint */ | ||
| 760 | surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]); | ||
| 761 | if (surrogate_low < 0) | ||
| 762 | return -1; | ||
| 763 | |||
| 764 | /* Error if the 2nd code is not a low surrogate */ | ||
| 765 | if ((surrogate_low & 0xFC00) != 0xDC00) | ||
| 766 | return -1; | ||
| 767 | |||
| 768 | /* Calculate Unicode codepoint */ | ||
| 769 | codepoint = (codepoint & 0x3FF) << 10; | ||
| 770 | surrogate_low &= 0x3FF; | ||
| 771 | codepoint = (codepoint | surrogate_low) + 0x10000; | ||
| 772 | escape_len = 12; | ||
| 725 | } | 773 | } |
| 726 | 774 | ||
| 727 | /* Convert to UTF-8 */ | 775 | /* Convert codepoint to UTF-8 */ |
| 728 | len = codepoint_to_utf8(utf8, codepoint); | 776 | len = codepoint_to_utf8(utf8, codepoint); |
| 729 | if (!len) { | 777 | if (!len) |
| 730 | return -1; | 778 | return -1; |
| 731 | } | ||
| 732 | 779 | ||
| 733 | /* Append bytes and advance index */ | 780 | /* Append bytes and advance parse index */ |
| 734 | strbuf_append_mem_unsafe(json->tmp, utf8, len); | 781 | strbuf_append_mem_unsafe(json->tmp, utf8, len); |
| 735 | json->index += 6; | 782 | json->index += escape_len; |
| 736 | 783 | ||
| 737 | return 0; | 784 | return 0; |
| 738 | } | 785 | } |
diff --git a/tests/common.lua b/tests/common.lua index 9a7ed19..b8ce01d 100644 --- a/tests/common.lua +++ b/tests/common.lua | |||
| @@ -99,6 +99,10 @@ function file_load(filename) | |||
| 99 | local data = file:read("*a") | 99 | local data = file:read("*a") |
| 100 | file:close() | 100 | file:close() |
| 101 | 101 | ||
| 102 | if data == nil then | ||
| 103 | error("Failed to read " .. filename) | ||
| 104 | end | ||
| 105 | |||
| 102 | return data | 106 | return data |
| 103 | end | 107 | end |
| 104 | 108 | ||
diff --git a/tests/genutf8.pl b/tests/genutf8.pl new file mode 100755 index 0000000..4960663 --- /dev/null +++ b/tests/genutf8.pl | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | #!/usr/bin/perl -w | ||
| 2 | |||
| 3 | # Create test comparison data using a different UTF-8 implementation. | ||
| 4 | |||
| 5 | use strict; | ||
| 6 | use Text::Iconv; | ||
| 7 | use FileHandle; | ||
| 8 | |||
| 9 | # 0xD800 - 0xDFFF are used to encode supplementary codepoints | ||
| 10 | # 0x10000 - 0x10FFFF are supplementary codepoints | ||
| 11 | my (@codepoints) = (0 .. 0xD7FF, 0xE000 .. 0x10FFFF); | ||
| 12 | |||
| 13 | my ($utf32be) = pack("N*", @codepoints); | ||
| 14 | my $iconv = Text::Iconv->new("UTF-32BE", "UTF-8"); | ||
| 15 | my ($utf8) = $iconv->convert($utf32be); | ||
| 16 | defined($utf8) or die "Unable create UTF-8 string\n"; | ||
| 17 | |||
| 18 | my $fh = FileHandle->new(); | ||
| 19 | $fh->open("utf8.dat", ">") | ||
| 20 | or die "Unable to open utf8.dat: $!\n"; | ||
| 21 | $fh->print($utf8) | ||
| 22 | or die "Unable to write utf.dat\n"; | ||
| 23 | $fh->close(); | ||
| 24 | |||
| 25 | # vi:ai et sw=4 ts=4: | ||
diff --git a/tests/bytestring.dat b/tests/octets-escaped.dat index ee99a6b..ee99a6b 100644 --- a/tests/bytestring.dat +++ b/tests/octets-escaped.dat | |||
diff --git a/tests/test.lua b/tests/test.lua index 9075bab..0e0aad8 100755 --- a/tests/test.lua +++ b/tests/test.lua | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | -- CJSON tests | 3 | -- CJSON tests |
| 4 | -- | 4 | -- |
| 5 | -- Mark Pulford <mark@kyne.com.au> | 5 | -- Mark Pulford <mark@kyne.com.au> |
| 6 | -- | ||
| 7 | -- Note: The output of this script is easier to read with "less -S" | ||
| 6 | 8 | ||
| 7 | require "common" | 9 | require "common" |
| 8 | local json = require "cjson" | 10 | local json = require "cjson" |
| @@ -95,13 +97,73 @@ local function gen_ascii() | |||
| 95 | return table.concat(chars) | 97 | return table.concat(chars) |
| 96 | end | 98 | end |
| 97 | 99 | ||
| 100 | -- Generate every UTF-16 codepoint, including supplementary codes | ||
| 101 | local function gen_utf16_escaped() | ||
| 102 | -- Create raw table escapes | ||
| 103 | local utf16_escaped = {} | ||
| 104 | local count = 0 | ||
| 105 | |||
| 106 | local function append_escape(code) | ||
| 107 | local esc = string.format('\\u%04X', code) | ||
| 108 | table.insert(utf16_escaped, esc) | ||
| 109 | end | ||
| 110 | |||
| 111 | table.insert(utf16_escaped, '"') | ||
| 112 | for i = 0, 0xD7FF do | ||
| 113 | append_escape(i) | ||
| 114 | end | ||
| 115 | -- Skip 0xD800 - 0xDFFF since they are used to encode supplementary | ||
| 116 | -- codepoints | ||
| 117 | for i = 0xE000, 0xFFFF do | ||
| 118 | append_escape(i) | ||
| 119 | end | ||
| 120 | -- Append surrogate pair for each supplementary codepoint | ||
| 121 | for high = 0xD800, 0xDBFF do | ||
| 122 | for low = 0xDC00, 0xDFFF do | ||
| 123 | append_escape(high) | ||
| 124 | append_escape(low) | ||
| 125 | end | ||
| 126 | end | ||
| 127 | table.insert(utf16_escaped, '"') | ||
| 128 | |||
| 129 | return table.concat(utf16_escaped) | ||
| 130 | end | ||
| 131 | |||
| 98 | local octets_raw = gen_ascii() | 132 | local octets_raw = gen_ascii() |
| 99 | local octets_escaped = file_load("bytestring.dat") | 133 | local octets_escaped = file_load("octets-escaped.dat") |
| 134 | local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat") | ||
| 135 | if not utf8_loaded then | ||
| 136 | utf8_raw = "Failed to load utf8.dat" | ||
| 137 | end | ||
| 138 | local utf16_escaped = gen_utf16_escaped() | ||
| 139 | |||
| 100 | local escape_tests = { | 140 | local escape_tests = { |
| 141 | -- Test 8bit clean | ||
| 101 | { json.encode, { octets_raw }, true, { octets_escaped } }, | 142 | { json.encode, { octets_raw }, true, { octets_escaped } }, |
| 102 | { json.decode, { octets_escaped }, true, { octets_raw } } | 143 | { json.decode, { octets_escaped }, true, { octets_raw } }, |
| 144 | -- Ensure high bits are removed from surrogate codes | ||
| 145 | { json.decode, { '"\\uF800"' }, true, { "\239\160\128" } }, | ||
| 146 | -- Test inverted surrogate pairs | ||
| 147 | { json.decode, { '"\\uDB00\\uD800"' }, | ||
| 148 | false, { "Expected value but found invalid unicode escape code at character 2" } }, | ||
| 149 | -- Test 2x high surrogate code units | ||
| 150 | { json.decode, { '"\\uDB00\\uDB00"' }, | ||
| 151 | false, { "Expected value but found invalid unicode escape code at character 2" } }, | ||
| 152 | -- Test invalid 2nd escape | ||
| 153 | { json.decode, { '"\\uDB00\\"' }, | ||
| 154 | false, { "Expected value but found invalid unicode escape code at character 2" } }, | ||
| 155 | { json.decode, { '"\\uDB00\\uD"' }, | ||
| 156 | false, { "Expected value but found invalid unicode escape code at character 2" } }, | ||
| 157 | -- Test decoding of all UTF-16 escapes | ||
| 158 | { json.decode, { utf16_escaped }, true, { utf8_raw } } | ||
| 103 | } | 159 | } |
| 104 | 160 | ||
| 161 | function test_decode_cycle(filename) | ||
| 162 | local obj1 = json.decode(file_load(filename)) | ||
| 163 | local obj2 = json.decode(json.encode(obj1)) | ||
| 164 | return compare_values(obj1, obj2) | ||
| 165 | end | ||
| 166 | |||
| 105 | run_test_group("decode simple value", simple_value_tests) | 167 | run_test_group("decode simple value", simple_value_tests) |
| 106 | run_test_group("decode numeric", numeric_tests) | 168 | run_test_group("decode numeric", numeric_tests) |
| 107 | 169 | ||
