aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Pulford <mark@kyne.com.au>2011-05-08 20:26:09 +0930
committerMark Pulford <mark@kyne.com.au>2011-05-08 20:26:09 +0930
commit4dc56c6d362f2cd8a79d83369f0b852df07dae3f (patch)
treed51d3470a396c7981871b4f6fe4fd331e180db83
parenteeebeda88e62fefa87c71d616d5719782bdaa45a (diff)
downloadlua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.gz
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.bz2
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.zip
Add UTF-16 surrogate pair decode support
- Add tests for UTF-16 decoding and failures - Add getutf8.pl to assist with UTF-16 decode testing - Re-add test_decode_cycle() which was accidentally removed earlier - Rename bytestring.dat to octets-escaped.dat
-rw-r--r--lua_cjson.c65
-rw-r--r--tests/common.lua4
-rwxr-xr-xtests/genutf8.pl25
-rw-r--r--tests/octets-escaped.dat (renamed from tests/bytestring.dat)0
-rwxr-xr-xtests/test.lua66
5 files changed, 149 insertions, 11 deletions
diff --git a/lua_cjson.c b/lua_cjson.c
index 3af8157..52b259d 100644
--- a/lua_cjson.c
+++ b/lua_cjson.c
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex)
680 digit[3]; 680 digit[3];
681} 681}
682 682
683/* Converts a Unicode codepoint to UTF-8.
684 * Returns UTF-8 string length, and up to 4 bytes in *utf8 */
683static int codepoint_to_utf8(char *utf8, int codepoint) 685static int codepoint_to_utf8(char *utf8, int codepoint)
684{ 686{
687 /* 0xxxxxxx */
685 if (codepoint <= 0x7F) { 688 if (codepoint <= 0x7F) {
686 utf8[0] = codepoint; 689 utf8[0] = codepoint;
687 return 1; 690 return 1;
688 } 691 }
689 692
693 /* 110xxxxx 10xxxxxx */
690 if (codepoint <= 0x7FF) { 694 if (codepoint <= 0x7FF) {
691 utf8[0] = (codepoint >> 6) | 0xC0; 695 utf8[0] = (codepoint >> 6) | 0xC0;
692 utf8[1] = (codepoint & 0x3F) | 0x80; 696 utf8[1] = (codepoint & 0x3F) | 0x80;
693 return 2; 697 return 2;
694 } 698 }
695 699
700 /* 1110xxxx 10xxxxxx 10xxxxxx */
696 if (codepoint <= 0xFFFF) { 701 if (codepoint <= 0xFFFF) {
697 utf8[0] = (codepoint >> 12) | 0xE0; 702 utf8[0] = (codepoint >> 12) | 0xE0;
698 utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80; 703 utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80;
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
700 return 3; 705 return 3;
701 } 706 }
702 707
708 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
709 if (codepoint <= 0x1FFFFF) {
710 utf8[0] = (codepoint >> 18) | 0xF0;
711 utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80;
712 utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80;
713 utf8[3] = (codepoint & 0x3F) | 0x80;
714 return 4;
715 }
716
703 return 0; 717 return 0;
704} 718}
705 719
706 720
707/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX 721/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
708 * \u is guaranteed to exist, but the remaining hex characters may be 722 * \u is guaranteed to exist, but the remaining hex characters may be
709 * missing. 723 * missing.
710 * Translate to UTF-8 and append to temporary token string. 724 * Translate to UTF-8 and append to temporary token string.
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
714 */ 728 */
715static int json_append_unicode_escape(json_parse_t *json) 729static int json_append_unicode_escape(json_parse_t *json)
716{ 730{
717 char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */ 731 char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */
718 int codepoint; 732 int codepoint;
733 int surrogate_low;
719 int len; 734 int len;
735 int escape_len = 6;
720 736
721 /* Fetch UCS-2 codepoint */ 737 /* Fetch UTF-16 code unit */
722 codepoint = decode_hex4(&json->data[json->index + 2]); 738 codepoint = decode_hex4(&json->data[json->index + 2]);
723 if (codepoint < 0) { 739 if (codepoint < 0)
724 return -1; 740 return -1;
741
742 /* UTF-16 surrogate pairs take the following 2 byte form:
743 * 11011 x yyyyyyyyyy
744 * When x = 0: y is the high 10 bits of the codepoint
745 * x = 1: y is the low 10 bits of the codepoint
746 *
747 * Check for a surrogate pair (high or low) */
748 if ((codepoint & 0xF800) == 0xD800) {
749 /* Error if the 1st surrogate is not high */
750 if (codepoint & 0x400)
751 return -1;
752
753 /* Ensure the next code is a unicode escape */
754 if (json->data[json->index + escape_len] != '\\' ||
755 json->data[json->index + escape_len + 1] != 'u') {
756 return -1;
757 }
758
759 /* Fetch the next codepoint */
760 surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]);
761 if (surrogate_low < 0)
762 return -1;
763
764 /* Error if the 2nd code is not a low surrogate */
765 if ((surrogate_low & 0xFC00) != 0xDC00)
766 return -1;
767
768 /* Calculate Unicode codepoint */
769 codepoint = (codepoint & 0x3FF) << 10;
770 surrogate_low &= 0x3FF;
771 codepoint = (codepoint | surrogate_low) + 0x10000;
772 escape_len = 12;
725 } 773 }
726 774
727 /* Convert to UTF-8 */ 775 /* Convert codepoint to UTF-8 */
728 len = codepoint_to_utf8(utf8, codepoint); 776 len = codepoint_to_utf8(utf8, codepoint);
729 if (!len) { 777 if (!len)
730 return -1; 778 return -1;
731 }
732 779
733 /* Append bytes and advance index */ 780 /* Append bytes and advance parse index */
734 strbuf_append_mem_unsafe(json->tmp, utf8, len); 781 strbuf_append_mem_unsafe(json->tmp, utf8, len);
735 json->index += 6; 782 json->index += escape_len;
736 783
737 return 0; 784 return 0;
738} 785}
diff --git a/tests/common.lua b/tests/common.lua
index 9a7ed19..b8ce01d 100644
--- a/tests/common.lua
+++ b/tests/common.lua
@@ -99,6 +99,10 @@ function file_load(filename)
99 local data = file:read("*a") 99 local data = file:read("*a")
100 file:close() 100 file:close()
101 101
102 if data == nil then
103 error("Failed to read " .. filename)
104 end
105
102 return data 106 return data
103end 107end
104 108
diff --git a/tests/genutf8.pl b/tests/genutf8.pl
new file mode 100755
index 0000000..4960663
--- /dev/null
+++ b/tests/genutf8.pl
@@ -0,0 +1,25 @@
1#!/usr/bin/perl -w
2
3# Create test comparison data using a different UTF-8 implementation.
4
5use strict;
6use Text::Iconv;
7use FileHandle;
8
9# 0xD800 - 0xDFFF are used to encode supplementary codepoints
10# 0x10000 - 0x10FFFF are supplementary codepoints
11my (@codepoints) = (0 .. 0xD7FF, 0xE000 .. 0x10FFFF);
12
13my ($utf32be) = pack("N*", @codepoints);
14my $iconv = Text::Iconv->new("UTF-32BE", "UTF-8");
15my ($utf8) = $iconv->convert($utf32be);
16defined($utf8) or die "Unable create UTF-8 string\n";
17
18my $fh = FileHandle->new();
19$fh->open("utf8.dat", ">")
20 or die "Unable to open utf8.dat: $!\n";
21$fh->print($utf8)
22 or die "Unable to write utf.dat\n";
23$fh->close();
24
25# vi:ai et sw=4 ts=4:
diff --git a/tests/bytestring.dat b/tests/octets-escaped.dat
index ee99a6b..ee99a6b 100644
--- a/tests/bytestring.dat
+++ b/tests/octets-escaped.dat
diff --git a/tests/test.lua b/tests/test.lua
index 9075bab..0e0aad8 100755
--- a/tests/test.lua
+++ b/tests/test.lua
@@ -3,6 +3,8 @@
3-- CJSON tests 3-- CJSON tests
4-- 4--
5-- Mark Pulford <mark@kyne.com.au> 5-- Mark Pulford <mark@kyne.com.au>
6--
7-- Note: The output of this script is easier to read with "less -S"
6 8
7require "common" 9require "common"
8local json = require "cjson" 10local json = require "cjson"
@@ -95,13 +97,73 @@ local function gen_ascii()
95 return table.concat(chars) 97 return table.concat(chars)
96end 98end
97 99
100-- Generate every UTF-16 codepoint, including supplementary codes
101local function gen_utf16_escaped()
102 -- Create raw table escapes
103 local utf16_escaped = {}
104 local count = 0
105
106 local function append_escape(code)
107 local esc = string.format('\\u%04X', code)
108 table.insert(utf16_escaped, esc)
109 end
110
111 table.insert(utf16_escaped, '"')
112 for i = 0, 0xD7FF do
113 append_escape(i)
114 end
115 -- Skip 0xD800 - 0xDFFF since they are used to encode supplementary
116 -- codepoints
117 for i = 0xE000, 0xFFFF do
118 append_escape(i)
119 end
120 -- Append surrogate pair for each supplementary codepoint
121 for high = 0xD800, 0xDBFF do
122 for low = 0xDC00, 0xDFFF do
123 append_escape(high)
124 append_escape(low)
125 end
126 end
127 table.insert(utf16_escaped, '"')
128
129 return table.concat(utf16_escaped)
130end
131
98local octets_raw = gen_ascii() 132local octets_raw = gen_ascii()
99local octets_escaped = file_load("bytestring.dat") 133local octets_escaped = file_load("octets-escaped.dat")
134local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat")
135if not utf8_loaded then
136 utf8_raw = "Failed to load utf8.dat"
137end
138local utf16_escaped = gen_utf16_escaped()
139
100local escape_tests = { 140local escape_tests = {
141 -- Test 8bit clean
101 { json.encode, { octets_raw }, true, { octets_escaped } }, 142 { json.encode, { octets_raw }, true, { octets_escaped } },
102 { json.decode, { octets_escaped }, true, { octets_raw } } 143 { json.decode, { octets_escaped }, true, { octets_raw } },
144 -- Ensure high bits are removed from surrogate codes
145 { json.decode, { '"\\uF800"' }, true, { "\239\160\128" } },
146 -- Test inverted surrogate pairs
147 { json.decode, { '"\\uDB00\\uD800"' },
148 false, { "Expected value but found invalid unicode escape code at character 2" } },
149 -- Test 2x high surrogate code units
150 { json.decode, { '"\\uDB00\\uDB00"' },
151 false, { "Expected value but found invalid unicode escape code at character 2" } },
152 -- Test invalid 2nd escape
153 { json.decode, { '"\\uDB00\\"' },
154 false, { "Expected value but found invalid unicode escape code at character 2" } },
155 { json.decode, { '"\\uDB00\\uD"' },
156 false, { "Expected value but found invalid unicode escape code at character 2" } },
157 -- Test decoding of all UTF-16 escapes
158 { json.decode, { utf16_escaped }, true, { utf8_raw } }
103} 159}
104 160
161function test_decode_cycle(filename)
162 local obj1 = json.decode(file_load(filename))
163 local obj2 = json.decode(json.encode(obj1))
164 return compare_values(obj1, obj2)
165end
166
105run_test_group("decode simple value", simple_value_tests) 167run_test_group("decode simple value", simple_value_tests)
106run_test_group("decode numeric", numeric_tests) 168run_test_group("decode numeric", numeric_tests)
107 169