Add UTF-16 surrogate pair decode support

- Add tests for UTF-16 decoding and failures - Add getutf8.pl to assist with UTF-16 decode testing - Re-add test_decode_cycle() which was accidentally removed earlier - Rename bytestring.dat to octets-escaped.dat
author: Mark Pulford <mark@kyne.com.au> 2011-05-08 20:26:09 +0930
committer: Mark Pulford <mark@kyne.com.au> 2011-05-08 20:26:09 +0930
commit: 4dc56c6d362f2cd8a79d83369f0b852df07dae3f (patch)
tree: d51d3470a396c7981871b4f6fe4fd331e180db83
parent: eeebeda88e62fefa87c71d616d5719782bdaa45a (diff)
download: lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.gz
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.bz2
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.zip
5 files changed, 149 insertions, 11 deletions
diff --git a/lua_cjson.c b/lua_cjson.c
index 3af8157..52b259d 100644
--- a/lua_cjson.c
+++ b/lua_cjson.c
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex)
            digit[3];
 }
+/* Converts a Unicode codepoint to UTF-8.
+ * Returns UTF-8 string length, and up to 4 bytes in *utf8 */
 static int codepoint_to_utf8(char *utf8, int codepoint)
 {
+    /* 0xxxxxxx */
    if (codepoint <= 0x7F) {
        utf8[0] = codepoint;
        return 1;
    }
    
+    /* 110xxxxx 10xxxxxx */
    if (codepoint <= 0x7FF) {
        utf8[0] = (codepoint >> 6) | 0xC0;
        utf8[1] = (codepoint & 0x3F) | 0x80;
        return 2;
    }
+    /* 1110xxxx 10xxxxxx 10xxxxxx */
    if (codepoint <= 0xFFFF) {
        utf8[0] = (codepoint >> 12) | 0xE0;
        utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80;
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
        return 3;
    }
+    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    if (codepoint <= 0x1FFFFF) {
+        utf8[0] = (codepoint >> 18) | 0xF0;
+        utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80;
+        utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80;
+        utf8[3] = (codepoint & 0x3F) | 0x80;
+        return 4;
+    }
    return 0;
 }
-/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX
+/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
 * \u is guaranteed to exist, but the remaining hex characters may be
 * missing.
 * Translate to UTF-8 and append to temporary token string.
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
 */
 static int json_append_unicode_escape(json_parse_t *json)
 {
-    char utf8[4];       /* 3 bytes of UTF-8 can handle UCS-2 */
+    char utf8[4];       /* Surrogate pairs require 4 UTF-8 bytes */
    int codepoint;
+    int surrogate_low;
    int len;
+    int escape_len = 6;
-    /* Fetch UCS-2 codepoint */
+    /* Fetch UTF-16 code unit */
    codepoint = decode_hex4(&json->data[json->index + 2]);
-    if (codepoint < 0) {
+    if (codepoint < 0)
        return -1;
+    /* UTF-16 surrogate pairs take the following 2 byte form:
+     *      11011 x yyyyyyyyyy
+     * When x = 0: y is the high 10 bits of the codepoint
+     *      x = 1: y is the low 10 bits of the codepoint
+     *
+     * Check for a surrogate pair (high or low) */
+    if ((codepoint & 0xF800) == 0xD800) {
+        /* Error if the 1st surrogate is not high */
+        if (codepoint & 0x400)
+            return -1;
+        /* Ensure the next code is a unicode escape */
+        if (json->data[json->index + escape_len] != '\\' ||
+            json->data[json->index + escape_len + 1] != 'u') {
+            return -1;
+        }
+        /* Fetch the next codepoint */
+        surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]);
+        if (surrogate_low < 0)
+            return -1;
+        /* Error if the 2nd code is not a low surrogate */
+        if ((surrogate_low & 0xFC00) != 0xDC00)
+            return -1;
+        /* Calculate Unicode codepoint */
+        codepoint = (codepoint & 0x3FF) << 10;
+        surrogate_low &= 0x3FF;
+        codepoint = (codepoint | surrogate_low) + 0x10000;
+        escape_len = 12;
    }
-    /* Convert to UTF-8 */
+    /* Convert codepoint to UTF-8 */
    len = codepoint_to_utf8(utf8, codepoint);
-    if (!len) {
+    if (!len)
        return -1;
-    }
-    /* Append bytes and advance index */
+    /* Append bytes and advance parse index */
    strbuf_append_mem_unsafe(json->tmp, utf8, len);
-    json->index += 6;
+    json->index += escape_len;
    return 0;
 }
diff --git a/tests/common.lua b/tests/common.lua
index 9a7ed19..b8ce01d 100644
--- a/tests/common.lua
+++ b/tests/common.lua
@@ -99,6 +99,10 @@ function file_load(filename)
    local data = file:read("*a")
    file:close()
+    if data == nil then
+        error("Failed to read " .. filename)
+    end
    return data
 end
diff --git a/tests/genutf8.pl b/tests/genutf8.pl
new file mode 100755
index 0000000..4960663
--- /dev/null
+++ b/tests/genutf8.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/perl -w
+# Create test comparison data using a different UTF-8 implementation.
+use strict;
+use Text::Iconv;
+use FileHandle;
+# 0xD800 - 0xDFFF are used to encode supplementary codepoints
+# 0x10000 - 0x10FFFF are supplementary codepoints
+my (@codepoints) = (0 .. 0xD7FF, 0xE000 .. 0x10FFFF);
+my ($utf32be) = pack("N*", @codepoints);
+my $iconv = Text::Iconv->new("UTF-32BE", "UTF-8");
+my ($utf8) = $iconv->convert($utf32be);
+defined($utf8) or die "Unable create UTF-8 string\n";
+my $fh = FileHandle->new();
+$fh->open("utf8.dat", ">")
+    or die "Unable to open utf8.dat: $!\n";
+$fh->print($utf8)
+    or die "Unable to write utf.dat\n";
+$fh->close();
+# vi:ai et sw=4 ts=4:
diff --git a/tests/bytestring.dat b/tests/octets-escaped.dat
index ee99a6b..ee99a6b 100644
--- a/tests/bytestring.dat
+++ b/tests/octets-escaped.dat
diff --git a/tests/test.lua b/tests/test.lua
index 9075bab..0e0aad8 100755
--- a/tests/test.lua
+++ b/tests/test.lua
@@ -3,6 +3,8 @@
 -- CJSON tests
 --
 -- Mark Pulford <mark@kyne.com.au>
+--
+-- Note: The output of this script is easier to read with "less -S"
 require "common"
 local json = require "cjson"
@@ -95,13 +97,73 @@ local function gen_ascii()
    return table.concat(chars)
 end
+-- Generate every UTF-16 codepoint, including supplementary codes
+local function gen_utf16_escaped()
+    -- Create raw table escapes
+    local utf16_escaped = {}
+    local count = 0
+    local function append_escape(code)
+        local esc = string.format('\\u%04X', code)
+        table.insert(utf16_escaped, esc)
+    end
+    table.insert(utf16_escaped, '"')
+    for i = 0, 0xD7FF do
+        append_escape(i)
+    end
+    -- Skip 0xD800 - 0xDFFF since they are used to encode supplementary
+    -- codepoints
+    for i = 0xE000, 0xFFFF do
+        append_escape(i)
+    end
+    -- Append surrogate pair for each supplementary codepoint
+    for high = 0xD800, 0xDBFF do
+        for low = 0xDC00, 0xDFFF do
+            append_escape(high)
+            append_escape(low)
+        end
+    end
+    table.insert(utf16_escaped, '"')
+   
+    return table.concat(utf16_escaped)
+end
 local octets_raw = gen_ascii()
-local octets_escaped = file_load("bytestring.dat")
+local octets_escaped = file_load("octets-escaped.dat")
+local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat")
+if not utf8_loaded then
+    utf8_raw = "Failed to load utf8.dat"
+end
+local utf16_escaped = gen_utf16_escaped()
 local escape_tests = {
+    -- Test 8bit clean
    { json.encode, { octets_raw }, true, { octets_escaped } },
-    { json.decode, { octets_escaped }, true, { octets_raw } }
+    { json.decode, { octets_escaped }, true, { octets_raw } },
+    -- Ensure high bits are removed from surrogate codes
+    { json.decode, { '"\\uF800"' }, true, { "\239\160\128" } },
+    -- Test inverted surrogate pairs
+    { json.decode, { '"\\uDB00\\uD800"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test 2x high surrogate code units
+    { json.decode, { '"\\uDB00\\uDB00"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test invalid 2nd escape
+    { json.decode, { '"\\uDB00\\"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    { json.decode, { '"\\uDB00\\uD"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test decoding of all UTF-16 escapes
+    { json.decode, { utf16_escaped }, true, { utf8_raw } }
 }
+function test_decode_cycle(filename)
+    local obj1 = json.decode(file_load(filename))
+    local obj2 = json.decode(json.encode(obj1))
+    return compare_values(obj1, obj2)
+end
 run_test_group("decode simple value", simple_value_tests)
 run_test_group("decode numeric", numeric_tests)
author	Mark Pulford <mark@kyne.com.au>	2011-05-08 20:26:09 +0930
committer	Mark Pulford <mark@kyne.com.au>	2011-05-08 20:26:09 +0930
commit	4dc56c6d362f2cd8a79d83369f0b852df07dae3f (patch)
tree	d51d3470a396c7981871b4f6fe4fd331e180db83
parent	eeebeda88e62fefa87c71d616d5719782bdaa45a (diff)
download	lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.gz lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.bz2 lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.zip

diff --git a/lua_cjson.c b/lua_cjson.c index 3af8157..52b259d 100644 --- a/lua_cjson.c +++ b/lua_cjson.c
@@ -680,19 +680,24 @@ static int decode_hex4(const char *hex)
680	digit[3];	680	digit[3];
681	}	681	}
682		682
		683	/* Converts a Unicode codepoint to UTF-8.
		684	* Returns UTF-8 string length, and up to 4 bytes in utf8 /
683	static int codepoint_to_utf8(char *utf8, int codepoint)	685	static int codepoint_to_utf8(char *utf8, int codepoint)
684	{	686	{
		687	/* 0xxxxxxx */
685	if (codepoint <= 0x7F) {	688	if (codepoint <= 0x7F) {
686	utf8[0] = codepoint;	689	utf8[0] = codepoint;
687	return 1;	690	return 1;
688	}	691	}
689		692
		693	/* 110xxxxx 10xxxxxx */
690	if (codepoint <= 0x7FF) {	694	if (codepoint <= 0x7FF) {
691	utf8[0] = (codepoint >> 6) \| 0xC0;	695	utf8[0] = (codepoint >> 6) \| 0xC0;
692	utf8[1] = (codepoint & 0x3F) \| 0x80;	696	utf8[1] = (codepoint & 0x3F) \| 0x80;
693	return 2;	697	return 2;
694	}	698	}
695		699
		700	/* 1110xxxx 10xxxxxx 10xxxxxx */
696	if (codepoint <= 0xFFFF) {	701	if (codepoint <= 0xFFFF) {
697	utf8[0] = (codepoint >> 12) \| 0xE0;	702	utf8[0] = (codepoint >> 12) \| 0xE0;
698	utf8[1] = ((codepoint >> 6) & 0x3F) \| 0x80;	703	utf8[1] = ((codepoint >> 6) & 0x3F) \| 0x80;
@@ -700,11 +705,20 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
700	return 3;	705	return 3;
701	}	706	}
702		707
		708	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
		709	if (codepoint <= 0x1FFFFF) {
		710	utf8[0] = (codepoint >> 18) \| 0xF0;
		711	utf8[1] = ((codepoint >> 12) & 0x3F) \| 0x80;
		712	utf8[2] = ((codepoint >> 6) & 0x3F) \| 0x80;
		713	utf8[3] = (codepoint & 0x3F) \| 0x80;
		714	return 4;
		715	}
		716
703	return 0;	717	return 0;
704	}	718	}
705		719
706		720
707	/* Called when index pointing to beginning of UCS-2 hex code: \uXXXX	721	/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
708	* \u is guaranteed to exist, but the remaining hex characters may be	722	* \u is guaranteed to exist, but the remaining hex characters may be
709	* missing.	723	* missing.
710	* Translate to UTF-8 and append to temporary token string.	724	* Translate to UTF-8 and append to temporary token string.
@@ -714,25 +728,58 @@ static int codepoint_to_utf8(char *utf8, int codepoint)
714	*/	728	*/
715	static int json_append_unicode_escape(json_parse_t *json)	729	static int json_append_unicode_escape(json_parse_t *json)
716	{	730	{
717	char utf8[4]; /* 3 bytes of UTF-8 can handle UCS-2 */	731	char utf8[4]; /* Surrogate pairs require 4 UTF-8 bytes */
718	int codepoint;	732	int codepoint;
		733	int surrogate_low;
719	int len;	734	int len;
		735	int escape_len = 6;
720		736
721	/* Fetch UCS-2 codepoint */	737	/* Fetch UTF-16 code unit */
722	codepoint = decode_hex4(&json->data[json->index + 2]);	738	codepoint = decode_hex4(&json->data[json->index + 2]);
723	if (codepoint < 0) {	739	if (codepoint < 0)
724	return -1;	740	return -1;
		741
		742	/* UTF-16 surrogate pairs take the following 2 byte form:
		743	* 11011 x yyyyyyyyyy
		744	* When x = 0: y is the high 10 bits of the codepoint
		745	* x = 1: y is the low 10 bits of the codepoint
		746	*
		747	* Check for a surrogate pair (high or low) */
		748	if ((codepoint & 0xF800) == 0xD800) {
		749	/* Error if the 1st surrogate is not high */
		750	if (codepoint & 0x400)
		751	return -1;
		752
		753	/* Ensure the next code is a unicode escape */
		754	if (json->data[json->index + escape_len] != '\\' \|\|
		755	json->data[json->index + escape_len + 1] != 'u') {
		756	return -1;
		757	}
		758
		759	/* Fetch the next codepoint */
		760	surrogate_low = decode_hex4(&json->data[json->index + 2 + escape_len]);
		761	if (surrogate_low < 0)
		762	return -1;
		763
		764	/* Error if the 2nd code is not a low surrogate */
		765	if ((surrogate_low & 0xFC00) != 0xDC00)
		766	return -1;
		767
		768	/* Calculate Unicode codepoint */
		769	codepoint = (codepoint & 0x3FF) << 10;
		770	surrogate_low &= 0x3FF;
		771	codepoint = (codepoint \| surrogate_low) + 0x10000;
		772	escape_len = 12;
725	}	773	}
726		774
727	/* Convert to UTF-8 */	775	/* Convert codepoint to UTF-8 */
728	len = codepoint_to_utf8(utf8, codepoint);	776	len = codepoint_to_utf8(utf8, codepoint);
729	if (!len) {	777	if (!len)
730	return -1;	778	return -1;
731	}
732		779
733	/* Append bytes and advance index */	780	/* Append bytes and advance parse index */
734	strbuf_append_mem_unsafe(json->tmp, utf8, len);	781	strbuf_append_mem_unsafe(json->tmp, utf8, len);
735	json->index += 6;	782	json->index += escape_len;
736		783
737	return 0;	784	return 0;
738	}	785	}


diff --git a/tests/common.lua b/tests/common.lua index 9a7ed19..b8ce01d 100644 --- a/tests/common.lua +++ b/tests/common.lua
@@ -99,6 +99,10 @@ function file_load(filename)
99	local data = file:read("*a")	99	local data = file:read("*a")
100	file:close()	100	file:close()
101		101
		102	if data == nil then
		103	error("Failed to read " .. filename)
		104	end
		105
102	return data	106	return data
103	end	107	end
104		108


diff --git a/tests/genutf8.pl b/tests/genutf8.pl new file mode 100755 index 0000000..4960663 --- /dev/null +++ b/tests/genutf8.pl
@@ -0,0 +1,25 @@
		1	#!/usr/bin/perl -w
		2
		3	# Create test comparison data using a different UTF-8 implementation.
		4
		5	use strict;
		6	use Text::Iconv;
		7	use FileHandle;
		8
		9	# 0xD800 - 0xDFFF are used to encode supplementary codepoints
		10	# 0x10000 - 0x10FFFF are supplementary codepoints
		11	my (@codepoints) = (0 .. 0xD7FF, 0xE000 .. 0x10FFFF);
		12
		13	my ($utf32be) = pack("N*", @codepoints);
		14	my $iconv = Text::Iconv->new("UTF-32BE", "UTF-8");
		15	my ($utf8) = $iconv->convert($utf32be);
		16	defined($utf8) or die "Unable create UTF-8 string\n";
		17
		18	my $fh = FileHandle->new();
		19	$fh->open("utf8.dat", ">")
		20	or die "Unable to open utf8.dat: $!\n";
		21	$fh->print($utf8)
		22	or die "Unable to write utf.dat\n";
		23	$fh->close();
		24
		25	# vi:ai et sw=4 ts=4:


diff --git a/tests/bytestring.dat b/tests/octets-escaped.dat index ee99a6b..ee99a6b 100644 --- a/tests/bytestring.dat +++ b/tests/octets-escaped.dat


diff --git a/tests/test.lua b/tests/test.lua index 9075bab..0e0aad8 100755 --- a/tests/test.lua +++ b/tests/test.lua
@@ -3,6 +3,8 @@
3	-- CJSON tests	3	-- CJSON tests
4	--	4	--
5	-- Mark Pulford <mark@kyne.com.au>	5	-- Mark Pulford <mark@kyne.com.au>
		6	--
		7	-- Note: The output of this script is easier to read with "less -S"
6		8
7	require "common"	9	require "common"
8	local json = require "cjson"	10	local json = require "cjson"
@@ -95,13 +97,73 @@ local function gen_ascii()
95	return table.concat(chars)	97	return table.concat(chars)
96	end	98	end
97		99
		100	-- Generate every UTF-16 codepoint, including supplementary codes
		101	local function gen_utf16_escaped()
		102	-- Create raw table escapes
		103	local utf16_escaped = {}
		104	local count = 0
		105
		106	local function append_escape(code)
		107	local esc = string.format('\\u%04X', code)
		108	table.insert(utf16_escaped, esc)
		109	end
		110
		111	table.insert(utf16_escaped, '"')
		112	for i = 0, 0xD7FF do
		113	append_escape(i)
		114	end
		115	-- Skip 0xD800 - 0xDFFF since they are used to encode supplementary
		116	-- codepoints
		117	for i = 0xE000, 0xFFFF do
		118	append_escape(i)
		119	end
		120	-- Append surrogate pair for each supplementary codepoint
		121	for high = 0xD800, 0xDBFF do
		122	for low = 0xDC00, 0xDFFF do
		123	append_escape(high)
		124	append_escape(low)
		125	end
		126	end
		127	table.insert(utf16_escaped, '"')
		128
		129	return table.concat(utf16_escaped)
		130	end
		131
98	local octets_raw = gen_ascii()	132	local octets_raw = gen_ascii()
99	local octets_escaped = file_load("bytestring.dat")	133	local octets_escaped = file_load("octets-escaped.dat")
		134	local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat")
		135	if not utf8_loaded then
		136	utf8_raw = "Failed to load utf8.dat"
		137	end
		138	local utf16_escaped = gen_utf16_escaped()
		139
100	local escape_tests = {	140	local escape_tests = {
		141	-- Test 8bit clean
101	{ json.encode, { octets_raw }, true, { octets_escaped } },	142	{ json.encode, { octets_raw }, true, { octets_escaped } },
102	{ json.decode, { octets_escaped }, true, { octets_raw } }	143	{ json.decode, { octets_escaped }, true, { octets_raw } },
		144	-- Ensure high bits are removed from surrogate codes
		145	{ json.decode, { '"\\uF800"' }, true, { "\239\160\128" } },
		146	-- Test inverted surrogate pairs
		147	{ json.decode, { '"\\uDB00\\uD800"' },
		148	false, { "Expected value but found invalid unicode escape code at character 2" } },
		149	-- Test 2x high surrogate code units
		150	{ json.decode, { '"\\uDB00\\uDB00"' },
		151	false, { "Expected value but found invalid unicode escape code at character 2" } },
		152	-- Test invalid 2nd escape
		153	{ json.decode, { '"\\uDB00\\"' },
		154	false, { "Expected value but found invalid unicode escape code at character 2" } },
		155	{ json.decode, { '"\\uDB00\\uD"' },
		156	false, { "Expected value but found invalid unicode escape code at character 2" } },
		157	-- Test decoding of all UTF-16 escapes
		158	{ json.decode, { utf16_escaped }, true, { utf8_raw } }
103	}	159	}
104		160
		161	function test_decode_cycle(filename)
		162	local obj1 = json.decode(file_load(filename))
		163	local obj2 = json.decode(json.encode(obj1))
		164	return compare_values(obj1, obj2)
		165	end
		166
105	run_test_group("decode simple value", simple_value_tests)	167	run_test_group("decode simple value", simple_value_tests)
106	run_test_group("decode numeric", numeric_tests)	168	run_test_group("decode numeric", numeric_tests)
107		169