Add UTF-16 surrogate pair decode support

- Add tests for UTF-16 decoding and failures - Add getutf8.pl to assist with UTF-16 decode testing - Re-add test_decode_cycle() which was accidentally removed earlier - Rename bytestring.dat to octets-escaped.dat
author: Mark Pulford <mark@kyne.com.au> 2011-05-08 20:26:09 +0930
committer: Mark Pulford <mark@kyne.com.au> 2011-05-08 20:26:09 +0930
commit: 4dc56c6d362f2cd8a79d83369f0b852df07dae3f (patch)
tree: d51d3470a396c7981871b4f6fe4fd331e180db83 /tests/test.lua
parent: eeebeda88e62fefa87c71d616d5719782bdaa45a (diff)
download: lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.gz
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.bz2
lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.zip
1 files changed, 64 insertions, 2 deletions
diff --git a/tests/test.lua b/tests/test.lua
index 9075bab..0e0aad8 100755
--- a/tests/test.lua
+++ b/tests/test.lua
@@ -3,6 +3,8 @@
 -- CJSON tests
 --
 -- Mark Pulford <mark@kyne.com.au>
+--
+-- Note: The output of this script is easier to read with "less -S"
 require "common"
 local json = require "cjson"
@@ -95,13 +97,73 @@ local function gen_ascii()
    return table.concat(chars)
 end
+-- Generate every UTF-16 codepoint, including supplementary codes
+local function gen_utf16_escaped()
+    -- Create raw table escapes
+    local utf16_escaped = {}
+    local count = 0
+    local function append_escape(code)
+        local esc = string.format('\\u%04X', code)
+        table.insert(utf16_escaped, esc)
+    end
+    table.insert(utf16_escaped, '"')
+    for i = 0, 0xD7FF do
+        append_escape(i)
+    end
+    -- Skip 0xD800 - 0xDFFF since they are used to encode supplementary
+    -- codepoints
+    for i = 0xE000, 0xFFFF do
+        append_escape(i)
+    end
+    -- Append surrogate pair for each supplementary codepoint
+    for high = 0xD800, 0xDBFF do
+        for low = 0xDC00, 0xDFFF do
+            append_escape(high)
+            append_escape(low)
+        end
+    end
+    table.insert(utf16_escaped, '"')
+   
+    return table.concat(utf16_escaped)
+end
 local octets_raw = gen_ascii()
-local octets_escaped = file_load("bytestring.dat")
+local octets_escaped = file_load("octets-escaped.dat")
+local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat")
+if not utf8_loaded then
+    utf8_raw = "Failed to load utf8.dat"
+end
+local utf16_escaped = gen_utf16_escaped()
 local escape_tests = {
+    -- Test 8bit clean
    { json.encode, { octets_raw }, true, { octets_escaped } },
-    { json.decode, { octets_escaped }, true, { octets_raw } }
+    { json.decode, { octets_escaped }, true, { octets_raw } },
+    -- Ensure high bits are removed from surrogate codes
+    { json.decode, { '"\\uF800"' }, true, { "\239\160\128" } },
+    -- Test inverted surrogate pairs
+    { json.decode, { '"\\uDB00\\uD800"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test 2x high surrogate code units
+    { json.decode, { '"\\uDB00\\uDB00"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test invalid 2nd escape
+    { json.decode, { '"\\uDB00\\"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    { json.decode, { '"\\uDB00\\uD"' },
+      false, { "Expected value but found invalid unicode escape code at character 2" } },
+    -- Test decoding of all UTF-16 escapes
+    { json.decode, { utf16_escaped }, true, { utf8_raw } }
 }
+function test_decode_cycle(filename)
+    local obj1 = json.decode(file_load(filename))
+    local obj2 = json.decode(json.encode(obj1))
+    return compare_values(obj1, obj2)
+end
 run_test_group("decode simple value", simple_value_tests)
 run_test_group("decode numeric", numeric_tests)
author	Mark Pulford <mark@kyne.com.au>	2011-05-08 20:26:09 +0930
committer	Mark Pulford <mark@kyne.com.au>	2011-05-08 20:26:09 +0930
commit	4dc56c6d362f2cd8a79d83369f0b852df07dae3f (patch)
tree	d51d3470a396c7981871b4f6fe4fd331e180db83 /tests/test.lua
parent	eeebeda88e62fefa87c71d616d5719782bdaa45a (diff)
download	lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.gz lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.tar.bz2 lua-cjson-4dc56c6d362f2cd8a79d83369f0b852df07dae3f.zip

diff --git a/tests/test.lua b/tests/test.lua index 9075bab..0e0aad8 100755 --- a/tests/test.lua +++ b/tests/test.lua
@@ -3,6 +3,8 @@
3	-- CJSON tests	3	-- CJSON tests
4	--	4	--
5	-- Mark Pulford <mark@kyne.com.au>	5	-- Mark Pulford <mark@kyne.com.au>
		6	--
		7	-- Note: The output of this script is easier to read with "less -S"
6		8
7	require "common"	9	require "common"
8	local json = require "cjson"	10	local json = require "cjson"
@@ -95,13 +97,73 @@ local function gen_ascii()
95	return table.concat(chars)	97	return table.concat(chars)
96	end	98	end
97		99
		100	-- Generate every UTF-16 codepoint, including supplementary codes
		101	local function gen_utf16_escaped()
		102	-- Create raw table escapes
		103	local utf16_escaped = {}
		104	local count = 0
		105
		106	local function append_escape(code)
		107	local esc = string.format('\\u%04X', code)
		108	table.insert(utf16_escaped, esc)
		109	end
		110
		111	table.insert(utf16_escaped, '"')
		112	for i = 0, 0xD7FF do
		113	append_escape(i)
		114	end
		115	-- Skip 0xD800 - 0xDFFF since they are used to encode supplementary
		116	-- codepoints
		117	for i = 0xE000, 0xFFFF do
		118	append_escape(i)
		119	end
		120	-- Append surrogate pair for each supplementary codepoint
		121	for high = 0xD800, 0xDBFF do
		122	for low = 0xDC00, 0xDFFF do
		123	append_escape(high)
		124	append_escape(low)
		125	end
		126	end
		127	table.insert(utf16_escaped, '"')
		128
		129	return table.concat(utf16_escaped)
		130	end
		131
98	local octets_raw = gen_ascii()	132	local octets_raw = gen_ascii()
99	local octets_escaped = file_load("bytestring.dat")	133	local octets_escaped = file_load("octets-escaped.dat")
		134	local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat")
		135	if not utf8_loaded then
		136	utf8_raw = "Failed to load utf8.dat"
		137	end
		138	local utf16_escaped = gen_utf16_escaped()
		139
100	local escape_tests = {	140	local escape_tests = {
		141	-- Test 8bit clean
101	{ json.encode, { octets_raw }, true, { octets_escaped } },	142	{ json.encode, { octets_raw }, true, { octets_escaped } },
102	{ json.decode, { octets_escaped }, true, { octets_raw } }	143	{ json.decode, { octets_escaped }, true, { octets_raw } },
		144	-- Ensure high bits are removed from surrogate codes
		145	{ json.decode, { '"\\uF800"' }, true, { "\239\160\128" } },
		146	-- Test inverted surrogate pairs
		147	{ json.decode, { '"\\uDB00\\uD800"' },
		148	false, { "Expected value but found invalid unicode escape code at character 2" } },
		149	-- Test 2x high surrogate code units
		150	{ json.decode, { '"\\uDB00\\uDB00"' },
		151	false, { "Expected value but found invalid unicode escape code at character 2" } },
		152	-- Test invalid 2nd escape
		153	{ json.decode, { '"\\uDB00\\"' },
		154	false, { "Expected value but found invalid unicode escape code at character 2" } },
		155	{ json.decode, { '"\\uDB00\\uD"' },
		156	false, { "Expected value but found invalid unicode escape code at character 2" } },
		157	-- Test decoding of all UTF-16 escapes
		158	{ json.decode, { utf16_escaped }, true, { utf8_raw } }
103	}	159	}
104		160
		161	function test_decode_cycle(filename)
		162	local obj1 = json.decode(file_load(filename))
		163	local obj2 = json.decode(json.encode(obj1))
		164	return compare_values(obj1, obj2)
		165	end
		166
105	run_test_group("decode simple value", simple_value_tests)	167	run_test_group("decode simple value", simple_value_tests)
106	run_test_group("decode numeric", numeric_tests)	168	run_test_group("decode numeric", numeric_tests)
107		169