From 4dc56c6d362f2cd8a79d83369f0b852df07dae3f Mon Sep 17 00:00:00 2001 From: Mark Pulford Date: Sun, 8 May 2011 20:26:09 +0930 Subject: Add UTF-16 surrogate pair decode support - Add tests for UTF-16 decoding and failures - Add getutf8.pl to assist with UTF-16 decode testing - Re-add test_decode_cycle() which was accidentally removed earlier - Rename bytestring.dat to octets-escaped.dat --- tests/bytestring.dat | 1 - tests/common.lua | 4 +++ tests/genutf8.pl | 25 ++++++++++++++++++ tests/octets-escaped.dat | 1 + tests/test.lua | 66 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 94 insertions(+), 3 deletions(-) delete mode 100644 tests/bytestring.dat create mode 100755 tests/genutf8.pl create mode 100644 tests/octets-escaped.dat (limited to 'tests') diff --git a/tests/bytestring.dat b/tests/bytestring.dat deleted file mode 100644 index ee99a6b..0000000 --- a/tests/bytestring.dat +++ /dev/null @@ -1 +0,0 @@ -"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f" \ No newline at end of file diff --git a/tests/common.lua b/tests/common.lua index 9a7ed19..b8ce01d 100644 --- a/tests/common.lua +++ b/tests/common.lua @@ -99,6 +99,10 @@ function file_load(filename) local data = file:read("*a") file:close() + if data == nil then + error("Failed to read " .. filename) + end + return data end diff --git a/tests/genutf8.pl b/tests/genutf8.pl new file mode 100755 index 0000000..4960663 --- /dev/null +++ b/tests/genutf8.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w + +# Create test comparison data using a different UTF-8 implementation. + +use strict; +use Text::Iconv; +use FileHandle; + +# 0xD800 - 0xDFFF are used to encode supplementary codepoints +# 0x10000 - 0x10FFFF are supplementary codepoints +my (@codepoints) = (0 .. 0xD7FF, 0xE000 .. 0x10FFFF); + +my ($utf32be) = pack("N*", @codepoints); +my $iconv = Text::Iconv->new("UTF-32BE", "UTF-8"); +my ($utf8) = $iconv->convert($utf32be); +defined($utf8) or die "Unable create UTF-8 string\n"; + +my $fh = FileHandle->new(); +$fh->open("utf8.dat", ">") + or die "Unable to open utf8.dat: $!\n"; +$fh->print($utf8) + or die "Unable to write utf.dat\n"; +$fh->close(); + +# vi:ai et sw=4 ts=4: diff --git a/tests/octets-escaped.dat b/tests/octets-escaped.dat new file mode 100644 index 0000000..ee99a6b --- /dev/null +++ b/tests/octets-escaped.dat @@ -0,0 +1 @@ +"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-.\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f" \ No newline at end of file diff --git a/tests/test.lua b/tests/test.lua index 9075bab..0e0aad8 100755 --- a/tests/test.lua +++ b/tests/test.lua @@ -3,6 +3,8 @@ -- CJSON tests -- -- Mark Pulford +-- +-- Note: The output of this script is easier to read with "less -S" require "common" local json = require "cjson" @@ -95,13 +97,73 @@ local function gen_ascii() return table.concat(chars) end +-- Generate every UTF-16 codepoint, including supplementary codes +local function gen_utf16_escaped() + -- Create raw table escapes + local utf16_escaped = {} + local count = 0 + + local function append_escape(code) + local esc = string.format('\\u%04X', code) + table.insert(utf16_escaped, esc) + end + + table.insert(utf16_escaped, '"') + for i = 0, 0xD7FF do + append_escape(i) + end + -- Skip 0xD800 - 0xDFFF since they are used to encode supplementary + -- codepoints + for i = 0xE000, 0xFFFF do + append_escape(i) + end + -- Append surrogate pair for each supplementary codepoint + for high = 0xD800, 0xDBFF do + for low = 0xDC00, 0xDFFF do + append_escape(high) + append_escape(low) + end + end + table.insert(utf16_escaped, '"') + + return table.concat(utf16_escaped) +end + local octets_raw = gen_ascii() -local octets_escaped = file_load("bytestring.dat") +local octets_escaped = file_load("octets-escaped.dat") +local utf8_loaded, utf8_raw = pcall(file_load, "utf8.dat") +if not utf8_loaded then + utf8_raw = "Failed to load utf8.dat" +end +local utf16_escaped = gen_utf16_escaped() + local escape_tests = { + -- Test 8bit clean { json.encode, { octets_raw }, true, { octets_escaped } }, - { json.decode, { octets_escaped }, true, { octets_raw } } + { json.decode, { octets_escaped }, true, { octets_raw } }, + -- Ensure high bits are removed from surrogate codes + { json.decode, { '"\\uF800"' }, true, { "\239\160\128" } }, + -- Test inverted surrogate pairs + { json.decode, { '"\\uDB00\\uD800"' }, + false, { "Expected value but found invalid unicode escape code at character 2" } }, + -- Test 2x high surrogate code units + { json.decode, { '"\\uDB00\\uDB00"' }, + false, { "Expected value but found invalid unicode escape code at character 2" } }, + -- Test invalid 2nd escape + { json.decode, { '"\\uDB00\\"' }, + false, { "Expected value but found invalid unicode escape code at character 2" } }, + { json.decode, { '"\\uDB00\\uD"' }, + false, { "Expected value but found invalid unicode escape code at character 2" } }, + -- Test decoding of all UTF-16 escapes + { json.decode, { utf16_escaped }, true, { utf8_raw } } } +function test_decode_cycle(filename) + local obj1 = json.decode(file_load(filename)) + local obj2 = json.decode(json.encode(obj1)) + return compare_values(obj1, obj2) +end + run_test_group("decode simple value", simple_value_tests) run_test_group("decode numeric", numeric_tests) -- cgit v1.2.3-55-g6feb