exclude more invalid unicode chars

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2010-01-31 16:04:30 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2010-01-31 16:04:30 +0100
commit: 40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch)
tree: 89e7c1880d057393ee6a5596bee77d802b882c3f /libbb
parent: 344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff)
download: busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz
busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2
busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip
1 files changed, 63 insertions, 9 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index ab62b18f6..410c741ac 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -59,8 +59,39 @@
 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 */
-#if CONFIG_LAST_SUPPORTED_WCHAR == 0
+/* Assigned Unicode character ranges:
-# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
+ * Plane Range
+ * 0       0000–FFFF   Basic Multilingual Plane
+ * 1      10000–1FFFF  Supplementary Multilingual Plane
+ * 2      20000–2FFFF  Supplementary Ideographic Plane
+ * 3      30000-3FFFF  Tertiary Ideographic Plane (no chars assigned yet)
+ * 4-13   40000–DFFFF  currently unassigned
+ * 14     E0000–EFFFF  Supplementary Special-purpose Plane
+ * 15     F0000–FFFFF  Supplementary Private Use Area-A
+ * 16    100000–10FFFF Supplementary Private Use Area-B
+ *
+ * "Supplementary Special-purpose Plane currently contains non-graphical
+ * characters in two blocks of 128 and 240 characters. The first block
+ * is for language tag characters for use when language cannot be indicated
+ * through other protocols (such as the xml:lang  attribute in XML).
+ * The other block contains glyph variation selectors to indicate
+ * an alternate glyph for a character that cannot be determined by context."
+ *
+ * In simpler terms: it is a tool to fix the "Han unification" mess
+ * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
+ * version of a character. (They forgot that the whole purpose of the Unicode
+ * was to be able to write all chars in one charset without such tricks).
+ * Until East Asian users say it is actually necessary to support these
+ * code points in console applications like busybox
+ * (i.e. do these chars ever appear in filenames, hostnames, text files
+ * and such?), we are treating these code points as invalid.
+ *
+ * Tertiary Ideographic Plane is also ignored for now,
+ * until Unicode committee assigns something there.
+ */
+#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000
+# define LAST_SUPPORTED_WCHAR 0x30000
 #else
 # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
 #endif
@@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs)
 #undef BIG_
 #undef PAIR
        };
-# if LAST_SUPPORTED_WCHAR >= 0x1100
+# if LAST_SUPPORTED_WCHAR >= 0x10000
+        /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
        static const struct interval combining0x10000[] = {
                { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
                { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
@@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs)
 # if LAST_SUPPORTED_WCHAR < 0x1100
        return -1;
 # else
-        /* binary search in table of non-spacing characters, cont. */
+        if (ucs >= LAST_SUPPORTED_WCHAR)
+                return -1;
+        /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */
+        /* We also exclude Private Use Area (e000..f8ff) */
+        if (LAST_SUPPORTED_WCHAR >= 0xd800
+         && (ucs >= 0xd800 || ucs <= 0xf8ff)
+        ) {
+                return -1;
+        }
+        /* 0xfffe and 0xffff in every plane are invalid */
+        if (LAST_SUPPORTED_WCHAR >= 0xfffe
+         && (ucs & 0xfffe) == 0xfffe
+        ) {
+                return -1;
+        }
+#  if LAST_SUPPORTED_WCHAR >= 0x10000
+        /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */
        if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
                return 0;
-        if (ucs == 0xE0001
+#  endif
-         || (ucs >= 0xE0020 && ucs <= 0xE007F)
+        /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
-         || (ucs >= 0xE0100 && ucs <= 0xE01EF)
+        if (LAST_SUPPORTED_WCHAR >= 0xE0001
+         && (  ucs == 0xE0001
+            || (ucs >= 0xE0020 && ucs <= 0xE007F)
+            || (ucs >= 0xE0100 && ucs <= 0xE01EF)
+            )
        ) {
                return 0;
        }
@@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs)
                || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
                || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
                || (ucs >= 0xffe0 && ucs <= 0xffe6)
-                || (ucs >= 0x20000 && ucs <= 0x2fffd)
+                || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
-                || (ucs >= 0x30000 && ucs <= 0x3fffd)
                );
 # endif
 #endif
author	Denys Vlasenko <vda.linux@googlemail.com>	2010-01-31 16:04:30 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2010-01-31 16:04:30 +0100
commit	40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch)
tree	89e7c1880d057393ee6a5596bee77d802b882c3f /libbb
parent	344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff)
download	busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2 busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip

diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index ab62b18f6..410c741ac 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c
@@ -59,8 +59,39 @@
59	* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c	59	* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60	*/	60	*/
61		61
62	#if CONFIG_LAST_SUPPORTED_WCHAR == 0	62	/* Assigned Unicode character ranges:
63	# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)	63	* Plane Range
		64	* 0 0000–FFFF Basic Multilingual Plane
		65	* 1 10000–1FFFF Supplementary Multilingual Plane
		66	* 2 20000–2FFFF Supplementary Ideographic Plane
		67	* 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
		68	* 4-13 40000–DFFFF currently unassigned
		69	* 14 E0000–EFFFF Supplementary Special-purpose Plane
		70	* 15 F0000–FFFFF Supplementary Private Use Area-A
		71	* 16 100000–10FFFF Supplementary Private Use Area-B
		72	*
		73	* "Supplementary Special-purpose Plane currently contains non-graphical
		74	* characters in two blocks of 128 and 240 characters. The first block
		75	* is for language tag characters for use when language cannot be indicated
		76	* through other protocols (such as the xml:lang attribute in XML).
		77	* The other block contains glyph variation selectors to indicate
		78	* an alternate glyph for a character that cannot be determined by context."
		79	*
		80	* In simpler terms: it is a tool to fix the "Han unification" mess
		81	* created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
		82	* version of a character. (They forgot that the whole purpose of the Unicode
		83	* was to be able to write all chars in one charset without such tricks).
		84	* Until East Asian users say it is actually necessary to support these
		85	* code points in console applications like busybox
		86	* (i.e. do these chars ever appear in filenames, hostnames, text files
		87	* and such?), we are treating these code points as invalid.
		88	*
		89	* Tertiary Ideographic Plane is also ignored for now,
		90	* until Unicode committee assigns something there.
		91	*/
		92
		93	#if CONFIG_LAST_SUPPORTED_WCHAR < 126 \|\| CONFIG_LAST_SUPPORTED_WCHAR > 0x30000
		94	# define LAST_SUPPORTED_WCHAR 0x30000
64	#else	95	#else
65	# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR	96	# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
66	#endif	97	#endif
@@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs)
429	#undef BIG_	460	#undef BIG_
430	#undef PAIR	461	#undef PAIR
431	};	462	};
432	# if LAST_SUPPORTED_WCHAR >= 0x1100	463	# if LAST_SUPPORTED_WCHAR >= 0x10000
		464	/* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
433	static const struct interval combining0x10000[] = {	465	static const struct interval combining0x10000[] = {
434	{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },	466	{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
435	{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },	467	{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
@@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs)
462	# if LAST_SUPPORTED_WCHAR < 0x1100	494	# if LAST_SUPPORTED_WCHAR < 0x1100
463	return -1;	495	return -1;
464	# else	496	# else
465	/* binary search in table of non-spacing characters, cont. */	497	if (ucs >= LAST_SUPPORTED_WCHAR)
		498	return -1;
		499
		500	/* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */
		501	/* We also exclude Private Use Area (e000..f8ff) */
		502	if (LAST_SUPPORTED_WCHAR >= 0xd800
		503	&& (ucs >= 0xd800 \|\| ucs <= 0xf8ff)
		504	) {
		505	return -1;
		506	}
		507
		508	/* 0xfffe and 0xffff in every plane are invalid */
		509	if (LAST_SUPPORTED_WCHAR >= 0xfffe
		510	&& (ucs & 0xfffe) == 0xfffe
		511	) {
		512	return -1;
		513	}
		514
		515	# if LAST_SUPPORTED_WCHAR >= 0x10000
		516	/* binary search in table of non-spacing characters in Supplementary Multilingual Plane */
466	if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))	517	if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
467	return 0;	518	return 0;
468	if (ucs == 0xE0001	519	# endif
469	\|\| (ucs >= 0xE0020 && ucs <= 0xE007F)	520	/* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
470	\|\| (ucs >= 0xE0100 && ucs <= 0xE01EF)	521	if (LAST_SUPPORTED_WCHAR >= 0xE0001
		522	&& ( ucs == 0xE0001
		523	\|\| (ucs >= 0xE0020 && ucs <= 0xE007F)
		524	\|\| (ucs >= 0xE0100 && ucs <= 0xE01EF)
		525	)
471	) {	526	) {
472	return 0;	527	return 0;
473	}	528	}
@@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs)
485	\|\| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */	540	\|\| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
486	\|\| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */	541	\|\| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
487	\|\| (ucs >= 0xffe0 && ucs <= 0xffe6)	542	\|\| (ucs >= 0xffe0 && ucs <= 0xffe6)
488	\|\| (ucs >= 0x20000 && ucs <= 0x2fffd)	543	\|\| ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
489	\|\| (ucs >= 0x30000 && ucs <= 0x3fffd)
490	);	544	);
491	# endif	545	# endif
492	#endif	546	#endif