aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 16:04:30 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 16:04:30 +0100
commit40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch)
tree89e7c1880d057393ee6a5596bee77d802b882c3f
parent344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff)
downloadbusybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz
busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2
busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip
exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/unicode_wcwidth.c72
-rwxr-xr-xtestsuite/ls.tests121
2 files changed, 183 insertions, 10 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index ab62b18f6..410c741ac 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -59,8 +59,39 @@
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */ 60 */
61 61
62#if CONFIG_LAST_SUPPORTED_WCHAR == 0 62/* Assigned Unicode character ranges:
63# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) 63 * Plane Range
64 * 0 0000–FFFF Basic Multilingual Plane
65 * 1 10000–1FFFF Supplementary Multilingual Plane
66 * 2 20000–2FFFF Supplementary Ideographic Plane
67 * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
68 * 4-13 40000–DFFFF currently unassigned
69 * 14 E0000–EFFFF Supplementary Special-purpose Plane
70 * 15 F0000–FFFFF Supplementary Private Use Area-A
71 * 16 100000–10FFFF Supplementary Private Use Area-B
72 *
73 * "Supplementary Special-purpose Plane currently contains non-graphical
74 * characters in two blocks of 128 and 240 characters. The first block
75 * is for language tag characters for use when language cannot be indicated
76 * through other protocols (such as the xml:lang attribute in XML).
77 * The other block contains glyph variation selectors to indicate
78 * an alternate glyph for a character that cannot be determined by context."
79 *
80 * In simpler terms: it is a tool to fix the "Han unification" mess
81 * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
82 * version of a character. (They forgot that the whole purpose of the Unicode
83 * was to be able to write all chars in one charset without such tricks).
84 * Until East Asian users say it is actually necessary to support these
85 * code points in console applications like busybox
86 * (i.e. do these chars ever appear in filenames, hostnames, text files
87 * and such?), we are treating these code points as invalid.
88 *
89 * Tertiary Ideographic Plane is also ignored for now,
90 * until Unicode committee assigns something there.
91 */
92
93#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000
94# define LAST_SUPPORTED_WCHAR 0x30000
64#else 95#else
65# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR 96# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
66#endif 97#endif
@@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs)
429#undef BIG_ 460#undef BIG_
430#undef PAIR 461#undef PAIR
431 }; 462 };
432# if LAST_SUPPORTED_WCHAR >= 0x1100 463# if LAST_SUPPORTED_WCHAR >= 0x10000
464 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
433 static const struct interval combining0x10000[] = { 465 static const struct interval combining0x10000[] = {
434 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, 466 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
435 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, 467 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
@@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs)
462# if LAST_SUPPORTED_WCHAR < 0x1100 494# if LAST_SUPPORTED_WCHAR < 0x1100
463 return -1; 495 return -1;
464# else 496# else
465 /* binary search in table of non-spacing characters, cont. */ 497 if (ucs >= LAST_SUPPORTED_WCHAR)
498 return -1;
499
500 /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */
501 /* We also exclude Private Use Area (e000..f8ff) */
502 if (LAST_SUPPORTED_WCHAR >= 0xd800
503 && (ucs >= 0xd800 || ucs <= 0xf8ff)
504 ) {
505 return -1;
506 }
507
508 /* 0xfffe and 0xffff in every plane are invalid */
509 if (LAST_SUPPORTED_WCHAR >= 0xfffe
510 && (ucs & 0xfffe) == 0xfffe
511 ) {
512 return -1;
513 }
514
515# if LAST_SUPPORTED_WCHAR >= 0x10000
516 /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */
466 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) 517 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
467 return 0; 518 return 0;
468 if (ucs == 0xE0001 519# endif
469 || (ucs >= 0xE0020 && ucs <= 0xE007F) 520 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
470 || (ucs >= 0xE0100 && ucs <= 0xE01EF) 521 if (LAST_SUPPORTED_WCHAR >= 0xE0001
522 && ( ucs == 0xE0001
523 || (ucs >= 0xE0020 && ucs <= 0xE007F)
524 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
525 )
471 ) { 526 ) {
472 return 0; 527 return 0;
473 } 528 }
@@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs)
485 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ 540 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
486 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ 541 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
487 || (ucs >= 0xffe0 && ucs <= 0xffe6) 542 || (ucs >= 0xffe0 && ucs <= 0xffe6)
488 || (ucs >= 0x20000 && ucs <= 0x2fffd) 543 || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
489 || (ucs >= 0x30000 && ucs <= 0x3fffd)
490 ); 544 );
491# endif 545# endif
492#endif 546#endif
diff --git a/testsuite/ls.tests b/testsuite/ls.tests
index 60f3eb50f..e08249ea6 100755
--- a/testsuite/ls.tests
+++ b/testsuite/ls.tests
@@ -18,7 +18,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
18&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ 18&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
19&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ 19&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
20&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ 20&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
21&& testing "ls unicode test" \ 21&& testing "ls unicode test with codepoints limited to 767" \
22"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ 22"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
23'0001_1__Some_correct_UTF-8_text___________________________________________| 23'0001_1__Some_correct_UTF-8_text___________________________________________|
240002_2__Boundary_condition_test_cases_____________________________________| 240002_2__Boundary_condition_test_cases_____________________________________|
@@ -132,6 +132,125 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
1320110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| 1320110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
133' "" "" 133' "" ""
134 134
135# Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line
136test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
137&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
138&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
139&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \
140&& testing "ls unicode test with unlimited codepoints" \
141"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
142'0001_1__Some_correct_UTF-8_text___________________________________________|
1430002_2__Boundary_condition_test_cases_____________________________________|
1440003_2.1__First_possible_sequence_of_a_certain_length_____________________|
1450004_2.1.2__2_bytes__U-00000080_:________"?"______________________________|
1460005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________|
1470006_2.1.4__4_bytes__U-00010000_:________"?"______________________________|
1480007_2.1.5__5_bytes__U-00200000_:________"?"______________________________|
1490008_2.1.6__6_bytes__U-04000000_:________"?"______________________________|
1500009_2.2__Last_possible_sequence_of_a_certain_length______________________|
1510010_2.2.1__1_byte___U-0000007F_:________"?"______________________________|
1520011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________|
1530012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________|
1540013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________|
1550014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________|
1560015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________|
1570016_2.3__Other_boundary_conditions_______________________________________|
1580017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________|
1590018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________|
1600019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________|
1610020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________|
1620021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________|
1630022_3__Malformed_sequences_______________________________________________|
1640023_3.1__Unexpected_continuation_bytes___________________________________|
1650024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________|
1660025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________|
1670026_3.1.3__2_continuation_bytes:_"??"____________________________________|
1680027_3.1.4__3_continuation_bytes:_"???"___________________________________|
1690028_3.1.5__4_continuation_bytes:_"????"__________________________________|
1700029_3.1.6__5_continuation_bytes:_"?????"_________________________________|
1710030_3.1.7__6_continuation_bytes:_"??????"________________________________|
1720031_3.1.8__7_continuation_bytes:_"???????"_______________________________|
1730032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___|
1740033____"????????????????_________________________________________________|
1750034_____????????????????_________________________________________________|
1760035_____????????????????_________________________________________________|
1770036_____????????????????"________________________________________________|
1780037_3.2__Lonely_start_characters_________________________________________|
1790038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________|
1800039________each_followed_by_a_space_character:___________________________|
1810040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________|
1820041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
1830042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________|
1840043________each_followed_by_a_space_character:___________________________|
1850044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
1860045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________|
1870046________each_followed_by_a_space_character:___________________________|
1880047____"?_?_?_?_?_?_?_?_"________________________________________________|
1890048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________|
1900049________each_followed_by_a_space_character:___________________________|
1910050____"?_?_?_?_"________________________________________________________|
1920051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________|
1930052________each_followed_by_a_space_character:___________________________|
1940053____"?_?_"____________________________________________________________|
1950054_3.3__Sequences_with_last_continuation_byte_missing___________________|
1960055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
1970056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
1980057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
1990058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
2000059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
2010060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
2020061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______|
2030062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______|
2040063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______|
2050064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______|
2060065_3.4__Concatenation_of_incomplete_sequences___________________________|
2070066____"??????????"______________________________________________________|
2080067_3.5__Impossible_bytes________________________________________________|
2090068_3.5.1__fe_=_"?"______________________________________________________|
2100069_3.5.2__ff_=_"?"______________________________________________________|
2110070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
2120071_4__Overlong_sequences________________________________________________|
2130072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
2140073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________|
2150074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________|
2160075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________|
2170076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________|
2180077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________|
2190078_4.2__Maximum_overlong_sequences______________________________________|
2200079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________|
2210080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
2220081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
2230082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
2240083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
2250084_4.3__Overlong_representation_of_the_NUL_character____________________|
2260085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________|
2270086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________|
2280087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________|
2290088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________|
2300089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________|
2310090_5__Illegal_code_positions____________________________________________|
2320091_5.1_Single_UTF-16_surrogates_________________________________________|
2330092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|
2340093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________|
2350094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________|
2360095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________|
2370096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________|
2380097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________|
2390098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________|
2400099_5.2_Paired_UTF-16_surrogates_________________________________________|
2410100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________|
2420101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________|
2430102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________|
2440103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________|
2450104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________|
2460105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________|
2470106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________|
2480107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________|
2490108_5.3_Other_illegal_code_positions_____________________________________|
2500109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________|
2510110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
252' "" ""
253
135# Clean up 254# Clean up
136rm -rf ls.testdir 2>/dev/null 255rm -rf ls.testdir 2>/dev/null
137 256