diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
| commit | 40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch) | |
| tree | 89e7c1880d057393ee6a5596bee77d802b882c3f | |
| parent | 344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff) | |
| download | busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2 busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip | |
exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | libbb/unicode_wcwidth.c | 72 | ||||
| -rwxr-xr-x | testsuite/ls.tests | 121 |
2 files changed, 183 insertions, 10 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index ab62b18f6..410c741ac 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c | |||
| @@ -59,8 +59,39 @@ | |||
| 59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | 59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
| 60 | */ | 60 | */ |
| 61 | 61 | ||
| 62 | #if CONFIG_LAST_SUPPORTED_WCHAR == 0 | 62 | /* Assigned Unicode character ranges: |
| 63 | # define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) | 63 | * Plane Range |
| 64 | * 0 0000–FFFF Basic Multilingual Plane | ||
| 65 | * 1 10000–1FFFF Supplementary Multilingual Plane | ||
| 66 | * 2 20000–2FFFF Supplementary Ideographic Plane | ||
| 67 | * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet) | ||
| 68 | * 4-13 40000–DFFFF currently unassigned | ||
| 69 | * 14 E0000–EFFFF Supplementary Special-purpose Plane | ||
| 70 | * 15 F0000–FFFFF Supplementary Private Use Area-A | ||
| 71 | * 16 100000–10FFFF Supplementary Private Use Area-B | ||
| 72 | * | ||
| 73 | * "Supplementary Special-purpose Plane currently contains non-graphical | ||
| 74 | * characters in two blocks of 128 and 240 characters. The first block | ||
| 75 | * is for language tag characters for use when language cannot be indicated | ||
| 76 | * through other protocols (such as the xml:lang attribute in XML). | ||
| 77 | * The other block contains glyph variation selectors to indicate | ||
| 78 | * an alternate glyph for a character that cannot be determined by context." | ||
| 79 | * | ||
| 80 | * In simpler terms: it is a tool to fix the "Han unification" mess | ||
| 81 | * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan | ||
| 82 | * version of a character. (They forgot that the whole purpose of the Unicode | ||
| 83 | * was to be able to write all chars in one charset without such tricks). | ||
| 84 | * Until East Asian users say it is actually necessary to support these | ||
| 85 | * code points in console applications like busybox | ||
| 86 | * (i.e. do these chars ever appear in filenames, hostnames, text files | ||
| 87 | * and such?), we are treating these code points as invalid. | ||
| 88 | * | ||
| 89 | * Tertiary Ideographic Plane is also ignored for now, | ||
| 90 | * until Unicode committee assigns something there. | ||
| 91 | */ | ||
| 92 | |||
| 93 | #if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 | ||
| 94 | # define LAST_SUPPORTED_WCHAR 0x30000 | ||
| 64 | #else | 95 | #else |
| 65 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR | 96 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR |
| 66 | #endif | 97 | #endif |
| @@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs) | |||
| 429 | #undef BIG_ | 460 | #undef BIG_ |
| 430 | #undef PAIR | 461 | #undef PAIR |
| 431 | }; | 462 | }; |
| 432 | # if LAST_SUPPORTED_WCHAR >= 0x1100 | 463 | # if LAST_SUPPORTED_WCHAR >= 0x10000 |
| 464 | /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ | ||
| 433 | static const struct interval combining0x10000[] = { | 465 | static const struct interval combining0x10000[] = { |
| 434 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, | 466 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, |
| 435 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, | 467 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, |
| @@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs) | |||
| 462 | # if LAST_SUPPORTED_WCHAR < 0x1100 | 494 | # if LAST_SUPPORTED_WCHAR < 0x1100 |
| 463 | return -1; | 495 | return -1; |
| 464 | # else | 496 | # else |
| 465 | /* binary search in table of non-spacing characters, cont. */ | 497 | if (ucs >= LAST_SUPPORTED_WCHAR) |
| 498 | return -1; | ||
| 499 | |||
| 500 | /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ | ||
| 501 | /* We also exclude Private Use Area (e000..f8ff) */ | ||
| 502 | if (LAST_SUPPORTED_WCHAR >= 0xd800 | ||
| 503 | && (ucs >= 0xd800 || ucs <= 0xf8ff) | ||
| 504 | ) { | ||
| 505 | return -1; | ||
| 506 | } | ||
| 507 | |||
| 508 | /* 0xfffe and 0xffff in every plane are invalid */ | ||
| 509 | if (LAST_SUPPORTED_WCHAR >= 0xfffe | ||
| 510 | && (ucs & 0xfffe) == 0xfffe | ||
| 511 | ) { | ||
| 512 | return -1; | ||
| 513 | } | ||
| 514 | |||
| 515 | # if LAST_SUPPORTED_WCHAR >= 0x10000 | ||
| 516 | /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ | ||
| 466 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) | 517 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) |
| 467 | return 0; | 518 | return 0; |
| 468 | if (ucs == 0xE0001 | 519 | # endif |
| 469 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | 520 | /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ |
| 470 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | 521 | if (LAST_SUPPORTED_WCHAR >= 0xE0001 |
| 522 | && ( ucs == 0xE0001 | ||
| 523 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | ||
| 524 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | ||
| 525 | ) | ||
| 471 | ) { | 526 | ) { |
| 472 | return 0; | 527 | return 0; |
| 473 | } | 528 | } |
| @@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs) | |||
| 485 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ | 540 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ |
| 486 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ | 541 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ |
| 487 | || (ucs >= 0xffe0 && ucs <= 0xffe6) | 542 | || (ucs >= 0xffe0 && ucs <= 0xffe6) |
| 488 | || (ucs >= 0x20000 && ucs <= 0x2fffd) | 543 | || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */ |
| 489 | || (ucs >= 0x30000 && ucs <= 0x3fffd) | ||
| 490 | ); | 544 | ); |
| 491 | # endif | 545 | # endif |
| 492 | #endif | 546 | #endif |
diff --git a/testsuite/ls.tests b/testsuite/ls.tests index 60f3eb50f..e08249ea6 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests | |||
| @@ -18,7 +18,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | |||
| 18 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ | 18 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ |
| 19 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ | 19 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ |
| 20 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ | 20 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ |
| 21 | && testing "ls unicode test" \ | 21 | && testing "ls unicode test with codepoints limited to 767" \ |
| 22 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ | 22 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ |
| 23 | '0001_1__Some_correct_UTF-8_text___________________________________________| | 23 | '0001_1__Some_correct_UTF-8_text___________________________________________| |
| 24 | 0002_2__Boundary_condition_test_cases_____________________________________| | 24 | 0002_2__Boundary_condition_test_cases_____________________________________| |
| @@ -132,6 +132,125 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | |||
| 132 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| | 132 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| |
| 133 | ' "" "" | 133 | ' "" "" |
| 134 | 134 | ||
| 135 | # Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line | ||
| 136 | test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | ||
| 137 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ | ||
| 138 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ | ||
| 139 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \ | ||
| 140 | && testing "ls unicode test with unlimited codepoints" \ | ||
| 141 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ | ||
| 142 | '0001_1__Some_correct_UTF-8_text___________________________________________| | ||
| 143 | 0002_2__Boundary_condition_test_cases_____________________________________| | ||
| 144 | 0003_2.1__First_possible_sequence_of_a_certain_length_____________________| | ||
| 145 | 0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________| | ||
| 146 | 0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________| | ||
| 147 | 0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________| | ||
| 148 | 0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________| | ||
| 149 | 0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________| | ||
| 150 | 0009_2.2__Last_possible_sequence_of_a_certain_length______________________| | ||
| 151 | 0010_2.2.1__1_byte___U-0000007F_:________"?"______________________________| | ||
| 152 | 0011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________| | ||
| 153 | 0012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________| | ||
| 154 | 0013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________| | ||
| 155 | 0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________| | ||
| 156 | 0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________| | ||
| 157 | 0016_2.3__Other_boundary_conditions_______________________________________| | ||
| 158 | 0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________| | ||
| 159 | 0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________| | ||
| 160 | 0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________| | ||
| 161 | 0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________| | ||
| 162 | 0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________| | ||
| 163 | 0022_3__Malformed_sequences_______________________________________________| | ||
| 164 | 0023_3.1__Unexpected_continuation_bytes___________________________________| | ||
| 165 | 0024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________| | ||
| 166 | 0025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________| | ||
| 167 | 0026_3.1.3__2_continuation_bytes:_"??"____________________________________| | ||
| 168 | 0027_3.1.4__3_continuation_bytes:_"???"___________________________________| | ||
| 169 | 0028_3.1.5__4_continuation_bytes:_"????"__________________________________| | ||
| 170 | 0029_3.1.6__5_continuation_bytes:_"?????"_________________________________| | ||
| 171 | 0030_3.1.7__6_continuation_bytes:_"??????"________________________________| | ||
| 172 | 0031_3.1.8__7_continuation_bytes:_"???????"_______________________________| | ||
| 173 | 0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___| | ||
| 174 | 0033____"????????????????_________________________________________________| | ||
| 175 | 0034_____????????????????_________________________________________________| | ||
| 176 | 0035_____????????????????_________________________________________________| | ||
| 177 | 0036_____????????????????"________________________________________________| | ||
| 178 | 0037_3.2__Lonely_start_characters_________________________________________| | ||
| 179 | 0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________| | ||
| 180 | 0039________each_followed_by_a_space_character:___________________________| | ||
| 181 | 0040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________| | ||
| 182 | 0041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| | ||
| 183 | 0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________| | ||
| 184 | 0043________each_followed_by_a_space_character:___________________________| | ||
| 185 | 0044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| | ||
| 186 | 0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________| | ||
| 187 | 0046________each_followed_by_a_space_character:___________________________| | ||
| 188 | 0047____"?_?_?_?_?_?_?_?_"________________________________________________| | ||
| 189 | 0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________| | ||
| 190 | 0049________each_followed_by_a_space_character:___________________________| | ||
| 191 | 0050____"?_?_?_?_"________________________________________________________| | ||
| 192 | 0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________| | ||
| 193 | 0052________each_followed_by_a_space_character:___________________________| | ||
| 194 | 0053____"?_?_"____________________________________________________________| | ||
| 195 | 0054_3.3__Sequences_with_last_continuation_byte_missing___________________| | ||
| 196 | 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
| 197 | 0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
| 198 | 0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
| 199 | 0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
| 200 | 0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
| 201 | 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| | ||
| 202 | 0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______| | ||
| 203 | 0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______| | ||
| 204 | 0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______| | ||
| 205 | 0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______| | ||
| 206 | 0065_3.4__Concatenation_of_incomplete_sequences___________________________| | ||
| 207 | 0066____"??????????"______________________________________________________| | ||
| 208 | 0067_3.5__Impossible_bytes________________________________________________| | ||
| 209 | 0068_3.5.1__fe_=_"?"______________________________________________________| | ||
| 210 | 0069_3.5.2__ff_=_"?"______________________________________________________| | ||
| 211 | 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| | ||
| 212 | 0071_4__Overlong_sequences________________________________________________| | ||
| 213 | 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| | ||
| 214 | 0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________| | ||
| 215 | 0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________| | ||
| 216 | 0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________| | ||
| 217 | 0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________| | ||
| 218 | 0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________| | ||
| 219 | 0078_4.2__Maximum_overlong_sequences______________________________________| | ||
| 220 | 0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________| | ||
| 221 | 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| | ||
| 222 | 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| | ||
| 223 | 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| | ||
| 224 | 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| | ||
| 225 | 0084_4.3__Overlong_representation_of_the_NUL_character____________________| | ||
| 226 | 0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________| | ||
| 227 | 0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________| | ||
| 228 | 0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________| | ||
| 229 | 0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________| | ||
| 230 | 0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________| | ||
| 231 | 0090_5__Illegal_code_positions____________________________________________| | ||
| 232 | 0091_5.1_Single_UTF-16_surrogates_________________________________________| | ||
| 233 | 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| | ||
| 234 | 0093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________| | ||
| 235 | 0094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________| | ||
| 236 | 0095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________| | ||
| 237 | 0096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________| | ||
| 238 | 0097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________| | ||
| 239 | 0098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________| | ||
| 240 | 0099_5.2_Paired_UTF-16_surrogates_________________________________________| | ||
| 241 | 0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________| | ||
| 242 | 0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________| | ||
| 243 | 0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________| | ||
| 244 | 0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________| | ||
| 245 | 0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________| | ||
| 246 | 0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________| | ||
| 247 | 0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________| | ||
| 248 | 0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________| | ||
| 249 | 0108_5.3_Other_illegal_code_positions_____________________________________| | ||
| 250 | 0109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________| | ||
| 251 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| | ||
| 252 | ' "" "" | ||
| 253 | |||
| 135 | # Clean up | 254 | # Clean up |
| 136 | rm -rf ls.testdir 2>/dev/null | 255 | rm -rf ls.testdir 2>/dev/null |
| 137 | 256 | ||
