diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
commit | 40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch) | |
tree | 89e7c1880d057393ee6a5596bee77d802b882c3f | |
parent | 344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff) | |
download | busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2 busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip |
exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/unicode_wcwidth.c | 72 | ||||
-rwxr-xr-x | testsuite/ls.tests | 121 |
2 files changed, 183 insertions, 10 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index ab62b18f6..410c741ac 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c | |||
@@ -59,8 +59,39 @@ | |||
59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | 59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
60 | */ | 60 | */ |
61 | 61 | ||
62 | #if CONFIG_LAST_SUPPORTED_WCHAR == 0 | 62 | /* Assigned Unicode character ranges: |
63 | # define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) | 63 | * Plane Range |
64 | * 0 0000–FFFF Basic Multilingual Plane | ||
65 | * 1 10000–1FFFF Supplementary Multilingual Plane | ||
66 | * 2 20000–2FFFF Supplementary Ideographic Plane | ||
67 | * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet) | ||
68 | * 4-13 40000–DFFFF currently unassigned | ||
69 | * 14 E0000–EFFFF Supplementary Special-purpose Plane | ||
70 | * 15 F0000–FFFFF Supplementary Private Use Area-A | ||
71 | * 16 100000–10FFFF Supplementary Private Use Area-B | ||
72 | * | ||
73 | * "Supplementary Special-purpose Plane currently contains non-graphical | ||
74 | * characters in two blocks of 128 and 240 characters. The first block | ||
75 | * is for language tag characters for use when language cannot be indicated | ||
76 | * through other protocols (such as the xml:lang attribute in XML). | ||
77 | * The other block contains glyph variation selectors to indicate | ||
78 | * an alternate glyph for a character that cannot be determined by context." | ||
79 | * | ||
80 | * In simpler terms: it is a tool to fix the "Han unification" mess | ||
81 | * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan | ||
82 | * version of a character. (They forgot that the whole purpose of the Unicode | ||
83 | * was to be able to write all chars in one charset without such tricks). | ||
84 | * Until East Asian users say it is actually necessary to support these | ||
85 | * code points in console applications like busybox | ||
86 | * (i.e. do these chars ever appear in filenames, hostnames, text files | ||
87 | * and such?), we are treating these code points as invalid. | ||
88 | * | ||
89 | * Tertiary Ideographic Plane is also ignored for now, | ||
90 | * until Unicode committee assigns something there. | ||
91 | */ | ||
92 | |||
93 | #if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 | ||
94 | # define LAST_SUPPORTED_WCHAR 0x30000 | ||
64 | #else | 95 | #else |
65 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR | 96 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR |
66 | #endif | 97 | #endif |
@@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs) | |||
429 | #undef BIG_ | 460 | #undef BIG_ |
430 | #undef PAIR | 461 | #undef PAIR |
431 | }; | 462 | }; |
432 | # if LAST_SUPPORTED_WCHAR >= 0x1100 | 463 | # if LAST_SUPPORTED_WCHAR >= 0x10000 |
464 | /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ | ||
433 | static const struct interval combining0x10000[] = { | 465 | static const struct interval combining0x10000[] = { |
434 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, | 466 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, |
435 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, | 467 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, |
@@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs) | |||
462 | # if LAST_SUPPORTED_WCHAR < 0x1100 | 494 | # if LAST_SUPPORTED_WCHAR < 0x1100 |
463 | return -1; | 495 | return -1; |
464 | # else | 496 | # else |
465 | /* binary search in table of non-spacing characters, cont. */ | 497 | if (ucs >= LAST_SUPPORTED_WCHAR) |
498 | return -1; | ||
499 | |||
500 | /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ | ||
501 | /* We also exclude Private Use Area (e000..f8ff) */ | ||
502 | if (LAST_SUPPORTED_WCHAR >= 0xd800 | ||
503 | && (ucs >= 0xd800 || ucs <= 0xf8ff) | ||
504 | ) { | ||
505 | return -1; | ||
506 | } | ||
507 | |||
508 | /* 0xfffe and 0xffff in every plane are invalid */ | ||
509 | if (LAST_SUPPORTED_WCHAR >= 0xfffe | ||
510 | && (ucs & 0xfffe) == 0xfffe | ||
511 | ) { | ||
512 | return -1; | ||
513 | } | ||
514 | |||
515 | # if LAST_SUPPORTED_WCHAR >= 0x10000 | ||
516 | /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ | ||
466 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) | 517 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) |
467 | return 0; | 518 | return 0; |
468 | if (ucs == 0xE0001 | 519 | # endif |
469 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | 520 | /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ |
470 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | 521 | if (LAST_SUPPORTED_WCHAR >= 0xE0001 |
522 | && ( ucs == 0xE0001 | ||
523 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | ||
524 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | ||
525 | ) | ||
471 | ) { | 526 | ) { |
472 | return 0; | 527 | return 0; |
473 | } | 528 | } |
@@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs) | |||
485 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ | 540 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ |
486 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ | 541 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ |
487 | || (ucs >= 0xffe0 && ucs <= 0xffe6) | 542 | || (ucs >= 0xffe0 && ucs <= 0xffe6) |
488 | || (ucs >= 0x20000 && ucs <= 0x2fffd) | 543 | || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */ |
489 | || (ucs >= 0x30000 && ucs <= 0x3fffd) | ||
490 | ); | 544 | ); |
491 | # endif | 545 | # endif |
492 | #endif | 546 | #endif |
diff --git a/testsuite/ls.tests b/testsuite/ls.tests index 60f3eb50f..e08249ea6 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests | |||
@@ -18,7 +18,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | |||
18 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ | 18 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ |
19 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ | 19 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ |
20 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ | 20 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ |
21 | && testing "ls unicode test" \ | 21 | && testing "ls unicode test with codepoints limited to 767" \ |
22 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ | 22 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ |
23 | '0001_1__Some_correct_UTF-8_text___________________________________________| | 23 | '0001_1__Some_correct_UTF-8_text___________________________________________| |
24 | 0002_2__Boundary_condition_test_cases_____________________________________| | 24 | 0002_2__Boundary_condition_test_cases_____________________________________| |
@@ -132,6 +132,125 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | |||
132 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| | 132 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| |
133 | ' "" "" | 133 | ' "" "" |
134 | 134 | ||
135 | # Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line | ||
136 | test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | ||
137 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ | ||
138 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ | ||
139 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \ | ||
140 | && testing "ls unicode test with unlimited codepoints" \ | ||
141 | "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ | ||
142 | '0001_1__Some_correct_UTF-8_text___________________________________________| | ||
143 | 0002_2__Boundary_condition_test_cases_____________________________________| | ||
144 | 0003_2.1__First_possible_sequence_of_a_certain_length_____________________| | ||
145 | 0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________| | ||
146 | 0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________| | ||
147 | 0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________| | ||
148 | 0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________| | ||
149 | 0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________| | ||
150 | 0009_2.2__Last_possible_sequence_of_a_certain_length______________________| | ||
151 | 0010_2.2.1__1_byte___U-0000007F_:________"?"______________________________| | ||
152 | 0011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________| | ||
153 | 0012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________| | ||
154 | 0013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________| | ||
155 | 0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________| | ||
156 | 0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________| | ||
157 | 0016_2.3__Other_boundary_conditions_______________________________________| | ||
158 | 0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________| | ||
159 | 0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________| | ||
160 | 0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________| | ||
161 | 0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________| | ||
162 | 0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________| | ||
163 | 0022_3__Malformed_sequences_______________________________________________| | ||
164 | 0023_3.1__Unexpected_continuation_bytes___________________________________| | ||
165 | 0024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________| | ||
166 | 0025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________| | ||
167 | 0026_3.1.3__2_continuation_bytes:_"??"____________________________________| | ||
168 | 0027_3.1.4__3_continuation_bytes:_"???"___________________________________| | ||
169 | 0028_3.1.5__4_continuation_bytes:_"????"__________________________________| | ||
170 | 0029_3.1.6__5_continuation_bytes:_"?????"_________________________________| | ||
171 | 0030_3.1.7__6_continuation_bytes:_"??????"________________________________| | ||
172 | 0031_3.1.8__7_continuation_bytes:_"???????"_______________________________| | ||
173 | 0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___| | ||
174 | 0033____"????????????????_________________________________________________| | ||
175 | 0034_____????????????????_________________________________________________| | ||
176 | 0035_____????????????????_________________________________________________| | ||
177 | 0036_____????????????????"________________________________________________| | ||
178 | 0037_3.2__Lonely_start_characters_________________________________________| | ||
179 | 0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________| | ||
180 | 0039________each_followed_by_a_space_character:___________________________| | ||
181 | 0040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________| | ||
182 | 0041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| | ||
183 | 0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________| | ||
184 | 0043________each_followed_by_a_space_character:___________________________| | ||
185 | 0044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| | ||
186 | 0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________| | ||
187 | 0046________each_followed_by_a_space_character:___________________________| | ||
188 | 0047____"?_?_?_?_?_?_?_?_"________________________________________________| | ||
189 | 0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________| | ||
190 | 0049________each_followed_by_a_space_character:___________________________| | ||
191 | 0050____"?_?_?_?_"________________________________________________________| | ||
192 | 0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________| | ||
193 | 0052________each_followed_by_a_space_character:___________________________| | ||
194 | 0053____"?_?_"____________________________________________________________| | ||
195 | 0054_3.3__Sequences_with_last_continuation_byte_missing___________________| | ||
196 | 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
197 | 0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
198 | 0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
199 | 0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
200 | 0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | ||
201 | 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| | ||
202 | 0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______| | ||
203 | 0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______| | ||
204 | 0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______| | ||
205 | 0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______| | ||
206 | 0065_3.4__Concatenation_of_incomplete_sequences___________________________| | ||
207 | 0066____"??????????"______________________________________________________| | ||
208 | 0067_3.5__Impossible_bytes________________________________________________| | ||
209 | 0068_3.5.1__fe_=_"?"______________________________________________________| | ||
210 | 0069_3.5.2__ff_=_"?"______________________________________________________| | ||
211 | 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| | ||
212 | 0071_4__Overlong_sequences________________________________________________| | ||
213 | 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| | ||
214 | 0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________| | ||
215 | 0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________| | ||
216 | 0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________| | ||
217 | 0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________| | ||
218 | 0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________| | ||
219 | 0078_4.2__Maximum_overlong_sequences______________________________________| | ||
220 | 0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________| | ||
221 | 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| | ||
222 | 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| | ||
223 | 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| | ||
224 | 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| | ||
225 | 0084_4.3__Overlong_representation_of_the_NUL_character____________________| | ||
226 | 0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________| | ||
227 | 0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________| | ||
228 | 0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________| | ||
229 | 0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________| | ||
230 | 0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________| | ||
231 | 0090_5__Illegal_code_positions____________________________________________| | ||
232 | 0091_5.1_Single_UTF-16_surrogates_________________________________________| | ||
233 | 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| | ||
234 | 0093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________| | ||
235 | 0094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________| | ||
236 | 0095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________| | ||
237 | 0096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________| | ||
238 | 0097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________| | ||
239 | 0098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________| | ||
240 | 0099_5.2_Paired_UTF-16_surrogates_________________________________________| | ||
241 | 0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________| | ||
242 | 0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________| | ||
243 | 0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________| | ||
244 | 0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________| | ||
245 | 0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________| | ||
246 | 0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________| | ||
247 | 0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________| | ||
248 | 0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________| | ||
249 | 0108_5.3_Other_illegal_code_positions_____________________________________| | ||
250 | 0109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________| | ||
251 | 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| | ||
252 | ' "" "" | ||
253 | |||
135 | # Clean up | 254 | # Clean up |
136 | rm -rf ls.testdir 2>/dev/null | 255 | rm -rf ls.testdir 2>/dev/null |
137 | 256 | ||