diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 05:55:55 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 05:55:55 +0100 |
commit | 3d5b60693109dc5bdaa977b9a25d715a24a33133 (patch) | |
tree | 163e95806e5c94613abf50037012313a74705be6 | |
parent | d8528b8e56bab7643722e4453121882d23c23c07 (diff) | |
download | busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.gz busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.bz2 busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.zip |
ls: fix handling of broken unicode sequences
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/unicode.c | 47 | ||||
-rwxr-xr-x | testsuite/ls.tests | 46 |
2 files changed, 49 insertions, 44 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c index 4e7e3a96a..7c41ef30b 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c | |||
@@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) | |||
139 | return org_n - n; | 139 | return org_n - n; |
140 | } | 140 | } |
141 | 141 | ||
142 | #define ERROR_WCHAR (~(wchar_t)0) | ||
143 | |||
142 | static const char *mbstowc_internal(wchar_t *res, const char *src) | 144 | static const char *mbstowc_internal(wchar_t *res, const char *src) |
143 | { | 145 | { |
144 | int bytes; | 146 | int bytes; |
@@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) | |||
159 | c <<= 1; | 161 | c <<= 1; |
160 | bytes++; | 162 | bytes++; |
161 | } while ((c & 0x80) && bytes < 6); | 163 | } while ((c & 0x80) && bytes < 6); |
162 | if (bytes == 1) | 164 | if (bytes == 1) { |
163 | return NULL; | 165 | /* A bare "continuation" byte. Say, 80 */ |
166 | *res = ERROR_WCHAR; | ||
167 | return src; | ||
168 | } | ||
164 | c = (uint8_t)(c) >> bytes; | 169 | c = (uint8_t)(c) >> bytes; |
165 | 170 | ||
166 | while (--bytes) { | 171 | while (--bytes) { |
167 | unsigned ch = (unsigned char) *src++; | 172 | unsigned ch = (unsigned char) *src; |
168 | if ((ch & 0xc0) != 0x80) { | 173 | if ((ch & 0xc0) != 0x80) { |
169 | return NULL; | 174 | /* Missing "continuation" byte. Example: e0 80 */ |
175 | *res = ERROR_WCHAR; | ||
176 | return src; | ||
170 | } | 177 | } |
171 | c = (c << 6) + (ch & 0x3f); | 178 | c = (c << 6) + (ch & 0x3f); |
179 | src++; | ||
172 | } | 180 | } |
173 | 181 | ||
174 | /* TODO */ | 182 | /* TODO */ |
@@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) | |||
177 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ | 185 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ |
178 | /* correct encoding: 11000100 10000000 */ | 186 | /* correct encoding: 11000100 10000000 */ |
179 | if (c <= 0x7f) { /* crude check */ | 187 | if (c <= 0x7f) { /* crude check */ |
180 | return NULL; | 188 | *res = ERROR_WCHAR; |
181 | //or maybe 0xfffd; /* replacement character */ | 189 | return src; |
182 | } | 190 | } |
183 | 191 | ||
184 | *res = c; | 192 | *res = c; |
@@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) | |||
204 | while (n) { | 212 | while (n) { |
205 | wchar_t wc; | 213 | wchar_t wc; |
206 | src = mbstowc_internal(&wc, src); | 214 | src = mbstowc_internal(&wc, src); |
207 | if (src == NULL) /* error */ | 215 | if (wc == ERROR_WCHAR) /* error */ |
208 | return (size_t) -1L; | 216 | return (size_t) -1L; |
209 | if (dest) | 217 | if (dest) |
210 | *dest++ = wc; | 218 | *dest++ = wc; |
@@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char | |||
312 | goto subst; | 320 | goto subst; |
313 | } | 321 | } |
314 | #else | 322 | #else |
315 | { | 323 | src = mbstowc_internal(&wc, src); |
316 | const char *src1 = mbstowc_internal(&wc, src); | 324 | /* src is advanced to next mb char |
317 | /* src = NULL: invalid sequence is seen, | 325 | * wc == ERROR_WCHAR: invalid sequence is seen |
318 | * else: wc is set, src is advanced to next mb char | 326 | * else: wc is set |
319 | */ | 327 | */ |
320 | if (src1) { /* no error */ | 328 | if (wc == ERROR_WCHAR) /* error */ |
321 | if (wc == 0) /* end-of-string */ | 329 | goto subst; |
322 | break; | 330 | if (wc == 0) /* end-of-string */ |
323 | src = src1; | 331 | break; |
324 | } else { /* error */ | ||
325 | src++; | ||
326 | goto subst; | ||
327 | } | ||
328 | } | ||
329 | #endif | 332 | #endif |
330 | if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) | 333 | if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) |
331 | goto subst; | 334 | goto subst; |
@@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src) | |||
411 | } | 414 | } |
412 | #else | 415 | #else |
413 | src = mbstowc_internal(&wc, src); | 416 | src = mbstowc_internal(&wc, src); |
414 | if (!src || wc == 0) /* error, or end-of-string */ | 417 | if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */ |
415 | return width; | 418 | return width; |
416 | #endif | 419 | #endif |
417 | w = wcwidth(wc); | 420 | w = wcwidth(wc); |
diff --git a/testsuite/ls.tests b/testsuite/ls.tests index b0c5da7f9..8d0f2c2ea 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests | |||
@@ -11,9 +11,11 @@ mkdir ls.testdir || exit 1 | |||
11 | 11 | ||
12 | # testing "test name" "command" "expected result" "file input" "stdin" | 12 | # testing "test name" "command" "expected result" "file input" "stdin" |
13 | 13 | ||
14 | # The test isn't passing correctly now - all | chars should line up | 14 | # With Unicode provided by libc locale, I'm not sure this test can pass. |
15 | # perfectly in the correctly passed test. | 15 | # I suspect we might fail to skip exactly correct number of bytes |
16 | # over broked unicode sequences. | ||
16 | test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | 17 | test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ |
18 | && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ | ||
17 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ | 19 | && test x"$CONFIG_SUBST_WCHAR" = x"63" \ |
18 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ | 20 | && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ |
19 | && testing "ls unicode test" \ | 21 | && testing "ls unicode test" \ |
@@ -73,40 +75,40 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ | |||
73 | 0053____"?_?_"____________________________________________________________| | 75 | 0053____"?_?_"____________________________________________________________| |
74 | 0054_3.3__Sequences_with_last_continuation_byte_missing___________________| | 76 | 0054_3.3__Sequences_with_last_continuation_byte_missing___________________| |
75 | 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| | 77 | 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| |
76 | 0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______| | 78 | 0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| |
77 | 0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______| | 79 | 0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| |
78 | 0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______| | 80 | 0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| |
79 | 0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______| | 81 | 0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| |
80 | 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| | 82 | 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| |
81 | 0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______| | 83 | 0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______| |
82 | 0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______| | 84 | 0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______| |
83 | 0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______| | 85 | 0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______| |
84 | 0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______| | 86 | 0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______| |
85 | 0065_3.4__Concatenation_of_incomplete_sequences___________________________| | 87 | 0065_3.4__Concatenation_of_incomplete_sequences___________________________| |
86 | 0066____"??????????????????????????????"______________________________________________________| | 88 | 0066____"??????????"______________________________________________________| |
87 | 0067_3.5__Impossible_bytes________________________________________________| | 89 | 0067_3.5__Impossible_bytes________________________________________________| |
88 | 0068_3.5.1__fe_=_"?"______________________________________________________| | 90 | 0068_3.5.1__fe_=_"?"______________________________________________________| |
89 | 0069_3.5.2__ff_=_"?"______________________________________________________| | 91 | 0069_3.5.2__ff_=_"?"______________________________________________________| |
90 | 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| | 92 | 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| |
91 | 0071_4__Overlong_sequences________________________________________________| | 93 | 0071_4__Overlong_sequences________________________________________________| |
92 | 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| | 94 | 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| |
93 | 0073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________| | 95 | 0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________| |
94 | 0074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________| | 96 | 0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________| |
95 | 0075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________| | 97 | 0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________| |
96 | 0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________| | 98 | 0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________| |
97 | 0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________| | 99 | 0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________| |
98 | 0078_4.2__Maximum_overlong_sequences______________________________________| | 100 | 0078_4.2__Maximum_overlong_sequences______________________________________| |
99 | 0079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________| | 101 | 0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________| |
100 | 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| | 102 | 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| |
101 | 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| | 103 | 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| |
102 | 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| | 104 | 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| |
103 | 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| | 105 | 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| |
104 | 0084_4.3__Overlong_representation_of_the_NUL_character____________________| | 106 | 0084_4.3__Overlong_representation_of_the_NUL_character____________________| |
105 | 0085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________| | 107 | 0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________| |
106 | 0086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________| | 108 | 0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________| |
107 | 0087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________| | 109 | 0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________| |
108 | 0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________| | 110 | 0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________| |
109 | 0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________| | 111 | 0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________| |
110 | 0090_5__Illegal_code_positions____________________________________________| | 112 | 0090_5__Illegal_code_positions____________________________________________| |
111 | 0091_5.1_Single_UTF-16_surrogates_________________________________________| | 113 | 0091_5.1_Single_UTF-16_surrogates_________________________________________| |
112 | 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| | 114 | 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| |