aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 05:55:55 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 05:55:55 +0100
commit3d5b60693109dc5bdaa977b9a25d715a24a33133 (patch)
tree163e95806e5c94613abf50037012313a74705be6
parentd8528b8e56bab7643722e4453121882d23c23c07 (diff)
downloadbusybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.gz
busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.bz2
busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.zip
ls: fix handling of broken unicode sequences
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/unicode.c47
-rwxr-xr-xtestsuite/ls.tests46
2 files changed, 49 insertions, 44 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 4e7e3a96a..7c41ef30b 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
139 return org_n - n; 139 return org_n - n;
140} 140}
141 141
142#define ERROR_WCHAR (~(wchar_t)0)
143
142static const char *mbstowc_internal(wchar_t *res, const char *src) 144static const char *mbstowc_internal(wchar_t *res, const char *src)
143{ 145{
144 int bytes; 146 int bytes;
@@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
159 c <<= 1; 161 c <<= 1;
160 bytes++; 162 bytes++;
161 } while ((c & 0x80) && bytes < 6); 163 } while ((c & 0x80) && bytes < 6);
162 if (bytes == 1) 164 if (bytes == 1) {
163 return NULL; 165 /* A bare "continuation" byte. Say, 80 */
166 *res = ERROR_WCHAR;
167 return src;
168 }
164 c = (uint8_t)(c) >> bytes; 169 c = (uint8_t)(c) >> bytes;
165 170
166 while (--bytes) { 171 while (--bytes) {
167 unsigned ch = (unsigned char) *src++; 172 unsigned ch = (unsigned char) *src;
168 if ((ch & 0xc0) != 0x80) { 173 if ((ch & 0xc0) != 0x80) {
169 return NULL; 174 /* Missing "continuation" byte. Example: e0 80 */
175 *res = ERROR_WCHAR;
176 return src;
170 } 177 }
171 c = (c << 6) + (ch & 0x3f); 178 c = (c << 6) + (ch & 0x3f);
179 src++;
172 } 180 }
173 181
174 /* TODO */ 182 /* TODO */
@@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
177 /* 11110000 10000000 10000100 10000000 converts to 0x100 */ 185 /* 11110000 10000000 10000100 10000000 converts to 0x100 */
178 /* correct encoding: 11000100 10000000 */ 186 /* correct encoding: 11000100 10000000 */
179 if (c <= 0x7f) { /* crude check */ 187 if (c <= 0x7f) { /* crude check */
180 return NULL; 188 *res = ERROR_WCHAR;
181 //or maybe 0xfffd; /* replacement character */ 189 return src;
182 } 190 }
183 191
184 *res = c; 192 *res = c;
@@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
204 while (n) { 212 while (n) {
205 wchar_t wc; 213 wchar_t wc;
206 src = mbstowc_internal(&wc, src); 214 src = mbstowc_internal(&wc, src);
207 if (src == NULL) /* error */ 215 if (wc == ERROR_WCHAR) /* error */
208 return (size_t) -1L; 216 return (size_t) -1L;
209 if (dest) 217 if (dest)
210 *dest++ = wc; 218 *dest++ = wc;
@@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
312 goto subst; 320 goto subst;
313 } 321 }
314#else 322#else
315 { 323 src = mbstowc_internal(&wc, src);
316 const char *src1 = mbstowc_internal(&wc, src); 324 /* src is advanced to next mb char
317 /* src = NULL: invalid sequence is seen, 325 * wc == ERROR_WCHAR: invalid sequence is seen
318 * else: wc is set, src is advanced to next mb char 326 * else: wc is set
319 */ 327 */
320 if (src1) { /* no error */ 328 if (wc == ERROR_WCHAR) /* error */
321 if (wc == 0) /* end-of-string */ 329 goto subst;
322 break; 330 if (wc == 0) /* end-of-string */
323 src = src1; 331 break;
324 } else { /* error */
325 src++;
326 goto subst;
327 }
328 }
329#endif 332#endif
330 if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) 333 if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
331 goto subst; 334 goto subst;
@@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
411 } 414 }
412#else 415#else
413 src = mbstowc_internal(&wc, src); 416 src = mbstowc_internal(&wc, src);
414 if (!src || wc == 0) /* error, or end-of-string */ 417 if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */
415 return width; 418 return width;
416#endif 419#endif
417 w = wcwidth(wc); 420 w = wcwidth(wc);
diff --git a/testsuite/ls.tests b/testsuite/ls.tests
index b0c5da7f9..8d0f2c2ea 100755
--- a/testsuite/ls.tests
+++ b/testsuite/ls.tests
@@ -11,9 +11,11 @@ mkdir ls.testdir || exit 1
11 11
12# testing "test name" "command" "expected result" "file input" "stdin" 12# testing "test name" "command" "expected result" "file input" "stdin"
13 13
14# The test isn't passing correctly now - all | chars should line up 14# With Unicode provided by libc locale, I'm not sure this test can pass.
15# perfectly in the correctly passed test. 15# I suspect we might fail to skip exactly correct number of bytes
16# over broked unicode sequences.
16test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 17test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
18&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
17&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ 19&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
18&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ 20&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
19&& testing "ls unicode test" \ 21&& testing "ls unicode test" \
@@ -73,40 +75,40 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
730053____"?_?_"____________________________________________________________| 750053____"?_?_"____________________________________________________________|
740054_3.3__Sequences_with_last_continuation_byte_missing___________________| 760054_3.3__Sequences_with_last_continuation_byte_missing___________________|
750055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| 770055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
760056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______| 780056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
770057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______| 790057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
780058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______| 800058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
790059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______| 810059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
800060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| 820060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
810061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______| 830061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______|
820062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______| 840062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______|
830063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______| 850063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______|
840064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______| 860064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______|
850065_3.4__Concatenation_of_incomplete_sequences___________________________| 870065_3.4__Concatenation_of_incomplete_sequences___________________________|
860066____"??????????????????????????????"______________________________________________________| 880066____"??????????"______________________________________________________|
870067_3.5__Impossible_bytes________________________________________________| 890067_3.5__Impossible_bytes________________________________________________|
880068_3.5.1__fe_=_"?"______________________________________________________| 900068_3.5.1__fe_=_"?"______________________________________________________|
890069_3.5.2__ff_=_"?"______________________________________________________| 910069_3.5.2__ff_=_"?"______________________________________________________|
900070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| 920070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
910071_4__Overlong_sequences________________________________________________| 930071_4__Overlong_sequences________________________________________________|
920072_4.1__Examples_of_an_overlong_ASCII_character_________________________| 940072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
930073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________| 950073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________|
940074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________| 960074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________|
950075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________| 970075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________|
960076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________| 980076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________|
970077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________| 990077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________|
980078_4.2__Maximum_overlong_sequences______________________________________| 1000078_4.2__Maximum_overlong_sequences______________________________________|
990079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________| 1010079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________|
1000080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| 1020080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
1010081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| 1030081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
1020082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| 1040082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
1030083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| 1050083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
1040084_4.3__Overlong_representation_of_the_NUL_character____________________| 1060084_4.3__Overlong_representation_of_the_NUL_character____________________|
1050085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________| 1070085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________|
1060086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________| 1080086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________|
1070087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________| 1090087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________|
1080088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________| 1100088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________|
1090089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________| 1110089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________|
1100090_5__Illegal_code_positions____________________________________________| 1120090_5__Illegal_code_positions____________________________________________|
1110091_5.1_Single_UTF-16_surrogates_________________________________________| 1130091_5.1_Single_UTF-16_surrogates_________________________________________|
1120092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| 1140092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|