diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 05:55:55 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 05:55:55 +0100 |
commit | 3d5b60693109dc5bdaa977b9a25d715a24a33133 (patch) | |
tree | 163e95806e5c94613abf50037012313a74705be6 /libbb/unicode.c | |
parent | d8528b8e56bab7643722e4453121882d23c23c07 (diff) | |
download | busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.gz busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.tar.bz2 busybox-w32-3d5b60693109dc5bdaa977b9a25d715a24a33133.zip |
ls: fix handling of broken unicode sequences
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb/unicode.c')
-rw-r--r-- | libbb/unicode.c | 47 |
1 files changed, 25 insertions, 22 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c index 4e7e3a96a..7c41ef30b 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c | |||
@@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) | |||
139 | return org_n - n; | 139 | return org_n - n; |
140 | } | 140 | } |
141 | 141 | ||
142 | #define ERROR_WCHAR (~(wchar_t)0) | ||
143 | |||
142 | static const char *mbstowc_internal(wchar_t *res, const char *src) | 144 | static const char *mbstowc_internal(wchar_t *res, const char *src) |
143 | { | 145 | { |
144 | int bytes; | 146 | int bytes; |
@@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) | |||
159 | c <<= 1; | 161 | c <<= 1; |
160 | bytes++; | 162 | bytes++; |
161 | } while ((c & 0x80) && bytes < 6); | 163 | } while ((c & 0x80) && bytes < 6); |
162 | if (bytes == 1) | 164 | if (bytes == 1) { |
163 | return NULL; | 165 | /* A bare "continuation" byte. Say, 80 */ |
166 | *res = ERROR_WCHAR; | ||
167 | return src; | ||
168 | } | ||
164 | c = (uint8_t)(c) >> bytes; | 169 | c = (uint8_t)(c) >> bytes; |
165 | 170 | ||
166 | while (--bytes) { | 171 | while (--bytes) { |
167 | unsigned ch = (unsigned char) *src++; | 172 | unsigned ch = (unsigned char) *src; |
168 | if ((ch & 0xc0) != 0x80) { | 173 | if ((ch & 0xc0) != 0x80) { |
169 | return NULL; | 174 | /* Missing "continuation" byte. Example: e0 80 */ |
175 | *res = ERROR_WCHAR; | ||
176 | return src; | ||
170 | } | 177 | } |
171 | c = (c << 6) + (ch & 0x3f); | 178 | c = (c << 6) + (ch & 0x3f); |
179 | src++; | ||
172 | } | 180 | } |
173 | 181 | ||
174 | /* TODO */ | 182 | /* TODO */ |
@@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) | |||
177 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ | 185 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ |
178 | /* correct encoding: 11000100 10000000 */ | 186 | /* correct encoding: 11000100 10000000 */ |
179 | if (c <= 0x7f) { /* crude check */ | 187 | if (c <= 0x7f) { /* crude check */ |
180 | return NULL; | 188 | *res = ERROR_WCHAR; |
181 | //or maybe 0xfffd; /* replacement character */ | 189 | return src; |
182 | } | 190 | } |
183 | 191 | ||
184 | *res = c; | 192 | *res = c; |
@@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) | |||
204 | while (n) { | 212 | while (n) { |
205 | wchar_t wc; | 213 | wchar_t wc; |
206 | src = mbstowc_internal(&wc, src); | 214 | src = mbstowc_internal(&wc, src); |
207 | if (src == NULL) /* error */ | 215 | if (wc == ERROR_WCHAR) /* error */ |
208 | return (size_t) -1L; | 216 | return (size_t) -1L; |
209 | if (dest) | 217 | if (dest) |
210 | *dest++ = wc; | 218 | *dest++ = wc; |
@@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char | |||
312 | goto subst; | 320 | goto subst; |
313 | } | 321 | } |
314 | #else | 322 | #else |
315 | { | 323 | src = mbstowc_internal(&wc, src); |
316 | const char *src1 = mbstowc_internal(&wc, src); | 324 | /* src is advanced to next mb char |
317 | /* src = NULL: invalid sequence is seen, | 325 | * wc == ERROR_WCHAR: invalid sequence is seen |
318 | * else: wc is set, src is advanced to next mb char | 326 | * else: wc is set |
319 | */ | 327 | */ |
320 | if (src1) { /* no error */ | 328 | if (wc == ERROR_WCHAR) /* error */ |
321 | if (wc == 0) /* end-of-string */ | 329 | goto subst; |
322 | break; | 330 | if (wc == 0) /* end-of-string */ |
323 | src = src1; | 331 | break; |
324 | } else { /* error */ | ||
325 | src++; | ||
326 | goto subst; | ||
327 | } | ||
328 | } | ||
329 | #endif | 332 | #endif |
330 | if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) | 333 | if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) |
331 | goto subst; | 334 | goto subst; |
@@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src) | |||
411 | } | 414 | } |
412 | #else | 415 | #else |
413 | src = mbstowc_internal(&wc, src); | 416 | src = mbstowc_internal(&wc, src); |
414 | if (!src || wc == 0) /* error, or end-of-string */ | 417 | if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */ |
415 | return width; | 418 | return width; |
416 | #endif | 419 | #endif |
417 | w = wcwidth(wc); | 420 | w = wcwidth(wc); |