diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-29 09:11:47 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-29 09:11:47 +0100 |
commit | 2edba21f4c59d071f2241c2f47021c7034ec7cb8 (patch) | |
tree | 6cf3de29bfbdafa26fddbc1cd3dc467a2d8263f6 | |
parent | 083e172641b64c564b7ec5478197dccbde43b421 (diff) | |
download | busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.gz busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.bz2 busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.zip |
more fine-grained Unicode support
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | Config.in | 51 | ||||
-rw-r--r-- | libbb/unicode.c | 86 | ||||
-rw-r--r-- | libbb/unicode_wcwidth.c | 24 |
3 files changed, 143 insertions, 18 deletions
@@ -141,6 +141,57 @@ config FEATURE_CHECK_UNICODE_IN_ENV | |||
141 | 141 | ||
142 | Otherwise, Unicode support will be always enabled and active. | 142 | Otherwise, Unicode support will be always enabled and active. |
143 | 143 | ||
144 | config SUBST_WCHAR | ||
145 | int "Character code to substitute unprintable characters with" | ||
146 | range 1 4294967295 | ||
147 | depends on FEATURE_ASSUME_UNICODE | ||
148 | default 63 | ||
149 | help | ||
150 | Typical values are 63 for '?' (works with any output device), | ||
151 | 30 for ASCII substitute control code, | ||
152 | 65533 (0xfffd) for Unicode replacement character. | ||
153 | |||
154 | config LAST_SUPPORTED_WCHAR | ||
155 | int "Range of supported Unicode characters" | ||
156 | range 0 4294967295 | ||
157 | depends on FEATURE_ASSUME_UNICODE | ||
158 | default 767 | ||
159 | help | ||
160 | Any character with Unicode value bigger than this is assumed | ||
161 | to be non-printable on output device. Many applets replace | ||
162 | such chars with substitution character. | ||
163 | |||
164 | The idea is that many valid printable Unicode chars are | ||
165 | nevertheless are not displayed correctly. Think about | ||
166 | combining charachers, double-wide hieroglyphs and such. | ||
167 | Many terminals, xterms and such will fail to handle them | ||
168 | correctly. | ||
169 | |||
170 | Typical values are: | ||
171 | 126 - ASCII only | ||
172 | 767 (0x2ff) - there are no combining chars in [0..767] range | ||
173 | (the range includes Latin 1, Latin Ext. A and B), | ||
174 | code is ~700 bytes smaller for this case. | ||
175 | 4351 (0x10ff) - there are no double-wide chars in [0..4351] range, | ||
176 | code is ~300 bytes smaller for this case. | ||
177 | 0 - off, any valid printable Unicode character will be printed. | ||
178 | |||
179 | config UNICODE_COMBINING_WCHARS | ||
180 | bool "Allow zero-width Unicode characters on output" | ||
181 | default n | ||
182 | depends on FEATURE_ASSUME_UNICODE | ||
183 | help | ||
184 | With this option off, any Unicode char with width of 0 | ||
185 | is substituted on output. | ||
186 | |||
187 | config UNICODE_WIDE_WCHARS | ||
188 | bool "Allow wide Unicode characters on output" | ||
189 | default n | ||
190 | depends on FEATURE_ASSUME_UNICODE | ||
191 | help | ||
192 | With this option off, any Unicode char with width > 1 | ||
193 | is substituted on output. | ||
194 | |||
144 | config LONG_OPTS | 195 | config LONG_OPTS |
145 | bool "Support for --long-options" | 196 | bool "Support for --long-options" |
146 | default y | 197 | default y |
diff --git a/libbb/unicode.c b/libbb/unicode.c index 39b173e9c..878af84bc 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c | |||
@@ -216,8 +216,6 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) | |||
216 | return org_n - n; | 216 | return org_n - n; |
217 | } | 217 | } |
218 | 218 | ||
219 | #include "unicode_wcwidth.c" | ||
220 | |||
221 | int FAST_FUNC iswspace(wint_t wc) | 219 | int FAST_FUNC iswspace(wint_t wc) |
222 | { | 220 | { |
223 | return (unsigned)wc <= 0x7f && isspace(wc); | 221 | return (unsigned)wc <= 0x7f && isspace(wc); |
@@ -233,6 +231,8 @@ int FAST_FUNC iswpunct(wint_t wc) | |||
233 | return (unsigned)wc <= 0x7f && ispunct(wc); | 231 | return (unsigned)wc <= 0x7f && ispunct(wc); |
234 | } | 232 | } |
235 | 233 | ||
234 | #include "unicode_wcwidth.c" | ||
235 | |||
236 | #endif /* Homegrown Unicode support */ | 236 | #endif /* Homegrown Unicode support */ |
237 | 237 | ||
238 | 238 | ||
@@ -251,8 +251,22 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src) | |||
251 | char *dst; | 251 | char *dst; |
252 | unsigned dst_len; | 252 | unsigned dst_len; |
253 | 253 | ||
254 | if (unicode_status != UNICODE_ON) | 254 | if (unicode_status != UNICODE_ON) { |
255 | return xasprintf("%-*.*s", width, width, src); | 255 | char *d = dst = xmalloc(width + 1); |
256 | while ((int)--width >= 0) { | ||
257 | unsigned char c = *src; | ||
258 | if (c == '\0') { | ||
259 | do | ||
260 | *d++ = ' '; | ||
261 | while ((int)--width >= 0); | ||
262 | break; | ||
263 | } | ||
264 | *d++ = (c >= ' ' && c < 0x7f) ? c : '?'; | ||
265 | src++; | ||
266 | } | ||
267 | *d = '\0'; | ||
268 | return dst; | ||
269 | } | ||
256 | 270 | ||
257 | dst = NULL; | 271 | dst = NULL; |
258 | dst_len = 0; | 272 | dst_len = 0; |
@@ -260,31 +274,64 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src) | |||
260 | int w; | 274 | int w; |
261 | wchar_t wc; | 275 | wchar_t wc; |
262 | 276 | ||
263 | dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX); | ||
264 | #if ENABLE_LOCALE_SUPPORT | 277 | #if ENABLE_LOCALE_SUPPORT |
265 | { | 278 | { |
266 | mbstate_t mbst = { 0 }; | 279 | mbstate_t mbst = { 0 }; |
267 | ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); | 280 | ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); |
268 | if (rc <= 0) /* error, or end-of-string */ | 281 | /* If invalid sequence is seen: -1 is returned, |
282 | * src points to the invalid sequence, errno = EILSEQ. | ||
283 | * Else number of wchars (excluding terminating L'\0') | ||
284 | * written to dest is returned. | ||
285 | * If len (here: 1) non-L'\0' wchars stored at dest, | ||
286 | * src points to the next char to be converted. | ||
287 | * If string is completely converted: src = NULL. | ||
288 | */ | ||
289 | if (rc == 0) /* end-of-string */ | ||
269 | break; | 290 | break; |
291 | if (rc < 0) { /* error */ | ||
292 | src++; | ||
293 | goto subst; | ||
294 | } | ||
295 | if (!iswprint(wc)) | ||
296 | goto subst; | ||
270 | } | 297 | } |
271 | #else | 298 | #else |
272 | src = mbstowc_internal(&wc, src); | 299 | { |
273 | if (!src || wc == 0) /* error, or end-of-string */ | 300 | const char *src1 = mbstowc_internal(&wc, src); |
274 | break; | 301 | /* src = NULL: invalid sequence is seen, |
302 | * else: wc is set, src is advanced to next mb char | ||
303 | */ | ||
304 | if (src1) {/* no error */ | ||
305 | if (wc == 0) /* end-of-string */ | ||
306 | break; | ||
307 | src = src1; | ||
308 | } else { /* error */ | ||
309 | src++; | ||
310 | goto subst; | ||
311 | } | ||
312 | } | ||
275 | #endif | 313 | #endif |
314 | if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) | ||
315 | goto subst; | ||
276 | w = wcwidth(wc); | 316 | w = wcwidth(wc); |
277 | if (w < 0) /* non-printable wchar */ | 317 | if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */ |
278 | break; | 318 | || (!ENABLE_UNICODE_COMBINING_WCHARS && wc <= 0) |
319 | || (!ENABLE_UNICODE_WIDE_WCHARS && wc > 1) | ||
320 | ) { | ||
321 | subst: | ||
322 | wc = CONFIG_SUBST_WCHAR; | ||
323 | w = 1; | ||
324 | } | ||
279 | width -= w; | 325 | width -= w; |
280 | if ((int)width < 0) { /* string is longer than width */ | 326 | /* Note: if width == 0, we still may add more chars, |
327 | * they may be zero-width or combining ones */ | ||
328 | if ((int)width < 0) { | ||
329 | /* can't add this wc, string would become longer than width */ | ||
281 | width += w; | 330 | width += w; |
282 | while (width) { | ||
283 | dst[dst_len++] = ' '; | ||
284 | width--; | ||
285 | } | ||
286 | break; | 331 | break; |
287 | } | 332 | } |
333 | |||
334 | dst = xrealloc(dst, dst_len + MB_CUR_MAX); | ||
288 | #if ENABLE_LOCALE_SUPPORT | 335 | #if ENABLE_LOCALE_SUPPORT |
289 | { | 336 | { |
290 | mbstate_t mbst = { 0 }; | 337 | mbstate_t mbst = { 0 }; |
@@ -294,7 +341,14 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src) | |||
294 | dst_len += wcrtomb_internal(&dst[dst_len], wc); | 341 | dst_len += wcrtomb_internal(&dst[dst_len], wc); |
295 | #endif | 342 | #endif |
296 | } | 343 | } |
344 | |||
345 | /* Pad to remaining width */ | ||
346 | dst = xrealloc(dst, dst_len + width + 1); | ||
347 | while ((int)--width >= 0) { | ||
348 | dst[dst_len++] = ' '; | ||
349 | } | ||
297 | dst[dst_len] = '\0'; | 350 | dst[dst_len] = '\0'; |
351 | |||
298 | return dst; | 352 | return dst; |
299 | } | 353 | } |
300 | 354 | ||
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index 8d301f7c3..ab62b18f6 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c | |||
@@ -59,6 +59,13 @@ | |||
59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | 59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
60 | */ | 60 | */ |
61 | 61 | ||
62 | #if CONFIG_LAST_SUPPORTED_WCHAR == 0 | ||
63 | # define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) | ||
64 | #else | ||
65 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR | ||
66 | #endif | ||
67 | |||
68 | #if LAST_SUPPORTED_WCHAR >= 0x0300 | ||
62 | struct interval { | 69 | struct interval { |
63 | uint16_t first; | 70 | uint16_t first; |
64 | uint16_t last; | 71 | uint16_t last; |
@@ -111,6 +118,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max) | |||
111 | } | 118 | } |
112 | return 0; | 119 | return 0; |
113 | } | 120 | } |
121 | #endif | ||
114 | 122 | ||
115 | 123 | ||
116 | /* The following two functions define the column width of an ISO 10646 | 124 | /* The following two functions define the column width of an ISO 10646 |
@@ -146,6 +154,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max) | |||
146 | */ | 154 | */ |
147 | static int wcwidth(unsigned ucs) | 155 | static int wcwidth(unsigned ucs) |
148 | { | 156 | { |
157 | #if LAST_SUPPORTED_WCHAR >= 0x0300 | ||
149 | /* sorted list of non-overlapping intervals of non-spacing characters */ | 158 | /* sorted list of non-overlapping intervals of non-spacing characters */ |
150 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ | 159 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ |
151 | static const struct interval combining[] = { | 160 | static const struct interval combining[] = { |
@@ -420,12 +429,15 @@ static int wcwidth(unsigned ucs) | |||
420 | #undef BIG_ | 429 | #undef BIG_ |
421 | #undef PAIR | 430 | #undef PAIR |
422 | }; | 431 | }; |
432 | # if LAST_SUPPORTED_WCHAR >= 0x1100 | ||
423 | static const struct interval combining0x10000[] = { | 433 | static const struct interval combining0x10000[] = { |
424 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, | 434 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, |
425 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, | 435 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, |
426 | { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, | 436 | { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, |
427 | { 0xD242, 0xD244 } | 437 | { 0xD242, 0xD244 } |
428 | }; | 438 | }; |
439 | # endif | ||
440 | #endif | ||
429 | 441 | ||
430 | if (ucs == 0) | 442 | if (ucs == 0) |
431 | return 0; | 443 | return 0; |
@@ -435,6 +447,9 @@ static int wcwidth(unsigned ucs) | |||
435 | if (ucs < 0x0300) /* optimization */ | 447 | if (ucs < 0x0300) /* optimization */ |
436 | return 1; | 448 | return 1; |
437 | 449 | ||
450 | #if LAST_SUPPORTED_WCHAR < 0x0300 | ||
451 | return -1; | ||
452 | #else | ||
438 | /* binary search in table of non-spacing characters */ | 453 | /* binary search in table of non-spacing characters */ |
439 | if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) | 454 | if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) |
440 | return 0; | 455 | return 0; |
@@ -444,6 +459,9 @@ static int wcwidth(unsigned ucs) | |||
444 | if (ucs < 0x1100) /* optimization */ | 459 | if (ucs < 0x1100) /* optimization */ |
445 | return 1; | 460 | return 1; |
446 | 461 | ||
462 | # if LAST_SUPPORTED_WCHAR < 0x1100 | ||
463 | return -1; | ||
464 | # else | ||
447 | /* binary search in table of non-spacing characters, cont. */ | 465 | /* binary search in table of non-spacing characters, cont. */ |
448 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) | 466 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) |
449 | return 0; | 467 | return 0; |
@@ -458,8 +476,8 @@ static int wcwidth(unsigned ucs) | |||
458 | 476 | ||
459 | return 1 + | 477 | return 1 + |
460 | ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ | 478 | ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ |
461 | || ucs == 0x2329 | 479 | || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */ |
462 | || ucs == 0x232a | 480 | || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */ |
463 | || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */ | 481 | || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */ |
464 | || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */ | 482 | || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */ |
465 | || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */ | 483 | || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */ |
@@ -470,4 +488,6 @@ static int wcwidth(unsigned ucs) | |||
470 | || (ucs >= 0x20000 && ucs <= 0x2fffd) | 488 | || (ucs >= 0x20000 && ucs <= 0x2fffd) |
471 | || (ucs >= 0x30000 && ucs <= 0x3fffd) | 489 | || (ucs >= 0x30000 && ucs <= 0x3fffd) |
472 | ); | 490 | ); |
491 | # endif | ||
492 | #endif | ||
473 | } | 493 | } |