aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-01-29 09:11:47 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-01-29 09:11:47 +0100
commit2edba21f4c59d071f2241c2f47021c7034ec7cb8 (patch)
tree6cf3de29bfbdafa26fddbc1cd3dc467a2d8263f6
parent083e172641b64c564b7ec5478197dccbde43b421 (diff)
downloadbusybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.gz
busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.bz2
busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.zip
more fine-grained Unicode support
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--Config.in51
-rw-r--r--libbb/unicode.c86
-rw-r--r--libbb/unicode_wcwidth.c24
3 files changed, 143 insertions, 18 deletions
diff --git a/Config.in b/Config.in
index 8e751530c..68444839d 100644
--- a/Config.in
+++ b/Config.in
@@ -141,6 +141,57 @@ config FEATURE_CHECK_UNICODE_IN_ENV
141 141
142 Otherwise, Unicode support will be always enabled and active. 142 Otherwise, Unicode support will be always enabled and active.
143 143
144config SUBST_WCHAR
145 int "Character code to substitute unprintable characters with"
146 range 1 4294967295
147 depends on FEATURE_ASSUME_UNICODE
148 default 63
149 help
150 Typical values are 63 for '?' (works with any output device),
151 30 for ASCII substitute control code,
152 65533 (0xfffd) for Unicode replacement character.
153
154config LAST_SUPPORTED_WCHAR
155 int "Range of supported Unicode characters"
156 range 0 4294967295
157 depends on FEATURE_ASSUME_UNICODE
158 default 767
159 help
160 Any character with Unicode value bigger than this is assumed
161 to be non-printable on output device. Many applets replace
162 such chars with substitution character.
163
164 The idea is that many valid printable Unicode chars are
165 nevertheless are not displayed correctly. Think about
166 combining charachers, double-wide hieroglyphs and such.
167 Many terminals, xterms and such will fail to handle them
168 correctly.
169
170 Typical values are:
171 126 - ASCII only
172 767 (0x2ff) - there are no combining chars in [0..767] range
173 (the range includes Latin 1, Latin Ext. A and B),
174 code is ~700 bytes smaller for this case.
175 4351 (0x10ff) - there are no double-wide chars in [0..4351] range,
176 code is ~300 bytes smaller for this case.
177 0 - off, any valid printable Unicode character will be printed.
178
179config UNICODE_COMBINING_WCHARS
180 bool "Allow zero-width Unicode characters on output"
181 default n
182 depends on FEATURE_ASSUME_UNICODE
183 help
184 With this option off, any Unicode char with width of 0
185 is substituted on output.
186
187config UNICODE_WIDE_WCHARS
188 bool "Allow wide Unicode characters on output"
189 default n
190 depends on FEATURE_ASSUME_UNICODE
191 help
192 With this option off, any Unicode char with width > 1
193 is substituted on output.
194
144config LONG_OPTS 195config LONG_OPTS
145 bool "Support for --long-options" 196 bool "Support for --long-options"
146 default y 197 default y
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 39b173e9c..878af84bc 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -216,8 +216,6 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
216 return org_n - n; 216 return org_n - n;
217} 217}
218 218
219#include "unicode_wcwidth.c"
220
221int FAST_FUNC iswspace(wint_t wc) 219int FAST_FUNC iswspace(wint_t wc)
222{ 220{
223 return (unsigned)wc <= 0x7f && isspace(wc); 221 return (unsigned)wc <= 0x7f && isspace(wc);
@@ -233,6 +231,8 @@ int FAST_FUNC iswpunct(wint_t wc)
233 return (unsigned)wc <= 0x7f && ispunct(wc); 231 return (unsigned)wc <= 0x7f && ispunct(wc);
234} 232}
235 233
234#include "unicode_wcwidth.c"
235
236#endif /* Homegrown Unicode support */ 236#endif /* Homegrown Unicode support */
237 237
238 238
@@ -251,8 +251,22 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
251 char *dst; 251 char *dst;
252 unsigned dst_len; 252 unsigned dst_len;
253 253
254 if (unicode_status != UNICODE_ON) 254 if (unicode_status != UNICODE_ON) {
255 return xasprintf("%-*.*s", width, width, src); 255 char *d = dst = xmalloc(width + 1);
256 while ((int)--width >= 0) {
257 unsigned char c = *src;
258 if (c == '\0') {
259 do
260 *d++ = ' ';
261 while ((int)--width >= 0);
262 break;
263 }
264 *d++ = (c >= ' ' && c < 0x7f) ? c : '?';
265 src++;
266 }
267 *d = '\0';
268 return dst;
269 }
256 270
257 dst = NULL; 271 dst = NULL;
258 dst_len = 0; 272 dst_len = 0;
@@ -260,31 +274,64 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
260 int w; 274 int w;
261 wchar_t wc; 275 wchar_t wc;
262 276
263 dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
264#if ENABLE_LOCALE_SUPPORT 277#if ENABLE_LOCALE_SUPPORT
265 { 278 {
266 mbstate_t mbst = { 0 }; 279 mbstate_t mbst = { 0 };
267 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); 280 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
268 if (rc <= 0) /* error, or end-of-string */ 281 /* If invalid sequence is seen: -1 is returned,
282 * src points to the invalid sequence, errno = EILSEQ.
283 * Else number of wchars (excluding terminating L'\0')
284 * written to dest is returned.
285 * If len (here: 1) non-L'\0' wchars stored at dest,
286 * src points to the next char to be converted.
287 * If string is completely converted: src = NULL.
288 */
289 if (rc == 0) /* end-of-string */
269 break; 290 break;
291 if (rc < 0) { /* error */
292 src++;
293 goto subst;
294 }
295 if (!iswprint(wc))
296 goto subst;
270 } 297 }
271#else 298#else
272 src = mbstowc_internal(&wc, src); 299 {
273 if (!src || wc == 0) /* error, or end-of-string */ 300 const char *src1 = mbstowc_internal(&wc, src);
274 break; 301 /* src = NULL: invalid sequence is seen,
302 * else: wc is set, src is advanced to next mb char
303 */
304 if (src1) {/* no error */
305 if (wc == 0) /* end-of-string */
306 break;
307 src = src1;
308 } else { /* error */
309 src++;
310 goto subst;
311 }
312 }
275#endif 313#endif
314 if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
315 goto subst;
276 w = wcwidth(wc); 316 w = wcwidth(wc);
277 if (w < 0) /* non-printable wchar */ 317 if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */
278 break; 318 || (!ENABLE_UNICODE_COMBINING_WCHARS && wc <= 0)
319 || (!ENABLE_UNICODE_WIDE_WCHARS && wc > 1)
320 ) {
321 subst:
322 wc = CONFIG_SUBST_WCHAR;
323 w = 1;
324 }
279 width -= w; 325 width -= w;
280 if ((int)width < 0) { /* string is longer than width */ 326 /* Note: if width == 0, we still may add more chars,
327 * they may be zero-width or combining ones */
328 if ((int)width < 0) {
329 /* can't add this wc, string would become longer than width */
281 width += w; 330 width += w;
282 while (width) {
283 dst[dst_len++] = ' ';
284 width--;
285 }
286 break; 331 break;
287 } 332 }
333
334 dst = xrealloc(dst, dst_len + MB_CUR_MAX);
288#if ENABLE_LOCALE_SUPPORT 335#if ENABLE_LOCALE_SUPPORT
289 { 336 {
290 mbstate_t mbst = { 0 }; 337 mbstate_t mbst = { 0 };
@@ -294,7 +341,14 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
294 dst_len += wcrtomb_internal(&dst[dst_len], wc); 341 dst_len += wcrtomb_internal(&dst[dst_len], wc);
295#endif 342#endif
296 } 343 }
344
345 /* Pad to remaining width */
346 dst = xrealloc(dst, dst_len + width + 1);
347 while ((int)--width >= 0) {
348 dst[dst_len++] = ' ';
349 }
297 dst[dst_len] = '\0'; 350 dst[dst_len] = '\0';
351
298 return dst; 352 return dst;
299} 353}
300 354
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index 8d301f7c3..ab62b18f6 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -59,6 +59,13 @@
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */ 60 */
61 61
62#if CONFIG_LAST_SUPPORTED_WCHAR == 0
63# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
64#else
65# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
66#endif
67
68#if LAST_SUPPORTED_WCHAR >= 0x0300
62struct interval { 69struct interval {
63 uint16_t first; 70 uint16_t first;
64 uint16_t last; 71 uint16_t last;
@@ -111,6 +118,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
111 } 118 }
112 return 0; 119 return 0;
113} 120}
121#endif
114 122
115 123
116/* The following two functions define the column width of an ISO 10646 124/* The following two functions define the column width of an ISO 10646
@@ -146,6 +154,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
146 */ 154 */
147static int wcwidth(unsigned ucs) 155static int wcwidth(unsigned ucs)
148{ 156{
157#if LAST_SUPPORTED_WCHAR >= 0x0300
149 /* sorted list of non-overlapping intervals of non-spacing characters */ 158 /* sorted list of non-overlapping intervals of non-spacing characters */
150 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 159 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
151 static const struct interval combining[] = { 160 static const struct interval combining[] = {
@@ -420,12 +429,15 @@ static int wcwidth(unsigned ucs)
420#undef BIG_ 429#undef BIG_
421#undef PAIR 430#undef PAIR
422 }; 431 };
432# if LAST_SUPPORTED_WCHAR >= 0x1100
423 static const struct interval combining0x10000[] = { 433 static const struct interval combining0x10000[] = {
424 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, 434 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
425 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, 435 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
426 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, 436 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
427 { 0xD242, 0xD244 } 437 { 0xD242, 0xD244 }
428 }; 438 };
439# endif
440#endif
429 441
430 if (ucs == 0) 442 if (ucs == 0)
431 return 0; 443 return 0;
@@ -435,6 +447,9 @@ static int wcwidth(unsigned ucs)
435 if (ucs < 0x0300) /* optimization */ 447 if (ucs < 0x0300) /* optimization */
436 return 1; 448 return 1;
437 449
450#if LAST_SUPPORTED_WCHAR < 0x0300
451 return -1;
452#else
438 /* binary search in table of non-spacing characters */ 453 /* binary search in table of non-spacing characters */
439 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) 454 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
440 return 0; 455 return 0;
@@ -444,6 +459,9 @@ static int wcwidth(unsigned ucs)
444 if (ucs < 0x1100) /* optimization */ 459 if (ucs < 0x1100) /* optimization */
445 return 1; 460 return 1;
446 461
462# if LAST_SUPPORTED_WCHAR < 0x1100
463 return -1;
464# else
447 /* binary search in table of non-spacing characters, cont. */ 465 /* binary search in table of non-spacing characters, cont. */
448 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) 466 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
449 return 0; 467 return 0;
@@ -458,8 +476,8 @@ static int wcwidth(unsigned ucs)
458 476
459 return 1 + 477 return 1 +
460 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ 478 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
461 || ucs == 0x2329 479 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
462 || ucs == 0x232a 480 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
463 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */ 481 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
464 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */ 482 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
465 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */ 483 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
@@ -470,4 +488,6 @@ static int wcwidth(unsigned ucs)
470 || (ucs >= 0x20000 && ucs <= 0x2fffd) 488 || (ucs >= 0x20000 && ucs <= 0x2fffd)
471 || (ucs >= 0x30000 && ucs <= 0x3fffd) 489 || (ucs >= 0x30000 && ucs <= 0x3fffd)
472 ); 490 );
491# endif
492#endif
473} 493}