aboutsummaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-01-29 09:11:47 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-01-29 09:11:47 +0100
commit2edba21f4c59d071f2241c2f47021c7034ec7cb8 (patch)
tree6cf3de29bfbdafa26fddbc1cd3dc467a2d8263f6 /libbb
parent083e172641b64c564b7ec5478197dccbde43b421 (diff)
downloadbusybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.gz
busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.tar.bz2
busybox-w32-2edba21f4c59d071f2241c2f47021c7034ec7cb8.zip
more fine-grained Unicode support
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r--libbb/unicode.c86
-rw-r--r--libbb/unicode_wcwidth.c24
2 files changed, 92 insertions, 18 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 39b173e9c..878af84bc 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -216,8 +216,6 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
216 return org_n - n; 216 return org_n - n;
217} 217}
218 218
219#include "unicode_wcwidth.c"
220
221int FAST_FUNC iswspace(wint_t wc) 219int FAST_FUNC iswspace(wint_t wc)
222{ 220{
223 return (unsigned)wc <= 0x7f && isspace(wc); 221 return (unsigned)wc <= 0x7f && isspace(wc);
@@ -233,6 +231,8 @@ int FAST_FUNC iswpunct(wint_t wc)
233 return (unsigned)wc <= 0x7f && ispunct(wc); 231 return (unsigned)wc <= 0x7f && ispunct(wc);
234} 232}
235 233
234#include "unicode_wcwidth.c"
235
236#endif /* Homegrown Unicode support */ 236#endif /* Homegrown Unicode support */
237 237
238 238
@@ -251,8 +251,22 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
251 char *dst; 251 char *dst;
252 unsigned dst_len; 252 unsigned dst_len;
253 253
254 if (unicode_status != UNICODE_ON) 254 if (unicode_status != UNICODE_ON) {
255 return xasprintf("%-*.*s", width, width, src); 255 char *d = dst = xmalloc(width + 1);
256 while ((int)--width >= 0) {
257 unsigned char c = *src;
258 if (c == '\0') {
259 do
260 *d++ = ' ';
261 while ((int)--width >= 0);
262 break;
263 }
264 *d++ = (c >= ' ' && c < 0x7f) ? c : '?';
265 src++;
266 }
267 *d = '\0';
268 return dst;
269 }
256 270
257 dst = NULL; 271 dst = NULL;
258 dst_len = 0; 272 dst_len = 0;
@@ -260,31 +274,64 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
260 int w; 274 int w;
261 wchar_t wc; 275 wchar_t wc;
262 276
263 dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
264#if ENABLE_LOCALE_SUPPORT 277#if ENABLE_LOCALE_SUPPORT
265 { 278 {
266 mbstate_t mbst = { 0 }; 279 mbstate_t mbst = { 0 };
267 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); 280 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
268 if (rc <= 0) /* error, or end-of-string */ 281 /* If invalid sequence is seen: -1 is returned,
282 * src points to the invalid sequence, errno = EILSEQ.
283 * Else number of wchars (excluding terminating L'\0')
284 * written to dest is returned.
285 * If len (here: 1) non-L'\0' wchars stored at dest,
286 * src points to the next char to be converted.
287 * If string is completely converted: src = NULL.
288 */
289 if (rc == 0) /* end-of-string */
269 break; 290 break;
291 if (rc < 0) { /* error */
292 src++;
293 goto subst;
294 }
295 if (!iswprint(wc))
296 goto subst;
270 } 297 }
271#else 298#else
272 src = mbstowc_internal(&wc, src); 299 {
273 if (!src || wc == 0) /* error, or end-of-string */ 300 const char *src1 = mbstowc_internal(&wc, src);
274 break; 301 /* src = NULL: invalid sequence is seen,
302 * else: wc is set, src is advanced to next mb char
303 */
304 if (src1) {/* no error */
305 if (wc == 0) /* end-of-string */
306 break;
307 src = src1;
308 } else { /* error */
309 src++;
310 goto subst;
311 }
312 }
275#endif 313#endif
314 if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
315 goto subst;
276 w = wcwidth(wc); 316 w = wcwidth(wc);
277 if (w < 0) /* non-printable wchar */ 317 if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */
278 break; 318 || (!ENABLE_UNICODE_COMBINING_WCHARS && wc <= 0)
319 || (!ENABLE_UNICODE_WIDE_WCHARS && wc > 1)
320 ) {
321 subst:
322 wc = CONFIG_SUBST_WCHAR;
323 w = 1;
324 }
279 width -= w; 325 width -= w;
280 if ((int)width < 0) { /* string is longer than width */ 326 /* Note: if width == 0, we still may add more chars,
327 * they may be zero-width or combining ones */
328 if ((int)width < 0) {
329 /* can't add this wc, string would become longer than width */
281 width += w; 330 width += w;
282 while (width) {
283 dst[dst_len++] = ' ';
284 width--;
285 }
286 break; 331 break;
287 } 332 }
333
334 dst = xrealloc(dst, dst_len + MB_CUR_MAX);
288#if ENABLE_LOCALE_SUPPORT 335#if ENABLE_LOCALE_SUPPORT
289 { 336 {
290 mbstate_t mbst = { 0 }; 337 mbstate_t mbst = { 0 };
@@ -294,7 +341,14 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
294 dst_len += wcrtomb_internal(&dst[dst_len], wc); 341 dst_len += wcrtomb_internal(&dst[dst_len], wc);
295#endif 342#endif
296 } 343 }
344
345 /* Pad to remaining width */
346 dst = xrealloc(dst, dst_len + width + 1);
347 while ((int)--width >= 0) {
348 dst[dst_len++] = ' ';
349 }
297 dst[dst_len] = '\0'; 350 dst[dst_len] = '\0';
351
298 return dst; 352 return dst;
299} 353}
300 354
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index 8d301f7c3..ab62b18f6 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -59,6 +59,13 @@
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */ 60 */
61 61
62#if CONFIG_LAST_SUPPORTED_WCHAR == 0
63# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
64#else
65# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
66#endif
67
68#if LAST_SUPPORTED_WCHAR >= 0x0300
62struct interval { 69struct interval {
63 uint16_t first; 70 uint16_t first;
64 uint16_t last; 71 uint16_t last;
@@ -111,6 +118,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
111 } 118 }
112 return 0; 119 return 0;
113} 120}
121#endif
114 122
115 123
116/* The following two functions define the column width of an ISO 10646 124/* The following two functions define the column width of an ISO 10646
@@ -146,6 +154,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
146 */ 154 */
147static int wcwidth(unsigned ucs) 155static int wcwidth(unsigned ucs)
148{ 156{
157#if LAST_SUPPORTED_WCHAR >= 0x0300
149 /* sorted list of non-overlapping intervals of non-spacing characters */ 158 /* sorted list of non-overlapping intervals of non-spacing characters */
150 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 159 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
151 static const struct interval combining[] = { 160 static const struct interval combining[] = {
@@ -420,12 +429,15 @@ static int wcwidth(unsigned ucs)
420#undef BIG_ 429#undef BIG_
421#undef PAIR 430#undef PAIR
422 }; 431 };
432# if LAST_SUPPORTED_WCHAR >= 0x1100
423 static const struct interval combining0x10000[] = { 433 static const struct interval combining0x10000[] = {
424 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, 434 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
425 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, 435 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
426 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, 436 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
427 { 0xD242, 0xD244 } 437 { 0xD242, 0xD244 }
428 }; 438 };
439# endif
440#endif
429 441
430 if (ucs == 0) 442 if (ucs == 0)
431 return 0; 443 return 0;
@@ -435,6 +447,9 @@ static int wcwidth(unsigned ucs)
435 if (ucs < 0x0300) /* optimization */ 447 if (ucs < 0x0300) /* optimization */
436 return 1; 448 return 1;
437 449
450#if LAST_SUPPORTED_WCHAR < 0x0300
451 return -1;
452#else
438 /* binary search in table of non-spacing characters */ 453 /* binary search in table of non-spacing characters */
439 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) 454 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
440 return 0; 455 return 0;
@@ -444,6 +459,9 @@ static int wcwidth(unsigned ucs)
444 if (ucs < 0x1100) /* optimization */ 459 if (ucs < 0x1100) /* optimization */
445 return 1; 460 return 1;
446 461
462# if LAST_SUPPORTED_WCHAR < 0x1100
463 return -1;
464# else
447 /* binary search in table of non-spacing characters, cont. */ 465 /* binary search in table of non-spacing characters, cont. */
448 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) 466 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
449 return 0; 467 return 0;
@@ -458,8 +476,8 @@ static int wcwidth(unsigned ucs)
458 476
459 return 1 + 477 return 1 +
460 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ 478 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
461 || ucs == 0x2329 479 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
462 || ucs == 0x232a 480 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
463 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */ 481 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
464 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */ 482 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
465 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */ 483 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
@@ -470,4 +488,6 @@ static int wcwidth(unsigned ucs)
470 || (ucs >= 0x20000 && ucs <= 0x2fffd) 488 || (ucs >= 0x20000 && ucs <= 0x2fffd)
471 || (ucs >= 0x30000 && ucs <= 0x3fffd) 489 || (ucs >= 0x30000 && ucs <= 0x3fffd)
472 ); 490 );
491# endif
492#endif
473} 493}