aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 16:34:37 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-01-31 16:34:37 +0100
commitb1edf20f1848cd741e8a8395afb4a4655a210906 (patch)
treeff6f99354d507ae1bb3bcf29ca99e1626cad0733
parent40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (diff)
downloadbusybox-w32-b1edf20f1848cd741e8a8395afb4a4655a210906.tar.gz
busybox-w32-b1edf20f1848cd741e8a8395afb4a4655a210906.tar.bz2
busybox-w32-b1edf20f1848cd741e8a8395afb4a4655a210906.zip
unicode: exclude FDD0..FDEF range too
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/unicode_wcwidth.c96
-rwxr-xr-xtestsuite/ls.tests8
2 files changed, 52 insertions, 52 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
index 410c741ac..c7cc524a6 100644
--- a/libbb/unicode_wcwidth.c
+++ b/libbb/unicode_wcwidth.c
@@ -90,13 +90,13 @@
90 * until Unicode committee assigns something there. 90 * until Unicode committee assigns something there.
91 */ 91 */
92 92
93#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 93#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000
94# define LAST_SUPPORTED_WCHAR 0x30000 94# define LAST_SUPPORTED_WCHAR 0x2ffff
95#else 95#else
96# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR 96# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
97#endif 97#endif
98 98
99#if LAST_SUPPORTED_WCHAR >= 0x0300 99#if LAST_SUPPORTED_WCHAR >= 0x300
100struct interval { 100struct interval {
101 uint16_t first; 101 uint16_t first;
102 uint16_t last; 102 uint16_t last;
@@ -185,7 +185,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
185 */ 185 */
186static int wcwidth(unsigned ucs) 186static int wcwidth(unsigned ucs)
187{ 187{
188#if LAST_SUPPORTED_WCHAR >= 0x0300 188#if LAST_SUPPORTED_WCHAR >= 0x300
189 /* sorted list of non-overlapping intervals of non-spacing characters */ 189 /* sorted list of non-overlapping intervals of non-spacing characters */
190 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 190 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
191 static const struct interval combining[] = { 191 static const struct interval combining[] = {
@@ -460,75 +460,75 @@ static int wcwidth(unsigned ucs)
460#undef BIG_ 460#undef BIG_
461#undef PAIR 461#undef PAIR
462 }; 462 };
463# if LAST_SUPPORTED_WCHAR >= 0x10000
464 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
465 static const struct interval combining0x10000[] = {
466 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
467 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
468 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
469 { 0xD242, 0xD244 }
470 };
471# endif
472#endif 463#endif
473 464
474 if (ucs == 0) 465 if (ucs == 0)
475 return 0; 466 return 0;
476 /* test for 8-bit control characters (00-1f, 80-9f, 7f) */ 467
468 /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
477 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f) 469 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
478 return -1; 470 return -1;
479 if (ucs < 0x0300) /* optimization */ 471 /* Quick abort if it is an obviously invalid char */
472 if (ucs > LAST_SUPPORTED_WCHAR)
473 return -1;
474
475 /* Optimization: no combining chars below 0x300 */
476 if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
480 return 1; 477 return 1;
481 478
482#if LAST_SUPPORTED_WCHAR < 0x0300 479#if LAST_SUPPORTED_WCHAR >= 0x300
483 return -1; 480 /* Binary search in table of non-spacing characters */
484#else
485 /* binary search in table of non-spacing characters */
486 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) 481 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
487 return 0; 482 return 0;
488 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1)) 483 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
489 return 0; 484 return 0;
490 485
491 if (ucs < 0x1100) /* optimization */ 486 /* Optimization: all chars below 0x1100 are not double-width */
487 if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
492 return 1; 488 return 1;
493 489
494# if LAST_SUPPORTED_WCHAR < 0x1100 490# if LAST_SUPPORTED_WCHAR >= 0x1100
495 return -1; 491 /* Invalid code points: */
496# else 492 /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
497 if (ucs >= LAST_SUPPORTED_WCHAR) 493 /* Private Use Area (e000..f8ff) */
498 return -1; 494 /* Noncharacters fdd0..fdef */
499 495 if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
500 /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ 496 || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
501 /* We also exclude Private Use Area (e000..f8ff) */
502 if (LAST_SUPPORTED_WCHAR >= 0xd800
503 && (ucs >= 0xd800 || ucs <= 0xf8ff)
504 ) { 497 ) {
505 return -1; 498 return -1;
506 } 499 }
507
508 /* 0xfffe and 0xffff in every plane are invalid */ 500 /* 0xfffe and 0xffff in every plane are invalid */
509 if (LAST_SUPPORTED_WCHAR >= 0xfffe 501 if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
510 && (ucs & 0xfffe) == 0xfffe
511 ) {
512 return -1; 502 return -1;
513 } 503 }
514 504
515# if LAST_SUPPORTED_WCHAR >= 0x10000 505# if LAST_SUPPORTED_WCHAR >= 0x10000
516 /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ 506 if (ucs >= 0x10000) {
517 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) 507 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
518 return 0; 508 static const struct interval combining0x10000[] = {
519# endif 509 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
520 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ 510 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
521 if (LAST_SUPPORTED_WCHAR >= 0xE0001 511 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
522 && ( ucs == 0xE0001 512 { 0xD242, 0xD244 }
523 || (ucs >= 0xE0020 && ucs <= 0xE007F) 513 };
524 || (ucs >= 0xE0100 && ucs <= 0xE01EF) 514 /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
525 ) 515 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
526 ) { 516 return 0;
527 return 0; 517 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
518 if (LAST_SUPPORTED_WCHAR >= 0xE0001
519 && ( ucs == 0xE0001
520 || (ucs >= 0xE0020 && ucs <= 0xE007F)
521 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
522 )
523 ) {
524 return 0;
525 }
528 } 526 }
527# endif
529 528
530 /* if we arrive here, ucs is not a combining or C0/C1 control character */ 529 /* If we arrive here, ucs is not a combining or C0/C1 control character.
531 530 * Check whether it's 1 char or 2-shar wide.
531 */
532 return 1 + 532 return 1 +
533 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ 533 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
534 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */ 534 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
diff --git a/testsuite/ls.tests b/testsuite/ls.tests
index e08249ea6..169313a63 100755
--- a/testsuite/ls.tests
+++ b/testsuite/ls.tests
@@ -13,7 +13,7 @@ mkdir ls.testdir || exit 1
13 13
14# With Unicode provided by libc locale, I'm not sure this test can pass. 14# With Unicode provided by libc locale, I'm not sure this test can pass.
15# I suspect we might fail to skip exactly correct number of bytes 15# I suspect we might fail to skip exactly correct number of bytes
16# over broken unicode sequences. 16# over broked unicode sequences.
17test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 17test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
18&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ 18&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
19&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ 19&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
@@ -144,7 +144,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
1440003_2.1__First_possible_sequence_of_a_certain_length_____________________| 1440003_2.1__First_possible_sequence_of_a_certain_length_____________________|
1450004_2.1.2__2_bytes__U-00000080_:________"?"______________________________| 1450004_2.1.2__2_bytes__U-00000080_:________"?"______________________________|
1460005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________| 1460005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________|
1470006_2.1.4__4_bytes__U-00010000_:________"?"______________________________| 1470006_2.1.4__4_bytes__U-00010000_:________"𐀀"______________________________|
1480007_2.1.5__5_bytes__U-00200000_:________"?"______________________________| 1480007_2.1.5__5_bytes__U-00200000_:________"?"______________________________|
1490008_2.1.6__6_bytes__U-04000000_:________"?"______________________________| 1490008_2.1.6__6_bytes__U-04000000_:________"?"______________________________|
1500009_2.2__Last_possible_sequence_of_a_certain_length______________________| 1500009_2.2__Last_possible_sequence_of_a_certain_length______________________|
@@ -155,9 +155,9 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
1550014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________| 1550014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________|
1560015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________| 1560015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________|
1570016_2.3__Other_boundary_conditions_______________________________________| 1570016_2.3__Other_boundary_conditions_______________________________________|
1580017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________| 1580017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_""___________________________________|
1590018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________| 1590018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________|
1600019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________| 1600019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_""___________________________________|
1610020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________| 1610020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________|
1620021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________| 1620021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________|
1630022_3__Malformed_sequences_______________________________________________| 1630022_3__Malformed_sequences_______________________________________________|