diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
commit | 40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch) | |
tree | 89e7c1880d057393ee6a5596bee77d802b882c3f /libbb | |
parent | 344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff) | |
download | busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.bz2 busybox-w32-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.zip |
exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r-- | libbb/unicode_wcwidth.c | 72 |
1 files changed, 63 insertions, 9 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index ab62b18f6..410c741ac 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c | |||
@@ -59,8 +59,39 @@ | |||
59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | 59 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c |
60 | */ | 60 | */ |
61 | 61 | ||
62 | #if CONFIG_LAST_SUPPORTED_WCHAR == 0 | 62 | /* Assigned Unicode character ranges: |
63 | # define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) | 63 | * Plane Range |
64 | * 0 0000–FFFF Basic Multilingual Plane | ||
65 | * 1 10000–1FFFF Supplementary Multilingual Plane | ||
66 | * 2 20000–2FFFF Supplementary Ideographic Plane | ||
67 | * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet) | ||
68 | * 4-13 40000–DFFFF currently unassigned | ||
69 | * 14 E0000–EFFFF Supplementary Special-purpose Plane | ||
70 | * 15 F0000–FFFFF Supplementary Private Use Area-A | ||
71 | * 16 100000–10FFFF Supplementary Private Use Area-B | ||
72 | * | ||
73 | * "Supplementary Special-purpose Plane currently contains non-graphical | ||
74 | * characters in two blocks of 128 and 240 characters. The first block | ||
75 | * is for language tag characters for use when language cannot be indicated | ||
76 | * through other protocols (such as the xml:lang attribute in XML). | ||
77 | * The other block contains glyph variation selectors to indicate | ||
78 | * an alternate glyph for a character that cannot be determined by context." | ||
79 | * | ||
80 | * In simpler terms: it is a tool to fix the "Han unification" mess | ||
81 | * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan | ||
82 | * version of a character. (They forgot that the whole purpose of the Unicode | ||
83 | * was to be able to write all chars in one charset without such tricks). | ||
84 | * Until East Asian users say it is actually necessary to support these | ||
85 | * code points in console applications like busybox | ||
86 | * (i.e. do these chars ever appear in filenames, hostnames, text files | ||
87 | * and such?), we are treating these code points as invalid. | ||
88 | * | ||
89 | * Tertiary Ideographic Plane is also ignored for now, | ||
90 | * until Unicode committee assigns something there. | ||
91 | */ | ||
92 | |||
93 | #if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 | ||
94 | # define LAST_SUPPORTED_WCHAR 0x30000 | ||
64 | #else | 95 | #else |
65 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR | 96 | # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR |
66 | #endif | 97 | #endif |
@@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs) | |||
429 | #undef BIG_ | 460 | #undef BIG_ |
430 | #undef PAIR | 461 | #undef PAIR |
431 | }; | 462 | }; |
432 | # if LAST_SUPPORTED_WCHAR >= 0x1100 | 463 | # if LAST_SUPPORTED_WCHAR >= 0x10000 |
464 | /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ | ||
433 | static const struct interval combining0x10000[] = { | 465 | static const struct interval combining0x10000[] = { |
434 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, | 466 | { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, |
435 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, | 467 | { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, |
@@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs) | |||
462 | # if LAST_SUPPORTED_WCHAR < 0x1100 | 494 | # if LAST_SUPPORTED_WCHAR < 0x1100 |
463 | return -1; | 495 | return -1; |
464 | # else | 496 | # else |
465 | /* binary search in table of non-spacing characters, cont. */ | 497 | if (ucs >= LAST_SUPPORTED_WCHAR) |
498 | return -1; | ||
499 | |||
500 | /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ | ||
501 | /* We also exclude Private Use Area (e000..f8ff) */ | ||
502 | if (LAST_SUPPORTED_WCHAR >= 0xd800 | ||
503 | && (ucs >= 0xd800 || ucs <= 0xf8ff) | ||
504 | ) { | ||
505 | return -1; | ||
506 | } | ||
507 | |||
508 | /* 0xfffe and 0xffff in every plane are invalid */ | ||
509 | if (LAST_SUPPORTED_WCHAR >= 0xfffe | ||
510 | && (ucs & 0xfffe) == 0xfffe | ||
511 | ) { | ||
512 | return -1; | ||
513 | } | ||
514 | |||
515 | # if LAST_SUPPORTED_WCHAR >= 0x10000 | ||
516 | /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ | ||
466 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) | 517 | if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) |
467 | return 0; | 518 | return 0; |
468 | if (ucs == 0xE0001 | 519 | # endif |
469 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | 520 | /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ |
470 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | 521 | if (LAST_SUPPORTED_WCHAR >= 0xE0001 |
522 | && ( ucs == 0xE0001 | ||
523 | || (ucs >= 0xE0020 && ucs <= 0xE007F) | ||
524 | || (ucs >= 0xE0100 && ucs <= 0xE01EF) | ||
525 | ) | ||
471 | ) { | 526 | ) { |
472 | return 0; | 527 | return 0; |
473 | } | 528 | } |
@@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs) | |||
485 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ | 540 | || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ |
486 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ | 541 | || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ |
487 | || (ucs >= 0xffe0 && ucs <= 0xffe6) | 542 | || (ucs >= 0xffe0 && ucs <= 0xffe6) |
488 | || (ucs >= 0x20000 && ucs <= 0x2fffd) | 543 | || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */ |
489 | || (ucs >= 0x30000 && ucs <= 0x3fffd) | ||
490 | ); | 544 | ); |
491 | # endif | 545 | # endif |
492 | #endif | 546 | #endif |