aboutsummaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2010-03-26 14:06:56 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2010-03-26 14:06:56 +0100
commit19158a837df5093a2d655536424412bac2b07467 (patch)
tree3f3ce9c808e05dbf8dd38292f4c2db52cb73b429 /libbb
parentaa167556cd2954bb9a9fb0a005178462087a4600 (diff)
downloadbusybox-w32-19158a837df5093a2d655536424412bac2b07467.tar.gz
busybox-w32-19158a837df5093a2d655536424412bac2b07467.tar.bz2
busybox-w32-19158a837df5093a2d655536424412bac2b07467.zip
unicode: s/FEATURE_ASSUME_UNICODE/UNICODE_SUPPORT, add UNICODE_USING_LOCALE
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r--libbb/Kbuild2
-rw-r--r--libbb/lineedit.c26
-rw-r--r--libbb/printable_string.c2
-rw-r--r--libbb/progress.c2
-rw-r--r--libbb/unicode.c434
-rw-r--r--libbb/unicode_wcwidth.c543
6 files changed, 443 insertions, 566 deletions
diff --git a/libbb/Kbuild b/libbb/Kbuild
index 49cf4b8ad..4606d5aa7 100644
--- a/libbb/Kbuild
+++ b/libbb/Kbuild
@@ -124,7 +124,7 @@ lib-y += xrealloc_vector.o
124# and objects which may fail to build (SELinux on selinux-less system) 124# and objects which may fail to build (SELinux on selinux-less system)
125lib-$(CONFIG_SELINUX) += selinux_common.o 125lib-$(CONFIG_SELINUX) += selinux_common.o
126lib-$(CONFIG_FEATURE_MTAB_SUPPORT) += mtab.o 126lib-$(CONFIG_FEATURE_MTAB_SUPPORT) += mtab.o
127lib-$(CONFIG_FEATURE_ASSUME_UNICODE) += unicode.o 127lib-$(CONFIG_UNICODE_SUPPORT) += unicode.o
128lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o 128lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o
129 129
130lib-$(CONFIG_LOSETUP) += loop.o 130lib-$(CONFIG_LOSETUP) += loop.o
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 38a09cb26..dc90846f9 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -67,7 +67,7 @@
67 67
68 68
69#undef CHAR_T 69#undef CHAR_T
70#if ENABLE_FEATURE_ASSUME_UNICODE 70#if ENABLE_UNICODE_SUPPORT
71# define BB_NUL L'\0' 71# define BB_NUL L'\0'
72# define CHAR_T wchar_t 72# define CHAR_T wchar_t
73static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } 73static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
@@ -202,7 +202,7 @@ static void deinit_S(void)
202#define DEINIT_S() deinit_S() 202#define DEINIT_S() deinit_S()
203 203
204 204
205#if ENABLE_FEATURE_ASSUME_UNICODE 205#if ENABLE_UNICODE_SUPPORT
206static size_t load_string(const char *src, int maxsize) 206static size_t load_string(const char *src, int maxsize)
207{ 207{
208 ssize_t len = mbstowcs(command_ps, src, maxsize - 1); 208 ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
@@ -932,7 +932,7 @@ static void input_tab(smallint *lastWasTab)
932#define matchBuf (S.input_tab__matchBuf) 932#define matchBuf (S.input_tab__matchBuf)
933 int find_type; 933 int find_type;
934 int recalc_pos; 934 int recalc_pos;
935#if ENABLE_FEATURE_ASSUME_UNICODE 935#if ENABLE_UNICODE_SUPPORT
936 /* cursor pos in command converted to multibyte form */ 936 /* cursor pos in command converted to multibyte form */
937 int cursor_mb; 937 int cursor_mb;
938#endif 938#endif
@@ -942,7 +942,7 @@ static void input_tab(smallint *lastWasTab)
942 /* Make a local copy of the string -- 942 /* Make a local copy of the string --
943 * up to the position of the cursor */ 943 * up to the position of the cursor */
944 save_string(matchBuf, cursor + 1); 944 save_string(matchBuf, cursor + 1);
945#if ENABLE_FEATURE_ASSUME_UNICODE 945#if ENABLE_UNICODE_SUPPORT
946 cursor_mb = strlen(matchBuf); 946 cursor_mb = strlen(matchBuf);
947#endif 947#endif
948 tmp = matchBuf; 948 tmp = matchBuf;
@@ -1015,7 +1015,7 @@ static void input_tab(smallint *lastWasTab)
1015 } 1015 }
1016 1016
1017 len_found = strlen(tmp); 1017 len_found = strlen(tmp);
1018#if !ENABLE_FEATURE_ASSUME_UNICODE 1018#if !ENABLE_UNICODE_SUPPORT
1019 /* have space to place the match? */ 1019 /* have space to place the match? */
1020 /* The result consists of three parts with these lengths: */ 1020 /* The result consists of three parts with these lengths: */
1021 /* (cursor - recalc_pos) + len_found + (command_len - cursor) */ 1021 /* (cursor - recalc_pos) + len_found + (command_len - cursor) */
@@ -1088,7 +1088,7 @@ static void save_command_ps_at_cur_history(void)
1088 int cur = state->cur_history; 1088 int cur = state->cur_history;
1089 free(state->history[cur]); 1089 free(state->history[cur]);
1090 1090
1091# if ENABLE_FEATURE_ASSUME_UNICODE 1091# if ENABLE_UNICODE_SUPPORT
1092 { 1092 {
1093 char tbuf[MAX_LINELEN]; 1093 char tbuf[MAX_LINELEN];
1094 save_string(tbuf, sizeof(tbuf)); 1094 save_string(tbuf, sizeof(tbuf));
@@ -1659,7 +1659,7 @@ static int lineedit_read_key(char *read_key_buffer)
1659{ 1659{
1660 int64_t ic; 1660 int64_t ic;
1661 int timeout = -1; 1661 int timeout = -1;
1662#if ENABLE_FEATURE_ASSUME_UNICODE 1662#if ENABLE_UNICODE_SUPPORT
1663 char unicode_buf[MB_CUR_MAX + 1]; 1663 char unicode_buf[MB_CUR_MAX + 1];
1664 int unicode_idx = 0; 1664 int unicode_idx = 0;
1665#endif 1665#endif
@@ -1674,7 +1674,7 @@ static int lineedit_read_key(char *read_key_buffer)
1674 */ 1674 */
1675 ic = read_key(STDIN_FILENO, read_key_buffer, timeout); 1675 ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
1676 if (errno) { 1676 if (errno) {
1677#if ENABLE_FEATURE_ASSUME_UNICODE 1677#if ENABLE_UNICODE_SUPPORT
1678 if (errno == EAGAIN && unicode_idx != 0) 1678 if (errno == EAGAIN && unicode_idx != 0)
1679 goto pushback; 1679 goto pushback;
1680#endif 1680#endif
@@ -1700,7 +1700,7 @@ static int lineedit_read_key(char *read_key_buffer)
1700 } 1700 }
1701#endif 1701#endif
1702 1702
1703#if ENABLE_FEATURE_ASSUME_UNICODE 1703#if ENABLE_UNICODE_SUPPORT
1704 if (unicode_status == UNICODE_ON) { 1704 if (unicode_status == UNICODE_ON) {
1705 wchar_t wc; 1705 wchar_t wc;
1706 1706
@@ -1817,7 +1817,7 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
1817 /* prepare before init handlers */ 1817 /* prepare before init handlers */
1818 cmdedit_y = 0; /* quasireal y, not true if line > xt*yt */ 1818 cmdedit_y = 0; /* quasireal y, not true if line > xt*yt */
1819 command_len = 0; 1819 command_len = 0;
1820#if ENABLE_FEATURE_ASSUME_UNICODE 1820#if ENABLE_UNICODE_SUPPORT
1821 command_ps = xzalloc(maxsize * sizeof(command_ps[0])); 1821 command_ps = xzalloc(maxsize * sizeof(command_ps[0]));
1822#else 1822#else
1823 command_ps = command; 1823 command_ps = command;
@@ -2199,8 +2199,8 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
2199// } 2199// }
2200// } 2200// }
2201 if (ic < ' ' 2201 if (ic < ' '
2202 || (!ENABLE_FEATURE_ASSUME_UNICODE && ic >= 256) 2202 || (!ENABLE_UNICODE_SUPPORT && ic >= 256)
2203 || (ENABLE_FEATURE_ASSUME_UNICODE && ic >= VI_CMDMODE_BIT) 2203 || (ENABLE_UNICODE_SUPPORT && ic >= VI_CMDMODE_BIT)
2204 ) { 2204 ) {
2205 /* If VI_CMDMODE_BIT is set, ic is >= 256 2205 /* If VI_CMDMODE_BIT is set, ic is >= 256
2206 * and vi mode ignores unexpected chars. 2206 * and vi mode ignores unexpected chars.
@@ -2268,7 +2268,7 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
2268/* Stop bug catching using "command_must_not_be_used" trick */ 2268/* Stop bug catching using "command_must_not_be_used" trick */
2269#undef command 2269#undef command
2270 2270
2271#if ENABLE_FEATURE_ASSUME_UNICODE 2271#if ENABLE_UNICODE_SUPPORT
2272 command[0] = '\0'; 2272 command[0] = '\0';
2273 if (command_len > 0) 2273 if (command_len > 0)
2274 command_len = save_string(command, maxsize - 1); 2274 command_len = save_string(command, maxsize - 1);
diff --git a/libbb/printable_string.c b/libbb/printable_string.c
index 47565de0d..83a482196 100644
--- a/libbb/printable_string.c
+++ b/libbb/printable_string.c
@@ -36,7 +36,7 @@ const char* FAST_FUNC printable_string(uni_stat_t *stats, const char *str)
36 s++; 36 s++;
37 } 37 }
38 38
39#if ENABLE_FEATURE_ASSUME_UNICODE 39#if ENABLE_UNICODE_SUPPORT
40 dst = unicode_conv_to_printable(stats, str); 40 dst = unicode_conv_to_printable(stats, str);
41#else 41#else
42 { 42 {
diff --git a/libbb/progress.c b/libbb/progress.c
index 0e484da6c..e96039042 100644
--- a/libbb/progress.c
+++ b/libbb/progress.c
@@ -78,7 +78,7 @@ void FAST_FUNC bb_progress_update(bb_progress_t *p,
78 if (ratio > 100) ratio = 100; 78 if (ratio > 100) ratio = 100;
79 } 79 }
80 80
81#if ENABLE_FEATURE_ASSUME_UNICODE 81#if ENABLE_UNICODE_SUPPORT
82 init_unicode(); 82 init_unicode();
83 /* libbb candidate? */ 83 /* libbb candidate? */
84 { 84 {
diff --git a/libbb/unicode.c b/libbb/unicode.c
index bc9714562..83e70b412 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -14,12 +14,12 @@
14uint8_t unicode_status; 14uint8_t unicode_status;
15#endif 15#endif
16 16
17/* This file is compiled only if FEATURE_ASSUME_UNICODE is on. 17/* This file is compiled only if UNICODE_SUPPORT is on.
18 * We check other options and decide whether to use libc support 18 * We check other options and decide whether to use libc support
19 * via locale, or use our own logic: 19 * via locale, or use our own logic:
20 */ 20 */
21 21
22#if ENABLE_LOCALE_SUPPORT 22#if ENABLE_UNICODE_USING_LOCALE
23 23
24/* Unicode support using libc locale support. */ 24/* Unicode support using libc locale support. */
25 25
@@ -139,7 +139,7 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
139 return org_n - n; 139 return org_n - n;
140} 140}
141 141
142#define ERROR_WCHAR (~(wchar_t)0) 142# define ERROR_WCHAR (~(wchar_t)0)
143 143
144static const char *mbstowc_internal(wchar_t *res, const char *src) 144static const char *mbstowc_internal(wchar_t *res, const char *src)
145{ 145{
@@ -239,7 +239,427 @@ int FAST_FUNC iswpunct(wint_t wc)
239 return (unsigned)wc <= 0x7f && ispunct(wc); 239 return (unsigned)wc <= 0x7f && ispunct(wc);
240} 240}
241 241
242#include "unicode_wcwidth.c" 242
243# if LAST_SUPPORTED_WCHAR >= 0x300
244struct interval {
245 uint16_t first;
246 uint16_t last;
247};
248
249/* auxiliary function for binary search in interval table */
250static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
251{
252 unsigned min;
253 unsigned mid;
254
255 if (ucs < table[0].first || ucs > table[max].last)
256 return 0;
257
258 min = 0;
259 while (max >= min) {
260 mid = (min + max) / 2;
261 if (ucs > table[mid].last)
262 min = mid + 1;
263 else if (ucs < table[mid].first)
264 max = mid - 1;
265 else
266 return 1;
267 }
268 return 0;
269}
270
271static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
272{
273 unsigned min;
274 unsigned mid;
275 unsigned first, last;
276
277 first = table[0] >> 2;
278 last = first + (table[0] & 3);
279 if (ucs < first || ucs > last)
280 return 0;
281
282 min = 0;
283 while (max >= min) {
284 mid = (min + max) / 2;
285 first = table[mid] >> 2;
286 last = first + (table[mid] & 3);
287 if (ucs > last)
288 min = mid + 1;
289 else if (ucs < first)
290 max = mid - 1;
291 else
292 return 1;
293 }
294 return 0;
295}
296# endif
297
298
299/*
300 * This is an implementation of wcwidth() and wcswidth() (defined in
301 * IEEE Std 1002.1-2001) for Unicode.
302 *
303 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
304 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
305 *
306 * In fixed-width output devices, Latin characters all occupy a single
307 * "cell" position of equal width, whereas ideographic CJK characters
308 * occupy two such cells. Interoperability between terminal-line
309 * applications and (teletype-style) character terminals using the
310 * UTF-8 encoding requires agreement on which character should advance
311 * the cursor by how many cell positions. No established formal
312 * standards exist at present on which Unicode character shall occupy
313 * how many cell positions on character terminals. These routines are
314 * a first attempt of defining such behavior based on simple rules
315 * applied to data provided by the Unicode Consortium.
316 *
317 * For some graphical characters, the Unicode standard explicitly
318 * defines a character-cell width via the definition of the East Asian
319 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
320 * In all these cases, there is no ambiguity about which width a
321 * terminal shall use. For characters in the East Asian Ambiguous (A)
322 * class, the width choice depends purely on a preference of backward
323 * compatibility with either historic CJK or Western practice.
324 * Choosing single-width for these characters is easy to justify as
325 * the appropriate long-term solution, as the CJK practice of
326 * displaying these characters as double-width comes from historic
327 * implementation simplicity (8-bit encoded characters were displayed
328 * single-width and 16-bit ones double-width, even for Greek,
329 * Cyrillic, etc.) and not any typographic considerations.
330 *
331 * Much less clear is the choice of width for the Not East Asian
332 * (Neutral) class. Existing practice does not dictate a width for any
333 * of these characters. It would nevertheless make sense
334 * typographically to allocate two character cells to characters such
335 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
336 * represented adequately with a single-width glyph. The following
337 * routines at present merely assign a single-cell width to all
338 * neutral characters, in the interest of simplicity. This is not
339 * entirely satisfactory and should be reconsidered before
340 * establishing a formal standard in this area. At the moment, the
341 * decision which Not East Asian (Neutral) characters should be
342 * represented by double-width glyphs cannot yet be answered by
343 * applying a simple rule from the Unicode database content. Setting
344 * up a proper standard for the behavior of UTF-8 character terminals
345 * will require a careful analysis not only of each Unicode character,
346 * but also of each presentation form, something the author of these
347 * routines has avoided to do so far.
348 *
349 * http://www.unicode.org/unicode/reports/tr11/
350 *
351 * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
352 *
353 * Permission to use, copy, modify, and distribute this software
354 * for any purpose and without fee is hereby granted. The author
355 * disclaims all warranties with regard to this software.
356 *
357 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
358 */
359
360/* Assigned Unicode character ranges:
361 * Plane Range
362 * 0 0000–FFFF Basic Multilingual Plane
363 * 1 10000–1FFFF Supplementary Multilingual Plane
364 * 2 20000–2FFFF Supplementary Ideographic Plane
365 * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
366 * 4-13 40000–DFFFF currently unassigned
367 * 14 E0000–EFFFF Supplementary Special-purpose Plane
368 * 15 F0000–FFFFF Supplementary Private Use Area-A
369 * 16 100000–10FFFF Supplementary Private Use Area-B
370 *
371 * "Supplementary Special-purpose Plane currently contains non-graphical
372 * characters in two blocks of 128 and 240 characters. The first block
373 * is for language tag characters for use when language cannot be indicated
374 * through other protocols (such as the xml:lang attribute in XML).
375 * The other block contains glyph variation selectors to indicate
376 * an alternate glyph for a character that cannot be determined by context."
377 *
378 * In simpler terms: it is a tool to fix the "Han unification" mess
379 * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
380 * version of a character. (They forgot that the whole purpose of the Unicode
381 * was to be able to write all chars in one charset without such tricks).
382 * Until East Asian users say it is actually necessary to support these
383 * code points in console applications like busybox
384 * (i.e. do these chars ever appear in filenames, hostnames, text files
385 * and such?), we are treating these code points as invalid.
386 *
387 * Tertiary Ideographic Plane is also ignored for now,
388 * until Unicode committee assigns something there.
389 */
390/* The following two functions define the column width of an ISO 10646
391 * character as follows:
392 *
393 * - The null character (U+0000) has a column width of 0.
394 *
395 * - Other C0/C1 control characters and DEL will lead to a return
396 * value of -1.
397 *
398 * - Non-spacing and enclosing combining characters (general
399 * category code Mn or Me in the Unicode database) have a
400 * column width of 0.
401 *
402 * - SOFT HYPHEN (U+00AD) has a column width of 1.
403 *
404 * - Other format characters (general category code Cf in the Unicode
405 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
406 *
407 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
408 * have a column width of 0.
409 *
410 * - Spacing characters in the East Asian Wide (W) or East Asian
411 * Full-width (F) category as defined in Unicode Technical
412 * Report #11 have a column width of 2.
413 *
414 * - All remaining characters (including all printable
415 * ISO 8859-1 and WGL4 characters, Unicode control characters,
416 * etc.) have a column width of 1.
417 *
418 * This implementation assumes that wchar_t characters are encoded
419 * in ISO 10646.
420 */
421static int wcwidth(unsigned ucs)
422{
423# if LAST_SUPPORTED_WCHAR >= 0x300
424 /* sorted list of non-overlapping intervals of non-spacing characters */
425 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
426 static const struct interval combining[] = {
427# define BIG_(a,b) { a, b },
428# define PAIR(a,b)
429# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
430 BIG_(0x0300, 0x036F) \
431 PAIR(0x0483, 0x0486) \
432 PAIR(0x0488, 0x0489) \
433 BIG_(0x0591, 0x05BD) \
434 PAIR(0x05BF, 0x05BF) \
435 PAIR(0x05C1, 0x05C2) \
436 PAIR(0x05C4, 0x05C5) \
437 PAIR(0x05C7, 0x05C7) \
438 PAIR(0x0600, 0x0603) \
439 BIG_(0x0610, 0x0615) \
440 BIG_(0x064B, 0x065E) \
441 PAIR(0x0670, 0x0670) \
442 BIG_(0x06D6, 0x06E4) \
443 PAIR(0x06E7, 0x06E8) \
444 PAIR(0x06EA, 0x06ED) \
445 PAIR(0x070F, 0x070F) \
446 PAIR(0x0711, 0x0711) \
447 BIG_(0x0730, 0x074A) \
448 BIG_(0x07A6, 0x07B0) \
449 BIG_(0x07EB, 0x07F3) \
450 PAIR(0x0901, 0x0902) \
451 PAIR(0x093C, 0x093C) \
452 BIG_(0x0941, 0x0948) \
453 PAIR(0x094D, 0x094D) \
454 PAIR(0x0951, 0x0954) \
455 PAIR(0x0962, 0x0963) \
456 PAIR(0x0981, 0x0981) \
457 PAIR(0x09BC, 0x09BC) \
458 PAIR(0x09C1, 0x09C4) \
459 PAIR(0x09CD, 0x09CD) \
460 PAIR(0x09E2, 0x09E3) \
461 PAIR(0x0A01, 0x0A02) \
462 PAIR(0x0A3C, 0x0A3C) \
463 PAIR(0x0A41, 0x0A42) \
464 PAIR(0x0A47, 0x0A48) \
465 PAIR(0x0A4B, 0x0A4D) \
466 PAIR(0x0A70, 0x0A71) \
467 PAIR(0x0A81, 0x0A82) \
468 PAIR(0x0ABC, 0x0ABC) \
469 BIG_(0x0AC1, 0x0AC5) \
470 PAIR(0x0AC7, 0x0AC8) \
471 PAIR(0x0ACD, 0x0ACD) \
472 PAIR(0x0AE2, 0x0AE3) \
473 PAIR(0x0B01, 0x0B01) \
474 PAIR(0x0B3C, 0x0B3C) \
475 PAIR(0x0B3F, 0x0B3F) \
476 PAIR(0x0B41, 0x0B43) \
477 PAIR(0x0B4D, 0x0B4D) \
478 PAIR(0x0B56, 0x0B56) \
479 PAIR(0x0B82, 0x0B82) \
480 PAIR(0x0BC0, 0x0BC0) \
481 PAIR(0x0BCD, 0x0BCD) \
482 PAIR(0x0C3E, 0x0C40) \
483 PAIR(0x0C46, 0x0C48) \
484 PAIR(0x0C4A, 0x0C4D) \
485 PAIR(0x0C55, 0x0C56) \
486 PAIR(0x0CBC, 0x0CBC) \
487 PAIR(0x0CBF, 0x0CBF) \
488 PAIR(0x0CC6, 0x0CC6) \
489 PAIR(0x0CCC, 0x0CCD) \
490 PAIR(0x0CE2, 0x0CE3) \
491 PAIR(0x0D41, 0x0D43) \
492 PAIR(0x0D4D, 0x0D4D) \
493 PAIR(0x0DCA, 0x0DCA) \
494 PAIR(0x0DD2, 0x0DD4) \
495 PAIR(0x0DD6, 0x0DD6) \
496 PAIR(0x0E31, 0x0E31) \
497 BIG_(0x0E34, 0x0E3A) \
498 BIG_(0x0E47, 0x0E4E) \
499 PAIR(0x0EB1, 0x0EB1) \
500 BIG_(0x0EB4, 0x0EB9) \
501 PAIR(0x0EBB, 0x0EBC) \
502 BIG_(0x0EC8, 0x0ECD) \
503 PAIR(0x0F18, 0x0F19) \
504 PAIR(0x0F35, 0x0F35) \
505 PAIR(0x0F37, 0x0F37) \
506 PAIR(0x0F39, 0x0F39) \
507 BIG_(0x0F71, 0x0F7E) \
508 BIG_(0x0F80, 0x0F84) \
509 PAIR(0x0F86, 0x0F87) \
510 PAIR(0x0FC6, 0x0FC6) \
511 BIG_(0x0F90, 0x0F97) \
512 BIG_(0x0F99, 0x0FBC) \
513 PAIR(0x102D, 0x1030) \
514 PAIR(0x1032, 0x1032) \
515 PAIR(0x1036, 0x1037) \
516 PAIR(0x1039, 0x1039) \
517 PAIR(0x1058, 0x1059) \
518 BIG_(0x1160, 0x11FF) \
519 PAIR(0x135F, 0x135F) \
520 PAIR(0x1712, 0x1714) \
521 PAIR(0x1732, 0x1734) \
522 PAIR(0x1752, 0x1753) \
523 PAIR(0x1772, 0x1773) \
524 PAIR(0x17B4, 0x17B5) \
525 BIG_(0x17B7, 0x17BD) \
526 PAIR(0x17C6, 0x17C6) \
527 BIG_(0x17C9, 0x17D3) \
528 PAIR(0x17DD, 0x17DD) \
529 PAIR(0x180B, 0x180D) \
530 PAIR(0x18A9, 0x18A9) \
531 PAIR(0x1920, 0x1922) \
532 PAIR(0x1927, 0x1928) \
533 PAIR(0x1932, 0x1932) \
534 PAIR(0x1939, 0x193B) \
535 PAIR(0x1A17, 0x1A18) \
536 PAIR(0x1B00, 0x1B03) \
537 PAIR(0x1B34, 0x1B34) \
538 BIG_(0x1B36, 0x1B3A) \
539 PAIR(0x1B3C, 0x1B3C) \
540 PAIR(0x1B42, 0x1B42) \
541 BIG_(0x1B6B, 0x1B73) \
542 BIG_(0x1DC0, 0x1DCA) \
543 PAIR(0x1DFE, 0x1DFF) \
544 BIG_(0x200B, 0x200F) \
545 BIG_(0x202A, 0x202E) \
546 PAIR(0x2060, 0x2063) \
547 BIG_(0x206A, 0x206F) \
548 BIG_(0x20D0, 0x20EF) \
549 BIG_(0x302A, 0x302F) \
550 PAIR(0x3099, 0x309A) \
551 /* Too big to be packed in PAIRs: */ \
552 BIG_(0xA806, 0xA806) \
553 BIG_(0xA80B, 0xA80B) \
554 BIG_(0xA825, 0xA826) \
555 BIG_(0xFB1E, 0xFB1E) \
556 BIG_(0xFE00, 0xFE0F) \
557 BIG_(0xFE20, 0xFE23) \
558 BIG_(0xFEFF, 0xFEFF) \
559 BIG_(0xFFF9, 0xFFFB)
560 ARRAY
561# undef BIG_
562# undef PAIR
563 };
564# define BIG_(a,b)
565# define PAIR(a,b) (a << 2) | (b-a),
566 static const uint16_t combining1[] = { ARRAY };
567# undef BIG_
568# undef PAIR
569# define BIG_(a,b) char big_##a[b < 0x4000 && b-a <= 3 ? -1 : 1];
570# define PAIR(a,b) char pair##a[b >= 0x4000 || b-a > 3 ? -1 : 1];
571 struct CHECK { ARRAY };
572# undef BIG_
573# undef PAIR
574# undef ARRAY
575# endif
576
577 if (ucs == 0)
578 return 0;
579
580 /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
581 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
582 return -1;
583 /* Quick abort if it is an obviously invalid char */
584 if (ucs > LAST_SUPPORTED_WCHAR)
585 return -1;
586
587 /* Optimization: no combining chars below 0x300 */
588 if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
589 return 1;
590
591# if LAST_SUPPORTED_WCHAR >= 0x300
592 /* Binary search in table of non-spacing characters */
593 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
594 return 0;
595 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
596 return 0;
597
598 /* Optimization: all chars below 0x1100 are not double-width */
599 if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
600 return 1;
601
602# if LAST_SUPPORTED_WCHAR >= 0x1100
603 /* Invalid code points: */
604 /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
605 /* Private Use Area (e000..f8ff) */
606 /* Noncharacters fdd0..fdef */
607 if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
608 || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
609 ) {
610 return -1;
611 }
612 /* 0xfffe and 0xffff in every plane are invalid */
613 if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
614 return -1;
615 }
616
617# if LAST_SUPPORTED_WCHAR >= 0x10000
618 if (ucs >= 0x10000) {
619 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
620 static const struct interval combining0x10000[] = {
621 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
622 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
623 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
624 { 0xD242, 0xD244 }
625 };
626 /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
627 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
628 return 0;
629 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
630 if (LAST_SUPPORTED_WCHAR >= 0xE0001
631 && ( ucs == 0xE0001
632 || (ucs >= 0xE0020 && ucs <= 0xE007F)
633 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
634 )
635 ) {
636 return 0;
637 }
638 }
639# endif
640
641 /* If we arrive here, ucs is not a combining or C0/C1 control character.
642 * Check whether it's 1 char or 2-shar wide.
643 */
644 return 1 +
645 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
646 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
647 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
648 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
649# if LAST_SUPPORTED_WCHAR >= 0xac00
650 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
651 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
652 || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
653 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
654 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
655 || (ucs >= 0xffe0 && ucs <= 0xffe6)
656 || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
657# endif
658 );
659# endif /* >= 0x1100 */
660# endif /* >= 0x300 */
661}
662
243 663
244# if ENABLE_UNICODE_BIDI_SUPPORT 664# if ENABLE_UNICODE_BIDI_SUPPORT
245int FAST_FUNC unicode_bidi_isrtl(wint_t wc) 665int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
@@ -592,7 +1012,7 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
592 int w; 1012 int w;
593 wchar_t wc; 1013 wchar_t wc;
594 1014
595#if ENABLE_LOCALE_SUPPORT 1015#if ENABLE_UNICODE_USING_LOCALE
596 { 1016 {
597 mbstate_t mbst = { 0 }; 1017 mbstate_t mbst = { 0 };
598 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); 1018 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
@@ -647,7 +1067,7 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
647 uni_count++; 1067 uni_count++;
648 uni_width += w; 1068 uni_width += w;
649 dst = xrealloc(dst, dst_len + MB_CUR_MAX); 1069 dst = xrealloc(dst, dst_len + MB_CUR_MAX);
650#if ENABLE_LOCALE_SUPPORT 1070#if ENABLE_UNICODE_USING_LOCALE
651 { 1071 {
652 mbstate_t mbst = { 0 }; 1072 mbstate_t mbst = { 0 };
653 dst_len += wcrtomb(&dst[dst_len], wc, &mbst); 1073 dst_len += wcrtomb(&dst[dst_len], wc, &mbst);
@@ -699,7 +1119,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
699 int w; 1119 int w;
700 wchar_t wc; 1120 wchar_t wc;
701 1121
702#if ENABLE_LOCALE_SUPPORT 1122#if ENABLE_UNICODE_USING_LOCALE
703 { 1123 {
704 mbstate_t mbst = { 0 }; 1124 mbstate_t mbst = { 0 };
705 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); 1125 ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c
deleted file mode 100644
index 0bb622705..000000000
--- a/libbb/unicode_wcwidth.c
+++ /dev/null
@@ -1,543 +0,0 @@
1/*
2 * This is an implementation of wcwidth() and wcswidth() (defined in
3 * IEEE Std 1002.1-2001) for Unicode.
4 *
5 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
6 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
7 *
8 * In fixed-width output devices, Latin characters all occupy a single
9 * "cell" position of equal width, whereas ideographic CJK characters
10 * occupy two such cells. Interoperability between terminal-line
11 * applications and (teletype-style) character terminals using the
12 * UTF-8 encoding requires agreement on which character should advance
13 * the cursor by how many cell positions. No established formal
14 * standards exist at present on which Unicode character shall occupy
15 * how many cell positions on character terminals. These routines are
16 * a first attempt of defining such behavior based on simple rules
17 * applied to data provided by the Unicode Consortium.
18 *
19 * For some graphical characters, the Unicode standard explicitly
20 * defines a character-cell width via the definition of the East Asian
21 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
22 * In all these cases, there is no ambiguity about which width a
23 * terminal shall use. For characters in the East Asian Ambiguous (A)
24 * class, the width choice depends purely on a preference of backward
25 * compatibility with either historic CJK or Western practice.
26 * Choosing single-width for these characters is easy to justify as
27 * the appropriate long-term solution, as the CJK practice of
28 * displaying these characters as double-width comes from historic
29 * implementation simplicity (8-bit encoded characters were displayed
30 * single-width and 16-bit ones double-width, even for Greek,
31 * Cyrillic, etc.) and not any typographic considerations.
32 *
33 * Much less clear is the choice of width for the Not East Asian
34 * (Neutral) class. Existing practice does not dictate a width for any
35 * of these characters. It would nevertheless make sense
36 * typographically to allocate two character cells to characters such
37 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
38 * represented adequately with a single-width glyph. The following
39 * routines at present merely assign a single-cell width to all
40 * neutral characters, in the interest of simplicity. This is not
41 * entirely satisfactory and should be reconsidered before
42 * establishing a formal standard in this area. At the moment, the
43 * decision which Not East Asian (Neutral) characters should be
44 * represented by double-width glyphs cannot yet be answered by
45 * applying a simple rule from the Unicode database content. Setting
46 * up a proper standard for the behavior of UTF-8 character terminals
47 * will require a careful analysis not only of each Unicode character,
48 * but also of each presentation form, something the author of these
49 * routines has avoided to do so far.
50 *
51 * http://www.unicode.org/unicode/reports/tr11/
52 *
53 * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
54 *
55 * Permission to use, copy, modify, and distribute this software
56 * for any purpose and without fee is hereby granted. The author
57 * disclaims all warranties with regard to this software.
58 *
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */
61
62/* Assigned Unicode character ranges:
63 * Plane Range
64 * 0 0000–FFFF Basic Multilingual Plane
65 * 1 10000–1FFFF Supplementary Multilingual Plane
66 * 2 20000–2FFFF Supplementary Ideographic Plane
67 * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
68 * 4-13 40000–DFFFF currently unassigned
69 * 14 E0000–EFFFF Supplementary Special-purpose Plane
70 * 15 F0000–FFFFF Supplementary Private Use Area-A
71 * 16 100000–10FFFF Supplementary Private Use Area-B
72 *
73 * "Supplementary Special-purpose Plane currently contains non-graphical
74 * characters in two blocks of 128 and 240 characters. The first block
75 * is for language tag characters for use when language cannot be indicated
76 * through other protocols (such as the xml:lang attribute in XML).
77 * The other block contains glyph variation selectors to indicate
78 * an alternate glyph for a character that cannot be determined by context."
79 *
80 * In simpler terms: it is a tool to fix the "Han unification" mess
81 * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
82 * version of a character. (They forgot that the whole purpose of the Unicode
83 * was to be able to write all chars in one charset without such tricks).
84 * Until East Asian users say it is actually necessary to support these
85 * code points in console applications like busybox
86 * (i.e. do these chars ever appear in filenames, hostnames, text files
87 * and such?), we are treating these code points as invalid.
88 *
89 * Tertiary Ideographic Plane is also ignored for now,
90 * until Unicode committee assigns something there.
91 */
92
93#if LAST_SUPPORTED_WCHAR >= 0x300
94struct interval {
95 uint16_t first;
96 uint16_t last;
97};
98
99/* auxiliary function for binary search in interval table */
100static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
101{
102 unsigned min;
103 unsigned mid;
104
105 if (ucs < table[0].first || ucs > table[max].last)
106 return 0;
107
108 min = 0;
109 while (max >= min) {
110 mid = (min + max) / 2;
111 if (ucs > table[mid].last)
112 min = mid + 1;
113 else if (ucs < table[mid].first)
114 max = mid - 1;
115 else
116 return 1;
117 }
118 return 0;
119}
120
121static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
122{
123 unsigned min;
124 unsigned mid;
125 unsigned first, last;
126
127 first = table[0] >> 2;
128 last = first + (table[0] & 3);
129 if (ucs < first || ucs > last)
130 return 0;
131
132 min = 0;
133 while (max >= min) {
134 mid = (min + max) / 2;
135 first = table[mid] >> 2;
136 last = first + (table[mid] & 3);
137 if (ucs > last)
138 min = mid + 1;
139 else if (ucs < first)
140 max = mid - 1;
141 else
142 return 1;
143 }
144 return 0;
145}
146#endif
147
148
149/* The following two functions define the column width of an ISO 10646
150 * character as follows:
151 *
152 * - The null character (U+0000) has a column width of 0.
153 *
154 * - Other C0/C1 control characters and DEL will lead to a return
155 * value of -1.
156 *
157 * - Non-spacing and enclosing combining characters (general
158 * category code Mn or Me in the Unicode database) have a
159 * column width of 0.
160 *
161 * - SOFT HYPHEN (U+00AD) has a column width of 1.
162 *
163 * - Other format characters (general category code Cf in the Unicode
164 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
165 *
166 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
167 * have a column width of 0.
168 *
169 * - Spacing characters in the East Asian Wide (W) or East Asian
170 * Full-width (F) category as defined in Unicode Technical
171 * Report #11 have a column width of 2.
172 *
173 * - All remaining characters (including all printable
174 * ISO 8859-1 and WGL4 characters, Unicode control characters,
175 * etc.) have a column width of 1.
176 *
177 * This implementation assumes that wchar_t characters are encoded
178 * in ISO 10646.
179 */
180static int wcwidth(unsigned ucs)
181{
182#if LAST_SUPPORTED_WCHAR >= 0x300
183 /* sorted list of non-overlapping intervals of non-spacing characters */
184 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
185 static const struct interval combining[] = {
186#define BIG_(a,b) { a, b },
187#define PAIR(a,b)
188 /* PAIR if < 0x4000 and no more than 4 chars big */
189 BIG_(0x0300, 0x036F)
190 PAIR(0x0483, 0x0486)
191 PAIR(0x0488, 0x0489)
192 BIG_(0x0591, 0x05BD)
193 PAIR(0x05BF, 0x05BF)
194 PAIR(0x05C1, 0x05C2)
195 PAIR(0x05C4, 0x05C5)
196 PAIR(0x05C7, 0x05C7)
197 PAIR(0x0600, 0x0603)
198 BIG_(0x0610, 0x0615)
199 BIG_(0x064B, 0x065E)
200 PAIR(0x0670, 0x0670)
201 BIG_(0x06D6, 0x06E4)
202 PAIR(0x06E7, 0x06E8)
203 PAIR(0x06EA, 0x06ED)
204 PAIR(0x070F, 0x070F)
205 PAIR(0x0711, 0x0711)
206 BIG_(0x0730, 0x074A)
207 BIG_(0x07A6, 0x07B0)
208 BIG_(0x07EB, 0x07F3)
209 PAIR(0x0901, 0x0902)
210 PAIR(0x093C, 0x093C)
211 BIG_(0x0941, 0x0948)
212 PAIR(0x094D, 0x094D)
213 PAIR(0x0951, 0x0954)
214 PAIR(0x0962, 0x0963)
215 PAIR(0x0981, 0x0981)
216 PAIR(0x09BC, 0x09BC)
217 PAIR(0x09C1, 0x09C4)
218 PAIR(0x09CD, 0x09CD)
219 PAIR(0x09E2, 0x09E3)
220 PAIR(0x0A01, 0x0A02)
221 PAIR(0x0A3C, 0x0A3C)
222 PAIR(0x0A41, 0x0A42)
223 PAIR(0x0A47, 0x0A48)
224 PAIR(0x0A4B, 0x0A4D)
225 PAIR(0x0A70, 0x0A71)
226 PAIR(0x0A81, 0x0A82)
227 PAIR(0x0ABC, 0x0ABC)
228 BIG_(0x0AC1, 0x0AC5)
229 PAIR(0x0AC7, 0x0AC8)
230 PAIR(0x0ACD, 0x0ACD)
231 PAIR(0x0AE2, 0x0AE3)
232 PAIR(0x0B01, 0x0B01)
233 PAIR(0x0B3C, 0x0B3C)
234 PAIR(0x0B3F, 0x0B3F)
235 PAIR(0x0B41, 0x0B43)
236 PAIR(0x0B4D, 0x0B4D)
237 PAIR(0x0B56, 0x0B56)
238 PAIR(0x0B82, 0x0B82)
239 PAIR(0x0BC0, 0x0BC0)
240 PAIR(0x0BCD, 0x0BCD)
241 PAIR(0x0C3E, 0x0C40)
242 PAIR(0x0C46, 0x0C48)
243 PAIR(0x0C4A, 0x0C4D)
244 PAIR(0x0C55, 0x0C56)
245 PAIR(0x0CBC, 0x0CBC)
246 PAIR(0x0CBF, 0x0CBF)
247 PAIR(0x0CC6, 0x0CC6)
248 PAIR(0x0CCC, 0x0CCD)
249 PAIR(0x0CE2, 0x0CE3)
250 PAIR(0x0D41, 0x0D43)
251 PAIR(0x0D4D, 0x0D4D)
252 PAIR(0x0DCA, 0x0DCA)
253 PAIR(0x0DD2, 0x0DD4)
254 PAIR(0x0DD6, 0x0DD6)
255 PAIR(0x0E31, 0x0E31)
256 BIG_(0x0E34, 0x0E3A)
257 BIG_(0x0E47, 0x0E4E)
258 PAIR(0x0EB1, 0x0EB1)
259 BIG_(0x0EB4, 0x0EB9)
260 PAIR(0x0EBB, 0x0EBC)
261 BIG_(0x0EC8, 0x0ECD)
262 PAIR(0x0F18, 0x0F19)
263 PAIR(0x0F35, 0x0F35)
264 PAIR(0x0F37, 0x0F37)
265 PAIR(0x0F39, 0x0F39)
266 BIG_(0x0F71, 0x0F7E)
267 BIG_(0x0F80, 0x0F84)
268 PAIR(0x0F86, 0x0F87)
269 PAIR(0x0FC6, 0x0FC6)
270 BIG_(0x0F90, 0x0F97)
271 BIG_(0x0F99, 0x0FBC)
272 PAIR(0x102D, 0x1030)
273 PAIR(0x1032, 0x1032)
274 PAIR(0x1036, 0x1037)
275 PAIR(0x1039, 0x1039)
276 PAIR(0x1058, 0x1059)
277 BIG_(0x1160, 0x11FF)
278 PAIR(0x135F, 0x135F)
279 PAIR(0x1712, 0x1714)
280 PAIR(0x1732, 0x1734)
281 PAIR(0x1752, 0x1753)
282 PAIR(0x1772, 0x1773)
283 PAIR(0x17B4, 0x17B5)
284 BIG_(0x17B7, 0x17BD)
285 PAIR(0x17C6, 0x17C6)
286 BIG_(0x17C9, 0x17D3)
287 PAIR(0x17DD, 0x17DD)
288 PAIR(0x180B, 0x180D)
289 PAIR(0x18A9, 0x18A9)
290 PAIR(0x1920, 0x1922)
291 PAIR(0x1927, 0x1928)
292 PAIR(0x1932, 0x1932)
293 PAIR(0x1939, 0x193B)
294 PAIR(0x1A17, 0x1A18)
295 PAIR(0x1B00, 0x1B03)
296 PAIR(0x1B34, 0x1B34)
297 BIG_(0x1B36, 0x1B3A)
298 PAIR(0x1B3C, 0x1B3C)
299 PAIR(0x1B42, 0x1B42)
300 BIG_(0x1B6B, 0x1B73)
301 BIG_(0x1DC0, 0x1DCA)
302 PAIR(0x1DFE, 0x1DFF)
303 BIG_(0x200B, 0x200F)
304 BIG_(0x202A, 0x202E)
305 PAIR(0x2060, 0x2063)
306 BIG_(0x206A, 0x206F)
307 BIG_(0x20D0, 0x20EF)
308 BIG_(0x302A, 0x302F)
309 PAIR(0x3099, 0x309A)
310 /* Too big to be packed in PAIRs: */
311 { 0xA806, 0xA806 },
312 { 0xA80B, 0xA80B },
313 { 0xA825, 0xA826 },
314 { 0xFB1E, 0xFB1E },
315 { 0xFE00, 0xFE0F },
316 { 0xFE20, 0xFE23 },
317 { 0xFEFF, 0xFEFF },
318 { 0xFFF9, 0xFFFB }
319#undef BIG_
320#undef PAIR
321 };
322 static const uint16_t combining1[] = {
323#define BIG_(a,b)
324#define PAIR(a,b) (a << 2) | (b-a),
325 /* Exact copy-n-paste of the above: */
326 BIG_(0x0300, 0x036F)
327 PAIR(0x0483, 0x0486)
328 PAIR(0x0488, 0x0489)
329 BIG_(0x0591, 0x05BD)
330 PAIR(0x05BF, 0x05BF)
331 PAIR(0x05C1, 0x05C2)
332 PAIR(0x05C4, 0x05C5)
333 PAIR(0x05C7, 0x05C7)
334 PAIR(0x0600, 0x0603)
335 BIG_(0x0610, 0x0615)
336 BIG_(0x064B, 0x065E)
337 PAIR(0x0670, 0x0670)
338 BIG_(0x06D6, 0x06E4)
339 PAIR(0x06E7, 0x06E8)
340 PAIR(0x06EA, 0x06ED)
341 PAIR(0x070F, 0x070F)
342 PAIR(0x0711, 0x0711)
343 BIG_(0x0730, 0x074A)
344 BIG_(0x07A6, 0x07B0)
345 BIG_(0x07EB, 0x07F3)
346 PAIR(0x0901, 0x0902)
347 PAIR(0x093C, 0x093C)
348 BIG_(0x0941, 0x0948)
349 PAIR(0x094D, 0x094D)
350 PAIR(0x0951, 0x0954)
351 PAIR(0x0962, 0x0963)
352 PAIR(0x0981, 0x0981)
353 PAIR(0x09BC, 0x09BC)
354 PAIR(0x09C1, 0x09C4)
355 PAIR(0x09CD, 0x09CD)
356 PAIR(0x09E2, 0x09E3)
357 PAIR(0x0A01, 0x0A02)
358 PAIR(0x0A3C, 0x0A3C)
359 PAIR(0x0A41, 0x0A42)
360 PAIR(0x0A47, 0x0A48)
361 PAIR(0x0A4B, 0x0A4D)
362 PAIR(0x0A70, 0x0A71)
363 PAIR(0x0A81, 0x0A82)
364 PAIR(0x0ABC, 0x0ABC)
365 BIG_(0x0AC1, 0x0AC5)
366 PAIR(0x0AC7, 0x0AC8)
367 PAIR(0x0ACD, 0x0ACD)
368 PAIR(0x0AE2, 0x0AE3)
369 PAIR(0x0B01, 0x0B01)
370 PAIR(0x0B3C, 0x0B3C)
371 PAIR(0x0B3F, 0x0B3F)
372 PAIR(0x0B41, 0x0B43)
373 PAIR(0x0B4D, 0x0B4D)
374 PAIR(0x0B56, 0x0B56)
375 PAIR(0x0B82, 0x0B82)
376 PAIR(0x0BC0, 0x0BC0)
377 PAIR(0x0BCD, 0x0BCD)
378 PAIR(0x0C3E, 0x0C40)
379 PAIR(0x0C46, 0x0C48)
380 PAIR(0x0C4A, 0x0C4D)
381 PAIR(0x0C55, 0x0C56)
382 PAIR(0x0CBC, 0x0CBC)
383 PAIR(0x0CBF, 0x0CBF)
384 PAIR(0x0CC6, 0x0CC6)
385 PAIR(0x0CCC, 0x0CCD)
386 PAIR(0x0CE2, 0x0CE3)
387 PAIR(0x0D41, 0x0D43)
388 PAIR(0x0D4D, 0x0D4D)
389 PAIR(0x0DCA, 0x0DCA)
390 PAIR(0x0DD2, 0x0DD4)
391 PAIR(0x0DD6, 0x0DD6)
392 PAIR(0x0E31, 0x0E31)
393 BIG_(0x0E34, 0x0E3A)
394 BIG_(0x0E47, 0x0E4E)
395 PAIR(0x0EB1, 0x0EB1)
396 BIG_(0x0EB4, 0x0EB9)
397 PAIR(0x0EBB, 0x0EBC)
398 BIG_(0x0EC8, 0x0ECD)
399 PAIR(0x0F18, 0x0F19)
400 PAIR(0x0F35, 0x0F35)
401 PAIR(0x0F37, 0x0F37)
402 PAIR(0x0F39, 0x0F39)
403 BIG_(0x0F71, 0x0F7E)
404 BIG_(0x0F80, 0x0F84)
405 PAIR(0x0F86, 0x0F87)
406 PAIR(0x0FC6, 0x0FC6)
407 BIG_(0x0F90, 0x0F97)
408 BIG_(0x0F99, 0x0FBC)
409 PAIR(0x102D, 0x1030)
410 PAIR(0x1032, 0x1032)
411 PAIR(0x1036, 0x1037)
412 PAIR(0x1039, 0x1039)
413 PAIR(0x1058, 0x1059)
414 BIG_(0x1160, 0x11FF)
415 PAIR(0x135F, 0x135F)
416 PAIR(0x1712, 0x1714)
417 PAIR(0x1732, 0x1734)
418 PAIR(0x1752, 0x1753)
419 PAIR(0x1772, 0x1773)
420 PAIR(0x17B4, 0x17B5)
421 BIG_(0x17B7, 0x17BD)
422 PAIR(0x17C6, 0x17C6)
423 BIG_(0x17C9, 0x17D3)
424 PAIR(0x17DD, 0x17DD)
425 PAIR(0x180B, 0x180D)
426 PAIR(0x18A9, 0x18A9)
427 PAIR(0x1920, 0x1922)
428 PAIR(0x1927, 0x1928)
429 PAIR(0x1932, 0x1932)
430 PAIR(0x1939, 0x193B)
431 PAIR(0x1A17, 0x1A18)
432 PAIR(0x1B00, 0x1B03)
433 PAIR(0x1B34, 0x1B34)
434 BIG_(0x1B36, 0x1B3A)
435 PAIR(0x1B3C, 0x1B3C)
436 PAIR(0x1B42, 0x1B42)
437 BIG_(0x1B6B, 0x1B73)
438 BIG_(0x1DC0, 0x1DCA)
439 PAIR(0x1DFE, 0x1DFF)
440 BIG_(0x200B, 0x200F)
441 BIG_(0x202A, 0x202E)
442 PAIR(0x2060, 0x2063)
443 BIG_(0x206A, 0x206F)
444 BIG_(0x20D0, 0x20EF)
445 BIG_(0x302A, 0x302F)
446 PAIR(0x3099, 0x309A)
447#undef BIG_
448#undef PAIR
449 };
450 struct CHECK {
451#define BIG_(a,b) char big##a[b-a <= 3 ? -1 : 1];
452#define PAIR(a,b) char pair##a[b-a > 3 ? -1 : 1];
453 /* Copy-n-paste it here again to verify correctness */
454#undef BIG_
455#undef PAIR
456 };
457#endif
458
459 if (ucs == 0)
460 return 0;
461
462 /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
463 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
464 return -1;
465 /* Quick abort if it is an obviously invalid char */
466 if (ucs > LAST_SUPPORTED_WCHAR)
467 return -1;
468
469 /* Optimization: no combining chars below 0x300 */
470 if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
471 return 1;
472
473#if LAST_SUPPORTED_WCHAR >= 0x300
474 /* Binary search in table of non-spacing characters */
475 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
476 return 0;
477 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
478 return 0;
479
480 /* Optimization: all chars below 0x1100 are not double-width */
481 if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
482 return 1;
483
484# if LAST_SUPPORTED_WCHAR >= 0x1100
485 /* Invalid code points: */
486 /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
487 /* Private Use Area (e000..f8ff) */
488 /* Noncharacters fdd0..fdef */
489 if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
490 || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
491 ) {
492 return -1;
493 }
494 /* 0xfffe and 0xffff in every plane are invalid */
495 if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
496 return -1;
497 }
498
499# if LAST_SUPPORTED_WCHAR >= 0x10000
500 if (ucs >= 0x10000) {
501 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
502 static const struct interval combining0x10000[] = {
503 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
504 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
505 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
506 { 0xD242, 0xD244 }
507 };
508 /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
509 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
510 return 0;
511 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
512 if (LAST_SUPPORTED_WCHAR >= 0xE0001
513 && ( ucs == 0xE0001
514 || (ucs >= 0xE0020 && ucs <= 0xE007F)
515 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
516 )
517 ) {
518 return 0;
519 }
520 }
521# endif
522
523 /* If we arrive here, ucs is not a combining or C0/C1 control character.
524 * Check whether it's 1 char or 2-shar wide.
525 */
526 return 1 +
527 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
528 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
529 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
530 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
531# if LAST_SUPPORTED_WCHAR >= 0xac00
532 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
533 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
534 || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
535 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
536 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
537 || (ucs >= 0xffe0 && ucs <= 0xffe6)
538 || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
539# endif
540 );
541# endif /* >= 0x1100 */
542#endif /* >= 0x300 */
543}