libbb: better unicode width support. Hopefully fixes bug 839.

Also opens up a possibility to make other unicode stuff smaller and more correct later. but: function old new delta static.combining - 516 +516 bb_wcwidth - 328 +328 unicode_cut_nchars - 141 +141 mbstowc_internal - 93 +93 in_table - 78 +78 cal_main 899 961 +62 static.combining0x10000 - 40 +40 unicode_strlen - 31 +31 bb_mbstrlen 31 - -31 bb_mbstowcs 173 102 -71 ------------------------------------------------------------------------------ (add/remove: 7/1 grow/shrink: 1/1 up/down: 1289/-102) Total: 1187 bytes Uses code of Markus Kuhn, which is in public domain: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c "Permission to use, copy, modify, and distribute this software for any purpose and without fee is hereby granted. The author disclaims all warranties with regard to this software." Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2010-01-24 07:44:03 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2010-01-24 07:44:03 +0100
commit: 9f93d621925966c22ee51fdcb5def8e131596f9b (patch)
tree: c1024fa92d6f422df6cf0991d5c9c8a4977b8625 /libbb/unicode.c
parent: 5da9f96ad85a2d9119d92c7a3d89deca7d904210 (diff)
download: busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.tar.gz
busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.tar.bz2
busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.zip
1 files changed, 153 insertions, 57 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 80042957c..844c154e0 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -9,22 +9,19 @@
 #include "libbb.h"
 #include "unicode.h"
-/* If it's not a constant... */
+/* If it's not #defined as a constant in unicode.h... */
 #ifndef unicode_status
 uint8_t unicode_status;
 #endif
-size_t FAST_FUNC bb_mbstrlen(const char *string)
+/* This file is compiled only if FEATURE_ASSUME_UNICODE is on.
-{
+ * We check other options and decide whether to use libc support
-        size_t width = mbstowcs(NULL, string, INT_MAX);
+ * via locale, or use our own logic:
-        if (width == (size_t)-1L)
+ */
-                return strlen(string);
-        return width;
-}
 #if ENABLE_LOCALE_SUPPORT
-/* Unicode support using libc */
+/* Unicode support using libc locale support. */
 void FAST_FUNC init_unicode(void)
 {
@@ -34,12 +31,12 @@ void FAST_FUNC init_unicode(void)
        if (unicode_status != UNICODE_UNKNOWN)
                return;
-        unicode_status = bb_mbstrlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
+        unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
 }
 #else
-/* Crude "locale support" which knows only C and Unicode locales */
+/* Homegrown Unicode support. It knows only C and Unicode locales. */
 # if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
 void FAST_FUNC init_unicode(void)
@@ -93,7 +90,6 @@ static size_t wcrtomb_internal(char *s, wchar_t wc)
        s[0] = wc | (uint8_t)(0x3f00 >> n);
        return n;
 }
 size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
 {
        if (unicode_status != UNICODE_ON) {
@@ -103,7 +99,6 @@ size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
        return wcrtomb_internal(s, wc);
 }
 size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
 {
        size_t org_n = n;
@@ -144,6 +139,51 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
        return org_n - n;
 }
+static const char *mbstowc_internal(wchar_t *res, const char *src)
+{
+        int bytes;
+        unsigned c = (unsigned char) *src++;
+        if (c <= 0x7f) {
+                *res = c;
+                return src;
+        }
+        /* 80-7FF -> 110yyyxx 10xxxxxx */
+        /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
+        /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
+        /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
+        /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
+        bytes = 0;
+        do {
+                c <<= 1;
+                bytes++;
+        } while ((c & 0x80) && bytes < 6);
+        if (bytes == 1)
+                return NULL;
+        c = (uint8_t)(c) >> bytes;
+        while (--bytes) {
+                unsigned ch = (unsigned char) *src++;
+                if ((ch & 0xc0) != 0x80) {
+                        return NULL;
+                }
+                c = (c << 6) + (ch & 0x3f);
+        }
+        /* TODO */
+        /* Need to check that c isn't produced by overlong encoding */
+        /* Example: 11000000 10000000 converts to NUL */
+        /* 11110000 10000000 10000100 10000000 converts to 0x100 */
+        /* correct encoding: 11000100 10000000 */
+        if (c <= 0x7f) { /* crude check */
+                return NULL;
+                //or maybe 0xfffd; /* replacement character */
+        }
+        *res = c;
+        return src;
+}
 size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
 {
        size_t org_n = n;
@@ -162,58 +202,20 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
        }
        while (n) {
-                int bytes;
+                wchar_t wc;
-                unsigned c = (unsigned char) *src++;
+                const char *rc = mbstowc_internal(dest ? dest : &wc, src);
+                if (rc == NULL) /* error */
-                if (c <= 0x7f) {
-                        if (dest)
-                                *dest++ = c;
-                        if (c == '\0')
-                                break;
-                        n--;
-                        continue;
-                }
-                /* 80-7FF -> 110yyyxx 10xxxxxx */
-                /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
-                /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
-                /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
-                /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
-                bytes = 0;
-                do {
-                        c <<= 1;
-                        bytes++;
-                } while ((c & 0x80) && bytes < 6);
-                if (bytes == 1)
                        return (size_t) -1L;
-                c = (uint8_t)(c) >> bytes;
-                while (--bytes) {
-                        unsigned ch = (unsigned char) *src++;
-                        if ((ch & 0xc0) != 0x80) {
-                                return (size_t) -1L;
-                        }
-                        c = (c << 6) + (ch & 0x3f);
-                }
-                /* TODO */
-                /* Need to check that c isn't produced by overlong encoding */
-                /* Example: 11000000 10000000 converts to NUL */
-                /* 11110000 10000000 10000100 10000000 converts to 0x100 */
-                /* correct encoding: 11000100 10000000 */
-                if (c <= 0x7f) { /* crude check */
-                        return (size_t) -1L;
-                        //or maybe: c = 0xfffd; /* replacement character */
-                }
                if (dest)
-                        *dest++ = c;
+                        dest++;
                n--;
        }
        return org_n - n;
 }
+#include "unicode_wcwidth.c"
 int FAST_FUNC iswspace(wint_t wc)
 {
        return (unsigned)wc <= 0x7f && isspace(wc);
@@ -229,4 +231,98 @@ int FAST_FUNC iswpunct(wint_t wc)
        return (unsigned)wc <= 0x7f && ispunct(wc);
 }
+#endif /* Homegrown Unicode support */
+/* The rest is mostly same for libc and for "homegrown" support */
+size_t FAST_FUNC unicode_strlen(const char *string)
+{
+        size_t width = mbstowcs(NULL, string, INT_MAX);
+        if (width == (size_t)-1L)
+                return strlen(string);
+        return width;
+}
+char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
+{
+        char *dst;
+        unsigned dst_len;
+        if (unicode_status != UNICODE_ON)
+                return xasprintf("%-*.*s", width, width, src);
+        dst = NULL;
+        dst_len = 0;
+        while (1) {
+                int w;
+                wchar_t wc;
+                dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
+#if ENABLE_LOCALE_SUPPORT
+                {
+                        mbstate_t mbst = { 0 };
+                        ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
+                        if (rc <= 0) /* error, or end-of-string */
+                                break;
+                }
+#else
+                src = mbstowc_internal(&wc, src);
+                if (!src || wc == 0) /* error, or end-of-string */
+                        break;
 #endif
+                w = wcwidth(wc);
+                if (w < 0) /* non-printable wchar */
+                        break;
+                width -= w;
+                if ((int)width < 0) { /* string is longer than width */
+                        width += w;
+                        while (width) {
+                                dst[dst_len++] = ' ';
+                                width--;
+                        }
+                        break;
+                }
+#if ENABLE_LOCALE_SUPPORT
+                {
+                        mbstate_t mbst = { 0 };
+                        dst_len += wcrtomb(&dst[dst_len], wc, &mbst);
+                }
+#else
+                dst_len += wcrtomb_internal(&dst[dst_len], wc);
+#endif
+        }
+        dst[dst_len] = '\0';
+        return dst;
+}
+unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
+{
+        if (unicode_status != UNICODE_ON) {
+                return width - strnlen(src, width);
+        }
+        while (1) {
+                int w;
+                wchar_t wc;
+#if ENABLE_LOCALE_SUPPORT
+                {
+                        mbstate_t mbst = { 0 };
+                        ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
+                        if (rc <= 0) /* error, or end-of-string */
+                                return width;
+                }
+#else
+                src = mbstowc_internal(&wc, src);
+                if (!src || wc == 0) /* error, or end-of-string */
+                        return width;
+#endif
+                w = wcwidth(wc);
+                if (w < 0) /* non-printable wchar */
+                        return width;
+                width -= w;
+                if ((int)width <= 0) /* string is longer than width */
+                        return 0;
+        }
+}
author	Denys Vlasenko <vda.linux@googlemail.com>	2010-01-24 07:44:03 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2010-01-24 07:44:03 +0100
commit	9f93d621925966c22ee51fdcb5def8e131596f9b (patch)
tree	c1024fa92d6f422df6cf0991d5c9c8a4977b8625 /libbb/unicode.c
parent	5da9f96ad85a2d9119d92c7a3d89deca7d904210 (diff)
download	busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.tar.gz busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.tar.bz2 busybox-w32-9f93d621925966c22ee51fdcb5def8e131596f9b.zip

diff --git a/libbb/unicode.c b/libbb/unicode.c index 80042957c..844c154e0 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c
@@ -9,22 +9,19 @@
9	#include "libbb.h"	9	#include "libbb.h"
10	#include "unicode.h"	10	#include "unicode.h"
11		11
12	/* If it's not a constant... */	12	/* If it's not #defined as a constant in unicode.h... */
13	#ifndef unicode_status	13	#ifndef unicode_status
14	uint8_t unicode_status;	14	uint8_t unicode_status;
15	#endif	15	#endif
16		16
17	size_t FAST_FUNC bb_mbstrlen(const char *string)	17	/* This file is compiled only if FEATURE_ASSUME_UNICODE is on.
18	{	18	* We check other options and decide whether to use libc support
19	size_t width = mbstowcs(NULL, string, INT_MAX);	19	* via locale, or use our own logic:
20	if (width == (size_t)-1L)	20	*/
21	return strlen(string);
22	return width;
23	}
24		21
25	#if ENABLE_LOCALE_SUPPORT	22	#if ENABLE_LOCALE_SUPPORT
26		23
27	/* Unicode support using libc */	24	/* Unicode support using libc locale support. */
28		25
29	void FAST_FUNC init_unicode(void)	26	void FAST_FUNC init_unicode(void)
30	{	27	{
@@ -34,12 +31,12 @@ void FAST_FUNC init_unicode(void)
34	if (unicode_status != UNICODE_UNKNOWN)	31	if (unicode_status != UNICODE_UNKNOWN)
35	return;	32	return;
36		33
37	unicode_status = bb_mbstrlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;	34	unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF;
38	}	35	}
39		36
40	#else	37	#else
41		38
42	/* Crude "locale support" which knows only C and Unicode locales */	39	/* Homegrown Unicode support. It knows only C and Unicode locales. */
43		40
44	# if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV	41	# if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
45	void FAST_FUNC init_unicode(void)	42	void FAST_FUNC init_unicode(void)
@@ -93,7 +90,6 @@ static size_t wcrtomb_internal(char *s, wchar_t wc)
93	s[0] = wc \| (uint8_t)(0x3f00 >> n);	90	s[0] = wc \| (uint8_t)(0x3f00 >> n);
94	return n;	91	return n;
95	}	92	}
96
97	size_t FAST_FUNC wcrtomb(char s, wchar_t wc, mbstate_t ps UNUSED_PARAM)	93	size_t FAST_FUNC wcrtomb(char s, wchar_t wc, mbstate_t ps UNUSED_PARAM)
98	{	94	{
99	if (unicode_status != UNICODE_ON) {	95	if (unicode_status != UNICODE_ON) {
@@ -103,7 +99,6 @@ size_t FAST_FUNC wcrtomb(char s, wchar_t wc, mbstate_t ps UNUSED_PARAM)
103		99
104	return wcrtomb_internal(s, wc);	100	return wcrtomb_internal(s, wc);
105	}	101	}
106
107	size_t FAST_FUNC wcstombs(char dest, const wchar_t src, size_t n)	102	size_t FAST_FUNC wcstombs(char dest, const wchar_t src, size_t n)
108	{	103	{
109	size_t org_n = n;	104	size_t org_n = n;
@@ -144,6 +139,51 @@ size_t FAST_FUNC wcstombs(char dest, const wchar_t src, size_t n)
144	return org_n - n;	139	return org_n - n;
145	}	140	}
146		141
		142	static const char mbstowc_internal(wchar_t res, const char *src)
		143	{
		144	int bytes;
		145	unsigned c = (unsigned char) *src++;
		146
		147	if (c <= 0x7f) {
		148	*res = c;
		149	return src;
		150	}
		151
		152	/* 80-7FF -> 110yyyxx 10xxxxxx */
		153	/* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
		154	/* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
		155	/* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
		156	/* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
		157	bytes = 0;
		158	do {
		159	c <<= 1;
		160	bytes++;
		161	} while ((c & 0x80) && bytes < 6);
		162	if (bytes == 1)
		163	return NULL;
		164	c = (uint8_t)(c) >> bytes;
		165
		166	while (--bytes) {
		167	unsigned ch = (unsigned char) *src++;
		168	if ((ch & 0xc0) != 0x80) {
		169	return NULL;
		170	}
		171	c = (c << 6) + (ch & 0x3f);
		172	}
		173
		174	/* TODO */
		175	/* Need to check that c isn't produced by overlong encoding */
		176	/* Example: 11000000 10000000 converts to NUL */
		177	/* 11110000 10000000 10000100 10000000 converts to 0x100 */
		178	/* correct encoding: 11000100 10000000 */
		179	if (c <= 0x7f) { /* crude check */
		180	return NULL;
		181	//or maybe 0xfffd; /* replacement character */
		182	}
		183
		184	*res = c;
		185	return src;
		186	}
147	size_t FAST_FUNC mbstowcs(wchar_t dest, const char src, size_t n)	187	size_t FAST_FUNC mbstowcs(wchar_t dest, const char src, size_t n)
148	{	188	{
149	size_t org_n = n;	189	size_t org_n = n;
@@ -162,58 +202,20 @@ size_t FAST_FUNC mbstowcs(wchar_t dest, const char src, size_t n)
162	}	202	}
163		203
164	while (n) {	204	while (n) {
165	int bytes;	205	wchar_t wc;
166	unsigned c = (unsigned char) *src++;	206	const char *rc = mbstowc_internal(dest ? dest : &wc, src);
167		207	if (rc == NULL) /* error */
168	if (c <= 0x7f) {
169	if (dest)
170	*dest++ = c;
171	if (c == '\0')
172	break;
173	n--;
174	continue;
175	}
176
177	/* 80-7FF -> 110yyyxx 10xxxxxx */
178	/* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
179	/* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
180	/* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
181	/* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
182	bytes = 0;
183	do {
184	c <<= 1;
185	bytes++;
186	} while ((c & 0x80) && bytes < 6);
187	if (bytes == 1)
188	return (size_t) -1L;	208	return (size_t) -1L;
189	c = (uint8_t)(c) >> bytes;
190
191	while (--bytes) {
192	unsigned ch = (unsigned char) *src++;
193	if ((ch & 0xc0) != 0x80) {
194	return (size_t) -1L;
195	}
196	c = (c << 6) + (ch & 0x3f);
197	}
198
199	/* TODO */
200	/* Need to check that c isn't produced by overlong encoding */
201	/* Example: 11000000 10000000 converts to NUL */
202	/* 11110000 10000000 10000100 10000000 converts to 0x100 */
203	/* correct encoding: 11000100 10000000 */
204	if (c <= 0x7f) { /* crude check */
205	return (size_t) -1L;
206	//or maybe: c = 0xfffd; /* replacement character */
207	}
208
209	if (dest)	209	if (dest)
210	*dest++ = c;	210	dest++;
211	n--;	211	n--;
212	}	212	}
213		213
214	return org_n - n;	214	return org_n - n;
215	}	215	}
216		216
		217	#include "unicode_wcwidth.c"
		218
217	int FAST_FUNC iswspace(wint_t wc)	219	int FAST_FUNC iswspace(wint_t wc)
218	{	220	{
219	return (unsigned)wc <= 0x7f && isspace(wc);	221	return (unsigned)wc <= 0x7f && isspace(wc);
@@ -229,4 +231,98 @@ int FAST_FUNC iswpunct(wint_t wc)
229	return (unsigned)wc <= 0x7f && ispunct(wc);	231	return (unsigned)wc <= 0x7f && ispunct(wc);
230	}	232	}
231		233
		234	#endif /* Homegrown Unicode support */
		235
		236
		237	/* The rest is mostly same for libc and for "homegrown" support */
		238
		239	size_t FAST_FUNC unicode_strlen(const char *string)
		240	{
		241	size_t width = mbstowcs(NULL, string, INT_MAX);
		242	if (width == (size_t)-1L)
		243	return strlen(string);
		244	return width;
		245	}
		246
		247	char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
		248	{
		249	char *dst;
		250	unsigned dst_len;
		251
		252	if (unicode_status != UNICODE_ON)
		253	return xasprintf("%-.s", width, width, src);
		254
		255	dst = NULL;
		256	dst_len = 0;
		257	while (1) {
		258	int w;
		259	wchar_t wc;
		260
		261	dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
		262	#if ENABLE_LOCALE_SUPPORT
		263	{
		264	mbstate_t mbst = { 0 };
		265	ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
		266	if (rc <= 0) /* error, or end-of-string */
		267	break;
		268	}
		269	#else
		270	src = mbstowc_internal(&wc, src);
		271	if (!src \|\| wc == 0) /* error, or end-of-string */
		272	break;
232	#endif	273	#endif
		274	w = wcwidth(wc);
		275	if (w < 0) /* non-printable wchar */
		276	break;
		277	width -= w;
		278	if ((int)width < 0) { /* string is longer than width */
		279	width += w;
		280	while (width) {
		281	dst[dst_len++] = ' ';
		282	width--;
		283	}
		284	break;
		285	}
		286	#if ENABLE_LOCALE_SUPPORT
		287	{
		288	mbstate_t mbst = { 0 };
		289	dst_len += wcrtomb(&dst[dst_len], wc, &mbst);
		290	}
		291	#else
		292	dst_len += wcrtomb_internal(&dst[dst_len], wc);
		293	#endif
		294	}
		295	dst[dst_len] = '\0';
		296	return dst;
		297	}
		298
		299	unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
		300	{
		301	if (unicode_status != UNICODE_ON) {
		302	return width - strnlen(src, width);
		303	}
		304
		305	while (1) {
		306	int w;
		307	wchar_t wc;
		308
		309	#if ENABLE_LOCALE_SUPPORT
		310	{
		311	mbstate_t mbst = { 0 };
		312	ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
		313	if (rc <= 0) /* error, or end-of-string */
		314	return width;
		315	}
		316	#else
		317	src = mbstowc_internal(&wc, src);
		318	if (!src \|\| wc == 0) /* error, or end-of-string */
		319	return width;
		320	#endif
		321	w = wcwidth(wc);
		322	if (w < 0) /* non-printable wchar */
		323	return width;
		324	width -= w;
		325	if ((int)width <= 0) /* string is longer than width */
		326	return 0;
		327	}
		328	}