aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 22:26:48 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 22:26:48 +0200
commitfda8f57360aaf24dba3784aae4818f5a351f5c7d (patch)
treeb8de1ad1663efc5270ae2e5e1067d84a1530474f
parent42a8fd0db08ab8b45fec6eab4af841f99576b260 (diff)
downloadbusybox-w32-fda8f57360aaf24dba3784aae4818f5a351f5c7d.tar.gz
busybox-w32-fda8f57360aaf24dba3784aae4818f5a351f5c7d.tar.bz2
busybox-w32-fda8f57360aaf24dba3784aae4818f5a351f5c7d.zip
tweaking Unicode support
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--coreutils/ls.c20
-rw-r--r--include/unicode.h9
-rw-r--r--libbb/unicode.c107
3 files changed, 57 insertions, 79 deletions
diff --git a/coreutils/ls.c b/coreutils/ls.c
index 20b979db6..827b35089 100644
--- a/coreutils/ls.c
+++ b/coreutils/ls.c
@@ -289,20 +289,6 @@ enum {
289} while (0) 289} while (0)
290 290
291 291
292#if ENABLE_FEATURE_ASSUME_UNICODE
293/* libbb candidate */
294static size_t mbstrlen(const char *string)
295{
296 size_t width = mbstowcs(NULL, string, INT_MAX);
297 if (width == (size_t)-1L)
298 return strlen(string);
299 return width;
300}
301#else
302#define mbstrlen(string) strlen(string)
303#endif
304
305
306static struct dnode *my_stat(const char *fullname, const char *name, int force_follow) 292static struct dnode *my_stat(const char *fullname, const char *name, int force_follow)
307{ 293{
308 struct stat dstat; 294 struct stat dstat;
@@ -570,7 +556,7 @@ static void showfiles(struct dnode **dn, int nfiles)
570 } else { 556 } else {
571 /* find the longest file name, use that as the column width */ 557 /* find the longest file name, use that as the column width */
572 for (i = 0; i < nfiles; i++) { 558 for (i = 0; i < nfiles; i++) {
573 int len = mbstrlen(dn[i]->name); 559 int len = bb_mbstrlen(dn[i]->name);
574 if (column_width < len) 560 if (column_width < len)
575 column_width = len; 561 column_width = len;
576 } 562 }
@@ -717,7 +703,7 @@ static int print_name(const char *name)
717{ 703{
718 if (option_mask32 & OPT_Q) { 704 if (option_mask32 & OPT_Q) {
719#if ENABLE_FEATURE_ASSUME_UNICODE 705#if ENABLE_FEATURE_ASSUME_UNICODE
720 int len = 2 + mbstrlen(name); 706 int len = 2 + bb_mbstrlen(name);
721#else 707#else
722 int len = 2; 708 int len = 2;
723#endif 709#endif
@@ -737,7 +723,7 @@ static int print_name(const char *name)
737 /* No -Q: */ 723 /* No -Q: */
738#if ENABLE_FEATURE_ASSUME_UNICODE 724#if ENABLE_FEATURE_ASSUME_UNICODE
739 fputs(name, stdout); 725 fputs(name, stdout);
740 return mbstrlen(name); 726 return bb_mbstrlen(name);
741#else 727#else
742 return printf("%s", name); 728 return printf("%s", name);
743#endif 729#endif
diff --git a/include/unicode.h b/include/unicode.h
index be64a50e2..e0061478d 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -7,10 +7,13 @@
7 7
8#if !ENABLE_FEATURE_ASSUME_UNICODE 8#if !ENABLE_FEATURE_ASSUME_UNICODE
9 9
10# define bb_mbstrlen(string) strlen(string)
10# define check_unicode_in_env() ((void)0) 11# define check_unicode_in_env() ((void)0)
11 12
12#else 13#else
13 14
15size_t bb_mbstrlen(const char *string) FAST_FUNC;
16
14# if ENABLE_LOCALE_SUPPORT 17# if ENABLE_LOCALE_SUPPORT
15 18
16# include <wchar.h> 19# include <wchar.h>
@@ -19,6 +22,8 @@
19 22
20# else 23# else
21 24
25/* Crude "locale support" which knows only C and Unicode locales */
26
22# if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV 27# if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
23# define check_unicode_in_env() ((void)0) 28# define check_unicode_in_env() ((void)0)
24# else 29# else
@@ -50,8 +55,8 @@ int iswspace(wint_t wc) FAST_FUNC;
50int iswalnum(wint_t wc) FAST_FUNC; 55int iswalnum(wint_t wc) FAST_FUNC;
51int iswpunct(wint_t wc) FAST_FUNC; 56int iswpunct(wint_t wc) FAST_FUNC;
52 57
53# endif 58# endif /* !LOCALE_SUPPORT */
54 59
55#endif 60#endif /* FEATURE_ASSUME_UNICODE */
56 61
57#endif 62#endif
diff --git a/libbb/unicode.c b/libbb/unicode.c
index a99f5ede1..b977437ef 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -7,13 +7,22 @@
7 * Licensed under GPL version 2, see file LICENSE in this tarball for details. 7 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
8 */ 8 */
9#include "libbb.h" 9#include "libbb.h"
10# include "unicode.h"
10 11
11/* if LOCALE_SUPPORT, libc locale stuff takes care of it, else: */ 12size_t FAST_FUNC bb_mbstrlen(const char *string)
13{
14 size_t width = mbstowcs(NULL, string, INT_MAX);
15 if (width == (size_t)-1L)
16 return strlen(string);
17 return width;
18}
12 19
13#if !ENABLE_LOCALE_SUPPORT 20#if !ENABLE_LOCALE_SUPPORT
14#include "unicode.h"
15 21
16/* 0: not known yet, 22/* Crude "locale support" which knows only C and Unicode locales */
23
24/* unicode_is_enabled:
25 * 0: not known yet,
17 * 1: not unicode (IOW: assuming one char == one byte) 26 * 1: not unicode (IOW: assuming one char == one byte)
18 * 2: unicode 27 * 2: unicode
19 */ 28 */
@@ -39,6 +48,7 @@ void FAST_FUNC check_unicode_in_env(void)
39 48
40static size_t wcrtomb_internal(char *s, wchar_t wc) 49static size_t wcrtomb_internal(char *s, wchar_t wc)
41{ 50{
51 int n;
42 uint32_t v = wc; 52 uint32_t v = wc;
43 53
44 if (v <= 0x7f) { 54 if (v <= 0x7f) {
@@ -46,65 +56,38 @@ static size_t wcrtomb_internal(char *s, wchar_t wc)
46 return 1; 56 return 1;
47 } 57 }
48 58
49 /* 80-7FF -> 110yyyxx 10xxxxxx */ 59 /* RFC 3629 says that Unicode ends at 10FFFF,
50 if (v <= 0x7ff) { 60 * but we cover entire 32 bits */
51 s[1] = (v & 0x3f) | 0x80;
52 v >>= 6;
53 s[0] = v | 0xc0;
54 return 2;
55 }
56 61
57 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */ 62 n = 2;
58 if (v <= 0xffff) { 63 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
59 s[2] = (v & 0x3f) | 0x80; 64 if (v >= 0x4000000) {
60 v >>= 6; 65 s[5] = (wc & 0x3f) | 0x80;
61 s[1] = (v & 0x3f) | 0x80; 66 wc = (uint32_t)wc >> 6; /* ensuring that high bits are 0 */
62 v >>= 6; 67 n++;
63 s[0] = v | 0xe0; 68 }
64 return 3; 69 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
70 if (v >= 0x200000) {
71 s[4] = (wc & 0x3f) | 0x80;
72 wc >>= 6;
73 n++;
65 } 74 }
66
67 /* RFC 3629 says that Unicode ends at 10FFFF */
68
69 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */ 75 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
70 if (v <= 0x1fffff) { 76 if (v >= 0x10000) {
71 s[3] = (v & 0x3f) | 0x80; 77 s[3] = (wc & 0x3f) | 0x80;
72 v >>= 6; 78 wc >>= 6;
73 s[2] = (v & 0x3f) | 0x80; 79 n++;
74 v >>= 6;
75 s[1] = (v & 0x3f) | 0x80;
76 v >>= 6;
77 s[0] = v | 0xf0;
78 return 4;
79 } 80 }
80 81 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
81 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ 82 if (v >= 0x800) {
82 if (v <= 0x3ffffff) { 83 s[2] = (wc & 0x3f) | 0x80;
83 s[4] = (v & 0x3f) | 0x80; 84 wc >>= 6;
84 v >>= 6; 85 n++;
85 s[3] = (v & 0x3f) | 0x80;
86 v >>= 6;
87 s[2] = (v & 0x3f) | 0x80;
88 v >>= 6;
89 s[1] = (v & 0x3f) | 0x80;
90 v >>= 6;
91 s[0] = v | 0xf8;
92 return 5;
93 } 86 }
94 87 s[1] = (wc & 0x3f) | 0x80;
95 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ 88 wc >>= 6;
96 s[5] = (v & 0x3f) | 0x80; 89 s[0] = wc | (uint8_t)(0x3f00 >> n);
97 v >>= 6; 90 return n;
98 s[4] = (v & 0x3f) | 0x80;
99 v >>= 6;
100 s[3] = (v & 0x3f) | 0x80;
101 v >>= 6;
102 s[2] = (v & 0x3f) | 0x80;
103 v >>= 6;
104 s[1] = (v & 0x3f) | 0x80;
105 v >>= 6;
106 s[0] = v | 0xfc;
107 return 6;
108} 91}
109 92
110size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM) 93size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
@@ -164,7 +147,9 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
164 if (unicode_is_enabled != 2) { 147 if (unicode_is_enabled != 2) {
165 while (n) { 148 while (n) {
166 unsigned char c = *src++; 149 unsigned char c = *src++;
167 *dest++ = c; 150
151 if (dest)
152 *dest++ = c;
168 if (c == 0) 153 if (c == 0)
169 break; 154 break;
170 n--; 155 n--;
@@ -177,7 +162,8 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
177 unsigned c = (unsigned char) *src++; 162 unsigned c = (unsigned char) *src++;
178 163
179 if (c <= 0x7f) { 164 if (c <= 0x7f) {
180 *dest++ = c; 165 if (dest)
166 *dest++ = c;
181 if (c == '\0') 167 if (c == '\0')
182 break; 168 break;
183 n--; 169 n--;
@@ -216,7 +202,8 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
216 //or maybe: c = 0xfffd; /* replacement character */ 202 //or maybe: c = 0xfffd; /* replacement character */
217 } 203 }
218 204
219 *dest++ = c; 205 if (dest)
206 *dest++ = c;
220 n--; 207 n--;
221 } 208 }
222 209