diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2009-07-11 21:36:13 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2009-07-11 21:36:13 +0200 |
commit | 42a8fd0db08ab8b45fec6eab4af841f99576b260 (patch) | |
tree | 55f0600298da0c83c638c985d0c8b6d803be926b | |
parent | 883cea47518a171ab83f8e41def3aec92207519e (diff) | |
download | busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.gz busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.bz2 busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.zip |
added simplified Unicode support for non-locale-enabled builds
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | Config.in | 34 | ||||
-rw-r--r-- | coreutils/ls.c | 12 | ||||
-rw-r--r-- | include/unicode.h | 57 | ||||
-rw-r--r-- | libbb/Kbuild | 2 | ||||
-rw-r--r-- | libbb/lineedit.c | 9 | ||||
-rw-r--r-- | libbb/unicode.c | 241 |
6 files changed, 331 insertions, 24 deletions
@@ -30,18 +30,6 @@ config EXTRA_COMPAT | |||
30 | some GNU extensions in libc. You probably only need this option | 30 | some GNU extensions in libc. You probably only need this option |
31 | if you plan to run busybox on desktop. | 31 | if you plan to run busybox on desktop. |
32 | 32 | ||
33 | config FEATURE_ASSUME_UNICODE | ||
34 | bool "Assume that 1:1 char/glyph correspondence is not true" | ||
35 | default n | ||
36 | help | ||
37 | This makes various applets aware that one byte is not | ||
38 | one character on screen. | ||
39 | |||
40 | Busybox aims to eventually work correctly with Unicode displays. | ||
41 | Any older encodings are not guaranteed to work. | ||
42 | Probably by the time when busybox will be fully Unicode-clean, | ||
43 | other encodings will be mainly of historic interest. | ||
44 | |||
45 | choice | 33 | choice |
46 | prompt "Buffer allocation policy" | 34 | prompt "Buffer allocation policy" |
47 | default FEATURE_BUFFERS_USE_MALLOC | 35 | default FEATURE_BUFFERS_USE_MALLOC |
@@ -114,6 +102,28 @@ config LOCALE_SUPPORT | |||
114 | Enable this if your system has locale support and you would like | 102 | Enable this if your system has locale support and you would like |
115 | busybox to support locale settings. | 103 | busybox to support locale settings. |
116 | 104 | ||
105 | config FEATURE_ASSUME_UNICODE | ||
106 | bool "Support Unicode" | ||
107 | default n | ||
108 | help | ||
109 | This makes various applets aware that one byte is not | ||
110 | one character on screen. | ||
111 | |||
112 | Busybox aims to eventually work correctly with Unicode displays. | ||
113 | Any older encodings are not guaranteed to work. | ||
114 | Probably by the time when busybox will be fully Unicode-clean, | ||
115 | other encodings will be mainly of historic interest. | ||
116 | |||
117 | config FEATURE_CHECK_UNICODE_IN_ENV | ||
118 | bool "Check $LANG environment variable" | ||
119 | default y | ||
120 | depends on FEATURE_ASSUME_UNICODE && !LOCALE_SUPPORT | ||
121 | help | ||
122 | With this option on, Unicode support is activated | ||
123 | only if LANG variable has the value of the form "xxxx.utf8" | ||
124 | |||
125 | Otherwise, Unicode support will be always enabled and active. | ||
126 | |||
117 | config LONG_OPTS | 127 | config LONG_OPTS |
118 | bool "Support for --long-options" | 128 | bool "Support for --long-options" |
119 | default y | 129 | default y |
diff --git a/coreutils/ls.c b/coreutils/ls.c index 8a6faf23f..20b979db6 100644 --- a/coreutils/ls.c +++ b/coreutils/ls.c | |||
@@ -30,12 +30,9 @@ | |||
30 | * [2009-03] | 30 | * [2009-03] |
31 | * ls sorts listing now, and supports almost all options. | 31 | * ls sorts listing now, and supports almost all options. |
32 | */ | 32 | */ |
33 | |||
34 | #include "libbb.h" | 33 | #include "libbb.h" |
34 | #include "unicode.h" | ||
35 | 35 | ||
36 | #if ENABLE_FEATURE_ASSUME_UNICODE | ||
37 | #include <wchar.h> | ||
38 | #endif | ||
39 | 36 | ||
40 | /* This is a NOEXEC applet. Be very careful! */ | 37 | /* This is a NOEXEC applet. Be very careful! */ |
41 | 38 | ||
@@ -296,9 +293,8 @@ enum { | |||
296 | /* libbb candidate */ | 293 | /* libbb candidate */ |
297 | static size_t mbstrlen(const char *string) | 294 | static size_t mbstrlen(const char *string) |
298 | { | 295 | { |
299 | size_t width = mbsrtowcs(NULL /*dest*/, &string, | 296 | size_t width = mbstowcs(NULL, string, INT_MAX); |
300 | MAXINT(size_t) /*len*/, NULL /*state*/); | 297 | if (width == (size_t)-1L) |
301 | if (width == (size_t)-1) | ||
302 | return strlen(string); | 298 | return strlen(string); |
303 | return width; | 299 | return width; |
304 | } | 300 | } |
@@ -932,6 +928,8 @@ int ls_main(int argc UNUSED_PARAM, char **argv) | |||
932 | 928 | ||
933 | INIT_G(); | 929 | INIT_G(); |
934 | 930 | ||
931 | check_unicode_in_env(); | ||
932 | |||
935 | all_fmt = LIST_SHORT | | 933 | all_fmt = LIST_SHORT | |
936 | (ENABLE_FEATURE_LS_SORTFILES * (SORT_NAME | SORT_FORWARD)); | 934 | (ENABLE_FEATURE_LS_SORTFILES * (SORT_NAME | SORT_FORWARD)); |
937 | 935 | ||
diff --git a/include/unicode.h b/include/unicode.h new file mode 100644 index 000000000..be64a50e2 --- /dev/null +++ b/include/unicode.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* vi: set sw=4 ts=4: */ | ||
2 | /* | ||
3 | * Licensed under the GPL version 2, see the file LICENSE in this tarball. | ||
4 | */ | ||
5 | #ifndef UNICODE_H | ||
6 | #define UNICODE_H 1 | ||
7 | |||
8 | #if !ENABLE_FEATURE_ASSUME_UNICODE | ||
9 | |||
10 | # define check_unicode_in_env() ((void)0) | ||
11 | |||
12 | #else | ||
13 | |||
14 | # if ENABLE_LOCALE_SUPPORT | ||
15 | |||
16 | # include <wchar.h> | ||
17 | # include <wctype.h> | ||
18 | # define check_unicode_in_env() ((void)0) | ||
19 | |||
20 | # else | ||
21 | |||
22 | # if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV | ||
23 | # define check_unicode_in_env() ((void)0) | ||
24 | # else | ||
25 | void check_unicode_in_env(void) FAST_FUNC; | ||
26 | # endif | ||
27 | |||
28 | # undef MB_CUR_MAX | ||
29 | # define MB_CUR_MAX 6 | ||
30 | |||
31 | /* Prevent name collisions */ | ||
32 | # define wint_t bb_wint_t | ||
33 | # define mbstate_t bb_mbstate_t | ||
34 | # define mbstowcs bb_mbstowcs | ||
35 | # define wcstombs bb_wcstombs | ||
36 | # define wcrtomb bb_wcrtomb | ||
37 | # define iswspace bb_iswspace | ||
38 | # define iswalnum bb_iswalnum | ||
39 | # define iswpunct bb_iswpunct | ||
40 | |||
41 | typedef int32_t wint_t; | ||
42 | typedef struct { | ||
43 | char bogus; | ||
44 | } mbstate_t; | ||
45 | |||
46 | size_t mbstowcs(wchar_t *dest, const char *src, size_t n) FAST_FUNC; | ||
47 | size_t wcstombs(char *dest, const wchar_t *src, size_t n) FAST_FUNC; | ||
48 | size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) FAST_FUNC; | ||
49 | int iswspace(wint_t wc) FAST_FUNC; | ||
50 | int iswalnum(wint_t wc) FAST_FUNC; | ||
51 | int iswpunct(wint_t wc) FAST_FUNC; | ||
52 | |||
53 | # endif | ||
54 | |||
55 | #endif | ||
56 | |||
57 | #endif | ||
diff --git a/libbb/Kbuild b/libbb/Kbuild index 70dc48dcb..efd04e322 100644 --- a/libbb/Kbuild +++ b/libbb/Kbuild | |||
@@ -139,6 +139,8 @@ lib-$(CONFIG_HWCLOCK) += rtc.o | |||
139 | lib-$(CONFIG_RTCWAKE) += rtc.o | 139 | lib-$(CONFIG_RTCWAKE) += rtc.o |
140 | lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o | 140 | lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o |
141 | 141 | ||
142 | lib-$(CONFIG_FEATURE_ASSUME_UNICODE) += unicode.o | ||
143 | |||
142 | # We shouldn't build xregcomp.c if we don't need it - this ensures we don't | 144 | # We shouldn't build xregcomp.c if we don't need it - this ensures we don't |
143 | # require regex.h to be in the include dir even if we don't need it thereby | 145 | # require regex.h to be in the include dir even if we don't need it thereby |
144 | # allowing us to build busybox even if uclibc regex support is disabled. | 146 | # allowing us to build busybox even if uclibc regex support is disabled. |
diff --git a/libbb/lineedit.c b/libbb/lineedit.c index e5d0c1b6c..ab3297220 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c | |||
@@ -34,10 +34,7 @@ | |||
34 | * PS1='\[\033[01;32m\]\u@\h\[\033[01;34m\] \w \$\[\033[00m\] ' | 34 | * PS1='\[\033[01;32m\]\u@\h\[\033[01;34m\] \w \$\[\033[00m\] ' |
35 | */ | 35 | */ |
36 | #include "libbb.h" | 36 | #include "libbb.h" |
37 | #if ENABLE_FEATURE_ASSUME_UNICODE | 37 | #include "unicode.h" |
38 | # include <wchar.h> | ||
39 | # include <wctype.h> | ||
40 | #endif | ||
41 | 38 | ||
42 | /* FIXME: obsolete CONFIG item? */ | 39 | /* FIXME: obsolete CONFIG item? */ |
43 | #define ENABLE_FEATURE_NONPRINTABLE_INVERSE_PUT 0 | 40 | #define ENABLE_FEATURE_NONPRINTABLE_INVERSE_PUT 0 |
@@ -1581,7 +1578,7 @@ static int lineedit_read_key(char *read_key_buffer) | |||
1581 | return ic; | 1578 | return ic; |
1582 | unicode_buf[unicode_idx++] = ic; | 1579 | unicode_buf[unicode_idx++] = ic; |
1583 | unicode_buf[unicode_idx] = '\0'; | 1580 | unicode_buf[unicode_idx] = '\0'; |
1584 | if (mbstowcs(&wc, unicode_buf, 1) < 1 && unicode_idx < MB_CUR_MAX) { | 1581 | if (mbstowcs(&wc, unicode_buf, 1) != 1 && unicode_idx < MB_CUR_MAX) { |
1585 | delay = 50; | 1582 | delay = 50; |
1586 | goto poll_again; | 1583 | goto poll_again; |
1587 | } | 1584 | } |
@@ -1636,6 +1633,8 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li | |||
1636 | return len; | 1633 | return len; |
1637 | } | 1634 | } |
1638 | 1635 | ||
1636 | check_unicode_in_env(); | ||
1637 | |||
1639 | // FIXME: audit & improve this | 1638 | // FIXME: audit & improve this |
1640 | if (maxsize > MAX_LINELEN) | 1639 | if (maxsize > MAX_LINELEN) |
1641 | maxsize = MAX_LINELEN; | 1640 | maxsize = MAX_LINELEN; |
diff --git a/libbb/unicode.c b/libbb/unicode.c new file mode 100644 index 000000000..a99f5ede1 --- /dev/null +++ b/libbb/unicode.c | |||
@@ -0,0 +1,241 @@ | |||
1 | /* vi: set sw=4 ts=4: */ | ||
2 | /* | ||
3 | * Unicode support routines. | ||
4 | * | ||
5 | * Copyright (C) 2008 Denys Vlasenko | ||
6 | * | ||
7 | * Licensed under GPL version 2, see file LICENSE in this tarball for details. | ||
8 | */ | ||
9 | #include "libbb.h" | ||
10 | |||
11 | /* if LOCALE_SUPPORT, libc locale stuff takes care of it, else: */ | ||
12 | |||
13 | #if !ENABLE_LOCALE_SUPPORT | ||
14 | #include "unicode.h" | ||
15 | |||
16 | /* 0: not known yet, | ||
17 | * 1: not unicode (IOW: assuming one char == one byte) | ||
18 | * 2: unicode | ||
19 | */ | ||
20 | # if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV | ||
21 | # define unicode_is_enabled 2 | ||
22 | # else | ||
23 | static smallint unicode_is_enabled; | ||
24 | void FAST_FUNC check_unicode_in_env(void) | ||
25 | { | ||
26 | char *lang; | ||
27 | |||
28 | if (unicode_is_enabled) | ||
29 | return; | ||
30 | unicode_is_enabled = 1; | ||
31 | |||
32 | lang = getenv("LANG"); | ||
33 | if (!lang || !strstr(lang, ".utf8")) | ||
34 | return; | ||
35 | |||
36 | unicode_is_enabled = 2; | ||
37 | } | ||
38 | # endif | ||
39 | |||
40 | static size_t wcrtomb_internal(char *s, wchar_t wc) | ||
41 | { | ||
42 | uint32_t v = wc; | ||
43 | |||
44 | if (v <= 0x7f) { | ||
45 | *s = v; | ||
46 | return 1; | ||
47 | } | ||
48 | |||
49 | /* 80-7FF -> 110yyyxx 10xxxxxx */ | ||
50 | if (v <= 0x7ff) { | ||
51 | s[1] = (v & 0x3f) | 0x80; | ||
52 | v >>= 6; | ||
53 | s[0] = v | 0xc0; | ||
54 | return 2; | ||
55 | } | ||
56 | |||
57 | /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */ | ||
58 | if (v <= 0xffff) { | ||
59 | s[2] = (v & 0x3f) | 0x80; | ||
60 | v >>= 6; | ||
61 | s[1] = (v & 0x3f) | 0x80; | ||
62 | v >>= 6; | ||
63 | s[0] = v | 0xe0; | ||
64 | return 3; | ||
65 | } | ||
66 | |||
67 | /* RFC 3629 says that Unicode ends at 10FFFF */ | ||
68 | |||
69 | /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
70 | if (v <= 0x1fffff) { | ||
71 | s[3] = (v & 0x3f) | 0x80; | ||
72 | v >>= 6; | ||
73 | s[2] = (v & 0x3f) | 0x80; | ||
74 | v >>= 6; | ||
75 | s[1] = (v & 0x3f) | 0x80; | ||
76 | v >>= 6; | ||
77 | s[0] = v | 0xf0; | ||
78 | return 4; | ||
79 | } | ||
80 | |||
81 | /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
82 | if (v <= 0x3ffffff) { | ||
83 | s[4] = (v & 0x3f) | 0x80; | ||
84 | v >>= 6; | ||
85 | s[3] = (v & 0x3f) | 0x80; | ||
86 | v >>= 6; | ||
87 | s[2] = (v & 0x3f) | 0x80; | ||
88 | v >>= 6; | ||
89 | s[1] = (v & 0x3f) | 0x80; | ||
90 | v >>= 6; | ||
91 | s[0] = v | 0xf8; | ||
92 | return 5; | ||
93 | } | ||
94 | |||
95 | /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
96 | s[5] = (v & 0x3f) | 0x80; | ||
97 | v >>= 6; | ||
98 | s[4] = (v & 0x3f) | 0x80; | ||
99 | v >>= 6; | ||
100 | s[3] = (v & 0x3f) | 0x80; | ||
101 | v >>= 6; | ||
102 | s[2] = (v & 0x3f) | 0x80; | ||
103 | v >>= 6; | ||
104 | s[1] = (v & 0x3f) | 0x80; | ||
105 | v >>= 6; | ||
106 | s[0] = v | 0xfc; | ||
107 | return 6; | ||
108 | } | ||
109 | |||
110 | size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM) | ||
111 | { | ||
112 | if (unicode_is_enabled != 2) { | ||
113 | *s = wc; | ||
114 | return 1; | ||
115 | } | ||
116 | |||
117 | return wcrtomb_internal(s, wc); | ||
118 | } | ||
119 | |||
120 | size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) | ||
121 | { | ||
122 | size_t org_n = n; | ||
123 | |||
124 | if (unicode_is_enabled != 2) { | ||
125 | while (n) { | ||
126 | wchar_t c = *src++; | ||
127 | *dest++ = c; | ||
128 | if (c == 0) | ||
129 | break; | ||
130 | n--; | ||
131 | } | ||
132 | return org_n - n; | ||
133 | } | ||
134 | |||
135 | while (n >= MB_CUR_MAX) { | ||
136 | wchar_t wc = *src++; | ||
137 | size_t len = wcrtomb_internal(dest, wc); | ||
138 | |||
139 | if (wc == L'\0') | ||
140 | return org_n - n; | ||
141 | dest += len; | ||
142 | n -= len; | ||
143 | } | ||
144 | while (n) { | ||
145 | char tbuf[MB_CUR_MAX]; | ||
146 | wchar_t wc = *src++; | ||
147 | size_t len = wcrtomb_internal(tbuf, wc); | ||
148 | |||
149 | if (len > n) | ||
150 | len = n; | ||
151 | memcpy(dest, tbuf, len); | ||
152 | if (wc == L'\0') | ||
153 | return org_n - n; | ||
154 | dest += len; | ||
155 | n -= len; | ||
156 | } | ||
157 | return org_n - n; | ||
158 | } | ||
159 | |||
160 | size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) | ||
161 | { | ||
162 | size_t org_n = n; | ||
163 | |||
164 | if (unicode_is_enabled != 2) { | ||
165 | while (n) { | ||
166 | unsigned char c = *src++; | ||
167 | *dest++ = c; | ||
168 | if (c == 0) | ||
169 | break; | ||
170 | n--; | ||
171 | } | ||
172 | return org_n - n; | ||
173 | } | ||
174 | |||
175 | while (n) { | ||
176 | int bytes; | ||
177 | unsigned c = (unsigned char) *src++; | ||
178 | |||
179 | if (c <= 0x7f) { | ||
180 | *dest++ = c; | ||
181 | if (c == '\0') | ||
182 | break; | ||
183 | n--; | ||
184 | continue; | ||
185 | } | ||
186 | |||
187 | /* 80-7FF -> 110yyyxx 10xxxxxx */ | ||
188 | /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */ | ||
189 | /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
190 | /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
191 | /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ | ||
192 | bytes = 0; | ||
193 | do { | ||
194 | c <<= 1; | ||
195 | bytes++; | ||
196 | } while ((c & 0x80) && bytes < 6); | ||
197 | if (bytes == 1) | ||
198 | return (size_t) -1L; | ||
199 | c = (uint8_t)(c) >> bytes; | ||
200 | |||
201 | while (--bytes) { | ||
202 | unsigned ch = (unsigned char) *src++; | ||
203 | if ((ch & 0xc0) != 0x80) { | ||
204 | return (size_t) -1L; | ||
205 | } | ||
206 | c = (c << 6) + (ch & 0x3f); | ||
207 | } | ||
208 | |||
209 | /* TODO */ | ||
210 | /* Need to check that c isn't produced by overlong encoding */ | ||
211 | /* Example: 11000000 10000000 converts to NUL */ | ||
212 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ | ||
213 | /* correct encoding: 11000100 10000000 */ | ||
214 | if (c <= 0x7f) { /* crude check */ | ||
215 | return (size_t) -1L; | ||
216 | //or maybe: c = 0xfffd; /* replacement character */ | ||
217 | } | ||
218 | |||
219 | *dest++ = c; | ||
220 | n--; | ||
221 | } | ||
222 | |||
223 | return org_n - n; | ||
224 | } | ||
225 | |||
226 | int FAST_FUNC iswspace(wint_t wc) | ||
227 | { | ||
228 | return (unsigned)wc <= 0x7f && isspace(wc); | ||
229 | } | ||
230 | |||
231 | int FAST_FUNC iswalnum(wint_t wc) | ||
232 | { | ||
233 | return (unsigned)wc <= 0x7f && isalnum(wc); | ||
234 | } | ||
235 | |||
236 | int FAST_FUNC iswpunct(wint_t wc) | ||
237 | { | ||
238 | return (unsigned)wc <= 0x7f && ispunct(wc); | ||
239 | } | ||
240 | |||
241 | #endif | ||