aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 21:36:13 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 21:36:13 +0200
commit42a8fd0db08ab8b45fec6eab4af841f99576b260 (patch)
tree55f0600298da0c83c638c985d0c8b6d803be926b
parent883cea47518a171ab83f8e41def3aec92207519e (diff)
downloadbusybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.gz
busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.bz2
busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.zip
added simplified Unicode support for non-locale-enabled builds
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--Config.in34
-rw-r--r--coreutils/ls.c12
-rw-r--r--include/unicode.h57
-rw-r--r--libbb/Kbuild2
-rw-r--r--libbb/lineedit.c9
-rw-r--r--libbb/unicode.c241
6 files changed, 331 insertions, 24 deletions
diff --git a/Config.in b/Config.in
index ee706eb0d..99f814e5a 100644
--- a/Config.in
+++ b/Config.in
@@ -30,18 +30,6 @@ config EXTRA_COMPAT
30 some GNU extensions in libc. You probably only need this option 30 some GNU extensions in libc. You probably only need this option
31 if you plan to run busybox on desktop. 31 if you plan to run busybox on desktop.
32 32
33config FEATURE_ASSUME_UNICODE
34 bool "Assume that 1:1 char/glyph correspondence is not true"
35 default n
36 help
37 This makes various applets aware that one byte is not
38 one character on screen.
39
40 Busybox aims to eventually work correctly with Unicode displays.
41 Any older encodings are not guaranteed to work.
42 Probably by the time when busybox will be fully Unicode-clean,
43 other encodings will be mainly of historic interest.
44
45choice 33choice
46 prompt "Buffer allocation policy" 34 prompt "Buffer allocation policy"
47 default FEATURE_BUFFERS_USE_MALLOC 35 default FEATURE_BUFFERS_USE_MALLOC
@@ -114,6 +102,28 @@ config LOCALE_SUPPORT
114 Enable this if your system has locale support and you would like 102 Enable this if your system has locale support and you would like
115 busybox to support locale settings. 103 busybox to support locale settings.
116 104
105config FEATURE_ASSUME_UNICODE
106 bool "Support Unicode"
107 default n
108 help
109 This makes various applets aware that one byte is not
110 one character on screen.
111
112 Busybox aims to eventually work correctly with Unicode displays.
113 Any older encodings are not guaranteed to work.
114 Probably by the time when busybox will be fully Unicode-clean,
115 other encodings will be mainly of historic interest.
116
117config FEATURE_CHECK_UNICODE_IN_ENV
118 bool "Check $LANG environment variable"
119 default y
120 depends on FEATURE_ASSUME_UNICODE && !LOCALE_SUPPORT
121 help
122 With this option on, Unicode support is activated
123 only if LANG variable has the value of the form "xxxx.utf8"
124
125 Otherwise, Unicode support will be always enabled and active.
126
117config LONG_OPTS 127config LONG_OPTS
118 bool "Support for --long-options" 128 bool "Support for --long-options"
119 default y 129 default y
diff --git a/coreutils/ls.c b/coreutils/ls.c
index 8a6faf23f..20b979db6 100644
--- a/coreutils/ls.c
+++ b/coreutils/ls.c
@@ -30,12 +30,9 @@
30 * [2009-03] 30 * [2009-03]
31 * ls sorts listing now, and supports almost all options. 31 * ls sorts listing now, and supports almost all options.
32 */ 32 */
33
34#include "libbb.h" 33#include "libbb.h"
34#include "unicode.h"
35 35
36#if ENABLE_FEATURE_ASSUME_UNICODE
37#include <wchar.h>
38#endif
39 36
40/* This is a NOEXEC applet. Be very careful! */ 37/* This is a NOEXEC applet. Be very careful! */
41 38
@@ -296,9 +293,8 @@ enum {
296/* libbb candidate */ 293/* libbb candidate */
297static size_t mbstrlen(const char *string) 294static size_t mbstrlen(const char *string)
298{ 295{
299 size_t width = mbsrtowcs(NULL /*dest*/, &string, 296 size_t width = mbstowcs(NULL, string, INT_MAX);
300 MAXINT(size_t) /*len*/, NULL /*state*/); 297 if (width == (size_t)-1L)
301 if (width == (size_t)-1)
302 return strlen(string); 298 return strlen(string);
303 return width; 299 return width;
304} 300}
@@ -932,6 +928,8 @@ int ls_main(int argc UNUSED_PARAM, char **argv)
932 928
933 INIT_G(); 929 INIT_G();
934 930
931 check_unicode_in_env();
932
935 all_fmt = LIST_SHORT | 933 all_fmt = LIST_SHORT |
936 (ENABLE_FEATURE_LS_SORTFILES * (SORT_NAME | SORT_FORWARD)); 934 (ENABLE_FEATURE_LS_SORTFILES * (SORT_NAME | SORT_FORWARD));
937 935
diff --git a/include/unicode.h b/include/unicode.h
new file mode 100644
index 000000000..be64a50e2
--- /dev/null
+++ b/include/unicode.h
@@ -0,0 +1,57 @@
1/* vi: set sw=4 ts=4: */
2/*
3 * Licensed under the GPL version 2, see the file LICENSE in this tarball.
4 */
5#ifndef UNICODE_H
6#define UNICODE_H 1
7
8#if !ENABLE_FEATURE_ASSUME_UNICODE
9
10# define check_unicode_in_env() ((void)0)
11
12#else
13
14# if ENABLE_LOCALE_SUPPORT
15
16# include <wchar.h>
17# include <wctype.h>
18# define check_unicode_in_env() ((void)0)
19
20# else
21
22# if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
23# define check_unicode_in_env() ((void)0)
24# else
25void check_unicode_in_env(void) FAST_FUNC;
26# endif
27
28# undef MB_CUR_MAX
29# define MB_CUR_MAX 6
30
31/* Prevent name collisions */
32# define wint_t bb_wint_t
33# define mbstate_t bb_mbstate_t
34# define mbstowcs bb_mbstowcs
35# define wcstombs bb_wcstombs
36# define wcrtomb bb_wcrtomb
37# define iswspace bb_iswspace
38# define iswalnum bb_iswalnum
39# define iswpunct bb_iswpunct
40
41typedef int32_t wint_t;
42typedef struct {
43 char bogus;
44} mbstate_t;
45
46size_t mbstowcs(wchar_t *dest, const char *src, size_t n) FAST_FUNC;
47size_t wcstombs(char *dest, const wchar_t *src, size_t n) FAST_FUNC;
48size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) FAST_FUNC;
49int iswspace(wint_t wc) FAST_FUNC;
50int iswalnum(wint_t wc) FAST_FUNC;
51int iswpunct(wint_t wc) FAST_FUNC;
52
53# endif
54
55#endif
56
57#endif
diff --git a/libbb/Kbuild b/libbb/Kbuild
index 70dc48dcb..efd04e322 100644
--- a/libbb/Kbuild
+++ b/libbb/Kbuild
@@ -139,6 +139,8 @@ lib-$(CONFIG_HWCLOCK) += rtc.o
139lib-$(CONFIG_RTCWAKE) += rtc.o 139lib-$(CONFIG_RTCWAKE) += rtc.o
140lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o 140lib-$(CONFIG_FEATURE_CHECK_NAMES) += die_if_bad_username.o
141 141
142lib-$(CONFIG_FEATURE_ASSUME_UNICODE) += unicode.o
143
142# We shouldn't build xregcomp.c if we don't need it - this ensures we don't 144# We shouldn't build xregcomp.c if we don't need it - this ensures we don't
143# require regex.h to be in the include dir even if we don't need it thereby 145# require regex.h to be in the include dir even if we don't need it thereby
144# allowing us to build busybox even if uclibc regex support is disabled. 146# allowing us to build busybox even if uclibc regex support is disabled.
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index e5d0c1b6c..ab3297220 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -34,10 +34,7 @@
34 * PS1='\[\033[01;32m\]\u@\h\[\033[01;34m\] \w \$\[\033[00m\] ' 34 * PS1='\[\033[01;32m\]\u@\h\[\033[01;34m\] \w \$\[\033[00m\] '
35 */ 35 */
36#include "libbb.h" 36#include "libbb.h"
37#if ENABLE_FEATURE_ASSUME_UNICODE 37#include "unicode.h"
38# include <wchar.h>
39# include <wctype.h>
40#endif
41 38
42/* FIXME: obsolete CONFIG item? */ 39/* FIXME: obsolete CONFIG item? */
43#define ENABLE_FEATURE_NONPRINTABLE_INVERSE_PUT 0 40#define ENABLE_FEATURE_NONPRINTABLE_INVERSE_PUT 0
@@ -1581,7 +1578,7 @@ static int lineedit_read_key(char *read_key_buffer)
1581 return ic; 1578 return ic;
1582 unicode_buf[unicode_idx++] = ic; 1579 unicode_buf[unicode_idx++] = ic;
1583 unicode_buf[unicode_idx] = '\0'; 1580 unicode_buf[unicode_idx] = '\0';
1584 if (mbstowcs(&wc, unicode_buf, 1) < 1 && unicode_idx < MB_CUR_MAX) { 1581 if (mbstowcs(&wc, unicode_buf, 1) != 1 && unicode_idx < MB_CUR_MAX) {
1585 delay = 50; 1582 delay = 50;
1586 goto poll_again; 1583 goto poll_again;
1587 } 1584 }
@@ -1636,6 +1633,8 @@ int FAST_FUNC read_line_input(const char *prompt, char *command, int maxsize, li
1636 return len; 1633 return len;
1637 } 1634 }
1638 1635
1636 check_unicode_in_env();
1637
1639// FIXME: audit & improve this 1638// FIXME: audit & improve this
1640 if (maxsize > MAX_LINELEN) 1639 if (maxsize > MAX_LINELEN)
1641 maxsize = MAX_LINELEN; 1640 maxsize = MAX_LINELEN;
diff --git a/libbb/unicode.c b/libbb/unicode.c
new file mode 100644
index 000000000..a99f5ede1
--- /dev/null
+++ b/libbb/unicode.c
@@ -0,0 +1,241 @@
1/* vi: set sw=4 ts=4: */
2/*
3 * Unicode support routines.
4 *
5 * Copyright (C) 2008 Denys Vlasenko
6 *
7 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
8 */
9#include "libbb.h"
10
11/* if LOCALE_SUPPORT, libc locale stuff takes care of it, else: */
12
13#if !ENABLE_LOCALE_SUPPORT
14#include "unicode.h"
15
16/* 0: not known yet,
17 * 1: not unicode (IOW: assuming one char == one byte)
18 * 2: unicode
19 */
20# if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
21# define unicode_is_enabled 2
22# else
23static smallint unicode_is_enabled;
24void FAST_FUNC check_unicode_in_env(void)
25{
26 char *lang;
27
28 if (unicode_is_enabled)
29 return;
30 unicode_is_enabled = 1;
31
32 lang = getenv("LANG");
33 if (!lang || !strstr(lang, ".utf8"))
34 return;
35
36 unicode_is_enabled = 2;
37}
38# endif
39
40static size_t wcrtomb_internal(char *s, wchar_t wc)
41{
42 uint32_t v = wc;
43
44 if (v <= 0x7f) {
45 *s = v;
46 return 1;
47 }
48
49 /* 80-7FF -> 110yyyxx 10xxxxxx */
50 if (v <= 0x7ff) {
51 s[1] = (v & 0x3f) | 0x80;
52 v >>= 6;
53 s[0] = v | 0xc0;
54 return 2;
55 }
56
57 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
58 if (v <= 0xffff) {
59 s[2] = (v & 0x3f) | 0x80;
60 v >>= 6;
61 s[1] = (v & 0x3f) | 0x80;
62 v >>= 6;
63 s[0] = v | 0xe0;
64 return 3;
65 }
66
67 /* RFC 3629 says that Unicode ends at 10FFFF */
68
69 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
70 if (v <= 0x1fffff) {
71 s[3] = (v & 0x3f) | 0x80;
72 v >>= 6;
73 s[2] = (v & 0x3f) | 0x80;
74 v >>= 6;
75 s[1] = (v & 0x3f) | 0x80;
76 v >>= 6;
77 s[0] = v | 0xf0;
78 return 4;
79 }
80
81 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
82 if (v <= 0x3ffffff) {
83 s[4] = (v & 0x3f) | 0x80;
84 v >>= 6;
85 s[3] = (v & 0x3f) | 0x80;
86 v >>= 6;
87 s[2] = (v & 0x3f) | 0x80;
88 v >>= 6;
89 s[1] = (v & 0x3f) | 0x80;
90 v >>= 6;
91 s[0] = v | 0xf8;
92 return 5;
93 }
94
95 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
96 s[5] = (v & 0x3f) | 0x80;
97 v >>= 6;
98 s[4] = (v & 0x3f) | 0x80;
99 v >>= 6;
100 s[3] = (v & 0x3f) | 0x80;
101 v >>= 6;
102 s[2] = (v & 0x3f) | 0x80;
103 v >>= 6;
104 s[1] = (v & 0x3f) | 0x80;
105 v >>= 6;
106 s[0] = v | 0xfc;
107 return 6;
108}
109
110size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
111{
112 if (unicode_is_enabled != 2) {
113 *s = wc;
114 return 1;
115 }
116
117 return wcrtomb_internal(s, wc);
118}
119
120size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
121{
122 size_t org_n = n;
123
124 if (unicode_is_enabled != 2) {
125 while (n) {
126 wchar_t c = *src++;
127 *dest++ = c;
128 if (c == 0)
129 break;
130 n--;
131 }
132 return org_n - n;
133 }
134
135 while (n >= MB_CUR_MAX) {
136 wchar_t wc = *src++;
137 size_t len = wcrtomb_internal(dest, wc);
138
139 if (wc == L'\0')
140 return org_n - n;
141 dest += len;
142 n -= len;
143 }
144 while (n) {
145 char tbuf[MB_CUR_MAX];
146 wchar_t wc = *src++;
147 size_t len = wcrtomb_internal(tbuf, wc);
148
149 if (len > n)
150 len = n;
151 memcpy(dest, tbuf, len);
152 if (wc == L'\0')
153 return org_n - n;
154 dest += len;
155 n -= len;
156 }
157 return org_n - n;
158}
159
160size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
161{
162 size_t org_n = n;
163
164 if (unicode_is_enabled != 2) {
165 while (n) {
166 unsigned char c = *src++;
167 *dest++ = c;
168 if (c == 0)
169 break;
170 n--;
171 }
172 return org_n - n;
173 }
174
175 while (n) {
176 int bytes;
177 unsigned c = (unsigned char) *src++;
178
179 if (c <= 0x7f) {
180 *dest++ = c;
181 if (c == '\0')
182 break;
183 n--;
184 continue;
185 }
186
187 /* 80-7FF -> 110yyyxx 10xxxxxx */
188 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
189 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
190 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
191 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
192 bytes = 0;
193 do {
194 c <<= 1;
195 bytes++;
196 } while ((c & 0x80) && bytes < 6);
197 if (bytes == 1)
198 return (size_t) -1L;
199 c = (uint8_t)(c) >> bytes;
200
201 while (--bytes) {
202 unsigned ch = (unsigned char) *src++;
203 if ((ch & 0xc0) != 0x80) {
204 return (size_t) -1L;
205 }
206 c = (c << 6) + (ch & 0x3f);
207 }
208
209 /* TODO */
210 /* Need to check that c isn't produced by overlong encoding */
211 /* Example: 11000000 10000000 converts to NUL */
212 /* 11110000 10000000 10000100 10000000 converts to 0x100 */
213 /* correct encoding: 11000100 10000000 */
214 if (c <= 0x7f) { /* crude check */
215 return (size_t) -1L;
216 //or maybe: c = 0xfffd; /* replacement character */
217 }
218
219 *dest++ = c;
220 n--;
221 }
222
223 return org_n - n;
224}
225
226int FAST_FUNC iswspace(wint_t wc)
227{
228 return (unsigned)wc <= 0x7f && isspace(wc);
229}
230
231int FAST_FUNC iswalnum(wint_t wc)
232{
233 return (unsigned)wc <= 0x7f && isalnum(wc);
234}
235
236int FAST_FUNC iswpunct(wint_t wc)
237{
238 return (unsigned)wc <= 0x7f && ispunct(wc);
239}
240
241#endif