aboutsummaryrefslogtreecommitdiff
path: root/libbb/unicode.c
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 21:36:13 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2009-07-11 21:36:13 +0200
commit42a8fd0db08ab8b45fec6eab4af841f99576b260 (patch)
tree55f0600298da0c83c638c985d0c8b6d803be926b /libbb/unicode.c
parent883cea47518a171ab83f8e41def3aec92207519e (diff)
downloadbusybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.gz
busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.tar.bz2
busybox-w32-42a8fd0db08ab8b45fec6eab4af841f99576b260.zip
added simplified Unicode support for non-locale-enabled builds
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb/unicode.c')
-rw-r--r--libbb/unicode.c241
1 files changed, 241 insertions, 0 deletions
diff --git a/libbb/unicode.c b/libbb/unicode.c
new file mode 100644
index 000000000..a99f5ede1
--- /dev/null
+++ b/libbb/unicode.c
@@ -0,0 +1,241 @@
1/* vi: set sw=4 ts=4: */
2/*
3 * Unicode support routines.
4 *
5 * Copyright (C) 2008 Denys Vlasenko
6 *
7 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
8 */
9#include "libbb.h"
10
11/* if LOCALE_SUPPORT, libc locale stuff takes care of it, else: */
12
13#if !ENABLE_LOCALE_SUPPORT
14#include "unicode.h"
15
16/* 0: not known yet,
17 * 1: not unicode (IOW: assuming one char == one byte)
18 * 2: unicode
19 */
20# if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
21# define unicode_is_enabled 2
22# else
23static smallint unicode_is_enabled;
24void FAST_FUNC check_unicode_in_env(void)
25{
26 char *lang;
27
28 if (unicode_is_enabled)
29 return;
30 unicode_is_enabled = 1;
31
32 lang = getenv("LANG");
33 if (!lang || !strstr(lang, ".utf8"))
34 return;
35
36 unicode_is_enabled = 2;
37}
38# endif
39
40static size_t wcrtomb_internal(char *s, wchar_t wc)
41{
42 uint32_t v = wc;
43
44 if (v <= 0x7f) {
45 *s = v;
46 return 1;
47 }
48
49 /* 80-7FF -> 110yyyxx 10xxxxxx */
50 if (v <= 0x7ff) {
51 s[1] = (v & 0x3f) | 0x80;
52 v >>= 6;
53 s[0] = v | 0xc0;
54 return 2;
55 }
56
57 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
58 if (v <= 0xffff) {
59 s[2] = (v & 0x3f) | 0x80;
60 v >>= 6;
61 s[1] = (v & 0x3f) | 0x80;
62 v >>= 6;
63 s[0] = v | 0xe0;
64 return 3;
65 }
66
67 /* RFC 3629 says that Unicode ends at 10FFFF */
68
69 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
70 if (v <= 0x1fffff) {
71 s[3] = (v & 0x3f) | 0x80;
72 v >>= 6;
73 s[2] = (v & 0x3f) | 0x80;
74 v >>= 6;
75 s[1] = (v & 0x3f) | 0x80;
76 v >>= 6;
77 s[0] = v | 0xf0;
78 return 4;
79 }
80
81 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
82 if (v <= 0x3ffffff) {
83 s[4] = (v & 0x3f) | 0x80;
84 v >>= 6;
85 s[3] = (v & 0x3f) | 0x80;
86 v >>= 6;
87 s[2] = (v & 0x3f) | 0x80;
88 v >>= 6;
89 s[1] = (v & 0x3f) | 0x80;
90 v >>= 6;
91 s[0] = v | 0xf8;
92 return 5;
93 }
94
95 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
96 s[5] = (v & 0x3f) | 0x80;
97 v >>= 6;
98 s[4] = (v & 0x3f) | 0x80;
99 v >>= 6;
100 s[3] = (v & 0x3f) | 0x80;
101 v >>= 6;
102 s[2] = (v & 0x3f) | 0x80;
103 v >>= 6;
104 s[1] = (v & 0x3f) | 0x80;
105 v >>= 6;
106 s[0] = v | 0xfc;
107 return 6;
108}
109
110size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
111{
112 if (unicode_is_enabled != 2) {
113 *s = wc;
114 return 1;
115 }
116
117 return wcrtomb_internal(s, wc);
118}
119
120size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
121{
122 size_t org_n = n;
123
124 if (unicode_is_enabled != 2) {
125 while (n) {
126 wchar_t c = *src++;
127 *dest++ = c;
128 if (c == 0)
129 break;
130 n--;
131 }
132 return org_n - n;
133 }
134
135 while (n >= MB_CUR_MAX) {
136 wchar_t wc = *src++;
137 size_t len = wcrtomb_internal(dest, wc);
138
139 if (wc == L'\0')
140 return org_n - n;
141 dest += len;
142 n -= len;
143 }
144 while (n) {
145 char tbuf[MB_CUR_MAX];
146 wchar_t wc = *src++;
147 size_t len = wcrtomb_internal(tbuf, wc);
148
149 if (len > n)
150 len = n;
151 memcpy(dest, tbuf, len);
152 if (wc == L'\0')
153 return org_n - n;
154 dest += len;
155 n -= len;
156 }
157 return org_n - n;
158}
159
160size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
161{
162 size_t org_n = n;
163
164 if (unicode_is_enabled != 2) {
165 while (n) {
166 unsigned char c = *src++;
167 *dest++ = c;
168 if (c == 0)
169 break;
170 n--;
171 }
172 return org_n - n;
173 }
174
175 while (n) {
176 int bytes;
177 unsigned c = (unsigned char) *src++;
178
179 if (c <= 0x7f) {
180 *dest++ = c;
181 if (c == '\0')
182 break;
183 n--;
184 continue;
185 }
186
187 /* 80-7FF -> 110yyyxx 10xxxxxx */
188 /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
189 /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
190 /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
191 /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
192 bytes = 0;
193 do {
194 c <<= 1;
195 bytes++;
196 } while ((c & 0x80) && bytes < 6);
197 if (bytes == 1)
198 return (size_t) -1L;
199 c = (uint8_t)(c) >> bytes;
200
201 while (--bytes) {
202 unsigned ch = (unsigned char) *src++;
203 if ((ch & 0xc0) != 0x80) {
204 return (size_t) -1L;
205 }
206 c = (c << 6) + (ch & 0x3f);
207 }
208
209 /* TODO */
210 /* Need to check that c isn't produced by overlong encoding */
211 /* Example: 11000000 10000000 converts to NUL */
212 /* 11110000 10000000 10000100 10000000 converts to 0x100 */
213 /* correct encoding: 11000100 10000000 */
214 if (c <= 0x7f) { /* crude check */
215 return (size_t) -1L;
216 //or maybe: c = 0xfffd; /* replacement character */
217 }
218
219 *dest++ = c;
220 n--;
221 }
222
223 return org_n - n;
224}
225
226int FAST_FUNC iswspace(wint_t wc)
227{
228 return (unsigned)wc <= 0x7f && isspace(wc);
229}
230
231int FAST_FUNC iswalnum(wint_t wc)
232{
233 return (unsigned)wc <= 0x7f && isalnum(wc);
234}
235
236int FAST_FUNC iswpunct(wint_t wc)
237{
238 return (unsigned)wc <= 0x7f && ispunct(wc);
239}
240
241#endif