aboutsummaryrefslogtreecommitdiff
path: root/miscutils/iconv.c
diff options
context:
space:
mode:
Diffstat (limited to 'miscutils/iconv.c')
-rw-r--r--miscutils/iconv.c1771
1 files changed, 1771 insertions, 0 deletions
diff --git a/miscutils/iconv.c b/miscutils/iconv.c
new file mode 100644
index 000000000..bedbb718d
--- /dev/null
+++ b/miscutils/iconv.c
@@ -0,0 +1,1771 @@
1/*
2 * iconv implementation using Win32 API to convert.
3 *
4 * This file is placed in the public domain.
5 */
6
7/*
8 * This code was obtained from:
9 *
10 * https://github.com/win-iconv/win-iconv
11 *
12 * Modified for busybox-w32 by Ronald M Yorston. These modifications
13 * are also dedicated to the public domain.
14 */
15
16//config:config ICONV
17//config: bool "iconv (11.4 kb)"
18//config: default y
19//config: depends on PLATFORM_MINGW32
20//config: help
21//config: 'iconv' converts text between character encodings.
22
23//applet:IF_ICONV(APPLET(iconv, BB_DIR_USR_BIN, BB_SUID_DROP))
24
25//kbuild:lib-$(CONFIG_ICONV) += iconv.o
26
27//usage:#define iconv_trivial_usage
28//usage: "[-lc] [-o outfile] [-f from-enc] [-t to-enc] [FILE]..."
29//usage:#define iconv_full_usage "\n\n"
30//usage: "Convert text between character encodings\n"
31//usage: "\n -l List all known character encodings"
32//usage: "\n -c Silently discard characters that cannot be converted"
33//usage: "\n -o Use outfile for output"
34//usage: "\n -f Use from-enc for input characters"
35//usage: "\n -t Use to-enc for output characters"
36
37#include "libbb.h"
38
39/* WORKAROUND: */
40#define GetProcAddressA GetProcAddress
41
42#define MB_CHAR_MAX 16
43
44#define UNICODE_MODE_BOM_DONE 1
45#define UNICODE_MODE_SWAPPED 2
46
47#define FLAG_USE_BOM 1
48#define FLAG_TRANSLIT 2 /* //TRANSLIT */
49#define FLAG_IGNORE 4 /* //IGNORE */
50
51typedef unsigned char uchar;
52typedef unsigned short ushort;
53typedef unsigned int uint;
54
55typedef void* iconv_t;
56
57static iconv_t iconv_open(const char *tocode, const char *fromcode);
58static int iconv_close(iconv_t cd);
59static size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
60
61typedef struct compat_t compat_t;
62typedef struct csconv_t csconv_t;
63typedef struct rec_iconv_t rec_iconv_t;
64
65typedef int (*f_mbtowc)(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
66typedef int (*f_wctomb)(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
67typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize);
68typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize);
69
70#define COMPAT_IN 1
71#define COMPAT_OUT 2
72
73/* unicode mapping for compatibility with other conversion table. */
74struct compat_t {
75 uint in;
76 uint out;
77 uint flag;
78};
79
80struct csconv_t {
81 int codepage;
82 int flags;
83 f_mbtowc mbtowc;
84 f_wctomb wctomb;
85 f_mblen mblen;
86 f_flush flush;
87 DWORD mode;
88 compat_t *compat;
89};
90
91struct rec_iconv_t {
92 iconv_t cd;
93 csconv_t from;
94 csconv_t to;
95};
96
97static int load_mlang(void);
98static int make_csconv(const char *name, csconv_t *cv);
99static int name_to_codepage(const char *name);
100static uint utf16_to_ucs4(const ushort *wbuf);
101static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize);
102static int mbtowc_flags(int codepage);
103static int must_use_null_useddefaultchar(int codepage);
104static int seterror(int err);
105
106static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
107static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
108static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
109static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize);
110static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize);
111
112static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
113static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
114static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
115static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
116static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
117static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
118static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
119static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
120static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
121static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
122static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize);
123
124#define CP_ALIAS_LIST \
125 CP_ALIAS(65001, "CP65001") \
126 CP_ALIAS(65001, "UTF8") \
127 CP_ALIAS(65001, "UTF-8") \
128\
129 CP_ALIAS(1200, "CP1200") \
130 CP_ALIAS(1200, "UTF16LE") \
131 CP_ALIAS(1200, "UTF-16LE") \
132 CP_ALIAS(1200, "UCS2LE") \
133 CP_ALIAS(1200, "UCS-2LE") \
134 CP_ALIAS(1200, "UCS-2-INTERNAL") \
135\
136 CP_ALIAS(1201, "CP1201") \
137 CP_ALIAS(1201, "UTF16BE") \
138 CP_ALIAS(1201, "UTF-16BE") \
139 CP_ALIAS(1201, "UCS2BE") \
140 CP_ALIAS(1201, "UCS-2BE") \
141 CP_ALIAS(1201, "unicodeFFFE") \
142\
143 CP_ALIAS(12000, "CP12000") \
144 CP_ALIAS(12000, "UTF32LE") \
145 CP_ALIAS(12000, "UTF-32LE") \
146 CP_ALIAS(12000, "UCS4LE") \
147 CP_ALIAS(12000, "UCS-4LE") \
148\
149 CP_ALIAS(12001, "CP12001") \
150 CP_ALIAS(12001, "UTF32BE") \
151 CP_ALIAS(12001, "UTF-32BE") \
152 CP_ALIAS(12001, "UCS4BE") \
153 CP_ALIAS(12001, "UCS-4BE") \
154\
155 /* Default is little endian, because the platform is */ \
156 CP_ALIAS(1200, "UTF16") \
157 CP_ALIAS(1200, "UTF-16") \
158 CP_ALIAS(1200, "UCS2") \
159 CP_ALIAS(1200, "UCS-2") \
160 CP_ALIAS(12000, "UTF32") \
161 CP_ALIAS(12000, "UTF-32") \
162 CP_ALIAS(12000, "UCS4") \
163 CP_ALIAS(12000, "UCS-4") \
164\
165 /* copy from libiconv `iconv -l` */ \
166 /* !IsValidCodePage(367) */ \
167 CP_ALIAS(20127, "ANSI_X3.4-1968") \
168 CP_ALIAS(20127, "ANSI_X3.4-1986") \
169 CP_ALIAS(20127, "ASCII") \
170 CP_ALIAS(20127, "CP367") \
171 CP_ALIAS(20127, "IBM367") \
172 CP_ALIAS(20127, "ISO-IR-6") \
173 CP_ALIAS(20127, "ISO646-US") \
174 CP_ALIAS(20127, "ISO_646.IRV:1991") \
175 CP_ALIAS(20127, "US") \
176 CP_ALIAS(20127, "US-ASCII") \
177 CP_ALIAS(20127, "CSASCII") \
178\
179 /* !IsValidCodePage(819) */ \
180 CP_ALIAS(1252, "CP819") \
181 CP_ALIAS(1252, "IBM819") \
182 CP_ALIAS(28591, "ISO-8859-1") \
183 CP_ALIAS(28591, "ISO-IR-100") \
184 CP_ALIAS(28591, "ISO8859-1") \
185 CP_ALIAS(28591, "ISO_8859-1") \
186 CP_ALIAS(28591, "ISO_8859-1:1987") \
187 CP_ALIAS(28591, "L1") \
188 CP_ALIAS(28591, "LATIN1") \
189 CP_ALIAS(28591, "CSISOLATIN1") \
190\
191 CP_ALIAS(1250, "CP1250") \
192 CP_ALIAS(1250, "MS-EE") \
193 CP_ALIAS(1250, "WINDOWS-1250") \
194\
195 CP_ALIAS(1251, "CP1251") \
196 CP_ALIAS(1251, "MS-CYRL") \
197 CP_ALIAS(1251, "WINDOWS-1251") \
198\
199 CP_ALIAS(1252, "CP1252") \
200 CP_ALIAS(1252, "MS-ANSI") \
201 CP_ALIAS(1252, "WINDOWS-1252") \
202\
203 CP_ALIAS(1253, "CP1253") \
204 CP_ALIAS(1253, "MS-GREEK") \
205 CP_ALIAS(1253, "WINDOWS-1253") \
206\
207 CP_ALIAS(1254, "CP1254") \
208 CP_ALIAS(1254, "MS-TURK") \
209 CP_ALIAS(1254, "WINDOWS-1254") \
210\
211 CP_ALIAS(1255, "CP1255") \
212 CP_ALIAS(1255, "MS-HEBR") \
213 CP_ALIAS(1255, "WINDOWS-1255") \
214\
215 CP_ALIAS(1256, "CP1256") \
216 CP_ALIAS(1256, "MS-ARAB") \
217 CP_ALIAS(1256, "WINDOWS-1256") \
218\
219 CP_ALIAS(1257, "CP1257") \
220 CP_ALIAS(1257, "WINBALTRIM") \
221 CP_ALIAS(1257, "WINDOWS-1257") \
222\
223 CP_ALIAS(1258, "CP1258") \
224 CP_ALIAS(1258, "WINDOWS-1258") \
225\
226 CP_ALIAS(850, "850") \
227 CP_ALIAS(850, "CP850") \
228 CP_ALIAS(850, "IBM850") \
229 CP_ALIAS(850, "CSPC850MULTILINGUAL") \
230\
231 /* !IsValidCodePage(862) */ \
232 CP_ALIAS(862, "862") \
233 CP_ALIAS(862, "CP862") \
234 CP_ALIAS(862, "IBM862") \
235 CP_ALIAS(862, "CSPC862LATINHEBREW") \
236\
237 CP_ALIAS(866, "866") \
238 CP_ALIAS(866, "CP866") \
239 CP_ALIAS(866, "IBM866") \
240 CP_ALIAS(866, "CSIBM866") \
241\
242 /* !IsValidCodePage(154) */ \
243 CP_ALIAS(154, "CP154") \
244 CP_ALIAS(154, "CYRILLIC-ASIAN") \
245 CP_ALIAS(154, "PT154") \
246 CP_ALIAS(154, "PTCP154") \
247 CP_ALIAS(154, "CSPTCP154") \
248\
249 /* !IsValidCodePage(1133) */ \
250 CP_ALIAS(1133, "CP1133") \
251 CP_ALIAS(1133, "IBM-CP1133") \
252\
253 CP_ALIAS(874, "CP874") \
254 CP_ALIAS(874, "WINDOWS-874") \
255\
256 /* !IsValidCodePage(51932) */ \
257 CP_ALIAS(51932, "CP51932") \
258 CP_ALIAS(51932, "MS51932") \
259 CP_ALIAS(51932, "WINDOWS-51932") \
260 CP_ALIAS(51932, "EUC-JP") \
261\
262 CP_ALIAS(932, "CP932") \
263 CP_ALIAS(932, "MS932") \
264 CP_ALIAS(932, "SHIFFT_JIS") \
265 CP_ALIAS(932, "SHIFFT_JIS-MS") \
266 CP_ALIAS(932, "SJIS") \
267 CP_ALIAS(932, "SJIS-MS") \
268 CP_ALIAS(932, "SJIS-OPEN") \
269 CP_ALIAS(932, "SJIS-WIN") \
270 CP_ALIAS(932, "WINDOWS-31J") \
271 CP_ALIAS(932, "WINDOWS-932") \
272 CP_ALIAS(932, "CSWINDOWS31J") \
273\
274 CP_ALIAS(50221, "CP50221") \
275 CP_ALIAS(50221, "ISO-2022-JP") \
276 CP_ALIAS(50221, "ISO-2022-JP-MS") \
277 CP_ALIAS(50221, "ISO2022-JP") \
278 CP_ALIAS(50221, "ISO2022-JP-MS") \
279 CP_ALIAS(50221, "MS50221") \
280 CP_ALIAS(50221, "WINDOWS-50221") \
281\
282 CP_ALIAS(936, "CP936") \
283 CP_ALIAS(936, "GBK") \
284 CP_ALIAS(936, "MS936") \
285 CP_ALIAS(936, "WINDOWS-936") \
286\
287 CP_ALIAS(950, "CP950") \
288 CP_ALIAS(950, "BIG5") \
289 CP_ALIAS(950, "BIG5HKSCS") \
290 CP_ALIAS(950, "BIG5-HKSCS") \
291\
292 CP_ALIAS(949, "CP949") \
293 CP_ALIAS(949, "UHC") \
294 CP_ALIAS(949, "EUC-KR") \
295\
296 CP_ALIAS(1361, "CP1361") \
297 CP_ALIAS(1361, "JOHAB") \
298\
299 CP_ALIAS(437, "437") \
300 CP_ALIAS(437, "CP437") \
301 CP_ALIAS(437, "IBM437") \
302 CP_ALIAS(437, "CSPC8CODEPAGE437") \
303\
304 CP_ALIAS(737, "CP737") \
305\
306 CP_ALIAS(775, "CP775") \
307 CP_ALIAS(775, "IBM775") \
308 CP_ALIAS(775, "CSPC775BALTIC") \
309\
310 CP_ALIAS(852, "852") \
311 CP_ALIAS(852, "CP852") \
312 CP_ALIAS(852, "IBM852") \
313 CP_ALIAS(852, "CSPCP852") \
314\
315 /* !IsValidCodePage(853) */ \
316 CP_ALIAS(853, "CP853") \
317\
318 CP_ALIAS(855, "855") \
319 CP_ALIAS(855, "CP855") \
320 CP_ALIAS(855, "IBM855") \
321 CP_ALIAS(855, "CSIBM855") \
322\
323 CP_ALIAS(857, "857") \
324 CP_ALIAS(857, "CP857") \
325 CP_ALIAS(857, "IBM857") \
326 CP_ALIAS(857, "CSIBM857") \
327\
328 /* !IsValidCodePage(858) */ \
329 CP_ALIAS(858, "CP858") \
330\
331 CP_ALIAS(860, "860") \
332 CP_ALIAS(860, "CP860") \
333 CP_ALIAS(860, "IBM860") \
334 CP_ALIAS(860, "CSIBM860") \
335\
336 CP_ALIAS(861, "861") \
337 CP_ALIAS(861, "CP-IS") \
338 CP_ALIAS(861, "CP861") \
339 CP_ALIAS(861, "IBM861") \
340 CP_ALIAS(861, "CSIBM861") \
341\
342 CP_ALIAS(863, "863") \
343 CP_ALIAS(863, "CP863") \
344 CP_ALIAS(863, "IBM863") \
345 CP_ALIAS(863, "CSIBM863") \
346\
347 CP_ALIAS(864, "CP864") \
348 CP_ALIAS(864, "IBM864") \
349 CP_ALIAS(864, "CSIBM864") \
350\
351 CP_ALIAS(865, "865") \
352 CP_ALIAS(865, "CP865") \
353 CP_ALIAS(865, "IBM865") \
354 CP_ALIAS(865, "CSIBM865") \
355\
356 CP_ALIAS(869, "869") \
357 CP_ALIAS(869, "CP-GR") \
358 CP_ALIAS(869, "CP869") \
359 CP_ALIAS(869, "IBM869") \
360 CP_ALIAS(869, "CSIBM869") \
361\
362 /* !IsValidCodePage(1152) */ \
363 CP_ALIAS(1125, "CP1125") \
364\
365 /* \
366 * Code Page Identifiers \
367 * http://msdn2.microsoft.com/en-us/library/ms776446.aspx \
368 */ \
369 CP_ALIAS(37, "IBM037") /* IBM EBCDIC US-Canada */ \
370 CP_ALIAS(437, "IBM437") /* OEM United States */ \
371 CP_ALIAS(500, "IBM500") /* IBM EBCDIC International */ \
372 CP_ALIAS(708, "ASMO-708") /* Arabic (ASMO 708) */ \
373 /* 709 Arabic (ASMO-449+, BCON V4) */ \
374 /* 710 Arabic - Transparent Arabic */ \
375 CP_ALIAS(720, "DOS-720") /* Arabic (Transparent ASMO); Arabic (DOS) */ \
376 CP_ALIAS(737, "ibm737") /* OEM Greek (formerly 437G); Greek (DOS) */ \
377 CP_ALIAS(775, "ibm775") /* OEM Baltic; Baltic (DOS) */ \
378 CP_ALIAS(850, "ibm850") /* OEM Multilingual Latin 1; Western European (DOS) */ \
379 CP_ALIAS(852, "ibm852") /* OEM Latin 2; Central European (DOS) */ \
380 CP_ALIAS(855, "IBM855") /* OEM Cyrillic (primarily Russian) */ \
381 CP_ALIAS(857, "ibm857") /* OEM Turkish; Turkish (DOS) */ \
382 CP_ALIAS(858, "IBM00858") /* OEM Multilingual Latin 1 + Euro symbol */ \
383 CP_ALIAS(860, "IBM860") /* OEM Portuguese; Portuguese (DOS) */ \
384 CP_ALIAS(861, "ibm861") /* OEM Icelandic; Icelandic (DOS) */ \
385 CP_ALIAS(862, "DOS-862") /* OEM Hebrew; Hebrew (DOS) */ \
386 CP_ALIAS(863, "IBM863") /* OEM French Canadian; French Canadian (DOS) */ \
387 CP_ALIAS(864, "IBM864") /* OEM Arabic; Arabic (864) */ \
388 CP_ALIAS(865, "IBM865") /* OEM Nordic; Nordic (DOS) */ \
389 CP_ALIAS(866, "cp866") /* OEM Russian; Cyrillic (DOS) */ \
390 CP_ALIAS(869, "ibm869") /* OEM Modern Greek; Greek, Modern (DOS) */ \
391 CP_ALIAS(870, "IBM870") /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ \
392 CP_ALIAS(874, "windows-874") /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ \
393 CP_ALIAS(875, "cp875") /* IBM EBCDIC Greek Modern */ \
394 CP_ALIAS(932, "shift_jis") /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ \
395 CP_ALIAS(932, "shift-jis") /* alternative name for it */ \
396 CP_ALIAS(936, "gb2312") /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ \
397 CP_ALIAS(949, "ks_c_5601-1987") /* ANSI/OEM Korean (Unified Hangul Code) */ \
398 CP_ALIAS(950, "big5") /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ \
399 CP_ALIAS(950, "big5hkscs") /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */ \
400 CP_ALIAS(950, "big5-hkscs") /* alternative name for it */ \
401 CP_ALIAS(1026, "IBM1026") /* IBM EBCDIC Turkish (Latin 5) */ \
402 CP_ALIAS(1047, "IBM01047") /* IBM EBCDIC Latin 1/Open System */ \
403 CP_ALIAS(1140, "IBM01140") /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ \
404 CP_ALIAS(1141, "IBM01141") /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ \
405 CP_ALIAS(1142, "IBM01142") /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ \
406 CP_ALIAS(1143, "IBM01143") /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ \
407 CP_ALIAS(1144, "IBM01144") /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ \
408 CP_ALIAS(1145, "IBM01145") /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ \
409 CP_ALIAS(1146, "IBM01146") /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ \
410 CP_ALIAS(1147, "IBM01147") /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ \
411 CP_ALIAS(1148, "IBM01148") /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ \
412 CP_ALIAS(1149, "IBM01149") /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ \
413 CP_ALIAS(1250, "windows-1250") /* ANSI Central European; Central European (Windows) */ \
414 CP_ALIAS(1251, "windows-1251") /* ANSI Cyrillic; Cyrillic (Windows) */ \
415 CP_ALIAS(1252, "windows-1252") /* ANSI Latin 1; Western European (Windows) */ \
416 CP_ALIAS(1253, "windows-1253") /* ANSI Greek; Greek (Windows) */ \
417 CP_ALIAS(1254, "windows-1254") /* ANSI Turkish; Turkish (Windows) */ \
418 CP_ALIAS(1255, "windows-1255") /* ANSI Hebrew; Hebrew (Windows) */ \
419 CP_ALIAS(1256, "windows-1256") /* ANSI Arabic; Arabic (Windows) */ \
420 CP_ALIAS(1257, "windows-1257") /* ANSI Baltic; Baltic (Windows) */ \
421 CP_ALIAS(1258, "windows-1258") /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ \
422 CP_ALIAS(1361, "Johab") /* Korean (Johab) */ \
423 CP_ALIAS(10000, "macintosh") /* MAC Roman; Western European (Mac) */ \
424 CP_ALIAS(10001, "x-mac-japanese") /* Japanese (Mac) */ \
425 CP_ALIAS(10002, "x-mac-chinesetrad") /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ \
426 CP_ALIAS(10003, "x-mac-korean") /* Korean (Mac) */ \
427 CP_ALIAS(10004, "x-mac-arabic") /* Arabic (Mac) */ \
428 CP_ALIAS(10005, "x-mac-hebrew") /* Hebrew (Mac) */ \
429 CP_ALIAS(10006, "x-mac-greek") /* Greek (Mac) */ \
430 CP_ALIAS(10007, "x-mac-cyrillic") /* Cyrillic (Mac) */ \
431 CP_ALIAS(10008, "x-mac-chinesesimp") /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ \
432 CP_ALIAS(10010, "x-mac-romanian") /* Romanian (Mac) */ \
433 CP_ALIAS(10017, "x-mac-ukrainian") /* Ukrainian (Mac) */ \
434 CP_ALIAS(10021, "x-mac-thai") /* Thai (Mac) */ \
435 CP_ALIAS(10029, "x-mac-ce") /* MAC Latin 2; Central European (Mac) */ \
436 CP_ALIAS(10079, "x-mac-icelandic") /* Icelandic (Mac) */ \
437 CP_ALIAS(10081, "x-mac-turkish") /* Turkish (Mac) */ \
438 CP_ALIAS(10082, "x-mac-croatian") /* Croatian (Mac) */ \
439 CP_ALIAS(20000, "x-Chinese_CNS") /* CNS Taiwan; Chinese Traditional (CNS) */ \
440 CP_ALIAS(20001, "x-cp20001") /* TCA Taiwan */ \
441 CP_ALIAS(20002, "x_Chinese-Eten") /* Eten Taiwan; Chinese Traditional (Eten) */ \
442 CP_ALIAS(20003, "x-cp20003") /* IBM5550 Taiwan */ \
443 CP_ALIAS(20004, "x-cp20004") /* TeleText Taiwan */ \
444 CP_ALIAS(20005, "x-cp20005") /* Wang Taiwan */ \
445 CP_ALIAS(20105, "x-IA5") /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ \
446 CP_ALIAS(20106, "x-IA5-German") /* IA5 German (7-bit) */ \
447 CP_ALIAS(20107, "x-IA5-Swedish") /* IA5 Swedish (7-bit) */ \
448 CP_ALIAS(20108, "x-IA5-Norwegian") /* IA5 Norwegian (7-bit) */ \
449 CP_ALIAS(20127, "us-ascii") /* US-ASCII (7-bit) */ \
450 CP_ALIAS(20261, "x-cp20261") /* T.61 */ \
451 CP_ALIAS(20269, "x-cp20269") /* ISO 6937 Non-Spacing Accent */ \
452 CP_ALIAS(20273, "IBM273") /* IBM EBCDIC Germany */ \
453 CP_ALIAS(20277, "IBM277") /* IBM EBCDIC Denmark-Norway */ \
454 CP_ALIAS(20278, "IBM278") /* IBM EBCDIC Finland-Sweden */ \
455 CP_ALIAS(20280, "IBM280") /* IBM EBCDIC Italy */ \
456 CP_ALIAS(20284, "IBM284") /* IBM EBCDIC Latin America-Spain */ \
457 CP_ALIAS(20285, "IBM285") /* IBM EBCDIC United Kingdom */ \
458 CP_ALIAS(20290, "IBM290") /* IBM EBCDIC Japanese Katakana Extended */ \
459 CP_ALIAS(20297, "IBM297") /* IBM EBCDIC France */ \
460 CP_ALIAS(20420, "IBM420") /* IBM EBCDIC Arabic */ \
461 CP_ALIAS(20423, "IBM423") /* IBM EBCDIC Greek */ \
462 CP_ALIAS(20424, "IBM424") /* IBM EBCDIC Hebrew */ \
463 CP_ALIAS(20833, "x-EBCDIC-KoreanExtended") /* IBM EBCDIC Korean Extended */ \
464 CP_ALIAS(20838, "IBM-Thai") /* IBM EBCDIC Thai */ \
465 CP_ALIAS(20866, "koi8-r") /* Russian (KOI8-R); Cyrillic (KOI8-R) */ \
466 CP_ALIAS(20871, "IBM871") /* IBM EBCDIC Icelandic */ \
467 CP_ALIAS(20880, "IBM880") /* IBM EBCDIC Cyrillic Russian */ \
468 CP_ALIAS(20905, "IBM905") /* IBM EBCDIC Turkish */ \
469 CP_ALIAS(20924, "IBM00924") /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ \
470 CP_ALIAS(20932, "EUC-JP") /* Japanese (JIS 0208-1990 and 0121-1990) */ \
471 CP_ALIAS(20936, "x-cp20936") /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ \
472 CP_ALIAS(20949, "x-cp20949") /* Korean Wansung */ \
473 CP_ALIAS(21025, "cp1025") /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ \
474 /* 21027 (deprecated) */ \
475 CP_ALIAS(21866, "koi8-u") /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ \
476 CP_ALIAS(28591, "iso-8859-1") /* ISO 8859-1 Latin 1; Western European (ISO) */ \
477 CP_ALIAS(28591, "iso8859-1") /* ISO 8859-1 Latin 1; Western European (ISO) */ \
478 CP_ALIAS(28591, "iso_8859-1") \
479 CP_ALIAS(28591, "iso_8859_1") \
480 CP_ALIAS(28592, "iso-8859-2") /* ISO 8859-2 Central European; Central European (ISO) */ \
481 CP_ALIAS(28592, "iso8859-2") /* ISO 8859-2 Central European; Central European (ISO) */ \
482 CP_ALIAS(28592, "iso_8859-2") \
483 CP_ALIAS(28592, "iso_8859_2") \
484 CP_ALIAS(28593, "iso-8859-3") /* ISO 8859-3 Latin 3 */ \
485 CP_ALIAS(28593, "iso8859-3") /* ISO 8859-3 Latin 3 */ \
486 CP_ALIAS(28593, "iso_8859-3") \
487 CP_ALIAS(28593, "iso_8859_3") \
488 CP_ALIAS(28594, "iso-8859-4") /* ISO 8859-4 Baltic */ \
489 CP_ALIAS(28594, "iso8859-4") /* ISO 8859-4 Baltic */ \
490 CP_ALIAS(28594, "iso_8859-4") \
491 CP_ALIAS(28594, "iso_8859_4") \
492 CP_ALIAS(28595, "iso-8859-5") /* ISO 8859-5 Cyrillic */ \
493 CP_ALIAS(28595, "iso8859-5") /* ISO 8859-5 Cyrillic */ \
494 CP_ALIAS(28595, "iso_8859-5") \
495 CP_ALIAS(28595, "iso_8859_5") \
496 CP_ALIAS(28596, "iso-8859-6") /* ISO 8859-6 Arabic */ \
497 CP_ALIAS(28596, "iso8859-6") /* ISO 8859-6 Arabic */ \
498 CP_ALIAS(28596, "iso_8859-6") \
499 CP_ALIAS(28596, "iso_8859_6") \
500 CP_ALIAS(28597, "iso-8859-7") /* ISO 8859-7 Greek */ \
501 CP_ALIAS(28597, "iso8859-7") /* ISO 8859-7 Greek */ \
502 CP_ALIAS(28597, "iso_8859-7") \
503 CP_ALIAS(28597, "iso_8859_7") \
504 CP_ALIAS(28598, "iso-8859-8") /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ \
505 CP_ALIAS(28598, "iso8859-8") /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ \
506 CP_ALIAS(28598, "iso_8859-8") \
507 CP_ALIAS(28598, "iso_8859_8") \
508 CP_ALIAS(28599, "iso-8859-9") /* ISO 8859-9 Turkish */ \
509 CP_ALIAS(28599, "iso8859-9") /* ISO 8859-9 Turkish */ \
510 CP_ALIAS(28599, "iso_8859-9") \
511 CP_ALIAS(28599, "iso_8859_9") \
512 CP_ALIAS(28603, "iso-8859-13") /* ISO 8859-13 Estonian */ \
513 CP_ALIAS(28603, "iso8859-13") /* ISO 8859-13 Estonian */ \
514 CP_ALIAS(28603, "iso_8859-13") \
515 CP_ALIAS(28603, "iso_8859_13") \
516 CP_ALIAS(28605, "iso-8859-15") /* ISO 8859-15 Latin 9 */ \
517 CP_ALIAS(28605, "iso8859-15") /* ISO 8859-15 Latin 9 */ \
518 CP_ALIAS(28605, "iso_8859-15") \
519 CP_ALIAS(28605, "iso_8859_15") \
520 CP_ALIAS(29001, "x-Europa") /* Europa 3 */ \
521 CP_ALIAS(38598, "iso-8859-8-i") /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ \
522 CP_ALIAS(38598, "iso8859-8-i") /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ \
523 CP_ALIAS(38598, "iso_8859-8-i") \
524 CP_ALIAS(38598, "iso_8859_8-i") \
525 CP_ALIAS(50220, "iso-2022-jp") /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ \
526 CP_ALIAS(50221, "csISO2022JP") /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ \
527 CP_ALIAS(50222, "iso-2022-jp") /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ \
528 CP_ALIAS(50225, "iso-2022-kr") /* ISO 2022 Korean */ \
529 CP_ALIAS(50225, "iso2022-kr") /* ISO 2022 Korean */ \
530 CP_ALIAS(50227, "x-cp50227") /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ \
531 /* 50229 ISO 2022 Traditional Chinese */ \
532 /* 50930 EBCDIC Japanese (Katakana) Extended */ \
533 /* 50931 EBCDIC US-Canada and Japanese */ \
534 /* 50933 EBCDIC Korean Extended and Korean */ \
535 /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ \
536 /* 50936 EBCDIC Simplified Chinese */ \
537 /* 50937 EBCDIC US-Canada and Traditional Chinese */ \
538 /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ \
539 CP_ALIAS(51932, "euc-jp") /* EUC Japanese */ \
540 CP_ALIAS(51936, "EUC-CN") /* EUC Simplified Chinese; Chinese Simplified (EUC) */ \
541 CP_ALIAS(51949, "euc-kr") /* EUC Korean */ \
542 /* 51950 EUC Traditional Chinese */ \
543 CP_ALIAS(52936, "hz-gb-2312") /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ \
544 CP_ALIAS(54936, "GB18030") /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ \
545 CP_ALIAS(57002, "x-iscii-de") /* ISCII Devanagari */ \
546 CP_ALIAS(57003, "x-iscii-be") /* ISCII Bengali */ \
547 CP_ALIAS(57004, "x-iscii-ta") /* ISCII Tamil */ \
548 CP_ALIAS(57005, "x-iscii-te") /* ISCII Telugu */ \
549 CP_ALIAS(57006, "x-iscii-as") /* ISCII Assamese */ \
550 CP_ALIAS(57007, "x-iscii-or") /* ISCII Oriya */ \
551 CP_ALIAS(57008, "x-iscii-ka") /* ISCII Kannada */ \
552 CP_ALIAS(57009, "x-iscii-ma") /* ISCII Malayalam */ \
553 CP_ALIAS(57010, "x-iscii-gu") /* ISCII Gujarati */ \
554 CP_ALIAS(57011, "x-iscii-pa") /* ISCII Punjabi */
555
556#define CP_ALIAS(codepage, alias) codepage,
557static const int cp_codepage[] = {
558 CP_ALIAS_LIST
559};
560#undef CP_ALIAS
561
562#define CP_ALIAS(codepage, alias) alias"\0"
563static const char cp_alias[] ALIGN1 =
564 CP_ALIAS_LIST;
565#undef CP_ALIAS
566
567/*
568 * SJIS SHIFTJIS table CP932 table
569 * ---- --------------------------- --------------------------------
570 * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS
571 * 7E U+203E OVERLINE U+007E TILDE
572 * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR
573 * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS
574 * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE
575 * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO
576 * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS
577 * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN
578 * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN
579 * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN
580 *
581 * EUC-JP and ISO-2022-JP should be compatible with CP932.
582 *
583 * Kernel and MLang have different Unicode mapping table. Make sure
584 * which API is used.
585 */
586static compat_t cp932_compat[] = {
587 {0x00A5, 0x005C, COMPAT_OUT},
588 {0x203E, 0x007E, COMPAT_OUT},
589 {0x2014, 0x2015, COMPAT_OUT},
590 {0x301C, 0xFF5E, COMPAT_OUT},
591 {0x2016, 0x2225, COMPAT_OUT},
592 {0x2212, 0xFF0D, COMPAT_OUT},
593 {0x00A2, 0xFFE0, COMPAT_OUT},
594 {0x00A3, 0xFFE1, COMPAT_OUT},
595 {0x00AC, 0xFFE2, COMPAT_OUT},
596 {0, 0, 0}
597};
598
599static compat_t cp20932_compat[] = {
600 {0x00A5, 0x005C, COMPAT_OUT},
601 {0x203E, 0x007E, COMPAT_OUT},
602 {0x2014, 0x2015, COMPAT_OUT},
603 {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN},
604 {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN},
605 {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN},
606 {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN},
607 {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN},
608 {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN},
609 {0, 0, 0}
610};
611
612static compat_t *cp51932_compat = cp932_compat;
613
614/* cp20932_compat for kernel. cp932_compat for mlang. */
615static compat_t *cp5022x_compat = cp932_compat;
616
617typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)(
618 LPDWORD lpdwMode,
619 DWORD dwSrcEncoding,
620 LPCSTR lpSrcStr,
621 LPINT lpnMultiCharCount,
622 LPWSTR lpDstStr,
623 LPINT lpnWideCharCount
624);
625
626typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)(
627 LPDWORD lpdwMode,
628 DWORD dwEncoding,
629 LPCWSTR lpSrcStr,
630 LPINT lpnWideCharCount,
631 LPSTR lpDstStr,
632 LPINT lpnMultiCharCount
633);
634
635static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode;
636static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte;
637
638static int
639load_mlang(void)
640{
641 HMODULE h;
642 if (ConvertINetMultiByteToUnicode != NULL)
643 return TRUE;
644 h = LoadLibrary(TEXT("mlang.dll"));
645 if (!h)
646 return FALSE;
647 ConvertINetMultiByteToUnicode = (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA(h, "ConvertINetMultiByteToUnicode");
648 ConvertINetUnicodeToMultiByte = (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA(h, "ConvertINetUnicodeToMultiByte");
649 return TRUE;
650}
651
652static iconv_t
653iconv_open(const char *tocode, const char *fromcode)
654{
655 rec_iconv_t *cd;
656
657 cd = (rec_iconv_t *)xzalloc(sizeof(rec_iconv_t));
658
659 /* reset the errno to prevent reporting wrong error code.
660 * 0 for unsorted error. */
661 errno = 0;
662 if (make_csconv(fromcode, &cd->from) && make_csconv(tocode, &cd->to)) {
663 cd->cd = (iconv_t)cd;
664 return (iconv_t)cd;
665 }
666
667 free(cd);
668 return (iconv_t)(-1);
669}
670
671static int
672iconv_close(iconv_t _cd)
673{
674 free(_cd);
675 return 0;
676}
677
678static size_t
679iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
680{
681 rec_iconv_t *cd = (rec_iconv_t *)_cd;
682 ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */
683 int insize;
684 int outsize;
685 int wsize;
686 DWORD frommode;
687 DWORD tomode;
688 uint wc;
689 compat_t *cp;
690 int i;
691
692 if (inbuf == NULL || *inbuf == NULL)
693 {
694 if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL)
695 {
696 tomode = cd->to.mode;
697 outsize = cd->to.flush(&cd->to, (uchar *)*outbuf, *outbytesleft);
698 if (outsize == -1)
699 {
700 if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG)
701 {
702 outsize = 0;
703 }
704 else
705 {
706 cd->to.mode = tomode;
707 return (size_t)(-1);
708 }
709 }
710 *outbuf += outsize;
711 *outbytesleft -= outsize;
712 }
713 cd->from.mode = 0;
714 cd->to.mode = 0;
715 return 0;
716 }
717
718 while (*inbytesleft != 0)
719 {
720 frommode = cd->from.mode;
721 tomode = cd->to.mode;
722 wsize = MB_CHAR_MAX;
723
724 insize = cd->from.mbtowc(&cd->from, (const uchar *)*inbuf, *inbytesleft, wbuf, &wsize);
725 if (insize == -1)
726 {
727 if (cd->to.flags & FLAG_IGNORE)
728 {
729 cd->from.mode = frommode;
730 insize = 1;
731 wsize = 0;
732 }
733 else
734 {
735 cd->from.mode = frommode;
736 return (size_t)(-1);
737 }
738 }
739
740 if (wsize == 0)
741 {
742 *inbuf += insize;
743 *inbytesleft -= insize;
744 continue;
745 }
746
747 if (cd->from.compat != NULL)
748 {
749 wc = utf16_to_ucs4(wbuf);
750 cp = cd->from.compat;
751 for (i = 0; cp[i].in != 0; ++i)
752 {
753 if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc)
754 {
755 ucs4_to_utf16(cp[i].in, wbuf, &wsize);
756 break;
757 }
758 }
759 }
760
761 if (cd->to.compat != NULL)
762 {
763 wc = utf16_to_ucs4(wbuf);
764 cp = cd->to.compat;
765 for (i = 0; cp[i].in != 0; ++i)
766 {
767 if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc)
768 {
769 ucs4_to_utf16(cp[i].out, wbuf, &wsize);
770 break;
771 }
772 }
773 }
774
775 outsize = cd->to.wctomb(&cd->to, wbuf, wsize, (uchar *)*outbuf, *outbytesleft);
776 if (outsize == -1)
777 {
778 if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG)
779 {
780 cd->to.mode = tomode;
781 outsize = 0;
782 }
783 else
784 {
785 cd->from.mode = frommode;
786 cd->to.mode = tomode;
787 return (size_t)(-1);
788 }
789 }
790
791 *inbuf += insize;
792 *outbuf += outsize;
793 *inbytesleft -= insize;
794 *outbytesleft -= outsize;
795 }
796
797 return 0;
798}
799
800static int
801make_csconv(const char *_name, csconv_t *cv)
802{
803 CPINFO cpinfo;
804 int use_compat = TRUE;
805 int flag = 0;
806 char *name;
807 char *p, *s;
808
809 name = xstrdup(_name);
810
811 /* check for option "enc_name//opt1//opt2" */
812 while ((p = strrstr(name, "//")) != NULL)
813 {
814 for (s = p + 2; *s; ++s)
815 *s = tolower(*s);
816 switch (index_in_strings("nocompat\0translit\0ignore\0", p + 2)) {
817 case 0:
818 use_compat = FALSE;
819 break;
820 case 1:
821 flag |= FLAG_TRANSLIT;
822 break;
823 case 2:
824 flag |= FLAG_IGNORE;
825 break;
826 }
827 *p = 0;
828 }
829
830 cv->mode = 0;
831 cv->flags = flag;
832 cv->mblen = NULL;
833 cv->flush = NULL;
834 cv->compat = NULL;
835 cv->codepage = name_to_codepage(name);
836 if (cv->codepage == 1200 || cv->codepage == 1201)
837 {
838 cv->mbtowc = utf16_mbtowc;
839 cv->wctomb = utf16_wctomb;
840 if (_stricmp(name, "UTF-16") == 0 || _stricmp(name, "UTF16") == 0 ||
841 _stricmp(name, "UCS-2") == 0 || _stricmp(name, "UCS2") == 0 ||
842 _stricmp(name,"UCS-2-INTERNAL") == 0)
843 cv->flags |= FLAG_USE_BOM;
844 }
845 else if (cv->codepage == 12000 || cv->codepage == 12001)
846 {
847 cv->mbtowc = utf32_mbtowc;
848 cv->wctomb = utf32_wctomb;
849 if (_stricmp(name, "UTF-32") == 0 || _stricmp(name, "UTF32") == 0 ||
850 _stricmp(name, "UCS-4") == 0 || _stricmp(name, "UCS4") == 0)
851 cv->flags |= FLAG_USE_BOM;
852 }
853 else if (cv->codepage == 65001)
854 {
855 cv->mbtowc = kernel_mbtowc;
856 cv->wctomb = kernel_wctomb;
857 cv->mblen = utf8_mblen;
858 }
859 else if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && load_mlang())
860 {
861 cv->mbtowc = iso2022jp_mbtowc;
862 cv->wctomb = iso2022jp_wctomb;
863 cv->flush = iso2022jp_flush;
864 }
865 else if (cv->codepage == 51932 && load_mlang())
866 {
867 cv->mbtowc = mlang_mbtowc;
868 cv->wctomb = mlang_wctomb;
869 cv->mblen = eucjp_mblen;
870 }
871 else if (IsValidCodePage(cv->codepage)
872 && GetCPInfo(cv->codepage, &cpinfo) != 0)
873 {
874 cv->mbtowc = kernel_mbtowc;
875 cv->wctomb = kernel_wctomb;
876 if (cpinfo.MaxCharSize == 1)
877 cv->mblen = sbcs_mblen;
878 else if (cpinfo.MaxCharSize == 2)
879 cv->mblen = dbcs_mblen;
880 else
881 cv->mblen = mbcs_mblen;
882 }
883 else
884 {
885 /* not supported */
886 free(name);
887 errno = EINVAL;
888 return FALSE;
889 }
890
891 if (use_compat)
892 {
893 switch (cv->codepage)
894 {
895 case 932: cv->compat = cp932_compat; break;
896 case 20932: cv->compat = cp20932_compat; break;
897 case 51932: cv->compat = cp51932_compat; break;
898 case 50220: case 50221: case 50222: cv->compat = cp5022x_compat; break;
899 }
900 }
901
902 free(name);
903
904 return TRUE;
905}
906
907static int
908name_to_codepage(const char *name)
909{
910 int i;
911 const char *alias;
912
913 if (*name == '\0' || strcmp(name, "char") == 0)
914 return GetACP();
915 else if (strcmp(name, "wchar_t") == 0)
916 return 1200;
917 else if (_strnicmp(name, "cp", 2) == 0)
918 return atoi(name + 2); /* CP123 */
919 else if ('0' <= name[0] && name[0] <= '9')
920 return atoi(name); /* 123 */
921 else if (_strnicmp(name, "xx", 2) == 0)
922 return atoi(name + 2); /* XX123 for debug */
923
924 i = 0;
925 alias = cp_alias;
926 while (*alias) {
927 if (_stricmp(alias, name) == 0) {
928 return cp_codepage[i];
929 }
930 alias += strlen(alias) + 1;
931 ++i;
932 }
933 return -1;
934}
935
936/*
937 * http://www.faqs.org/rfcs/rfc2781.html
938 */
939static uint
940utf16_to_ucs4(const ushort *wbuf)
941{
942 uint wc = wbuf[0];
943 if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
944 wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000;
945 return wc;
946}
947
948static void
949ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize)
950{
951 if (wc < 0x10000)
952 {
953 wbuf[0] = wc;
954 *wbufsize = 1;
955 }
956 else
957 {
958 wc -= 0x10000;
959 wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF);
960 wbuf[1] = 0xDC00 | (wc & 0x3FF);
961 *wbufsize = 2;
962 }
963}
964
965/*
966 * Check if codepage is one of those for which the dwFlags parameter
967 * to MultiByteToWideChar() must be zero. Return zero or
968 * MB_ERR_INVALID_CHARS. The docs in Platform SDK for Windows
969 * Server 2003 R2 claims that also codepage 65001 is one of these, but
970 * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave
971 * out 65001 (UTF-8), and that indeed seems to be the case on XP, it
972 * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting
973 * from UTF-8.
974 */
975static int
976mbtowc_flags(int codepage)
977{
978 return (codepage == 50220 || codepage == 50221 ||
979 codepage == 50222 || codepage == 50225 ||
980 codepage == 50227 || codepage == 50229 ||
981 codepage == 52936 || codepage == 54936 ||
982 (codepage >= 57002 && codepage <= 57011) ||
983 codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS;
984}
985
986/*
987 * Check if codepage is one those for which the lpUsedDefaultChar
988 * parameter to WideCharToMultiByte() must be NULL. The docs in
989 * Platform SDK for Windows Server 2003 R2 claims that this is the
990 * list below, while the MSDN docs for MSVS2008 claim that it is only
991 * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform
992 * SDK seems to be correct, at least for XP.
993 */
994static int
995must_use_null_useddefaultchar(int codepage)
996{
997 return (codepage == 65000 || codepage == 65001 ||
998 codepage == 50220 || codepage == 50221 ||
999 codepage == 50222 || codepage == 50225 ||
1000 codepage == 50227 || codepage == 50229 ||
1001 codepage == 52936 || codepage == 54936 ||
1002 (codepage >= 57002 && codepage <= 57011) ||
1003 codepage == 42);
1004}
1005
1006static int
1007seterror(int err)
1008{
1009 errno = err;
1010 return -1;
1011}
1012
1013static int
1014sbcs_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf UNUSED_PARAM,
1015 int bufsize UNUSED_PARAM)
1016{
1017 return 1;
1018}
1019
1020static int
1021dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize)
1022{
1023 int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1;
1024 if (bufsize < len)
1025 return seterror(EINVAL);
1026 return len;
1027}
1028
1029static int
1030mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize)
1031{
1032 int len = 0;
1033
1034 if (cv->codepage == 54936) {
1035 if (buf[0] <= 0x7F)
1036 len = 1;
1037 else if (buf[0] >= 0x81 && buf[0] <= 0xFE &&
1038 bufsize >= 2 &&
1039 ((buf[1] >= 0x40 && buf[1] <= 0x7E) ||
1040 (buf[1] >= 0x80 && buf[1] <= 0xFE)))
1041 len = 2;
1042 else if (buf[0] >= 0x81 && buf[0] <= 0xFE &&
1043 bufsize >= 4 &&
1044 buf[1] >= 0x30 && buf[1] <= 0x39)
1045 len = 4;
1046 else
1047 return seterror(EINVAL);
1048 return len;
1049 }
1050 else
1051 return seterror(EINVAL);
1052}
1053
1054static int
1055utf8_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize)
1056{
1057 int len = 0;
1058
1059 if (buf[0] < 0x80) len = 1;
1060 else if ((buf[0] & 0xE0) == 0xC0) len = 2;
1061 else if ((buf[0] & 0xF0) == 0xE0) len = 3;
1062 else if ((buf[0] & 0xF8) == 0xF0) len = 4;
1063 else if ((buf[0] & 0xFC) == 0xF8) len = 5;
1064 else if ((buf[0] & 0xFE) == 0xFC) len = 6;
1065
1066 if (len == 0)
1067 return seterror(EILSEQ);
1068 else if (bufsize < len)
1069 return seterror(EINVAL);
1070 return len;
1071}
1072
1073static int
1074eucjp_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize)
1075{
1076 if (buf[0] < 0x80) /* ASCII */
1077 return 1;
1078 else if (buf[0] == 0x8E) /* JIS X 0201 */
1079 {
1080 if (bufsize < 2)
1081 return seterror(EINVAL);
1082 else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF))
1083 return seterror(EILSEQ);
1084 return 2;
1085 }
1086 else if (buf[0] == 0x8F) /* JIS X 0212 */
1087 {
1088 if (bufsize < 3)
1089 return seterror(EINVAL);
1090 else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE)
1091 || !(0xA1 <= buf[2] && buf[2] <= 0xFE))
1092 return seterror(EILSEQ);
1093 return 3;
1094 }
1095 else /* JIS X 0208 */
1096 {
1097 if (bufsize < 2)
1098 return seterror(EINVAL);
1099 else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE)
1100 || !(0xA1 <= buf[1] && buf[1] <= 0xFE))
1101 return seterror(EILSEQ);
1102 return 2;
1103 }
1104}
1105
1106static int
1107kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
1108{
1109 int len;
1110
1111 len = cv->mblen(cv, buf, bufsize);
1112 if (len == -1)
1113 return -1;
1114 /* If converting from ASCII, reject 8bit
1115 * chars. MultiByteToWideChar() doesn't. Note that for ASCII we
1116 * know that the mblen function is sbcs_mblen() so len is 1.
1117 */
1118 if (cv->codepage == 20127 && buf[0] >= 0x80)
1119 return seterror(EILSEQ);
1120 *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage),
1121 (const char *)buf, len, (wchar_t *)wbuf, *wbufsize);
1122 if (*wbufsize == 0)
1123 return seterror(EILSEQ);
1124 return len;
1125}
1126
1127static int
1128kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
1129{
1130 BOOL usedDefaultChar = 0;
1131 BOOL *p = NULL;
1132 int flags = 0;
1133 int len;
1134
1135 if (bufsize == 0)
1136 return seterror(E2BIG);
1137 if (!must_use_null_useddefaultchar(cv->codepage))
1138 {
1139 p = &usedDefaultChar;
1140#ifdef WC_NO_BEST_FIT_CHARS
1141 if (!(cv->flags & FLAG_TRANSLIT))
1142 flags |= WC_NO_BEST_FIT_CHARS;
1143#endif
1144 }
1145 len = WideCharToMultiByte(cv->codepage, flags,
1146 (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p);
1147 if (len == 0)
1148 {
1149 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
1150 return seterror(E2BIG);
1151 return seterror(EILSEQ);
1152 }
1153 else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT))
1154 return seterror(EILSEQ);
1155 else if (cv->mblen(cv, buf, len) != len) /* validate result */
1156 return seterror(EILSEQ);
1157 return len;
1158}
1159
1160/*
1161 * It seems that the mode (cv->mode) is fixnum.
1162 * For example, when converting iso-2022-jp(cp50221) to unicode:
1163 * in ascii sequence: mode=0xC42C0000
1164 * in jisx0208 sequence: mode=0xC42C0001
1165 * "C42C" is same for each convert session.
1166 * It should be: ((codepage-1)<<16)|state
1167 */
1168static int
1169mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
1170{
1171 int len;
1172 int insize;
1173 HRESULT hr;
1174
1175 len = cv->mblen(cv, buf, bufsize);
1176 if (len == -1)
1177 return -1;
1178 insize = len;
1179 hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage,
1180 (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize);
1181 if (hr != S_OK || insize != len)
1182 return seterror(EILSEQ);
1183 return len;
1184}
1185
1186static int
1187mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
1188{
1189 char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */
1190 int tmpsize = MB_CHAR_MAX;
1191 int insize = wbufsize;
1192 HRESULT hr;
1193
1194 hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage,
1195 (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize);
1196 if (hr != S_OK || insize != wbufsize)
1197 return seterror(EILSEQ);
1198 else if (bufsize < tmpsize)
1199 return seterror(E2BIG);
1200 else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize)
1201 return seterror(EILSEQ);
1202 memcpy(buf, tmpbuf, tmpsize);
1203 return tmpsize;
1204}
1205
1206static int
1207utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
1208{
1209 int codepage = cv->codepage;
1210
1211 /* swap endian: 1200 <-> 1201 */
1212 if (cv->mode & UNICODE_MODE_SWAPPED)
1213 codepage ^= 1;
1214
1215 if (bufsize < 2)
1216 return seterror(EINVAL);
1217 if (codepage == 1200) /* little endian */
1218 wbuf[0] = (buf[1] << 8) | buf[0];
1219 else if (codepage == 1201) /* big endian */
1220 wbuf[0] = (buf[0] << 8) | buf[1];
1221
1222 if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
1223 {
1224 cv->mode |= UNICODE_MODE_BOM_DONE;
1225 if (wbuf[0] == 0xFFFE)
1226 {
1227 cv->mode |= UNICODE_MODE_SWAPPED;
1228 *wbufsize = 0;
1229 return 2;
1230 }
1231 else if (wbuf[0] == 0xFEFF)
1232 {
1233 *wbufsize = 0;
1234 return 2;
1235 }
1236 }
1237
1238 if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF)
1239 return seterror(EILSEQ);
1240 if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
1241 {
1242 if (bufsize < 4)
1243 return seterror(EINVAL);
1244 if (codepage == 1200) /* little endian */
1245 wbuf[1] = (buf[3] << 8) | buf[2];
1246 else if (codepage == 1201) /* big endian */
1247 wbuf[1] = (buf[2] << 8) | buf[3];
1248 if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF))
1249 return seterror(EILSEQ);
1250 *wbufsize = 2;
1251 return 4;
1252 }
1253 *wbufsize = 1;
1254 return 2;
1255}
1256
1257static int
1258utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
1259{
1260 if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
1261 {
1262 int r;
1263
1264 cv->mode |= UNICODE_MODE_BOM_DONE;
1265 if (bufsize < 2)
1266 return seterror(E2BIG);
1267 if (cv->codepage == 1200) /* little endian */
1268 memcpy(buf, "\xFF\xFE", 2);
1269 else if (cv->codepage == 1201) /* big endian */
1270 memcpy(buf, "\xFE\xFF", 2);
1271
1272 r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2);
1273 if (r == -1)
1274 return -1;
1275 return r + 2;
1276 }
1277
1278 if (bufsize < 2)
1279 return seterror(E2BIG);
1280 if (cv->codepage == 1200) /* little endian */
1281 {
1282 buf[0] = (wbuf[0] & 0x00FF);
1283 buf[1] = (wbuf[0] & 0xFF00) >> 8;
1284 }
1285 else if (cv->codepage == 1201) /* big endian */
1286 {
1287 buf[0] = (wbuf[0] & 0xFF00) >> 8;
1288 buf[1] = (wbuf[0] & 0x00FF);
1289 }
1290 if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
1291 {
1292 if (bufsize < 4)
1293 return seterror(E2BIG);
1294 if (cv->codepage == 1200) /* little endian */
1295 {
1296 buf[2] = (wbuf[1] & 0x00FF);
1297 buf[3] = (wbuf[1] & 0xFF00) >> 8;
1298 }
1299 else if (cv->codepage == 1201) /* big endian */
1300 {
1301 buf[2] = (wbuf[1] & 0xFF00) >> 8;
1302 buf[3] = (wbuf[1] & 0x00FF);
1303 }
1304 return 4;
1305 }
1306 return 2;
1307}
1308
1309static int
1310utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
1311{
1312 int codepage = cv->codepage;
1313 uint wc = 0xD800;
1314
1315 /* swap endian: 12000 <-> 12001 */
1316 if (cv->mode & UNICODE_MODE_SWAPPED)
1317 codepage ^= 1;
1318
1319 if (bufsize < 4)
1320 return seterror(EINVAL);
1321 if (codepage == 12000) /* little endian */
1322 wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0];
1323 else if (codepage == 12001) /* big endian */
1324 wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
1325
1326 if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
1327 {
1328 cv->mode |= UNICODE_MODE_BOM_DONE;
1329 if (wc == 0xFFFE0000)
1330 {
1331 cv->mode |= UNICODE_MODE_SWAPPED;
1332 *wbufsize = 0;
1333 return 4;
1334 }
1335 else if (wc == 0x0000FEFF)
1336 {
1337 *wbufsize = 0;
1338 return 4;
1339 }
1340 }
1341
1342 if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc)
1343 return seterror(EILSEQ);
1344 ucs4_to_utf16(wc, wbuf, wbufsize);
1345 return 4;
1346}
1347
1348static int
1349utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
1350{
1351 uint wc;
1352
1353 if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
1354 {
1355 int r;
1356
1357 cv->mode |= UNICODE_MODE_BOM_DONE;
1358 if (bufsize < 4)
1359 return seterror(E2BIG);
1360 if (cv->codepage == 12000) /* little endian */
1361 memcpy(buf, "\xFF\xFE\x00\x00", 4);
1362 else if (cv->codepage == 12001) /* big endian */
1363 memcpy(buf, "\x00\x00\xFE\xFF", 4);
1364
1365 r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4);
1366 if (r == -1)
1367 return -1;
1368 return r + 4;
1369 }
1370
1371 if (bufsize < 4)
1372 return seterror(E2BIG);
1373 wc = utf16_to_ucs4(wbuf);
1374 if (cv->codepage == 12000) /* little endian */
1375 {
1376 buf[0] = wc & 0x000000FF;
1377 buf[1] = (wc & 0x0000FF00) >> 8;
1378 buf[2] = (wc & 0x00FF0000) >> 16;
1379 buf[3] = (wc & 0xFF000000) >> 24;
1380 }
1381 else if (cv->codepage == 12001) /* big endian */
1382 {
1383 buf[0] = (wc & 0xFF000000) >> 24;
1384 buf[1] = (wc & 0x00FF0000) >> 16;
1385 buf[2] = (wc & 0x0000FF00) >> 8;
1386 buf[3] = wc & 0x000000FF;
1387 }
1388 return 4;
1389}
1390
1391/*
1392 * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
1393 * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow
1394 * 1 byte Kana)
1395 * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte
1396 * Kana - SO/SI)
1397 *
1398 * MultiByteToWideChar() and WideCharToMultiByte() behave differently
1399 * depending on Windows version. On XP, WideCharToMultiByte() doesn't
1400 * terminate result sequence with ascii escape. But Vista does.
1401 * Use MLang instead.
1402 */
1403
1404#define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift))
1405#define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF)
1406#define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF)
1407
1408#define ISO2022_SI 0
1409#define ISO2022_SO 1
1410
1411/* shift in */
1412static const char iso2022_SI_seq[] = "\x0F";
1413/* shift out */
1414static const char iso2022_SO_seq[] = "\x0E";
1415
1416typedef struct iso2022_esc_t iso2022_esc_t;
1417struct iso2022_esc_t {
1418 const char *esc;
1419 int esc_len;
1420 int len;
1421 int cs;
1422};
1423
1424#define ISO2022JP_CS_ASCII 0
1425#define ISO2022JP_CS_JISX0201_ROMAN 1
1426#define ISO2022JP_CS_JISX0201_KANA 2
1427#define ISO2022JP_CS_JISX0208_1978 3
1428#define ISO2022JP_CS_JISX0208_1983 4
1429#define ISO2022JP_CS_JISX0212 5
1430
1431static iso2022_esc_t iso2022jp_esc[] = {
1432 {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII},
1433 {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN},
1434 {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA},
1435 {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, /* unify 1978 with 1983 */
1436 {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983},
1437 {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212},
1438 {NULL, 0, 0, 0}
1439};
1440
1441static int
1442iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
1443{
1444 iso2022_esc_t *iesc = iso2022jp_esc;
1445 char tmp[MB_CHAR_MAX];
1446 int insize;
1447 HRESULT hr;
1448 DWORD dummy = 0;
1449 int len;
1450 int esc_len;
1451 int cs;
1452 int shift;
1453 int i;
1454
1455 if (buf[0] == 0x1B)
1456 {
1457 for (i = 0; iesc[i].esc != NULL; ++i)
1458 {
1459 esc_len = iesc[i].esc_len;
1460 if (bufsize < esc_len)
1461 {
1462 if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0)
1463 return seterror(EINVAL);
1464 }
1465 else
1466 {
1467 if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0)
1468 {
1469 cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI);
1470 *wbufsize = 0;
1471 return esc_len;
1472 }
1473 }
1474 }
1475 /* not supported escape sequence */
1476 return seterror(EILSEQ);
1477 }
1478 else if (buf[0] == iso2022_SO_seq[0])
1479 {
1480 cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO);
1481 *wbufsize = 0;
1482 return 1;
1483 }
1484 else if (buf[0] == iso2022_SI_seq[0])
1485 {
1486 cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI);
1487 *wbufsize = 0;
1488 return 1;
1489 }
1490
1491 cs = ISO2022_MODE_CS(cv->mode);
1492 shift = ISO2022_MODE_SHIFT(cv->mode);
1493
1494 /* reset the mode for informal sequence */
1495 if (buf[0] < 0x20)
1496 {
1497 cs = ISO2022JP_CS_ASCII;
1498 shift = ISO2022_SI;
1499 }
1500
1501 len = iesc[cs].len;
1502 if (bufsize < len)
1503 return seterror(EINVAL);
1504 for (i = 0; i < len; ++i)
1505 if (!(buf[i] < 0x80))
1506 return seterror(EILSEQ);
1507 esc_len = iesc[cs].esc_len;
1508 memcpy(tmp, iesc[cs].esc, esc_len);
1509 if (shift == ISO2022_SO)
1510 {
1511 memcpy(tmp + esc_len, iso2022_SO_seq, 1);
1512 esc_len += 1;
1513 }
1514 memcpy(tmp + esc_len, buf, len);
1515
1516 if ((cv->codepage == 50220 || cv->codepage == 50221
1517 || cv->codepage == 50222) && shift == ISO2022_SO)
1518 {
1519 /* XXX: shift-out cannot be used for mbtowc (both kernel and
1520 * mlang) */
1521 esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len;
1522 memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len);
1523 memcpy(tmp + esc_len, buf, len);
1524 }
1525
1526 insize = len + esc_len;
1527 hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage,
1528 (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize);
1529 if (hr != S_OK || insize != len + esc_len)
1530 return seterror(EILSEQ);
1531
1532 /* Check for conversion error. Assuming defaultChar is 0x3F. */
1533 /* ascii should be converted from ascii */
1534 if (wbuf[0] == buf[0]
1535 && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI))
1536 return seterror(EILSEQ);
1537
1538 /* reset the mode for informal sequence */
1539 if (cv->mode != ISO2022_MODE(cs, shift))
1540 cv->mode = ISO2022_MODE(cs, shift);
1541
1542 return len;
1543}
1544
1545static int
1546iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
1547{
1548 iso2022_esc_t *iesc = iso2022jp_esc;
1549 char tmp[MB_CHAR_MAX];
1550 int tmpsize = MB_CHAR_MAX;
1551 int insize = wbufsize;
1552 HRESULT hr;
1553 DWORD dummy = 0;
1554 int len;
1555 int esc_len;
1556 int cs;
1557 int shift;
1558 int i;
1559
1560 /*
1561 * MultiByte = [escape sequence] + character + [escape sequence]
1562 *
1563 * Whether trailing escape sequence is added depends on which API is
1564 * used (kernel or MLang, and its version).
1565 */
1566 hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage,
1567 (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize);
1568 if (hr != S_OK || insize != wbufsize)
1569 return seterror(EILSEQ);
1570 else if (bufsize < tmpsize)
1571 return seterror(E2BIG);
1572
1573 if (tmpsize == 1)
1574 {
1575 cs = ISO2022JP_CS_ASCII;
1576 esc_len = 0;
1577 }
1578 else
1579 {
1580 for (i = 1; iesc[i].esc != NULL; ++i)
1581 {
1582 esc_len = iesc[i].esc_len;
1583 if (strncmp(tmp, iesc[i].esc, esc_len) == 0)
1584 {
1585 cs = iesc[i].cs;
1586 break;
1587 }
1588 }
1589 if (iesc[i].esc == NULL)
1590 /* not supported escape sequence */
1591 return seterror(EILSEQ);
1592 }
1593
1594 shift = ISO2022_SI;
1595 if (tmp[esc_len] == iso2022_SO_seq[0])
1596 {
1597 shift = ISO2022_SO;
1598 esc_len += 1;
1599 }
1600
1601 len = iesc[cs].len;
1602
1603 /* Check for converting error. Assuming defaultChar is 0x3F. */
1604 /* ascii should be converted from ascii */
1605 if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80))
1606 return seterror(EILSEQ);
1607 else if (tmpsize < esc_len + len)
1608 return seterror(EILSEQ);
1609
1610 if (cv->mode == ISO2022_MODE(cs, shift))
1611 {
1612 /* remove escape sequence */
1613 if (esc_len != 0)
1614 memmove(tmp, tmp + esc_len, len);
1615 esc_len = 0;
1616 }
1617 else
1618 {
1619 if (cs == ISO2022JP_CS_ASCII)
1620 {
1621 esc_len = iesc[ISO2022JP_CS_ASCII].esc_len;
1622 memmove(tmp + esc_len, tmp, len);
1623 memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len);
1624 }
1625 if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO)
1626 {
1627 /* shift-in before changing to other mode */
1628 memmove(tmp + 1, tmp, len + esc_len);
1629 memcpy(tmp, iso2022_SI_seq, 1);
1630 esc_len += 1;
1631 }
1632 }
1633
1634 if (bufsize < len + esc_len)
1635 return seterror(E2BIG);
1636 memcpy(buf, tmp, len + esc_len);
1637 cv->mode = ISO2022_MODE(cs, shift);
1638 return len + esc_len;
1639}
1640
1641static int
1642iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize)
1643{
1644 iso2022_esc_t *iesc = iso2022jp_esc;
1645 int esc_len;
1646
1647 if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI))
1648 {
1649 esc_len = 0;
1650 if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI)
1651 esc_len += 1;
1652 if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII)
1653 esc_len += iesc[ISO2022JP_CS_ASCII].esc_len;
1654 if (bufsize < esc_len)
1655 return seterror(E2BIG);
1656
1657 esc_len = 0;
1658 if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI)
1659 {
1660 memcpy(buf, iso2022_SI_seq, 1);
1661 esc_len += 1;
1662 }
1663 if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII)
1664 {
1665 memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc,
1666 iesc[ISO2022JP_CS_ASCII].esc_len);
1667 esc_len += iesc[ISO2022JP_CS_ASCII].esc_len;
1668 }
1669 return esc_len;
1670 }
1671 return 0;
1672}
1673
1674static void process_file(iconv_t cd, FILE *in, FILE *out)
1675{
1676 char inbuf[BUFSIZ];
1677 char outbuf[BUFSIZ];
1678 const char *pin;
1679 char *pout;
1680 size_t inbytesleft;
1681 size_t outbytesleft;
1682 size_t rest = 0;
1683 size_t r;
1684
1685 while ((inbytesleft=fread(inbuf+rest, 1, sizeof(inbuf)-rest, in)) != 0
1686 || rest != 0) {
1687 inbytesleft += rest;
1688 pin = inbuf;
1689 pout = outbuf;
1690 outbytesleft = sizeof(outbuf);
1691 r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft);
1692 fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out);
1693 if (r == (size_t)(-1) && errno != E2BIG &&
1694 (errno != EINVAL || feof(in)))
1695 bb_perror_msg_and_die("conversion error");
1696 memmove(inbuf, pin, inbytesleft);
1697 rest = inbytesleft;
1698 if (rest == 0 && feof(in))
1699 break;
1700 }
1701 pout = outbuf;
1702 outbytesleft = sizeof(outbuf);
1703 r = iconv(cd, NULL, NULL, &pout, &outbytesleft);
1704 fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out);
1705 if (r == (size_t)(-1))
1706 bb_perror_msg_and_die("conversion error");
1707}
1708
1709enum {
1710 OPT_f = (1 << 0),
1711 OPT_t = (1 << 1),
1712 OPT_l = (1 << 2),
1713 OPT_c = (1 << 3),
1714 OPT_o = (1 << 4),
1715};
1716
1717int iconv_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
1718int iconv_main(int argc, char **argv)
1719{
1720 const char *fromcode = "", *tocode = "", *outfile;
1721 char *tmpname = NULL;
1722 int i, opt;
1723 iconv_t cd;
1724 FILE *in;
1725 FILE *out = stdout;
1726
1727 opt = getopt32(argv, "f:t:lco:", &fromcode, &tocode, &outfile);
1728
1729 if (opt & OPT_l) {
1730 const char *alias = cp_alias;
1731 while (*alias) {
1732 printf("%s\n", alias);
1733 alias += strlen(alias) + 1;
1734 }
1735 return 0;
1736 }
1737
1738 if (opt & OPT_o) {
1739 tmpname = xasprintf("%sXXXXXX", outfile);
1740 mktemp(tmpname);
1741 out = xfopen(tmpname, "wb");
1742 }
1743
1744 if (opt & OPT_c)
1745 tocode = xasprintf("%s//IGNORE", tocode);
1746
1747 cd = iconv_open(tocode, fromcode);
1748 if (cd == (iconv_t)(-1))
1749 bb_perror_msg_and_die("iconv_open error");
1750
1751 if (optind == argc)
1752 argv[argc++] = (char *)"-";
1753
1754 for (i=optind; i<argc; ++i) {
1755 if (argv[i][0] == '-' && argv[i][1] == '\0')
1756 in = stdin;
1757 else
1758 in = xfopen(argv[optind], "rb");
1759 process_file(cd, in, out);
1760 fclose(in);
1761 }
1762
1763 if (tmpname) {
1764 fclose(out);
1765 xrename(tmpname, outfile);
1766 }
1767
1768 if (ENABLE_FEATURE_CLEAN_UP)
1769 iconv_close(cd);
1770 return 0;
1771}