diff options
Diffstat (limited to 'miscutils/iconv.c')
| -rw-r--r-- | miscutils/iconv.c | 1771 |
1 files changed, 1771 insertions, 0 deletions
diff --git a/miscutils/iconv.c b/miscutils/iconv.c new file mode 100644 index 000000000..bedbb718d --- /dev/null +++ b/miscutils/iconv.c | |||
| @@ -0,0 +1,1771 @@ | |||
| 1 | /* | ||
| 2 | * iconv implementation using Win32 API to convert. | ||
| 3 | * | ||
| 4 | * This file is placed in the public domain. | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * This code was obtained from: | ||
| 9 | * | ||
| 10 | * https://github.com/win-iconv/win-iconv | ||
| 11 | * | ||
| 12 | * Modified for busybox-w32 by Ronald M Yorston. These modifications | ||
| 13 | * are also dedicated to the public domain. | ||
| 14 | */ | ||
| 15 | |||
| 16 | //config:config ICONV | ||
| 17 | //config: bool "iconv (11.4 kb)" | ||
| 18 | //config: default y | ||
| 19 | //config: depends on PLATFORM_MINGW32 | ||
| 20 | //config: help | ||
| 21 | //config: 'iconv' converts text between character encodings. | ||
| 22 | |||
| 23 | //applet:IF_ICONV(APPLET(iconv, BB_DIR_USR_BIN, BB_SUID_DROP)) | ||
| 24 | |||
| 25 | //kbuild:lib-$(CONFIG_ICONV) += iconv.o | ||
| 26 | |||
| 27 | //usage:#define iconv_trivial_usage | ||
| 28 | //usage: "[-lc] [-o outfile] [-f from-enc] [-t to-enc] [FILE]..." | ||
| 29 | //usage:#define iconv_full_usage "\n\n" | ||
| 30 | //usage: "Convert text between character encodings\n" | ||
| 31 | //usage: "\n -l List all known character encodings" | ||
| 32 | //usage: "\n -c Silently discard characters that cannot be converted" | ||
| 33 | //usage: "\n -o Use outfile for output" | ||
| 34 | //usage: "\n -f Use from-enc for input characters" | ||
| 35 | //usage: "\n -t Use to-enc for output characters" | ||
| 36 | |||
| 37 | #include "libbb.h" | ||
| 38 | |||
| 39 | /* WORKAROUND: */ | ||
| 40 | #define GetProcAddressA GetProcAddress | ||
| 41 | |||
| 42 | #define MB_CHAR_MAX 16 | ||
| 43 | |||
| 44 | #define UNICODE_MODE_BOM_DONE 1 | ||
| 45 | #define UNICODE_MODE_SWAPPED 2 | ||
| 46 | |||
| 47 | #define FLAG_USE_BOM 1 | ||
| 48 | #define FLAG_TRANSLIT 2 /* //TRANSLIT */ | ||
| 49 | #define FLAG_IGNORE 4 /* //IGNORE */ | ||
| 50 | |||
| 51 | typedef unsigned char uchar; | ||
| 52 | typedef unsigned short ushort; | ||
| 53 | typedef unsigned int uint; | ||
| 54 | |||
| 55 | typedef void* iconv_t; | ||
| 56 | |||
| 57 | static iconv_t iconv_open(const char *tocode, const char *fromcode); | ||
| 58 | static int iconv_close(iconv_t cd); | ||
| 59 | static size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); | ||
| 60 | |||
| 61 | typedef struct compat_t compat_t; | ||
| 62 | typedef struct csconv_t csconv_t; | ||
| 63 | typedef struct rec_iconv_t rec_iconv_t; | ||
| 64 | |||
| 65 | typedef int (*f_mbtowc)(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 66 | typedef int (*f_wctomb)(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 67 | typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 68 | typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize); | ||
| 69 | |||
| 70 | #define COMPAT_IN 1 | ||
| 71 | #define COMPAT_OUT 2 | ||
| 72 | |||
| 73 | /* unicode mapping for compatibility with other conversion table. */ | ||
| 74 | struct compat_t { | ||
| 75 | uint in; | ||
| 76 | uint out; | ||
| 77 | uint flag; | ||
| 78 | }; | ||
| 79 | |||
| 80 | struct csconv_t { | ||
| 81 | int codepage; | ||
| 82 | int flags; | ||
| 83 | f_mbtowc mbtowc; | ||
| 84 | f_wctomb wctomb; | ||
| 85 | f_mblen mblen; | ||
| 86 | f_flush flush; | ||
| 87 | DWORD mode; | ||
| 88 | compat_t *compat; | ||
| 89 | }; | ||
| 90 | |||
| 91 | struct rec_iconv_t { | ||
| 92 | iconv_t cd; | ||
| 93 | csconv_t from; | ||
| 94 | csconv_t to; | ||
| 95 | }; | ||
| 96 | |||
| 97 | static int load_mlang(void); | ||
| 98 | static int make_csconv(const char *name, csconv_t *cv); | ||
| 99 | static int name_to_codepage(const char *name); | ||
| 100 | static uint utf16_to_ucs4(const ushort *wbuf); | ||
| 101 | static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); | ||
| 102 | static int mbtowc_flags(int codepage); | ||
| 103 | static int must_use_null_useddefaultchar(int codepage); | ||
| 104 | static int seterror(int err); | ||
| 105 | |||
| 106 | static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 107 | static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 108 | static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 109 | static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 110 | static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 111 | |||
| 112 | static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 113 | static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 114 | static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 115 | static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 116 | static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 117 | static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 118 | static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 119 | static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 120 | static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 121 | static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 122 | static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize); | ||
| 123 | |||
| 124 | #define CP_ALIAS_LIST \ | ||
| 125 | CP_ALIAS(65001, "CP65001") \ | ||
| 126 | CP_ALIAS(65001, "UTF8") \ | ||
| 127 | CP_ALIAS(65001, "UTF-8") \ | ||
| 128 | \ | ||
| 129 | CP_ALIAS(1200, "CP1200") \ | ||
| 130 | CP_ALIAS(1200, "UTF16LE") \ | ||
| 131 | CP_ALIAS(1200, "UTF-16LE") \ | ||
| 132 | CP_ALIAS(1200, "UCS2LE") \ | ||
| 133 | CP_ALIAS(1200, "UCS-2LE") \ | ||
| 134 | CP_ALIAS(1200, "UCS-2-INTERNAL") \ | ||
| 135 | \ | ||
| 136 | CP_ALIAS(1201, "CP1201") \ | ||
| 137 | CP_ALIAS(1201, "UTF16BE") \ | ||
| 138 | CP_ALIAS(1201, "UTF-16BE") \ | ||
| 139 | CP_ALIAS(1201, "UCS2BE") \ | ||
| 140 | CP_ALIAS(1201, "UCS-2BE") \ | ||
| 141 | CP_ALIAS(1201, "unicodeFFFE") \ | ||
| 142 | \ | ||
| 143 | CP_ALIAS(12000, "CP12000") \ | ||
| 144 | CP_ALIAS(12000, "UTF32LE") \ | ||
| 145 | CP_ALIAS(12000, "UTF-32LE") \ | ||
| 146 | CP_ALIAS(12000, "UCS4LE") \ | ||
| 147 | CP_ALIAS(12000, "UCS-4LE") \ | ||
| 148 | \ | ||
| 149 | CP_ALIAS(12001, "CP12001") \ | ||
| 150 | CP_ALIAS(12001, "UTF32BE") \ | ||
| 151 | CP_ALIAS(12001, "UTF-32BE") \ | ||
| 152 | CP_ALIAS(12001, "UCS4BE") \ | ||
| 153 | CP_ALIAS(12001, "UCS-4BE") \ | ||
| 154 | \ | ||
| 155 | /* Default is little endian, because the platform is */ \ | ||
| 156 | CP_ALIAS(1200, "UTF16") \ | ||
| 157 | CP_ALIAS(1200, "UTF-16") \ | ||
| 158 | CP_ALIAS(1200, "UCS2") \ | ||
| 159 | CP_ALIAS(1200, "UCS-2") \ | ||
| 160 | CP_ALIAS(12000, "UTF32") \ | ||
| 161 | CP_ALIAS(12000, "UTF-32") \ | ||
| 162 | CP_ALIAS(12000, "UCS4") \ | ||
| 163 | CP_ALIAS(12000, "UCS-4") \ | ||
| 164 | \ | ||
| 165 | /* copy from libiconv `iconv -l` */ \ | ||
| 166 | /* !IsValidCodePage(367) */ \ | ||
| 167 | CP_ALIAS(20127, "ANSI_X3.4-1968") \ | ||
| 168 | CP_ALIAS(20127, "ANSI_X3.4-1986") \ | ||
| 169 | CP_ALIAS(20127, "ASCII") \ | ||
| 170 | CP_ALIAS(20127, "CP367") \ | ||
| 171 | CP_ALIAS(20127, "IBM367") \ | ||
| 172 | CP_ALIAS(20127, "ISO-IR-6") \ | ||
| 173 | CP_ALIAS(20127, "ISO646-US") \ | ||
| 174 | CP_ALIAS(20127, "ISO_646.IRV:1991") \ | ||
| 175 | CP_ALIAS(20127, "US") \ | ||
| 176 | CP_ALIAS(20127, "US-ASCII") \ | ||
| 177 | CP_ALIAS(20127, "CSASCII") \ | ||
| 178 | \ | ||
| 179 | /* !IsValidCodePage(819) */ \ | ||
| 180 | CP_ALIAS(1252, "CP819") \ | ||
| 181 | CP_ALIAS(1252, "IBM819") \ | ||
| 182 | CP_ALIAS(28591, "ISO-8859-1") \ | ||
| 183 | CP_ALIAS(28591, "ISO-IR-100") \ | ||
| 184 | CP_ALIAS(28591, "ISO8859-1") \ | ||
| 185 | CP_ALIAS(28591, "ISO_8859-1") \ | ||
| 186 | CP_ALIAS(28591, "ISO_8859-1:1987") \ | ||
| 187 | CP_ALIAS(28591, "L1") \ | ||
| 188 | CP_ALIAS(28591, "LATIN1") \ | ||
| 189 | CP_ALIAS(28591, "CSISOLATIN1") \ | ||
| 190 | \ | ||
| 191 | CP_ALIAS(1250, "CP1250") \ | ||
| 192 | CP_ALIAS(1250, "MS-EE") \ | ||
| 193 | CP_ALIAS(1250, "WINDOWS-1250") \ | ||
| 194 | \ | ||
| 195 | CP_ALIAS(1251, "CP1251") \ | ||
| 196 | CP_ALIAS(1251, "MS-CYRL") \ | ||
| 197 | CP_ALIAS(1251, "WINDOWS-1251") \ | ||
| 198 | \ | ||
| 199 | CP_ALIAS(1252, "CP1252") \ | ||
| 200 | CP_ALIAS(1252, "MS-ANSI") \ | ||
| 201 | CP_ALIAS(1252, "WINDOWS-1252") \ | ||
| 202 | \ | ||
| 203 | CP_ALIAS(1253, "CP1253") \ | ||
| 204 | CP_ALIAS(1253, "MS-GREEK") \ | ||
| 205 | CP_ALIAS(1253, "WINDOWS-1253") \ | ||
| 206 | \ | ||
| 207 | CP_ALIAS(1254, "CP1254") \ | ||
| 208 | CP_ALIAS(1254, "MS-TURK") \ | ||
| 209 | CP_ALIAS(1254, "WINDOWS-1254") \ | ||
| 210 | \ | ||
| 211 | CP_ALIAS(1255, "CP1255") \ | ||
| 212 | CP_ALIAS(1255, "MS-HEBR") \ | ||
| 213 | CP_ALIAS(1255, "WINDOWS-1255") \ | ||
| 214 | \ | ||
| 215 | CP_ALIAS(1256, "CP1256") \ | ||
| 216 | CP_ALIAS(1256, "MS-ARAB") \ | ||
| 217 | CP_ALIAS(1256, "WINDOWS-1256") \ | ||
| 218 | \ | ||
| 219 | CP_ALIAS(1257, "CP1257") \ | ||
| 220 | CP_ALIAS(1257, "WINBALTRIM") \ | ||
| 221 | CP_ALIAS(1257, "WINDOWS-1257") \ | ||
| 222 | \ | ||
| 223 | CP_ALIAS(1258, "CP1258") \ | ||
| 224 | CP_ALIAS(1258, "WINDOWS-1258") \ | ||
| 225 | \ | ||
| 226 | CP_ALIAS(850, "850") \ | ||
| 227 | CP_ALIAS(850, "CP850") \ | ||
| 228 | CP_ALIAS(850, "IBM850") \ | ||
| 229 | CP_ALIAS(850, "CSPC850MULTILINGUAL") \ | ||
| 230 | \ | ||
| 231 | /* !IsValidCodePage(862) */ \ | ||
| 232 | CP_ALIAS(862, "862") \ | ||
| 233 | CP_ALIAS(862, "CP862") \ | ||
| 234 | CP_ALIAS(862, "IBM862") \ | ||
| 235 | CP_ALIAS(862, "CSPC862LATINHEBREW") \ | ||
| 236 | \ | ||
| 237 | CP_ALIAS(866, "866") \ | ||
| 238 | CP_ALIAS(866, "CP866") \ | ||
| 239 | CP_ALIAS(866, "IBM866") \ | ||
| 240 | CP_ALIAS(866, "CSIBM866") \ | ||
| 241 | \ | ||
| 242 | /* !IsValidCodePage(154) */ \ | ||
| 243 | CP_ALIAS(154, "CP154") \ | ||
| 244 | CP_ALIAS(154, "CYRILLIC-ASIAN") \ | ||
| 245 | CP_ALIAS(154, "PT154") \ | ||
| 246 | CP_ALIAS(154, "PTCP154") \ | ||
| 247 | CP_ALIAS(154, "CSPTCP154") \ | ||
| 248 | \ | ||
| 249 | /* !IsValidCodePage(1133) */ \ | ||
| 250 | CP_ALIAS(1133, "CP1133") \ | ||
| 251 | CP_ALIAS(1133, "IBM-CP1133") \ | ||
| 252 | \ | ||
| 253 | CP_ALIAS(874, "CP874") \ | ||
| 254 | CP_ALIAS(874, "WINDOWS-874") \ | ||
| 255 | \ | ||
| 256 | /* !IsValidCodePage(51932) */ \ | ||
| 257 | CP_ALIAS(51932, "CP51932") \ | ||
| 258 | CP_ALIAS(51932, "MS51932") \ | ||
| 259 | CP_ALIAS(51932, "WINDOWS-51932") \ | ||
| 260 | CP_ALIAS(51932, "EUC-JP") \ | ||
| 261 | \ | ||
| 262 | CP_ALIAS(932, "CP932") \ | ||
| 263 | CP_ALIAS(932, "MS932") \ | ||
| 264 | CP_ALIAS(932, "SHIFFT_JIS") \ | ||
| 265 | CP_ALIAS(932, "SHIFFT_JIS-MS") \ | ||
| 266 | CP_ALIAS(932, "SJIS") \ | ||
| 267 | CP_ALIAS(932, "SJIS-MS") \ | ||
| 268 | CP_ALIAS(932, "SJIS-OPEN") \ | ||
| 269 | CP_ALIAS(932, "SJIS-WIN") \ | ||
| 270 | CP_ALIAS(932, "WINDOWS-31J") \ | ||
| 271 | CP_ALIAS(932, "WINDOWS-932") \ | ||
| 272 | CP_ALIAS(932, "CSWINDOWS31J") \ | ||
| 273 | \ | ||
| 274 | CP_ALIAS(50221, "CP50221") \ | ||
| 275 | CP_ALIAS(50221, "ISO-2022-JP") \ | ||
| 276 | CP_ALIAS(50221, "ISO-2022-JP-MS") \ | ||
| 277 | CP_ALIAS(50221, "ISO2022-JP") \ | ||
| 278 | CP_ALIAS(50221, "ISO2022-JP-MS") \ | ||
| 279 | CP_ALIAS(50221, "MS50221") \ | ||
| 280 | CP_ALIAS(50221, "WINDOWS-50221") \ | ||
| 281 | \ | ||
| 282 | CP_ALIAS(936, "CP936") \ | ||
| 283 | CP_ALIAS(936, "GBK") \ | ||
| 284 | CP_ALIAS(936, "MS936") \ | ||
| 285 | CP_ALIAS(936, "WINDOWS-936") \ | ||
| 286 | \ | ||
| 287 | CP_ALIAS(950, "CP950") \ | ||
| 288 | CP_ALIAS(950, "BIG5") \ | ||
| 289 | CP_ALIAS(950, "BIG5HKSCS") \ | ||
| 290 | CP_ALIAS(950, "BIG5-HKSCS") \ | ||
| 291 | \ | ||
| 292 | CP_ALIAS(949, "CP949") \ | ||
| 293 | CP_ALIAS(949, "UHC") \ | ||
| 294 | CP_ALIAS(949, "EUC-KR") \ | ||
| 295 | \ | ||
| 296 | CP_ALIAS(1361, "CP1361") \ | ||
| 297 | CP_ALIAS(1361, "JOHAB") \ | ||
| 298 | \ | ||
| 299 | CP_ALIAS(437, "437") \ | ||
| 300 | CP_ALIAS(437, "CP437") \ | ||
| 301 | CP_ALIAS(437, "IBM437") \ | ||
| 302 | CP_ALIAS(437, "CSPC8CODEPAGE437") \ | ||
| 303 | \ | ||
| 304 | CP_ALIAS(737, "CP737") \ | ||
| 305 | \ | ||
| 306 | CP_ALIAS(775, "CP775") \ | ||
| 307 | CP_ALIAS(775, "IBM775") \ | ||
| 308 | CP_ALIAS(775, "CSPC775BALTIC") \ | ||
| 309 | \ | ||
| 310 | CP_ALIAS(852, "852") \ | ||
| 311 | CP_ALIAS(852, "CP852") \ | ||
| 312 | CP_ALIAS(852, "IBM852") \ | ||
| 313 | CP_ALIAS(852, "CSPCP852") \ | ||
| 314 | \ | ||
| 315 | /* !IsValidCodePage(853) */ \ | ||
| 316 | CP_ALIAS(853, "CP853") \ | ||
| 317 | \ | ||
| 318 | CP_ALIAS(855, "855") \ | ||
| 319 | CP_ALIAS(855, "CP855") \ | ||
| 320 | CP_ALIAS(855, "IBM855") \ | ||
| 321 | CP_ALIAS(855, "CSIBM855") \ | ||
| 322 | \ | ||
| 323 | CP_ALIAS(857, "857") \ | ||
| 324 | CP_ALIAS(857, "CP857") \ | ||
| 325 | CP_ALIAS(857, "IBM857") \ | ||
| 326 | CP_ALIAS(857, "CSIBM857") \ | ||
| 327 | \ | ||
| 328 | /* !IsValidCodePage(858) */ \ | ||
| 329 | CP_ALIAS(858, "CP858") \ | ||
| 330 | \ | ||
| 331 | CP_ALIAS(860, "860") \ | ||
| 332 | CP_ALIAS(860, "CP860") \ | ||
| 333 | CP_ALIAS(860, "IBM860") \ | ||
| 334 | CP_ALIAS(860, "CSIBM860") \ | ||
| 335 | \ | ||
| 336 | CP_ALIAS(861, "861") \ | ||
| 337 | CP_ALIAS(861, "CP-IS") \ | ||
| 338 | CP_ALIAS(861, "CP861") \ | ||
| 339 | CP_ALIAS(861, "IBM861") \ | ||
| 340 | CP_ALIAS(861, "CSIBM861") \ | ||
| 341 | \ | ||
| 342 | CP_ALIAS(863, "863") \ | ||
| 343 | CP_ALIAS(863, "CP863") \ | ||
| 344 | CP_ALIAS(863, "IBM863") \ | ||
| 345 | CP_ALIAS(863, "CSIBM863") \ | ||
| 346 | \ | ||
| 347 | CP_ALIAS(864, "CP864") \ | ||
| 348 | CP_ALIAS(864, "IBM864") \ | ||
| 349 | CP_ALIAS(864, "CSIBM864") \ | ||
| 350 | \ | ||
| 351 | CP_ALIAS(865, "865") \ | ||
| 352 | CP_ALIAS(865, "CP865") \ | ||
| 353 | CP_ALIAS(865, "IBM865") \ | ||
| 354 | CP_ALIAS(865, "CSIBM865") \ | ||
| 355 | \ | ||
| 356 | CP_ALIAS(869, "869") \ | ||
| 357 | CP_ALIAS(869, "CP-GR") \ | ||
| 358 | CP_ALIAS(869, "CP869") \ | ||
| 359 | CP_ALIAS(869, "IBM869") \ | ||
| 360 | CP_ALIAS(869, "CSIBM869") \ | ||
| 361 | \ | ||
| 362 | /* !IsValidCodePage(1152) */ \ | ||
| 363 | CP_ALIAS(1125, "CP1125") \ | ||
| 364 | \ | ||
| 365 | /* \ | ||
| 366 | * Code Page Identifiers \ | ||
| 367 | * http://msdn2.microsoft.com/en-us/library/ms776446.aspx \ | ||
| 368 | */ \ | ||
| 369 | CP_ALIAS(37, "IBM037") /* IBM EBCDIC US-Canada */ \ | ||
| 370 | CP_ALIAS(437, "IBM437") /* OEM United States */ \ | ||
| 371 | CP_ALIAS(500, "IBM500") /* IBM EBCDIC International */ \ | ||
| 372 | CP_ALIAS(708, "ASMO-708") /* Arabic (ASMO 708) */ \ | ||
| 373 | /* 709 Arabic (ASMO-449+, BCON V4) */ \ | ||
| 374 | /* 710 Arabic - Transparent Arabic */ \ | ||
| 375 | CP_ALIAS(720, "DOS-720") /* Arabic (Transparent ASMO); Arabic (DOS) */ \ | ||
| 376 | CP_ALIAS(737, "ibm737") /* OEM Greek (formerly 437G); Greek (DOS) */ \ | ||
| 377 | CP_ALIAS(775, "ibm775") /* OEM Baltic; Baltic (DOS) */ \ | ||
| 378 | CP_ALIAS(850, "ibm850") /* OEM Multilingual Latin 1; Western European (DOS) */ \ | ||
| 379 | CP_ALIAS(852, "ibm852") /* OEM Latin 2; Central European (DOS) */ \ | ||
| 380 | CP_ALIAS(855, "IBM855") /* OEM Cyrillic (primarily Russian) */ \ | ||
| 381 | CP_ALIAS(857, "ibm857") /* OEM Turkish; Turkish (DOS) */ \ | ||
| 382 | CP_ALIAS(858, "IBM00858") /* OEM Multilingual Latin 1 + Euro symbol */ \ | ||
| 383 | CP_ALIAS(860, "IBM860") /* OEM Portuguese; Portuguese (DOS) */ \ | ||
| 384 | CP_ALIAS(861, "ibm861") /* OEM Icelandic; Icelandic (DOS) */ \ | ||
| 385 | CP_ALIAS(862, "DOS-862") /* OEM Hebrew; Hebrew (DOS) */ \ | ||
| 386 | CP_ALIAS(863, "IBM863") /* OEM French Canadian; French Canadian (DOS) */ \ | ||
| 387 | CP_ALIAS(864, "IBM864") /* OEM Arabic; Arabic (864) */ \ | ||
| 388 | CP_ALIAS(865, "IBM865") /* OEM Nordic; Nordic (DOS) */ \ | ||
| 389 | CP_ALIAS(866, "cp866") /* OEM Russian; Cyrillic (DOS) */ \ | ||
| 390 | CP_ALIAS(869, "ibm869") /* OEM Modern Greek; Greek, Modern (DOS) */ \ | ||
| 391 | CP_ALIAS(870, "IBM870") /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ \ | ||
| 392 | CP_ALIAS(874, "windows-874") /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ \ | ||
| 393 | CP_ALIAS(875, "cp875") /* IBM EBCDIC Greek Modern */ \ | ||
| 394 | CP_ALIAS(932, "shift_jis") /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ \ | ||
| 395 | CP_ALIAS(932, "shift-jis") /* alternative name for it */ \ | ||
| 396 | CP_ALIAS(936, "gb2312") /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ \ | ||
| 397 | CP_ALIAS(949, "ks_c_5601-1987") /* ANSI/OEM Korean (Unified Hangul Code) */ \ | ||
| 398 | CP_ALIAS(950, "big5") /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ \ | ||
| 399 | CP_ALIAS(950, "big5hkscs") /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */ \ | ||
| 400 | CP_ALIAS(950, "big5-hkscs") /* alternative name for it */ \ | ||
| 401 | CP_ALIAS(1026, "IBM1026") /* IBM EBCDIC Turkish (Latin 5) */ \ | ||
| 402 | CP_ALIAS(1047, "IBM01047") /* IBM EBCDIC Latin 1/Open System */ \ | ||
| 403 | CP_ALIAS(1140, "IBM01140") /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ \ | ||
| 404 | CP_ALIAS(1141, "IBM01141") /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ \ | ||
| 405 | CP_ALIAS(1142, "IBM01142") /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ \ | ||
| 406 | CP_ALIAS(1143, "IBM01143") /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ \ | ||
| 407 | CP_ALIAS(1144, "IBM01144") /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ \ | ||
| 408 | CP_ALIAS(1145, "IBM01145") /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ \ | ||
| 409 | CP_ALIAS(1146, "IBM01146") /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ \ | ||
| 410 | CP_ALIAS(1147, "IBM01147") /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ \ | ||
| 411 | CP_ALIAS(1148, "IBM01148") /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ \ | ||
| 412 | CP_ALIAS(1149, "IBM01149") /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ \ | ||
| 413 | CP_ALIAS(1250, "windows-1250") /* ANSI Central European; Central European (Windows) */ \ | ||
| 414 | CP_ALIAS(1251, "windows-1251") /* ANSI Cyrillic; Cyrillic (Windows) */ \ | ||
| 415 | CP_ALIAS(1252, "windows-1252") /* ANSI Latin 1; Western European (Windows) */ \ | ||
| 416 | CP_ALIAS(1253, "windows-1253") /* ANSI Greek; Greek (Windows) */ \ | ||
| 417 | CP_ALIAS(1254, "windows-1254") /* ANSI Turkish; Turkish (Windows) */ \ | ||
| 418 | CP_ALIAS(1255, "windows-1255") /* ANSI Hebrew; Hebrew (Windows) */ \ | ||
| 419 | CP_ALIAS(1256, "windows-1256") /* ANSI Arabic; Arabic (Windows) */ \ | ||
| 420 | CP_ALIAS(1257, "windows-1257") /* ANSI Baltic; Baltic (Windows) */ \ | ||
| 421 | CP_ALIAS(1258, "windows-1258") /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ \ | ||
| 422 | CP_ALIAS(1361, "Johab") /* Korean (Johab) */ \ | ||
| 423 | CP_ALIAS(10000, "macintosh") /* MAC Roman; Western European (Mac) */ \ | ||
| 424 | CP_ALIAS(10001, "x-mac-japanese") /* Japanese (Mac) */ \ | ||
| 425 | CP_ALIAS(10002, "x-mac-chinesetrad") /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ \ | ||
| 426 | CP_ALIAS(10003, "x-mac-korean") /* Korean (Mac) */ \ | ||
| 427 | CP_ALIAS(10004, "x-mac-arabic") /* Arabic (Mac) */ \ | ||
| 428 | CP_ALIAS(10005, "x-mac-hebrew") /* Hebrew (Mac) */ \ | ||
| 429 | CP_ALIAS(10006, "x-mac-greek") /* Greek (Mac) */ \ | ||
| 430 | CP_ALIAS(10007, "x-mac-cyrillic") /* Cyrillic (Mac) */ \ | ||
| 431 | CP_ALIAS(10008, "x-mac-chinesesimp") /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ \ | ||
| 432 | CP_ALIAS(10010, "x-mac-romanian") /* Romanian (Mac) */ \ | ||
| 433 | CP_ALIAS(10017, "x-mac-ukrainian") /* Ukrainian (Mac) */ \ | ||
| 434 | CP_ALIAS(10021, "x-mac-thai") /* Thai (Mac) */ \ | ||
| 435 | CP_ALIAS(10029, "x-mac-ce") /* MAC Latin 2; Central European (Mac) */ \ | ||
| 436 | CP_ALIAS(10079, "x-mac-icelandic") /* Icelandic (Mac) */ \ | ||
| 437 | CP_ALIAS(10081, "x-mac-turkish") /* Turkish (Mac) */ \ | ||
| 438 | CP_ALIAS(10082, "x-mac-croatian") /* Croatian (Mac) */ \ | ||
| 439 | CP_ALIAS(20000, "x-Chinese_CNS") /* CNS Taiwan; Chinese Traditional (CNS) */ \ | ||
| 440 | CP_ALIAS(20001, "x-cp20001") /* TCA Taiwan */ \ | ||
| 441 | CP_ALIAS(20002, "x_Chinese-Eten") /* Eten Taiwan; Chinese Traditional (Eten) */ \ | ||
| 442 | CP_ALIAS(20003, "x-cp20003") /* IBM5550 Taiwan */ \ | ||
| 443 | CP_ALIAS(20004, "x-cp20004") /* TeleText Taiwan */ \ | ||
| 444 | CP_ALIAS(20005, "x-cp20005") /* Wang Taiwan */ \ | ||
| 445 | CP_ALIAS(20105, "x-IA5") /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ \ | ||
| 446 | CP_ALIAS(20106, "x-IA5-German") /* IA5 German (7-bit) */ \ | ||
| 447 | CP_ALIAS(20107, "x-IA5-Swedish") /* IA5 Swedish (7-bit) */ \ | ||
| 448 | CP_ALIAS(20108, "x-IA5-Norwegian") /* IA5 Norwegian (7-bit) */ \ | ||
| 449 | CP_ALIAS(20127, "us-ascii") /* US-ASCII (7-bit) */ \ | ||
| 450 | CP_ALIAS(20261, "x-cp20261") /* T.61 */ \ | ||
| 451 | CP_ALIAS(20269, "x-cp20269") /* ISO 6937 Non-Spacing Accent */ \ | ||
| 452 | CP_ALIAS(20273, "IBM273") /* IBM EBCDIC Germany */ \ | ||
| 453 | CP_ALIAS(20277, "IBM277") /* IBM EBCDIC Denmark-Norway */ \ | ||
| 454 | CP_ALIAS(20278, "IBM278") /* IBM EBCDIC Finland-Sweden */ \ | ||
| 455 | CP_ALIAS(20280, "IBM280") /* IBM EBCDIC Italy */ \ | ||
| 456 | CP_ALIAS(20284, "IBM284") /* IBM EBCDIC Latin America-Spain */ \ | ||
| 457 | CP_ALIAS(20285, "IBM285") /* IBM EBCDIC United Kingdom */ \ | ||
| 458 | CP_ALIAS(20290, "IBM290") /* IBM EBCDIC Japanese Katakana Extended */ \ | ||
| 459 | CP_ALIAS(20297, "IBM297") /* IBM EBCDIC France */ \ | ||
| 460 | CP_ALIAS(20420, "IBM420") /* IBM EBCDIC Arabic */ \ | ||
| 461 | CP_ALIAS(20423, "IBM423") /* IBM EBCDIC Greek */ \ | ||
| 462 | CP_ALIAS(20424, "IBM424") /* IBM EBCDIC Hebrew */ \ | ||
| 463 | CP_ALIAS(20833, "x-EBCDIC-KoreanExtended") /* IBM EBCDIC Korean Extended */ \ | ||
| 464 | CP_ALIAS(20838, "IBM-Thai") /* IBM EBCDIC Thai */ \ | ||
| 465 | CP_ALIAS(20866, "koi8-r") /* Russian (KOI8-R); Cyrillic (KOI8-R) */ \ | ||
| 466 | CP_ALIAS(20871, "IBM871") /* IBM EBCDIC Icelandic */ \ | ||
| 467 | CP_ALIAS(20880, "IBM880") /* IBM EBCDIC Cyrillic Russian */ \ | ||
| 468 | CP_ALIAS(20905, "IBM905") /* IBM EBCDIC Turkish */ \ | ||
| 469 | CP_ALIAS(20924, "IBM00924") /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ \ | ||
| 470 | CP_ALIAS(20932, "EUC-JP") /* Japanese (JIS 0208-1990 and 0121-1990) */ \ | ||
| 471 | CP_ALIAS(20936, "x-cp20936") /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ \ | ||
| 472 | CP_ALIAS(20949, "x-cp20949") /* Korean Wansung */ \ | ||
| 473 | CP_ALIAS(21025, "cp1025") /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ \ | ||
| 474 | /* 21027 (deprecated) */ \ | ||
| 475 | CP_ALIAS(21866, "koi8-u") /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ \ | ||
| 476 | CP_ALIAS(28591, "iso-8859-1") /* ISO 8859-1 Latin 1; Western European (ISO) */ \ | ||
| 477 | CP_ALIAS(28591, "iso8859-1") /* ISO 8859-1 Latin 1; Western European (ISO) */ \ | ||
| 478 | CP_ALIAS(28591, "iso_8859-1") \ | ||
| 479 | CP_ALIAS(28591, "iso_8859_1") \ | ||
| 480 | CP_ALIAS(28592, "iso-8859-2") /* ISO 8859-2 Central European; Central European (ISO) */ \ | ||
| 481 | CP_ALIAS(28592, "iso8859-2") /* ISO 8859-2 Central European; Central European (ISO) */ \ | ||
| 482 | CP_ALIAS(28592, "iso_8859-2") \ | ||
| 483 | CP_ALIAS(28592, "iso_8859_2") \ | ||
| 484 | CP_ALIAS(28593, "iso-8859-3") /* ISO 8859-3 Latin 3 */ \ | ||
| 485 | CP_ALIAS(28593, "iso8859-3") /* ISO 8859-3 Latin 3 */ \ | ||
| 486 | CP_ALIAS(28593, "iso_8859-3") \ | ||
| 487 | CP_ALIAS(28593, "iso_8859_3") \ | ||
| 488 | CP_ALIAS(28594, "iso-8859-4") /* ISO 8859-4 Baltic */ \ | ||
| 489 | CP_ALIAS(28594, "iso8859-4") /* ISO 8859-4 Baltic */ \ | ||
| 490 | CP_ALIAS(28594, "iso_8859-4") \ | ||
| 491 | CP_ALIAS(28594, "iso_8859_4") \ | ||
| 492 | CP_ALIAS(28595, "iso-8859-5") /* ISO 8859-5 Cyrillic */ \ | ||
| 493 | CP_ALIAS(28595, "iso8859-5") /* ISO 8859-5 Cyrillic */ \ | ||
| 494 | CP_ALIAS(28595, "iso_8859-5") \ | ||
| 495 | CP_ALIAS(28595, "iso_8859_5") \ | ||
| 496 | CP_ALIAS(28596, "iso-8859-6") /* ISO 8859-6 Arabic */ \ | ||
| 497 | CP_ALIAS(28596, "iso8859-6") /* ISO 8859-6 Arabic */ \ | ||
| 498 | CP_ALIAS(28596, "iso_8859-6") \ | ||
| 499 | CP_ALIAS(28596, "iso_8859_6") \ | ||
| 500 | CP_ALIAS(28597, "iso-8859-7") /* ISO 8859-7 Greek */ \ | ||
| 501 | CP_ALIAS(28597, "iso8859-7") /* ISO 8859-7 Greek */ \ | ||
| 502 | CP_ALIAS(28597, "iso_8859-7") \ | ||
| 503 | CP_ALIAS(28597, "iso_8859_7") \ | ||
| 504 | CP_ALIAS(28598, "iso-8859-8") /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ \ | ||
| 505 | CP_ALIAS(28598, "iso8859-8") /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ \ | ||
| 506 | CP_ALIAS(28598, "iso_8859-8") \ | ||
| 507 | CP_ALIAS(28598, "iso_8859_8") \ | ||
| 508 | CP_ALIAS(28599, "iso-8859-9") /* ISO 8859-9 Turkish */ \ | ||
| 509 | CP_ALIAS(28599, "iso8859-9") /* ISO 8859-9 Turkish */ \ | ||
| 510 | CP_ALIAS(28599, "iso_8859-9") \ | ||
| 511 | CP_ALIAS(28599, "iso_8859_9") \ | ||
| 512 | CP_ALIAS(28603, "iso-8859-13") /* ISO 8859-13 Estonian */ \ | ||
| 513 | CP_ALIAS(28603, "iso8859-13") /* ISO 8859-13 Estonian */ \ | ||
| 514 | CP_ALIAS(28603, "iso_8859-13") \ | ||
| 515 | CP_ALIAS(28603, "iso_8859_13") \ | ||
| 516 | CP_ALIAS(28605, "iso-8859-15") /* ISO 8859-15 Latin 9 */ \ | ||
| 517 | CP_ALIAS(28605, "iso8859-15") /* ISO 8859-15 Latin 9 */ \ | ||
| 518 | CP_ALIAS(28605, "iso_8859-15") \ | ||
| 519 | CP_ALIAS(28605, "iso_8859_15") \ | ||
| 520 | CP_ALIAS(29001, "x-Europa") /* Europa 3 */ \ | ||
| 521 | CP_ALIAS(38598, "iso-8859-8-i") /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ \ | ||
| 522 | CP_ALIAS(38598, "iso8859-8-i") /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ \ | ||
| 523 | CP_ALIAS(38598, "iso_8859-8-i") \ | ||
| 524 | CP_ALIAS(38598, "iso_8859_8-i") \ | ||
| 525 | CP_ALIAS(50220, "iso-2022-jp") /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ \ | ||
| 526 | CP_ALIAS(50221, "csISO2022JP") /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ \ | ||
| 527 | CP_ALIAS(50222, "iso-2022-jp") /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ \ | ||
| 528 | CP_ALIAS(50225, "iso-2022-kr") /* ISO 2022 Korean */ \ | ||
| 529 | CP_ALIAS(50225, "iso2022-kr") /* ISO 2022 Korean */ \ | ||
| 530 | CP_ALIAS(50227, "x-cp50227") /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ \ | ||
| 531 | /* 50229 ISO 2022 Traditional Chinese */ \ | ||
| 532 | /* 50930 EBCDIC Japanese (Katakana) Extended */ \ | ||
| 533 | /* 50931 EBCDIC US-Canada and Japanese */ \ | ||
| 534 | /* 50933 EBCDIC Korean Extended and Korean */ \ | ||
| 535 | /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ \ | ||
| 536 | /* 50936 EBCDIC Simplified Chinese */ \ | ||
| 537 | /* 50937 EBCDIC US-Canada and Traditional Chinese */ \ | ||
| 538 | /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ \ | ||
| 539 | CP_ALIAS(51932, "euc-jp") /* EUC Japanese */ \ | ||
| 540 | CP_ALIAS(51936, "EUC-CN") /* EUC Simplified Chinese; Chinese Simplified (EUC) */ \ | ||
| 541 | CP_ALIAS(51949, "euc-kr") /* EUC Korean */ \ | ||
| 542 | /* 51950 EUC Traditional Chinese */ \ | ||
| 543 | CP_ALIAS(52936, "hz-gb-2312") /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ \ | ||
| 544 | CP_ALIAS(54936, "GB18030") /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ \ | ||
| 545 | CP_ALIAS(57002, "x-iscii-de") /* ISCII Devanagari */ \ | ||
| 546 | CP_ALIAS(57003, "x-iscii-be") /* ISCII Bengali */ \ | ||
| 547 | CP_ALIAS(57004, "x-iscii-ta") /* ISCII Tamil */ \ | ||
| 548 | CP_ALIAS(57005, "x-iscii-te") /* ISCII Telugu */ \ | ||
| 549 | CP_ALIAS(57006, "x-iscii-as") /* ISCII Assamese */ \ | ||
| 550 | CP_ALIAS(57007, "x-iscii-or") /* ISCII Oriya */ \ | ||
| 551 | CP_ALIAS(57008, "x-iscii-ka") /* ISCII Kannada */ \ | ||
| 552 | CP_ALIAS(57009, "x-iscii-ma") /* ISCII Malayalam */ \ | ||
| 553 | CP_ALIAS(57010, "x-iscii-gu") /* ISCII Gujarati */ \ | ||
| 554 | CP_ALIAS(57011, "x-iscii-pa") /* ISCII Punjabi */ | ||
| 555 | |||
| 556 | #define CP_ALIAS(codepage, alias) codepage, | ||
| 557 | static const int cp_codepage[] = { | ||
| 558 | CP_ALIAS_LIST | ||
| 559 | }; | ||
| 560 | #undef CP_ALIAS | ||
| 561 | |||
| 562 | #define CP_ALIAS(codepage, alias) alias"\0" | ||
| 563 | static const char cp_alias[] ALIGN1 = | ||
| 564 | CP_ALIAS_LIST; | ||
| 565 | #undef CP_ALIAS | ||
| 566 | |||
| 567 | /* | ||
| 568 | * SJIS SHIFTJIS table CP932 table | ||
| 569 | * ---- --------------------------- -------------------------------- | ||
| 570 | * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS | ||
| 571 | * 7E U+203E OVERLINE U+007E TILDE | ||
| 572 | * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR | ||
| 573 | * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS | ||
| 574 | * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE | ||
| 575 | * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO | ||
| 576 | * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS | ||
| 577 | * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN | ||
| 578 | * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN | ||
| 579 | * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN | ||
| 580 | * | ||
| 581 | * EUC-JP and ISO-2022-JP should be compatible with CP932. | ||
| 582 | * | ||
| 583 | * Kernel and MLang have different Unicode mapping table. Make sure | ||
| 584 | * which API is used. | ||
| 585 | */ | ||
| 586 | static compat_t cp932_compat[] = { | ||
| 587 | {0x00A5, 0x005C, COMPAT_OUT}, | ||
| 588 | {0x203E, 0x007E, COMPAT_OUT}, | ||
| 589 | {0x2014, 0x2015, COMPAT_OUT}, | ||
| 590 | {0x301C, 0xFF5E, COMPAT_OUT}, | ||
| 591 | {0x2016, 0x2225, COMPAT_OUT}, | ||
| 592 | {0x2212, 0xFF0D, COMPAT_OUT}, | ||
| 593 | {0x00A2, 0xFFE0, COMPAT_OUT}, | ||
| 594 | {0x00A3, 0xFFE1, COMPAT_OUT}, | ||
| 595 | {0x00AC, 0xFFE2, COMPAT_OUT}, | ||
| 596 | {0, 0, 0} | ||
| 597 | }; | ||
| 598 | |||
| 599 | static compat_t cp20932_compat[] = { | ||
| 600 | {0x00A5, 0x005C, COMPAT_OUT}, | ||
| 601 | {0x203E, 0x007E, COMPAT_OUT}, | ||
| 602 | {0x2014, 0x2015, COMPAT_OUT}, | ||
| 603 | {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN}, | ||
| 604 | {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN}, | ||
| 605 | {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN}, | ||
| 606 | {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN}, | ||
| 607 | {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN}, | ||
| 608 | {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN}, | ||
| 609 | {0, 0, 0} | ||
| 610 | }; | ||
| 611 | |||
| 612 | static compat_t *cp51932_compat = cp932_compat; | ||
| 613 | |||
| 614 | /* cp20932_compat for kernel. cp932_compat for mlang. */ | ||
| 615 | static compat_t *cp5022x_compat = cp932_compat; | ||
| 616 | |||
| 617 | typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)( | ||
| 618 | LPDWORD lpdwMode, | ||
| 619 | DWORD dwSrcEncoding, | ||
| 620 | LPCSTR lpSrcStr, | ||
| 621 | LPINT lpnMultiCharCount, | ||
| 622 | LPWSTR lpDstStr, | ||
| 623 | LPINT lpnWideCharCount | ||
| 624 | ); | ||
| 625 | |||
| 626 | typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)( | ||
| 627 | LPDWORD lpdwMode, | ||
| 628 | DWORD dwEncoding, | ||
| 629 | LPCWSTR lpSrcStr, | ||
| 630 | LPINT lpnWideCharCount, | ||
| 631 | LPSTR lpDstStr, | ||
| 632 | LPINT lpnMultiCharCount | ||
| 633 | ); | ||
| 634 | |||
| 635 | static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode; | ||
| 636 | static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte; | ||
| 637 | |||
| 638 | static int | ||
| 639 | load_mlang(void) | ||
| 640 | { | ||
| 641 | HMODULE h; | ||
| 642 | if (ConvertINetMultiByteToUnicode != NULL) | ||
| 643 | return TRUE; | ||
| 644 | h = LoadLibrary(TEXT("mlang.dll")); | ||
| 645 | if (!h) | ||
| 646 | return FALSE; | ||
| 647 | ConvertINetMultiByteToUnicode = (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA(h, "ConvertINetMultiByteToUnicode"); | ||
| 648 | ConvertINetUnicodeToMultiByte = (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA(h, "ConvertINetUnicodeToMultiByte"); | ||
| 649 | return TRUE; | ||
| 650 | } | ||
| 651 | |||
| 652 | static iconv_t | ||
| 653 | iconv_open(const char *tocode, const char *fromcode) | ||
| 654 | { | ||
| 655 | rec_iconv_t *cd; | ||
| 656 | |||
| 657 | cd = (rec_iconv_t *)xzalloc(sizeof(rec_iconv_t)); | ||
| 658 | |||
| 659 | /* reset the errno to prevent reporting wrong error code. | ||
| 660 | * 0 for unsorted error. */ | ||
| 661 | errno = 0; | ||
| 662 | if (make_csconv(fromcode, &cd->from) && make_csconv(tocode, &cd->to)) { | ||
| 663 | cd->cd = (iconv_t)cd; | ||
| 664 | return (iconv_t)cd; | ||
| 665 | } | ||
| 666 | |||
| 667 | free(cd); | ||
| 668 | return (iconv_t)(-1); | ||
| 669 | } | ||
| 670 | |||
| 671 | static int | ||
| 672 | iconv_close(iconv_t _cd) | ||
| 673 | { | ||
| 674 | free(_cd); | ||
| 675 | return 0; | ||
| 676 | } | ||
| 677 | |||
| 678 | static size_t | ||
| 679 | iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) | ||
| 680 | { | ||
| 681 | rec_iconv_t *cd = (rec_iconv_t *)_cd; | ||
| 682 | ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */ | ||
| 683 | int insize; | ||
| 684 | int outsize; | ||
| 685 | int wsize; | ||
| 686 | DWORD frommode; | ||
| 687 | DWORD tomode; | ||
| 688 | uint wc; | ||
| 689 | compat_t *cp; | ||
| 690 | int i; | ||
| 691 | |||
| 692 | if (inbuf == NULL || *inbuf == NULL) | ||
| 693 | { | ||
| 694 | if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL) | ||
| 695 | { | ||
| 696 | tomode = cd->to.mode; | ||
| 697 | outsize = cd->to.flush(&cd->to, (uchar *)*outbuf, *outbytesleft); | ||
| 698 | if (outsize == -1) | ||
| 699 | { | ||
| 700 | if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) | ||
| 701 | { | ||
| 702 | outsize = 0; | ||
| 703 | } | ||
| 704 | else | ||
| 705 | { | ||
| 706 | cd->to.mode = tomode; | ||
| 707 | return (size_t)(-1); | ||
| 708 | } | ||
| 709 | } | ||
| 710 | *outbuf += outsize; | ||
| 711 | *outbytesleft -= outsize; | ||
| 712 | } | ||
| 713 | cd->from.mode = 0; | ||
| 714 | cd->to.mode = 0; | ||
| 715 | return 0; | ||
| 716 | } | ||
| 717 | |||
| 718 | while (*inbytesleft != 0) | ||
| 719 | { | ||
| 720 | frommode = cd->from.mode; | ||
| 721 | tomode = cd->to.mode; | ||
| 722 | wsize = MB_CHAR_MAX; | ||
| 723 | |||
| 724 | insize = cd->from.mbtowc(&cd->from, (const uchar *)*inbuf, *inbytesleft, wbuf, &wsize); | ||
| 725 | if (insize == -1) | ||
| 726 | { | ||
| 727 | if (cd->to.flags & FLAG_IGNORE) | ||
| 728 | { | ||
| 729 | cd->from.mode = frommode; | ||
| 730 | insize = 1; | ||
| 731 | wsize = 0; | ||
| 732 | } | ||
| 733 | else | ||
| 734 | { | ||
| 735 | cd->from.mode = frommode; | ||
| 736 | return (size_t)(-1); | ||
| 737 | } | ||
| 738 | } | ||
| 739 | |||
| 740 | if (wsize == 0) | ||
| 741 | { | ||
| 742 | *inbuf += insize; | ||
| 743 | *inbytesleft -= insize; | ||
| 744 | continue; | ||
| 745 | } | ||
| 746 | |||
| 747 | if (cd->from.compat != NULL) | ||
| 748 | { | ||
| 749 | wc = utf16_to_ucs4(wbuf); | ||
| 750 | cp = cd->from.compat; | ||
| 751 | for (i = 0; cp[i].in != 0; ++i) | ||
| 752 | { | ||
| 753 | if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc) | ||
| 754 | { | ||
| 755 | ucs4_to_utf16(cp[i].in, wbuf, &wsize); | ||
| 756 | break; | ||
| 757 | } | ||
| 758 | } | ||
| 759 | } | ||
| 760 | |||
| 761 | if (cd->to.compat != NULL) | ||
| 762 | { | ||
| 763 | wc = utf16_to_ucs4(wbuf); | ||
| 764 | cp = cd->to.compat; | ||
| 765 | for (i = 0; cp[i].in != 0; ++i) | ||
| 766 | { | ||
| 767 | if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc) | ||
| 768 | { | ||
| 769 | ucs4_to_utf16(cp[i].out, wbuf, &wsize); | ||
| 770 | break; | ||
| 771 | } | ||
| 772 | } | ||
| 773 | } | ||
| 774 | |||
| 775 | outsize = cd->to.wctomb(&cd->to, wbuf, wsize, (uchar *)*outbuf, *outbytesleft); | ||
| 776 | if (outsize == -1) | ||
| 777 | { | ||
| 778 | if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) | ||
| 779 | { | ||
| 780 | cd->to.mode = tomode; | ||
| 781 | outsize = 0; | ||
| 782 | } | ||
| 783 | else | ||
| 784 | { | ||
| 785 | cd->from.mode = frommode; | ||
| 786 | cd->to.mode = tomode; | ||
| 787 | return (size_t)(-1); | ||
| 788 | } | ||
| 789 | } | ||
| 790 | |||
| 791 | *inbuf += insize; | ||
| 792 | *outbuf += outsize; | ||
| 793 | *inbytesleft -= insize; | ||
| 794 | *outbytesleft -= outsize; | ||
| 795 | } | ||
| 796 | |||
| 797 | return 0; | ||
| 798 | } | ||
| 799 | |||
| 800 | static int | ||
| 801 | make_csconv(const char *_name, csconv_t *cv) | ||
| 802 | { | ||
| 803 | CPINFO cpinfo; | ||
| 804 | int use_compat = TRUE; | ||
| 805 | int flag = 0; | ||
| 806 | char *name; | ||
| 807 | char *p, *s; | ||
| 808 | |||
| 809 | name = xstrdup(_name); | ||
| 810 | |||
| 811 | /* check for option "enc_name//opt1//opt2" */ | ||
| 812 | while ((p = strrstr(name, "//")) != NULL) | ||
| 813 | { | ||
| 814 | for (s = p + 2; *s; ++s) | ||
| 815 | *s = tolower(*s); | ||
| 816 | switch (index_in_strings("nocompat\0translit\0ignore\0", p + 2)) { | ||
| 817 | case 0: | ||
| 818 | use_compat = FALSE; | ||
| 819 | break; | ||
| 820 | case 1: | ||
| 821 | flag |= FLAG_TRANSLIT; | ||
| 822 | break; | ||
| 823 | case 2: | ||
| 824 | flag |= FLAG_IGNORE; | ||
| 825 | break; | ||
| 826 | } | ||
| 827 | *p = 0; | ||
| 828 | } | ||
| 829 | |||
| 830 | cv->mode = 0; | ||
| 831 | cv->flags = flag; | ||
| 832 | cv->mblen = NULL; | ||
| 833 | cv->flush = NULL; | ||
| 834 | cv->compat = NULL; | ||
| 835 | cv->codepage = name_to_codepage(name); | ||
| 836 | if (cv->codepage == 1200 || cv->codepage == 1201) | ||
| 837 | { | ||
| 838 | cv->mbtowc = utf16_mbtowc; | ||
| 839 | cv->wctomb = utf16_wctomb; | ||
| 840 | if (_stricmp(name, "UTF-16") == 0 || _stricmp(name, "UTF16") == 0 || | ||
| 841 | _stricmp(name, "UCS-2") == 0 || _stricmp(name, "UCS2") == 0 || | ||
| 842 | _stricmp(name,"UCS-2-INTERNAL") == 0) | ||
| 843 | cv->flags |= FLAG_USE_BOM; | ||
| 844 | } | ||
| 845 | else if (cv->codepage == 12000 || cv->codepage == 12001) | ||
| 846 | { | ||
| 847 | cv->mbtowc = utf32_mbtowc; | ||
| 848 | cv->wctomb = utf32_wctomb; | ||
| 849 | if (_stricmp(name, "UTF-32") == 0 || _stricmp(name, "UTF32") == 0 || | ||
| 850 | _stricmp(name, "UCS-4") == 0 || _stricmp(name, "UCS4") == 0) | ||
| 851 | cv->flags |= FLAG_USE_BOM; | ||
| 852 | } | ||
| 853 | else if (cv->codepage == 65001) | ||
| 854 | { | ||
| 855 | cv->mbtowc = kernel_mbtowc; | ||
| 856 | cv->wctomb = kernel_wctomb; | ||
| 857 | cv->mblen = utf8_mblen; | ||
| 858 | } | ||
| 859 | else if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && load_mlang()) | ||
| 860 | { | ||
| 861 | cv->mbtowc = iso2022jp_mbtowc; | ||
| 862 | cv->wctomb = iso2022jp_wctomb; | ||
| 863 | cv->flush = iso2022jp_flush; | ||
| 864 | } | ||
| 865 | else if (cv->codepage == 51932 && load_mlang()) | ||
| 866 | { | ||
| 867 | cv->mbtowc = mlang_mbtowc; | ||
| 868 | cv->wctomb = mlang_wctomb; | ||
| 869 | cv->mblen = eucjp_mblen; | ||
| 870 | } | ||
| 871 | else if (IsValidCodePage(cv->codepage) | ||
| 872 | && GetCPInfo(cv->codepage, &cpinfo) != 0) | ||
| 873 | { | ||
| 874 | cv->mbtowc = kernel_mbtowc; | ||
| 875 | cv->wctomb = kernel_wctomb; | ||
| 876 | if (cpinfo.MaxCharSize == 1) | ||
| 877 | cv->mblen = sbcs_mblen; | ||
| 878 | else if (cpinfo.MaxCharSize == 2) | ||
| 879 | cv->mblen = dbcs_mblen; | ||
| 880 | else | ||
| 881 | cv->mblen = mbcs_mblen; | ||
| 882 | } | ||
| 883 | else | ||
| 884 | { | ||
| 885 | /* not supported */ | ||
| 886 | free(name); | ||
| 887 | errno = EINVAL; | ||
| 888 | return FALSE; | ||
| 889 | } | ||
| 890 | |||
| 891 | if (use_compat) | ||
| 892 | { | ||
| 893 | switch (cv->codepage) | ||
| 894 | { | ||
| 895 | case 932: cv->compat = cp932_compat; break; | ||
| 896 | case 20932: cv->compat = cp20932_compat; break; | ||
| 897 | case 51932: cv->compat = cp51932_compat; break; | ||
| 898 | case 50220: case 50221: case 50222: cv->compat = cp5022x_compat; break; | ||
| 899 | } | ||
| 900 | } | ||
| 901 | |||
| 902 | free(name); | ||
| 903 | |||
| 904 | return TRUE; | ||
| 905 | } | ||
| 906 | |||
| 907 | static int | ||
| 908 | name_to_codepage(const char *name) | ||
| 909 | { | ||
| 910 | int i; | ||
| 911 | const char *alias; | ||
| 912 | |||
| 913 | if (*name == '\0' || strcmp(name, "char") == 0) | ||
| 914 | return GetACP(); | ||
| 915 | else if (strcmp(name, "wchar_t") == 0) | ||
| 916 | return 1200; | ||
| 917 | else if (_strnicmp(name, "cp", 2) == 0) | ||
| 918 | return atoi(name + 2); /* CP123 */ | ||
| 919 | else if ('0' <= name[0] && name[0] <= '9') | ||
| 920 | return atoi(name); /* 123 */ | ||
| 921 | else if (_strnicmp(name, "xx", 2) == 0) | ||
| 922 | return atoi(name + 2); /* XX123 for debug */ | ||
| 923 | |||
| 924 | i = 0; | ||
| 925 | alias = cp_alias; | ||
| 926 | while (*alias) { | ||
| 927 | if (_stricmp(alias, name) == 0) { | ||
| 928 | return cp_codepage[i]; | ||
| 929 | } | ||
| 930 | alias += strlen(alias) + 1; | ||
| 931 | ++i; | ||
| 932 | } | ||
| 933 | return -1; | ||
| 934 | } | ||
| 935 | |||
| 936 | /* | ||
| 937 | * http://www.faqs.org/rfcs/rfc2781.html | ||
| 938 | */ | ||
| 939 | static uint | ||
| 940 | utf16_to_ucs4(const ushort *wbuf) | ||
| 941 | { | ||
| 942 | uint wc = wbuf[0]; | ||
| 943 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 944 | wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000; | ||
| 945 | return wc; | ||
| 946 | } | ||
| 947 | |||
| 948 | static void | ||
| 949 | ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize) | ||
| 950 | { | ||
| 951 | if (wc < 0x10000) | ||
| 952 | { | ||
| 953 | wbuf[0] = wc; | ||
| 954 | *wbufsize = 1; | ||
| 955 | } | ||
| 956 | else | ||
| 957 | { | ||
| 958 | wc -= 0x10000; | ||
| 959 | wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF); | ||
| 960 | wbuf[1] = 0xDC00 | (wc & 0x3FF); | ||
| 961 | *wbufsize = 2; | ||
| 962 | } | ||
| 963 | } | ||
| 964 | |||
| 965 | /* | ||
| 966 | * Check if codepage is one of those for which the dwFlags parameter | ||
| 967 | * to MultiByteToWideChar() must be zero. Return zero or | ||
| 968 | * MB_ERR_INVALID_CHARS. The docs in Platform SDK for Windows | ||
| 969 | * Server 2003 R2 claims that also codepage 65001 is one of these, but | ||
| 970 | * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave | ||
| 971 | * out 65001 (UTF-8), and that indeed seems to be the case on XP, it | ||
| 972 | * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting | ||
| 973 | * from UTF-8. | ||
| 974 | */ | ||
| 975 | static int | ||
| 976 | mbtowc_flags(int codepage) | ||
| 977 | { | ||
| 978 | return (codepage == 50220 || codepage == 50221 || | ||
| 979 | codepage == 50222 || codepage == 50225 || | ||
| 980 | codepage == 50227 || codepage == 50229 || | ||
| 981 | codepage == 52936 || codepage == 54936 || | ||
| 982 | (codepage >= 57002 && codepage <= 57011) || | ||
| 983 | codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; | ||
| 984 | } | ||
| 985 | |||
| 986 | /* | ||
| 987 | * Check if codepage is one those for which the lpUsedDefaultChar | ||
| 988 | * parameter to WideCharToMultiByte() must be NULL. The docs in | ||
| 989 | * Platform SDK for Windows Server 2003 R2 claims that this is the | ||
| 990 | * list below, while the MSDN docs for MSVS2008 claim that it is only | ||
| 991 | * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform | ||
| 992 | * SDK seems to be correct, at least for XP. | ||
| 993 | */ | ||
| 994 | static int | ||
| 995 | must_use_null_useddefaultchar(int codepage) | ||
| 996 | { | ||
| 997 | return (codepage == 65000 || codepage == 65001 || | ||
| 998 | codepage == 50220 || codepage == 50221 || | ||
| 999 | codepage == 50222 || codepage == 50225 || | ||
| 1000 | codepage == 50227 || codepage == 50229 || | ||
| 1001 | codepage == 52936 || codepage == 54936 || | ||
| 1002 | (codepage >= 57002 && codepage <= 57011) || | ||
| 1003 | codepage == 42); | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | static int | ||
| 1007 | seterror(int err) | ||
| 1008 | { | ||
| 1009 | errno = err; | ||
| 1010 | return -1; | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | static int | ||
| 1014 | sbcs_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf UNUSED_PARAM, | ||
| 1015 | int bufsize UNUSED_PARAM) | ||
| 1016 | { | ||
| 1017 | return 1; | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | static int | ||
| 1021 | dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) | ||
| 1022 | { | ||
| 1023 | int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1; | ||
| 1024 | if (bufsize < len) | ||
| 1025 | return seterror(EINVAL); | ||
| 1026 | return len; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | static int | ||
| 1030 | mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) | ||
| 1031 | { | ||
| 1032 | int len = 0; | ||
| 1033 | |||
| 1034 | if (cv->codepage == 54936) { | ||
| 1035 | if (buf[0] <= 0x7F) | ||
| 1036 | len = 1; | ||
| 1037 | else if (buf[0] >= 0x81 && buf[0] <= 0xFE && | ||
| 1038 | bufsize >= 2 && | ||
| 1039 | ((buf[1] >= 0x40 && buf[1] <= 0x7E) || | ||
| 1040 | (buf[1] >= 0x80 && buf[1] <= 0xFE))) | ||
| 1041 | len = 2; | ||
| 1042 | else if (buf[0] >= 0x81 && buf[0] <= 0xFE && | ||
| 1043 | bufsize >= 4 && | ||
| 1044 | buf[1] >= 0x30 && buf[1] <= 0x39) | ||
| 1045 | len = 4; | ||
| 1046 | else | ||
| 1047 | return seterror(EINVAL); | ||
| 1048 | return len; | ||
| 1049 | } | ||
| 1050 | else | ||
| 1051 | return seterror(EINVAL); | ||
| 1052 | } | ||
| 1053 | |||
| 1054 | static int | ||
| 1055 | utf8_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize) | ||
| 1056 | { | ||
| 1057 | int len = 0; | ||
| 1058 | |||
| 1059 | if (buf[0] < 0x80) len = 1; | ||
| 1060 | else if ((buf[0] & 0xE0) == 0xC0) len = 2; | ||
| 1061 | else if ((buf[0] & 0xF0) == 0xE0) len = 3; | ||
| 1062 | else if ((buf[0] & 0xF8) == 0xF0) len = 4; | ||
| 1063 | else if ((buf[0] & 0xFC) == 0xF8) len = 5; | ||
| 1064 | else if ((buf[0] & 0xFE) == 0xFC) len = 6; | ||
| 1065 | |||
| 1066 | if (len == 0) | ||
| 1067 | return seterror(EILSEQ); | ||
| 1068 | else if (bufsize < len) | ||
| 1069 | return seterror(EINVAL); | ||
| 1070 | return len; | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static int | ||
| 1074 | eucjp_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize) | ||
| 1075 | { | ||
| 1076 | if (buf[0] < 0x80) /* ASCII */ | ||
| 1077 | return 1; | ||
| 1078 | else if (buf[0] == 0x8E) /* JIS X 0201 */ | ||
| 1079 | { | ||
| 1080 | if (bufsize < 2) | ||
| 1081 | return seterror(EINVAL); | ||
| 1082 | else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF)) | ||
| 1083 | return seterror(EILSEQ); | ||
| 1084 | return 2; | ||
| 1085 | } | ||
| 1086 | else if (buf[0] == 0x8F) /* JIS X 0212 */ | ||
| 1087 | { | ||
| 1088 | if (bufsize < 3) | ||
| 1089 | return seterror(EINVAL); | ||
| 1090 | else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE) | ||
| 1091 | || !(0xA1 <= buf[2] && buf[2] <= 0xFE)) | ||
| 1092 | return seterror(EILSEQ); | ||
| 1093 | return 3; | ||
| 1094 | } | ||
| 1095 | else /* JIS X 0208 */ | ||
| 1096 | { | ||
| 1097 | if (bufsize < 2) | ||
| 1098 | return seterror(EINVAL); | ||
| 1099 | else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE) | ||
| 1100 | || !(0xA1 <= buf[1] && buf[1] <= 0xFE)) | ||
| 1101 | return seterror(EILSEQ); | ||
| 1102 | return 2; | ||
| 1103 | } | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | static int | ||
| 1107 | kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1108 | { | ||
| 1109 | int len; | ||
| 1110 | |||
| 1111 | len = cv->mblen(cv, buf, bufsize); | ||
| 1112 | if (len == -1) | ||
| 1113 | return -1; | ||
| 1114 | /* If converting from ASCII, reject 8bit | ||
| 1115 | * chars. MultiByteToWideChar() doesn't. Note that for ASCII we | ||
| 1116 | * know that the mblen function is sbcs_mblen() so len is 1. | ||
| 1117 | */ | ||
| 1118 | if (cv->codepage == 20127 && buf[0] >= 0x80) | ||
| 1119 | return seterror(EILSEQ); | ||
| 1120 | *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage), | ||
| 1121 | (const char *)buf, len, (wchar_t *)wbuf, *wbufsize); | ||
| 1122 | if (*wbufsize == 0) | ||
| 1123 | return seterror(EILSEQ); | ||
| 1124 | return len; | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | static int | ||
| 1128 | kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1129 | { | ||
| 1130 | BOOL usedDefaultChar = 0; | ||
| 1131 | BOOL *p = NULL; | ||
| 1132 | int flags = 0; | ||
| 1133 | int len; | ||
| 1134 | |||
| 1135 | if (bufsize == 0) | ||
| 1136 | return seterror(E2BIG); | ||
| 1137 | if (!must_use_null_useddefaultchar(cv->codepage)) | ||
| 1138 | { | ||
| 1139 | p = &usedDefaultChar; | ||
| 1140 | #ifdef WC_NO_BEST_FIT_CHARS | ||
| 1141 | if (!(cv->flags & FLAG_TRANSLIT)) | ||
| 1142 | flags |= WC_NO_BEST_FIT_CHARS; | ||
| 1143 | #endif | ||
| 1144 | } | ||
| 1145 | len = WideCharToMultiByte(cv->codepage, flags, | ||
| 1146 | (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p); | ||
| 1147 | if (len == 0) | ||
| 1148 | { | ||
| 1149 | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) | ||
| 1150 | return seterror(E2BIG); | ||
| 1151 | return seterror(EILSEQ); | ||
| 1152 | } | ||
| 1153 | else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT)) | ||
| 1154 | return seterror(EILSEQ); | ||
| 1155 | else if (cv->mblen(cv, buf, len) != len) /* validate result */ | ||
| 1156 | return seterror(EILSEQ); | ||
| 1157 | return len; | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | /* | ||
| 1161 | * It seems that the mode (cv->mode) is fixnum. | ||
| 1162 | * For example, when converting iso-2022-jp(cp50221) to unicode: | ||
| 1163 | * in ascii sequence: mode=0xC42C0000 | ||
| 1164 | * in jisx0208 sequence: mode=0xC42C0001 | ||
| 1165 | * "C42C" is same for each convert session. | ||
| 1166 | * It should be: ((codepage-1)<<16)|state | ||
| 1167 | */ | ||
| 1168 | static int | ||
| 1169 | mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1170 | { | ||
| 1171 | int len; | ||
| 1172 | int insize; | ||
| 1173 | HRESULT hr; | ||
| 1174 | |||
| 1175 | len = cv->mblen(cv, buf, bufsize); | ||
| 1176 | if (len == -1) | ||
| 1177 | return -1; | ||
| 1178 | insize = len; | ||
| 1179 | hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage, | ||
| 1180 | (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize); | ||
| 1181 | if (hr != S_OK || insize != len) | ||
| 1182 | return seterror(EILSEQ); | ||
| 1183 | return len; | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | static int | ||
| 1187 | mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1188 | { | ||
| 1189 | char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */ | ||
| 1190 | int tmpsize = MB_CHAR_MAX; | ||
| 1191 | int insize = wbufsize; | ||
| 1192 | HRESULT hr; | ||
| 1193 | |||
| 1194 | hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage, | ||
| 1195 | (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize); | ||
| 1196 | if (hr != S_OK || insize != wbufsize) | ||
| 1197 | return seterror(EILSEQ); | ||
| 1198 | else if (bufsize < tmpsize) | ||
| 1199 | return seterror(E2BIG); | ||
| 1200 | else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize) | ||
| 1201 | return seterror(EILSEQ); | ||
| 1202 | memcpy(buf, tmpbuf, tmpsize); | ||
| 1203 | return tmpsize; | ||
| 1204 | } | ||
| 1205 | |||
| 1206 | static int | ||
| 1207 | utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1208 | { | ||
| 1209 | int codepage = cv->codepage; | ||
| 1210 | |||
| 1211 | /* swap endian: 1200 <-> 1201 */ | ||
| 1212 | if (cv->mode & UNICODE_MODE_SWAPPED) | ||
| 1213 | codepage ^= 1; | ||
| 1214 | |||
| 1215 | if (bufsize < 2) | ||
| 1216 | return seterror(EINVAL); | ||
| 1217 | if (codepage == 1200) /* little endian */ | ||
| 1218 | wbuf[0] = (buf[1] << 8) | buf[0]; | ||
| 1219 | else if (codepage == 1201) /* big endian */ | ||
| 1220 | wbuf[0] = (buf[0] << 8) | buf[1]; | ||
| 1221 | |||
| 1222 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1223 | { | ||
| 1224 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1225 | if (wbuf[0] == 0xFFFE) | ||
| 1226 | { | ||
| 1227 | cv->mode |= UNICODE_MODE_SWAPPED; | ||
| 1228 | *wbufsize = 0; | ||
| 1229 | return 2; | ||
| 1230 | } | ||
| 1231 | else if (wbuf[0] == 0xFEFF) | ||
| 1232 | { | ||
| 1233 | *wbufsize = 0; | ||
| 1234 | return 2; | ||
| 1235 | } | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF) | ||
| 1239 | return seterror(EILSEQ); | ||
| 1240 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 1241 | { | ||
| 1242 | if (bufsize < 4) | ||
| 1243 | return seterror(EINVAL); | ||
| 1244 | if (codepage == 1200) /* little endian */ | ||
| 1245 | wbuf[1] = (buf[3] << 8) | buf[2]; | ||
| 1246 | else if (codepage == 1201) /* big endian */ | ||
| 1247 | wbuf[1] = (buf[2] << 8) | buf[3]; | ||
| 1248 | if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF)) | ||
| 1249 | return seterror(EILSEQ); | ||
| 1250 | *wbufsize = 2; | ||
| 1251 | return 4; | ||
| 1252 | } | ||
| 1253 | *wbufsize = 1; | ||
| 1254 | return 2; | ||
| 1255 | } | ||
| 1256 | |||
| 1257 | static int | ||
| 1258 | utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1259 | { | ||
| 1260 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1261 | { | ||
| 1262 | int r; | ||
| 1263 | |||
| 1264 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1265 | if (bufsize < 2) | ||
| 1266 | return seterror(E2BIG); | ||
| 1267 | if (cv->codepage == 1200) /* little endian */ | ||
| 1268 | memcpy(buf, "\xFF\xFE", 2); | ||
| 1269 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1270 | memcpy(buf, "\xFE\xFF", 2); | ||
| 1271 | |||
| 1272 | r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2); | ||
| 1273 | if (r == -1) | ||
| 1274 | return -1; | ||
| 1275 | return r + 2; | ||
| 1276 | } | ||
| 1277 | |||
| 1278 | if (bufsize < 2) | ||
| 1279 | return seterror(E2BIG); | ||
| 1280 | if (cv->codepage == 1200) /* little endian */ | ||
| 1281 | { | ||
| 1282 | buf[0] = (wbuf[0] & 0x00FF); | ||
| 1283 | buf[1] = (wbuf[0] & 0xFF00) >> 8; | ||
| 1284 | } | ||
| 1285 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1286 | { | ||
| 1287 | buf[0] = (wbuf[0] & 0xFF00) >> 8; | ||
| 1288 | buf[1] = (wbuf[0] & 0x00FF); | ||
| 1289 | } | ||
| 1290 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 1291 | { | ||
| 1292 | if (bufsize < 4) | ||
| 1293 | return seterror(E2BIG); | ||
| 1294 | if (cv->codepage == 1200) /* little endian */ | ||
| 1295 | { | ||
| 1296 | buf[2] = (wbuf[1] & 0x00FF); | ||
| 1297 | buf[3] = (wbuf[1] & 0xFF00) >> 8; | ||
| 1298 | } | ||
| 1299 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1300 | { | ||
| 1301 | buf[2] = (wbuf[1] & 0xFF00) >> 8; | ||
| 1302 | buf[3] = (wbuf[1] & 0x00FF); | ||
| 1303 | } | ||
| 1304 | return 4; | ||
| 1305 | } | ||
| 1306 | return 2; | ||
| 1307 | } | ||
| 1308 | |||
| 1309 | static int | ||
| 1310 | utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1311 | { | ||
| 1312 | int codepage = cv->codepage; | ||
| 1313 | uint wc = 0xD800; | ||
| 1314 | |||
| 1315 | /* swap endian: 12000 <-> 12001 */ | ||
| 1316 | if (cv->mode & UNICODE_MODE_SWAPPED) | ||
| 1317 | codepage ^= 1; | ||
| 1318 | |||
| 1319 | if (bufsize < 4) | ||
| 1320 | return seterror(EINVAL); | ||
| 1321 | if (codepage == 12000) /* little endian */ | ||
| 1322 | wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; | ||
| 1323 | else if (codepage == 12001) /* big endian */ | ||
| 1324 | wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; | ||
| 1325 | |||
| 1326 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1327 | { | ||
| 1328 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1329 | if (wc == 0xFFFE0000) | ||
| 1330 | { | ||
| 1331 | cv->mode |= UNICODE_MODE_SWAPPED; | ||
| 1332 | *wbufsize = 0; | ||
| 1333 | return 4; | ||
| 1334 | } | ||
| 1335 | else if (wc == 0x0000FEFF) | ||
| 1336 | { | ||
| 1337 | *wbufsize = 0; | ||
| 1338 | return 4; | ||
| 1339 | } | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc) | ||
| 1343 | return seterror(EILSEQ); | ||
| 1344 | ucs4_to_utf16(wc, wbuf, wbufsize); | ||
| 1345 | return 4; | ||
| 1346 | } | ||
| 1347 | |||
| 1348 | static int | ||
| 1349 | utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1350 | { | ||
| 1351 | uint wc; | ||
| 1352 | |||
| 1353 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1354 | { | ||
| 1355 | int r; | ||
| 1356 | |||
| 1357 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1358 | if (bufsize < 4) | ||
| 1359 | return seterror(E2BIG); | ||
| 1360 | if (cv->codepage == 12000) /* little endian */ | ||
| 1361 | memcpy(buf, "\xFF\xFE\x00\x00", 4); | ||
| 1362 | else if (cv->codepage == 12001) /* big endian */ | ||
| 1363 | memcpy(buf, "\x00\x00\xFE\xFF", 4); | ||
| 1364 | |||
| 1365 | r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4); | ||
| 1366 | if (r == -1) | ||
| 1367 | return -1; | ||
| 1368 | return r + 4; | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | if (bufsize < 4) | ||
| 1372 | return seterror(E2BIG); | ||
| 1373 | wc = utf16_to_ucs4(wbuf); | ||
| 1374 | if (cv->codepage == 12000) /* little endian */ | ||
| 1375 | { | ||
| 1376 | buf[0] = wc & 0x000000FF; | ||
| 1377 | buf[1] = (wc & 0x0000FF00) >> 8; | ||
| 1378 | buf[2] = (wc & 0x00FF0000) >> 16; | ||
| 1379 | buf[3] = (wc & 0xFF000000) >> 24; | ||
| 1380 | } | ||
| 1381 | else if (cv->codepage == 12001) /* big endian */ | ||
| 1382 | { | ||
| 1383 | buf[0] = (wc & 0xFF000000) >> 24; | ||
| 1384 | buf[1] = (wc & 0x00FF0000) >> 16; | ||
| 1385 | buf[2] = (wc & 0x0000FF00) >> 8; | ||
| 1386 | buf[3] = wc & 0x000000FF; | ||
| 1387 | } | ||
| 1388 | return 4; | ||
| 1389 | } | ||
| 1390 | |||
| 1391 | /* | ||
| 1392 | * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) | ||
| 1393 | * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow | ||
| 1394 | * 1 byte Kana) | ||
| 1395 | * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte | ||
| 1396 | * Kana - SO/SI) | ||
| 1397 | * | ||
| 1398 | * MultiByteToWideChar() and WideCharToMultiByte() behave differently | ||
| 1399 | * depending on Windows version. On XP, WideCharToMultiByte() doesn't | ||
| 1400 | * terminate result sequence with ascii escape. But Vista does. | ||
| 1401 | * Use MLang instead. | ||
| 1402 | */ | ||
| 1403 | |||
| 1404 | #define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift)) | ||
| 1405 | #define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF) | ||
| 1406 | #define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF) | ||
| 1407 | |||
| 1408 | #define ISO2022_SI 0 | ||
| 1409 | #define ISO2022_SO 1 | ||
| 1410 | |||
| 1411 | /* shift in */ | ||
| 1412 | static const char iso2022_SI_seq[] = "\x0F"; | ||
| 1413 | /* shift out */ | ||
| 1414 | static const char iso2022_SO_seq[] = "\x0E"; | ||
| 1415 | |||
| 1416 | typedef struct iso2022_esc_t iso2022_esc_t; | ||
| 1417 | struct iso2022_esc_t { | ||
| 1418 | const char *esc; | ||
| 1419 | int esc_len; | ||
| 1420 | int len; | ||
| 1421 | int cs; | ||
| 1422 | }; | ||
| 1423 | |||
| 1424 | #define ISO2022JP_CS_ASCII 0 | ||
| 1425 | #define ISO2022JP_CS_JISX0201_ROMAN 1 | ||
| 1426 | #define ISO2022JP_CS_JISX0201_KANA 2 | ||
| 1427 | #define ISO2022JP_CS_JISX0208_1978 3 | ||
| 1428 | #define ISO2022JP_CS_JISX0208_1983 4 | ||
| 1429 | #define ISO2022JP_CS_JISX0212 5 | ||
| 1430 | |||
| 1431 | static iso2022_esc_t iso2022jp_esc[] = { | ||
| 1432 | {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII}, | ||
| 1433 | {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN}, | ||
| 1434 | {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA}, | ||
| 1435 | {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, /* unify 1978 with 1983 */ | ||
| 1436 | {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983}, | ||
| 1437 | {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212}, | ||
| 1438 | {NULL, 0, 0, 0} | ||
| 1439 | }; | ||
| 1440 | |||
| 1441 | static int | ||
| 1442 | iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1443 | { | ||
| 1444 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1445 | char tmp[MB_CHAR_MAX]; | ||
| 1446 | int insize; | ||
| 1447 | HRESULT hr; | ||
| 1448 | DWORD dummy = 0; | ||
| 1449 | int len; | ||
| 1450 | int esc_len; | ||
| 1451 | int cs; | ||
| 1452 | int shift; | ||
| 1453 | int i; | ||
| 1454 | |||
| 1455 | if (buf[0] == 0x1B) | ||
| 1456 | { | ||
| 1457 | for (i = 0; iesc[i].esc != NULL; ++i) | ||
| 1458 | { | ||
| 1459 | esc_len = iesc[i].esc_len; | ||
| 1460 | if (bufsize < esc_len) | ||
| 1461 | { | ||
| 1462 | if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0) | ||
| 1463 | return seterror(EINVAL); | ||
| 1464 | } | ||
| 1465 | else | ||
| 1466 | { | ||
| 1467 | if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0) | ||
| 1468 | { | ||
| 1469 | cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI); | ||
| 1470 | *wbufsize = 0; | ||
| 1471 | return esc_len; | ||
| 1472 | } | ||
| 1473 | } | ||
| 1474 | } | ||
| 1475 | /* not supported escape sequence */ | ||
| 1476 | return seterror(EILSEQ); | ||
| 1477 | } | ||
| 1478 | else if (buf[0] == iso2022_SO_seq[0]) | ||
| 1479 | { | ||
| 1480 | cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO); | ||
| 1481 | *wbufsize = 0; | ||
| 1482 | return 1; | ||
| 1483 | } | ||
| 1484 | else if (buf[0] == iso2022_SI_seq[0]) | ||
| 1485 | { | ||
| 1486 | cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI); | ||
| 1487 | *wbufsize = 0; | ||
| 1488 | return 1; | ||
| 1489 | } | ||
| 1490 | |||
| 1491 | cs = ISO2022_MODE_CS(cv->mode); | ||
| 1492 | shift = ISO2022_MODE_SHIFT(cv->mode); | ||
| 1493 | |||
| 1494 | /* reset the mode for informal sequence */ | ||
| 1495 | if (buf[0] < 0x20) | ||
| 1496 | { | ||
| 1497 | cs = ISO2022JP_CS_ASCII; | ||
| 1498 | shift = ISO2022_SI; | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | len = iesc[cs].len; | ||
| 1502 | if (bufsize < len) | ||
| 1503 | return seterror(EINVAL); | ||
| 1504 | for (i = 0; i < len; ++i) | ||
| 1505 | if (!(buf[i] < 0x80)) | ||
| 1506 | return seterror(EILSEQ); | ||
| 1507 | esc_len = iesc[cs].esc_len; | ||
| 1508 | memcpy(tmp, iesc[cs].esc, esc_len); | ||
| 1509 | if (shift == ISO2022_SO) | ||
| 1510 | { | ||
| 1511 | memcpy(tmp + esc_len, iso2022_SO_seq, 1); | ||
| 1512 | esc_len += 1; | ||
| 1513 | } | ||
| 1514 | memcpy(tmp + esc_len, buf, len); | ||
| 1515 | |||
| 1516 | if ((cv->codepage == 50220 || cv->codepage == 50221 | ||
| 1517 | || cv->codepage == 50222) && shift == ISO2022_SO) | ||
| 1518 | { | ||
| 1519 | /* XXX: shift-out cannot be used for mbtowc (both kernel and | ||
| 1520 | * mlang) */ | ||
| 1521 | esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len; | ||
| 1522 | memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len); | ||
| 1523 | memcpy(tmp + esc_len, buf, len); | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | insize = len + esc_len; | ||
| 1527 | hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage, | ||
| 1528 | (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize); | ||
| 1529 | if (hr != S_OK || insize != len + esc_len) | ||
| 1530 | return seterror(EILSEQ); | ||
| 1531 | |||
| 1532 | /* Check for conversion error. Assuming defaultChar is 0x3F. */ | ||
| 1533 | /* ascii should be converted from ascii */ | ||
| 1534 | if (wbuf[0] == buf[0] | ||
| 1535 | && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) | ||
| 1536 | return seterror(EILSEQ); | ||
| 1537 | |||
| 1538 | /* reset the mode for informal sequence */ | ||
| 1539 | if (cv->mode != ISO2022_MODE(cs, shift)) | ||
| 1540 | cv->mode = ISO2022_MODE(cs, shift); | ||
| 1541 | |||
| 1542 | return len; | ||
| 1543 | } | ||
| 1544 | |||
| 1545 | static int | ||
| 1546 | iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1547 | { | ||
| 1548 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1549 | char tmp[MB_CHAR_MAX]; | ||
| 1550 | int tmpsize = MB_CHAR_MAX; | ||
| 1551 | int insize = wbufsize; | ||
| 1552 | HRESULT hr; | ||
| 1553 | DWORD dummy = 0; | ||
| 1554 | int len; | ||
| 1555 | int esc_len; | ||
| 1556 | int cs; | ||
| 1557 | int shift; | ||
| 1558 | int i; | ||
| 1559 | |||
| 1560 | /* | ||
| 1561 | * MultiByte = [escape sequence] + character + [escape sequence] | ||
| 1562 | * | ||
| 1563 | * Whether trailing escape sequence is added depends on which API is | ||
| 1564 | * used (kernel or MLang, and its version). | ||
| 1565 | */ | ||
| 1566 | hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage, | ||
| 1567 | (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize); | ||
| 1568 | if (hr != S_OK || insize != wbufsize) | ||
| 1569 | return seterror(EILSEQ); | ||
| 1570 | else if (bufsize < tmpsize) | ||
| 1571 | return seterror(E2BIG); | ||
| 1572 | |||
| 1573 | if (tmpsize == 1) | ||
| 1574 | { | ||
| 1575 | cs = ISO2022JP_CS_ASCII; | ||
| 1576 | esc_len = 0; | ||
| 1577 | } | ||
| 1578 | else | ||
| 1579 | { | ||
| 1580 | for (i = 1; iesc[i].esc != NULL; ++i) | ||
| 1581 | { | ||
| 1582 | esc_len = iesc[i].esc_len; | ||
| 1583 | if (strncmp(tmp, iesc[i].esc, esc_len) == 0) | ||
| 1584 | { | ||
| 1585 | cs = iesc[i].cs; | ||
| 1586 | break; | ||
| 1587 | } | ||
| 1588 | } | ||
| 1589 | if (iesc[i].esc == NULL) | ||
| 1590 | /* not supported escape sequence */ | ||
| 1591 | return seterror(EILSEQ); | ||
| 1592 | } | ||
| 1593 | |||
| 1594 | shift = ISO2022_SI; | ||
| 1595 | if (tmp[esc_len] == iso2022_SO_seq[0]) | ||
| 1596 | { | ||
| 1597 | shift = ISO2022_SO; | ||
| 1598 | esc_len += 1; | ||
| 1599 | } | ||
| 1600 | |||
| 1601 | len = iesc[cs].len; | ||
| 1602 | |||
| 1603 | /* Check for converting error. Assuming defaultChar is 0x3F. */ | ||
| 1604 | /* ascii should be converted from ascii */ | ||
| 1605 | if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80)) | ||
| 1606 | return seterror(EILSEQ); | ||
| 1607 | else if (tmpsize < esc_len + len) | ||
| 1608 | return seterror(EILSEQ); | ||
| 1609 | |||
| 1610 | if (cv->mode == ISO2022_MODE(cs, shift)) | ||
| 1611 | { | ||
| 1612 | /* remove escape sequence */ | ||
| 1613 | if (esc_len != 0) | ||
| 1614 | memmove(tmp, tmp + esc_len, len); | ||
| 1615 | esc_len = 0; | ||
| 1616 | } | ||
| 1617 | else | ||
| 1618 | { | ||
| 1619 | if (cs == ISO2022JP_CS_ASCII) | ||
| 1620 | { | ||
| 1621 | esc_len = iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1622 | memmove(tmp + esc_len, tmp, len); | ||
| 1623 | memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len); | ||
| 1624 | } | ||
| 1625 | if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO) | ||
| 1626 | { | ||
| 1627 | /* shift-in before changing to other mode */ | ||
| 1628 | memmove(tmp + 1, tmp, len + esc_len); | ||
| 1629 | memcpy(tmp, iso2022_SI_seq, 1); | ||
| 1630 | esc_len += 1; | ||
| 1631 | } | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | if (bufsize < len + esc_len) | ||
| 1635 | return seterror(E2BIG); | ||
| 1636 | memcpy(buf, tmp, len + esc_len); | ||
| 1637 | cv->mode = ISO2022_MODE(cs, shift); | ||
| 1638 | return len + esc_len; | ||
| 1639 | } | ||
| 1640 | |||
| 1641 | static int | ||
| 1642 | iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize) | ||
| 1643 | { | ||
| 1644 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1645 | int esc_len; | ||
| 1646 | |||
| 1647 | if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) | ||
| 1648 | { | ||
| 1649 | esc_len = 0; | ||
| 1650 | if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) | ||
| 1651 | esc_len += 1; | ||
| 1652 | if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) | ||
| 1653 | esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1654 | if (bufsize < esc_len) | ||
| 1655 | return seterror(E2BIG); | ||
| 1656 | |||
| 1657 | esc_len = 0; | ||
| 1658 | if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) | ||
| 1659 | { | ||
| 1660 | memcpy(buf, iso2022_SI_seq, 1); | ||
| 1661 | esc_len += 1; | ||
| 1662 | } | ||
| 1663 | if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) | ||
| 1664 | { | ||
| 1665 | memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc, | ||
| 1666 | iesc[ISO2022JP_CS_ASCII].esc_len); | ||
| 1667 | esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1668 | } | ||
| 1669 | return esc_len; | ||
| 1670 | } | ||
| 1671 | return 0; | ||
| 1672 | } | ||
| 1673 | |||
| 1674 | static void process_file(iconv_t cd, FILE *in, FILE *out) | ||
| 1675 | { | ||
| 1676 | char inbuf[BUFSIZ]; | ||
| 1677 | char outbuf[BUFSIZ]; | ||
| 1678 | const char *pin; | ||
| 1679 | char *pout; | ||
| 1680 | size_t inbytesleft; | ||
| 1681 | size_t outbytesleft; | ||
| 1682 | size_t rest = 0; | ||
| 1683 | size_t r; | ||
| 1684 | |||
| 1685 | while ((inbytesleft=fread(inbuf+rest, 1, sizeof(inbuf)-rest, in)) != 0 | ||
| 1686 | || rest != 0) { | ||
| 1687 | inbytesleft += rest; | ||
| 1688 | pin = inbuf; | ||
| 1689 | pout = outbuf; | ||
| 1690 | outbytesleft = sizeof(outbuf); | ||
| 1691 | r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft); | ||
| 1692 | fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); | ||
| 1693 | if (r == (size_t)(-1) && errno != E2BIG && | ||
| 1694 | (errno != EINVAL || feof(in))) | ||
| 1695 | bb_perror_msg_and_die("conversion error"); | ||
| 1696 | memmove(inbuf, pin, inbytesleft); | ||
| 1697 | rest = inbytesleft; | ||
| 1698 | if (rest == 0 && feof(in)) | ||
| 1699 | break; | ||
| 1700 | } | ||
| 1701 | pout = outbuf; | ||
| 1702 | outbytesleft = sizeof(outbuf); | ||
| 1703 | r = iconv(cd, NULL, NULL, &pout, &outbytesleft); | ||
| 1704 | fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); | ||
| 1705 | if (r == (size_t)(-1)) | ||
| 1706 | bb_perror_msg_and_die("conversion error"); | ||
| 1707 | } | ||
| 1708 | |||
| 1709 | enum { | ||
| 1710 | OPT_f = (1 << 0), | ||
| 1711 | OPT_t = (1 << 1), | ||
| 1712 | OPT_l = (1 << 2), | ||
| 1713 | OPT_c = (1 << 3), | ||
| 1714 | OPT_o = (1 << 4), | ||
| 1715 | }; | ||
| 1716 | |||
| 1717 | int iconv_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; | ||
| 1718 | int iconv_main(int argc, char **argv) | ||
| 1719 | { | ||
| 1720 | const char *fromcode = "", *tocode = "", *outfile; | ||
| 1721 | char *tmpname = NULL; | ||
| 1722 | int i, opt; | ||
| 1723 | iconv_t cd; | ||
| 1724 | FILE *in; | ||
| 1725 | FILE *out = stdout; | ||
| 1726 | |||
| 1727 | opt = getopt32(argv, "f:t:lco:", &fromcode, &tocode, &outfile); | ||
| 1728 | |||
| 1729 | if (opt & OPT_l) { | ||
| 1730 | const char *alias = cp_alias; | ||
| 1731 | while (*alias) { | ||
| 1732 | printf("%s\n", alias); | ||
| 1733 | alias += strlen(alias) + 1; | ||
| 1734 | } | ||
| 1735 | return 0; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | if (opt & OPT_o) { | ||
| 1739 | tmpname = xasprintf("%sXXXXXX", outfile); | ||
| 1740 | mktemp(tmpname); | ||
| 1741 | out = xfopen(tmpname, "wb"); | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | if (opt & OPT_c) | ||
| 1745 | tocode = xasprintf("%s//IGNORE", tocode); | ||
| 1746 | |||
| 1747 | cd = iconv_open(tocode, fromcode); | ||
| 1748 | if (cd == (iconv_t)(-1)) | ||
| 1749 | bb_perror_msg_and_die("iconv_open error"); | ||
| 1750 | |||
| 1751 | if (optind == argc) | ||
| 1752 | argv[argc++] = (char *)"-"; | ||
| 1753 | |||
| 1754 | for (i=optind; i<argc; ++i) { | ||
| 1755 | if (argv[i][0] == '-' && argv[i][1] == '\0') | ||
| 1756 | in = stdin; | ||
| 1757 | else | ||
| 1758 | in = xfopen(argv[optind], "rb"); | ||
| 1759 | process_file(cd, in, out); | ||
| 1760 | fclose(in); | ||
| 1761 | } | ||
| 1762 | |||
| 1763 | if (tmpname) { | ||
| 1764 | fclose(out); | ||
| 1765 | xrename(tmpname, outfile); | ||
| 1766 | } | ||
| 1767 | |||
| 1768 | if (ENABLE_FEATURE_CLEAN_UP) | ||
| 1769 | iconv_close(cd); | ||
| 1770 | return 0; | ||
| 1771 | } | ||
