diff options
| author | Ron Yorston <rmy@pobox.com> | 2019-01-06 13:17:09 +0000 |
|---|---|---|
| committer | Ron Yorston <rmy@pobox.com> | 2019-01-06 13:26:19 +0000 |
| commit | 1fec4ebbdb930f6b8989be2e10c0f673803ac830 (patch) | |
| tree | b66e26698acde618b64b219c5709d59feaaea72b /miscutils | |
| parent | f192e653963344fdffa88d47bb16d40d6e63081f (diff) | |
| download | busybox-w32-1fec4ebbdb930f6b8989be2e10c0f673803ac830.tar.gz busybox-w32-1fec4ebbdb930f6b8989be2e10c0f673803ac830.tar.bz2 busybox-w32-1fec4ebbdb930f6b8989be2e10c0f673803ac830.zip | |
iconv: import from win-iconv
Source imported from https://github.com/win-iconv/win-iconv and
modified to build in busybox-w32.
Diffstat (limited to 'miscutils')
| -rw-r--r-- | miscutils/iconv.c | 1846 |
1 files changed, 1846 insertions, 0 deletions
diff --git a/miscutils/iconv.c b/miscutils/iconv.c new file mode 100644 index 000000000..828c38213 --- /dev/null +++ b/miscutils/iconv.c | |||
| @@ -0,0 +1,1846 @@ | |||
| 1 | /* | ||
| 2 | * iconv implementation using Win32 API to convert. | ||
| 3 | * | ||
| 4 | * This file is placed in the public domain. | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * This code was obtained from: | ||
| 9 | * | ||
| 10 | * https://github.com/win-iconv/win-iconv | ||
| 11 | * | ||
| 12 | * Modified for busybox-w32 by Ronald M Yorston. These modifications | ||
| 13 | * are also dedicated to the public domain. | ||
| 14 | */ | ||
| 15 | |||
| 16 | //config:config ICONV | ||
| 17 | //config: bool "iconv" | ||
| 18 | //config: default y | ||
| 19 | //config: help | ||
| 20 | //config: 'iconv' converts text between character encodings. | ||
| 21 | |||
| 22 | //applet:IF_ICONV(APPLET(iconv, BB_DIR_USR_BIN, BB_SUID_DROP)) | ||
| 23 | |||
| 24 | //kbuild:lib-$(CONFIG_ICONV) += iconv.o | ||
| 25 | |||
| 26 | //usage:#define iconv_trivial_usage | ||
| 27 | //usage: "[-lc] [-o outfile] -f from-enc -t to-enc [FILE]..." | ||
| 28 | //usage:#define iconv_full_usage "\n\n" | ||
| 29 | //usage: "Convert text between character encodings\n" | ||
| 30 | //usage: "\n -l List all known character encodings" | ||
| 31 | //usage: "\n -c Silently discard characters that cannot be converted" | ||
| 32 | //usage: "\n -o Use outfile for output" | ||
| 33 | //usage: "\n -f Use from-enc for input characters" | ||
| 34 | //usage: "\n -t Use to-enc for output characters" | ||
| 35 | |||
| 36 | #include "libbb.h" | ||
| 37 | |||
| 38 | /* WORKAROUND: */ | ||
| 39 | #define GetProcAddressA GetProcAddress | ||
| 40 | |||
| 41 | #define MB_CHAR_MAX 16 | ||
| 42 | |||
| 43 | #define UNICODE_MODE_BOM_DONE 1 | ||
| 44 | #define UNICODE_MODE_SWAPPED 2 | ||
| 45 | |||
| 46 | #define FLAG_USE_BOM 1 | ||
| 47 | #define FLAG_TRANSLIT 2 /* //TRANSLIT */ | ||
| 48 | #define FLAG_IGNORE 4 /* //IGNORE */ | ||
| 49 | |||
| 50 | typedef unsigned char uchar; | ||
| 51 | typedef unsigned short ushort; | ||
| 52 | typedef unsigned int uint; | ||
| 53 | |||
| 54 | typedef void* iconv_t; | ||
| 55 | |||
| 56 | iconv_t iconv_open(const char *tocode, const char *fromcode); | ||
| 57 | int iconv_close(iconv_t cd); | ||
| 58 | size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); | ||
| 59 | |||
| 60 | typedef struct compat_t compat_t; | ||
| 61 | typedef struct csconv_t csconv_t; | ||
| 62 | typedef struct rec_iconv_t rec_iconv_t; | ||
| 63 | |||
| 64 | typedef iconv_t (*f_iconv_open)(const char *tocode, const char *fromcode); | ||
| 65 | typedef int (*f_iconv_close)(iconv_t cd); | ||
| 66 | typedef size_t (*f_iconv)(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); | ||
| 67 | typedef int* (*f_errno)(void); | ||
| 68 | typedef int (*f_mbtowc)(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 69 | typedef int (*f_wctomb)(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 70 | typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 71 | typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize); | ||
| 72 | |||
| 73 | #define COMPAT_IN 1 | ||
| 74 | #define COMPAT_OUT 2 | ||
| 75 | |||
| 76 | /* unicode mapping for compatibility with other conversion table. */ | ||
| 77 | struct compat_t { | ||
| 78 | uint in; | ||
| 79 | uint out; | ||
| 80 | uint flag; | ||
| 81 | }; | ||
| 82 | |||
| 83 | struct csconv_t { | ||
| 84 | int codepage; | ||
| 85 | int flags; | ||
| 86 | f_mbtowc mbtowc; | ||
| 87 | f_wctomb wctomb; | ||
| 88 | f_mblen mblen; | ||
| 89 | f_flush flush; | ||
| 90 | DWORD mode; | ||
| 91 | compat_t *compat; | ||
| 92 | }; | ||
| 93 | |||
| 94 | struct rec_iconv_t { | ||
| 95 | iconv_t cd; | ||
| 96 | f_iconv_close iconv_close; | ||
| 97 | f_iconv iconv; | ||
| 98 | f_errno _errno; | ||
| 99 | csconv_t from; | ||
| 100 | csconv_t to; | ||
| 101 | }; | ||
| 102 | |||
| 103 | static int win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode); | ||
| 104 | static int win_iconv_close(iconv_t cd); | ||
| 105 | static size_t win_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); | ||
| 106 | |||
| 107 | static int load_mlang(void); | ||
| 108 | static int make_csconv(const char *name, csconv_t *cv); | ||
| 109 | static int name_to_codepage(const char *name); | ||
| 110 | static uint utf16_to_ucs4(const ushort *wbuf); | ||
| 111 | static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); | ||
| 112 | static int mbtowc_flags(int codepage); | ||
| 113 | static int must_use_null_useddefaultchar(int codepage); | ||
| 114 | static int seterror(int err); | ||
| 115 | |||
| 116 | static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 117 | static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 118 | static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 119 | static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 120 | static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); | ||
| 121 | |||
| 122 | static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 123 | static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 124 | static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 125 | static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 126 | static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 127 | static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 128 | static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 129 | static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 130 | static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); | ||
| 131 | static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); | ||
| 132 | static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize); | ||
| 133 | |||
| 134 | static struct { | ||
| 135 | int codepage; | ||
| 136 | const char *name; | ||
| 137 | } codepage_alias[] = { | ||
| 138 | {65001, "CP65001"}, | ||
| 139 | {65001, "UTF8"}, | ||
| 140 | {65001, "UTF-8"}, | ||
| 141 | |||
| 142 | {1200, "CP1200"}, | ||
| 143 | {1200, "UTF16LE"}, | ||
| 144 | {1200, "UTF-16LE"}, | ||
| 145 | {1200, "UCS2LE"}, | ||
| 146 | {1200, "UCS-2LE"}, | ||
| 147 | {1200, "UCS-2-INTERNAL"}, | ||
| 148 | |||
| 149 | {1201, "CP1201"}, | ||
| 150 | {1201, "UTF16BE"}, | ||
| 151 | {1201, "UTF-16BE"}, | ||
| 152 | {1201, "UCS2BE"}, | ||
| 153 | {1201, "UCS-2BE"}, | ||
| 154 | {1201, "unicodeFFFE"}, | ||
| 155 | |||
| 156 | {12000, "CP12000"}, | ||
| 157 | {12000, "UTF32LE"}, | ||
| 158 | {12000, "UTF-32LE"}, | ||
| 159 | {12000, "UCS4LE"}, | ||
| 160 | {12000, "UCS-4LE"}, | ||
| 161 | |||
| 162 | {12001, "CP12001"}, | ||
| 163 | {12001, "UTF32BE"}, | ||
| 164 | {12001, "UTF-32BE"}, | ||
| 165 | {12001, "UCS4BE"}, | ||
| 166 | {12001, "UCS-4BE"}, | ||
| 167 | |||
| 168 | #ifndef GLIB_COMPILATION | ||
| 169 | /* | ||
| 170 | * Default is big endian. | ||
| 171 | * See rfc2781 4.3 Interpreting text labelled as UTF-16. | ||
| 172 | */ | ||
| 173 | {1201, "UTF16"}, | ||
| 174 | {1201, "UTF-16"}, | ||
| 175 | {1201, "UCS2"}, | ||
| 176 | {1201, "UCS-2"}, | ||
| 177 | {12001, "UTF32"}, | ||
| 178 | {12001, "UTF-32"}, | ||
| 179 | {12001, "UCS-4"}, | ||
| 180 | {12001, "UCS4"}, | ||
| 181 | #else | ||
| 182 | /* Default is little endian, because the platform is */ | ||
| 183 | {1200, "UTF16"}, | ||
| 184 | {1200, "UTF-16"}, | ||
| 185 | {1200, "UCS2"}, | ||
| 186 | {1200, "UCS-2"}, | ||
| 187 | {12000, "UTF32"}, | ||
| 188 | {12000, "UTF-32"}, | ||
| 189 | {12000, "UCS4"}, | ||
| 190 | {12000, "UCS-4"}, | ||
| 191 | #endif | ||
| 192 | |||
| 193 | /* copy from libiconv `iconv -l` */ | ||
| 194 | /* !IsValidCodePage(367) */ | ||
| 195 | {20127, "ANSI_X3.4-1968"}, | ||
| 196 | {20127, "ANSI_X3.4-1986"}, | ||
| 197 | {20127, "ASCII"}, | ||
| 198 | {20127, "CP367"}, | ||
| 199 | {20127, "IBM367"}, | ||
| 200 | {20127, "ISO-IR-6"}, | ||
| 201 | {20127, "ISO646-US"}, | ||
| 202 | {20127, "ISO_646.IRV:1991"}, | ||
| 203 | {20127, "US"}, | ||
| 204 | {20127, "US-ASCII"}, | ||
| 205 | {20127, "CSASCII"}, | ||
| 206 | |||
| 207 | /* !IsValidCodePage(819) */ | ||
| 208 | {1252, "CP819"}, | ||
| 209 | {1252, "IBM819"}, | ||
| 210 | {28591, "ISO-8859-1"}, | ||
| 211 | {28591, "ISO-IR-100"}, | ||
| 212 | {28591, "ISO8859-1"}, | ||
| 213 | {28591, "ISO_8859-1"}, | ||
| 214 | {28591, "ISO_8859-1:1987"}, | ||
| 215 | {28591, "L1"}, | ||
| 216 | {28591, "LATIN1"}, | ||
| 217 | {28591, "CSISOLATIN1"}, | ||
| 218 | |||
| 219 | {1250, "CP1250"}, | ||
| 220 | {1250, "MS-EE"}, | ||
| 221 | {1250, "WINDOWS-1250"}, | ||
| 222 | |||
| 223 | {1251, "CP1251"}, | ||
| 224 | {1251, "MS-CYRL"}, | ||
| 225 | {1251, "WINDOWS-1251"}, | ||
| 226 | |||
| 227 | {1252, "CP1252"}, | ||
| 228 | {1252, "MS-ANSI"}, | ||
| 229 | {1252, "WINDOWS-1252"}, | ||
| 230 | |||
| 231 | {1253, "CP1253"}, | ||
| 232 | {1253, "MS-GREEK"}, | ||
| 233 | {1253, "WINDOWS-1253"}, | ||
| 234 | |||
| 235 | {1254, "CP1254"}, | ||
| 236 | {1254, "MS-TURK"}, | ||
| 237 | {1254, "WINDOWS-1254"}, | ||
| 238 | |||
| 239 | {1255, "CP1255"}, | ||
| 240 | {1255, "MS-HEBR"}, | ||
| 241 | {1255, "WINDOWS-1255"}, | ||
| 242 | |||
| 243 | {1256, "CP1256"}, | ||
| 244 | {1256, "MS-ARAB"}, | ||
| 245 | {1256, "WINDOWS-1256"}, | ||
| 246 | |||
| 247 | {1257, "CP1257"}, | ||
| 248 | {1257, "WINBALTRIM"}, | ||
| 249 | {1257, "WINDOWS-1257"}, | ||
| 250 | |||
| 251 | {1258, "CP1258"}, | ||
| 252 | {1258, "WINDOWS-1258"}, | ||
| 253 | |||
| 254 | {850, "850"}, | ||
| 255 | {850, "CP850"}, | ||
| 256 | {850, "IBM850"}, | ||
| 257 | {850, "CSPC850MULTILINGUAL"}, | ||
| 258 | |||
| 259 | /* !IsValidCodePage(862) */ | ||
| 260 | {862, "862"}, | ||
| 261 | {862, "CP862"}, | ||
| 262 | {862, "IBM862"}, | ||
| 263 | {862, "CSPC862LATINHEBREW"}, | ||
| 264 | |||
| 265 | {866, "866"}, | ||
| 266 | {866, "CP866"}, | ||
| 267 | {866, "IBM866"}, | ||
| 268 | {866, "CSIBM866"}, | ||
| 269 | |||
| 270 | /* !IsValidCodePage(154) */ | ||
| 271 | {154, "CP154"}, | ||
| 272 | {154, "CYRILLIC-ASIAN"}, | ||
| 273 | {154, "PT154"}, | ||
| 274 | {154, "PTCP154"}, | ||
| 275 | {154, "CSPTCP154"}, | ||
| 276 | |||
| 277 | /* !IsValidCodePage(1133) */ | ||
| 278 | {1133, "CP1133"}, | ||
| 279 | {1133, "IBM-CP1133"}, | ||
| 280 | |||
| 281 | {874, "CP874"}, | ||
| 282 | {874, "WINDOWS-874"}, | ||
| 283 | |||
| 284 | /* !IsValidCodePage(51932) */ | ||
| 285 | {51932, "CP51932"}, | ||
| 286 | {51932, "MS51932"}, | ||
| 287 | {51932, "WINDOWS-51932"}, | ||
| 288 | {51932, "EUC-JP"}, | ||
| 289 | |||
| 290 | {932, "CP932"}, | ||
| 291 | {932, "MS932"}, | ||
| 292 | {932, "SHIFFT_JIS"}, | ||
| 293 | {932, "SHIFFT_JIS-MS"}, | ||
| 294 | {932, "SJIS"}, | ||
| 295 | {932, "SJIS-MS"}, | ||
| 296 | {932, "SJIS-OPEN"}, | ||
| 297 | {932, "SJIS-WIN"}, | ||
| 298 | {932, "WINDOWS-31J"}, | ||
| 299 | {932, "WINDOWS-932"}, | ||
| 300 | {932, "CSWINDOWS31J"}, | ||
| 301 | |||
| 302 | {50221, "CP50221"}, | ||
| 303 | {50221, "ISO-2022-JP"}, | ||
| 304 | {50221, "ISO-2022-JP-MS"}, | ||
| 305 | {50221, "ISO2022-JP"}, | ||
| 306 | {50221, "ISO2022-JP-MS"}, | ||
| 307 | {50221, "MS50221"}, | ||
| 308 | {50221, "WINDOWS-50221"}, | ||
| 309 | |||
| 310 | {936, "CP936"}, | ||
| 311 | {936, "GBK"}, | ||
| 312 | {936, "MS936"}, | ||
| 313 | {936, "WINDOWS-936"}, | ||
| 314 | |||
| 315 | {950, "CP950"}, | ||
| 316 | {950, "BIG5"}, | ||
| 317 | {950, "BIG5HKSCS"}, | ||
| 318 | {950, "BIG5-HKSCS"}, | ||
| 319 | |||
| 320 | {949, "CP949"}, | ||
| 321 | {949, "UHC"}, | ||
| 322 | {949, "EUC-KR"}, | ||
| 323 | |||
| 324 | {1361, "CP1361"}, | ||
| 325 | {1361, "JOHAB"}, | ||
| 326 | |||
| 327 | {437, "437"}, | ||
| 328 | {437, "CP437"}, | ||
| 329 | {437, "IBM437"}, | ||
| 330 | {437, "CSPC8CODEPAGE437"}, | ||
| 331 | |||
| 332 | {737, "CP737"}, | ||
| 333 | |||
| 334 | {775, "CP775"}, | ||
| 335 | {775, "IBM775"}, | ||
| 336 | {775, "CSPC775BALTIC"}, | ||
| 337 | |||
| 338 | {852, "852"}, | ||
| 339 | {852, "CP852"}, | ||
| 340 | {852, "IBM852"}, | ||
| 341 | {852, "CSPCP852"}, | ||
| 342 | |||
| 343 | /* !IsValidCodePage(853) */ | ||
| 344 | {853, "CP853"}, | ||
| 345 | |||
| 346 | {855, "855"}, | ||
| 347 | {855, "CP855"}, | ||
| 348 | {855, "IBM855"}, | ||
| 349 | {855, "CSIBM855"}, | ||
| 350 | |||
| 351 | {857, "857"}, | ||
| 352 | {857, "CP857"}, | ||
| 353 | {857, "IBM857"}, | ||
| 354 | {857, "CSIBM857"}, | ||
| 355 | |||
| 356 | /* !IsValidCodePage(858) */ | ||
| 357 | {858, "CP858"}, | ||
| 358 | |||
| 359 | {860, "860"}, | ||
| 360 | {860, "CP860"}, | ||
| 361 | {860, "IBM860"}, | ||
| 362 | {860, "CSIBM860"}, | ||
| 363 | |||
| 364 | {861, "861"}, | ||
| 365 | {861, "CP-IS"}, | ||
| 366 | {861, "CP861"}, | ||
| 367 | {861, "IBM861"}, | ||
| 368 | {861, "CSIBM861"}, | ||
| 369 | |||
| 370 | {863, "863"}, | ||
| 371 | {863, "CP863"}, | ||
| 372 | {863, "IBM863"}, | ||
| 373 | {863, "CSIBM863"}, | ||
| 374 | |||
| 375 | {864, "CP864"}, | ||
| 376 | {864, "IBM864"}, | ||
| 377 | {864, "CSIBM864"}, | ||
| 378 | |||
| 379 | {865, "865"}, | ||
| 380 | {865, "CP865"}, | ||
| 381 | {865, "IBM865"}, | ||
| 382 | {865, "CSIBM865"}, | ||
| 383 | |||
| 384 | {869, "869"}, | ||
| 385 | {869, "CP-GR"}, | ||
| 386 | {869, "CP869"}, | ||
| 387 | {869, "IBM869"}, | ||
| 388 | {869, "CSIBM869"}, | ||
| 389 | |||
| 390 | /* !IsValidCodePage(1152) */ | ||
| 391 | {1125, "CP1125"}, | ||
| 392 | |||
| 393 | /* | ||
| 394 | * Code Page Identifiers | ||
| 395 | * http://msdn2.microsoft.com/en-us/library/ms776446.aspx | ||
| 396 | */ | ||
| 397 | {37, "IBM037"}, /* IBM EBCDIC US-Canada */ | ||
| 398 | {437, "IBM437"}, /* OEM United States */ | ||
| 399 | {500, "IBM500"}, /* IBM EBCDIC International */ | ||
| 400 | {708, "ASMO-708"}, /* Arabic (ASMO 708) */ | ||
| 401 | /* 709 Arabic (ASMO-449+, BCON V4) */ | ||
| 402 | /* 710 Arabic - Transparent Arabic */ | ||
| 403 | {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ | ||
| 404 | {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ | ||
| 405 | {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ | ||
| 406 | {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ | ||
| 407 | {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ | ||
| 408 | {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ | ||
| 409 | {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ | ||
| 410 | {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ | ||
| 411 | {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ | ||
| 412 | {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ | ||
| 413 | {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ | ||
| 414 | {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ | ||
| 415 | {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ | ||
| 416 | {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ | ||
| 417 | {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ | ||
| 418 | {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ | ||
| 419 | {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ | ||
| 420 | {874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ | ||
| 421 | {875, "cp875"}, /* IBM EBCDIC Greek Modern */ | ||
| 422 | {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ | ||
| 423 | {932, "shift-jis"}, /* alternative name for it */ | ||
| 424 | {936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ | ||
| 425 | {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ | ||
| 426 | {950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ | ||
| 427 | {950, "big5hkscs"}, /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */ | ||
| 428 | {950, "big5-hkscs"}, /* alternative name for it */ | ||
| 429 | {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ | ||
| 430 | {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ | ||
| 431 | {1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ | ||
| 432 | {1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ | ||
| 433 | {1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ | ||
| 434 | {1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ | ||
| 435 | {1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ | ||
| 436 | {1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ | ||
| 437 | {1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ | ||
| 438 | {1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ | ||
| 439 | {1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ | ||
| 440 | {1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ | ||
| 441 | {1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */ | ||
| 442 | {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ | ||
| 443 | {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ | ||
| 444 | {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ | ||
| 445 | {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ | ||
| 446 | {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ | ||
| 447 | {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ | ||
| 448 | {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ | ||
| 449 | {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ | ||
| 450 | {1361, "Johab"}, /* Korean (Johab) */ | ||
| 451 | {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ | ||
| 452 | {10001, "x-mac-japanese"}, /* Japanese (Mac) */ | ||
| 453 | {10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ | ||
| 454 | {10003, "x-mac-korean"}, /* Korean (Mac) */ | ||
| 455 | {10004, "x-mac-arabic"}, /* Arabic (Mac) */ | ||
| 456 | {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ | ||
| 457 | {10006, "x-mac-greek"}, /* Greek (Mac) */ | ||
| 458 | {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ | ||
| 459 | {10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ | ||
| 460 | {10010, "x-mac-romanian"}, /* Romanian (Mac) */ | ||
| 461 | {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ | ||
| 462 | {10021, "x-mac-thai"}, /* Thai (Mac) */ | ||
| 463 | {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ | ||
| 464 | {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ | ||
| 465 | {10081, "x-mac-turkish"}, /* Turkish (Mac) */ | ||
| 466 | {10082, "x-mac-croatian"}, /* Croatian (Mac) */ | ||
| 467 | {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ | ||
| 468 | {20001, "x-cp20001"}, /* TCA Taiwan */ | ||
| 469 | {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ | ||
| 470 | {20003, "x-cp20003"}, /* IBM5550 Taiwan */ | ||
| 471 | {20004, "x-cp20004"}, /* TeleText Taiwan */ | ||
| 472 | {20005, "x-cp20005"}, /* Wang Taiwan */ | ||
| 473 | {20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ | ||
| 474 | {20106, "x-IA5-German"}, /* IA5 German (7-bit) */ | ||
| 475 | {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ | ||
| 476 | {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ | ||
| 477 | {20127, "us-ascii"}, /* US-ASCII (7-bit) */ | ||
| 478 | {20261, "x-cp20261"}, /* T.61 */ | ||
| 479 | {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ | ||
| 480 | {20273, "IBM273"}, /* IBM EBCDIC Germany */ | ||
| 481 | {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ | ||
| 482 | {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ | ||
| 483 | {20280, "IBM280"}, /* IBM EBCDIC Italy */ | ||
| 484 | {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ | ||
| 485 | {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ | ||
| 486 | {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ | ||
| 487 | {20297, "IBM297"}, /* IBM EBCDIC France */ | ||
| 488 | {20420, "IBM420"}, /* IBM EBCDIC Arabic */ | ||
| 489 | {20423, "IBM423"}, /* IBM EBCDIC Greek */ | ||
| 490 | {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ | ||
| 491 | {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ | ||
| 492 | {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ | ||
| 493 | {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ | ||
| 494 | {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ | ||
| 495 | {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ | ||
| 496 | {20905, "IBM905"}, /* IBM EBCDIC Turkish */ | ||
| 497 | {20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ | ||
| 498 | {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ | ||
| 499 | {20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ | ||
| 500 | {20949, "x-cp20949"}, /* Korean Wansung */ | ||
| 501 | {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ | ||
| 502 | /* 21027 (deprecated) */ | ||
| 503 | {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ | ||
| 504 | {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ | ||
| 505 | {28591, "iso8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ | ||
| 506 | {28591, "iso_8859-1"}, | ||
| 507 | {28591, "iso_8859_1"}, | ||
| 508 | {28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ | ||
| 509 | {28592, "iso8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ | ||
| 510 | {28592, "iso_8859-2"}, | ||
| 511 | {28592, "iso_8859_2"}, | ||
| 512 | {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ | ||
| 513 | {28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */ | ||
| 514 | {28593, "iso_8859-3"}, | ||
| 515 | {28593, "iso_8859_3"}, | ||
| 516 | {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ | ||
| 517 | {28594, "iso8859-4"}, /* ISO 8859-4 Baltic */ | ||
| 518 | {28594, "iso_8859-4"}, | ||
| 519 | {28594, "iso_8859_4"}, | ||
| 520 | {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ | ||
| 521 | {28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */ | ||
| 522 | {28595, "iso_8859-5"}, | ||
| 523 | {28595, "iso_8859_5"}, | ||
| 524 | {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ | ||
| 525 | {28596, "iso8859-6"}, /* ISO 8859-6 Arabic */ | ||
| 526 | {28596, "iso_8859-6"}, | ||
| 527 | {28596, "iso_8859_6"}, | ||
| 528 | {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ | ||
| 529 | {28597, "iso8859-7"}, /* ISO 8859-7 Greek */ | ||
| 530 | {28597, "iso_8859-7"}, | ||
| 531 | {28597, "iso_8859_7"}, | ||
| 532 | {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ | ||
| 533 | {28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ | ||
| 534 | {28598, "iso_8859-8"}, | ||
| 535 | {28598, "iso_8859_8"}, | ||
| 536 | {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ | ||
| 537 | {28599, "iso8859-9"}, /* ISO 8859-9 Turkish */ | ||
| 538 | {28599, "iso_8859-9"}, | ||
| 539 | {28599, "iso_8859_9"}, | ||
| 540 | {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ | ||
| 541 | {28603, "iso8859-13"}, /* ISO 8859-13 Estonian */ | ||
| 542 | {28603, "iso_8859-13"}, | ||
| 543 | {28603, "iso_8859_13"}, | ||
| 544 | {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ | ||
| 545 | {28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */ | ||
| 546 | {28605, "iso_8859-15"}, | ||
| 547 | {28605, "iso_8859_15"}, | ||
| 548 | {29001, "x-Europa"}, /* Europa 3 */ | ||
| 549 | {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ | ||
| 550 | {38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ | ||
| 551 | {38598, "iso_8859-8-i"}, | ||
| 552 | {38598, "iso_8859_8-i"}, | ||
| 553 | {50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ | ||
| 554 | {50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ | ||
| 555 | {50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ | ||
| 556 | {50225, "iso-2022-kr"}, /* ISO 2022 Korean */ | ||
| 557 | {50225, "iso2022-kr"}, /* ISO 2022 Korean */ | ||
| 558 | {50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ | ||
| 559 | /* 50229 ISO 2022 Traditional Chinese */ | ||
| 560 | /* 50930 EBCDIC Japanese (Katakana) Extended */ | ||
| 561 | /* 50931 EBCDIC US-Canada and Japanese */ | ||
| 562 | /* 50933 EBCDIC Korean Extended and Korean */ | ||
| 563 | /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ | ||
| 564 | /* 50936 EBCDIC Simplified Chinese */ | ||
| 565 | /* 50937 EBCDIC US-Canada and Traditional Chinese */ | ||
| 566 | /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ | ||
| 567 | {51932, "euc-jp"}, /* EUC Japanese */ | ||
| 568 | {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ | ||
| 569 | {51949, "euc-kr"}, /* EUC Korean */ | ||
| 570 | /* 51950 EUC Traditional Chinese */ | ||
| 571 | {52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ | ||
| 572 | {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ | ||
| 573 | {57002, "x-iscii-de"}, /* ISCII Devanagari */ | ||
| 574 | {57003, "x-iscii-be"}, /* ISCII Bengali */ | ||
| 575 | {57004, "x-iscii-ta"}, /* ISCII Tamil */ | ||
| 576 | {57005, "x-iscii-te"}, /* ISCII Telugu */ | ||
| 577 | {57006, "x-iscii-as"}, /* ISCII Assamese */ | ||
| 578 | {57007, "x-iscii-or"}, /* ISCII Oriya */ | ||
| 579 | {57008, "x-iscii-ka"}, /* ISCII Kannada */ | ||
| 580 | {57009, "x-iscii-ma"}, /* ISCII Malayalam */ | ||
| 581 | {57010, "x-iscii-gu"}, /* ISCII Gujarati */ | ||
| 582 | {57011, "x-iscii-pa"}, /* ISCII Punjabi */ | ||
| 583 | |||
| 584 | {0, NULL} | ||
| 585 | }; | ||
| 586 | |||
| 587 | /* | ||
| 588 | * SJIS SHIFTJIS table CP932 table | ||
| 589 | * ---- --------------------------- -------------------------------- | ||
| 590 | * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS | ||
| 591 | * 7E U+203E OVERLINE U+007E TILDE | ||
| 592 | * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR | ||
| 593 | * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS | ||
| 594 | * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE | ||
| 595 | * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO | ||
| 596 | * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS | ||
| 597 | * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN | ||
| 598 | * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN | ||
| 599 | * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN | ||
| 600 | * | ||
| 601 | * EUC-JP and ISO-2022-JP should be compatible with CP932. | ||
| 602 | * | ||
| 603 | * Kernel and MLang have different Unicode mapping table. Make sure | ||
| 604 | * which API is used. | ||
| 605 | */ | ||
| 606 | static compat_t cp932_compat[] = { | ||
| 607 | {0x00A5, 0x005C, COMPAT_OUT}, | ||
| 608 | {0x203E, 0x007E, COMPAT_OUT}, | ||
| 609 | {0x2014, 0x2015, COMPAT_OUT}, | ||
| 610 | {0x301C, 0xFF5E, COMPAT_OUT}, | ||
| 611 | {0x2016, 0x2225, COMPAT_OUT}, | ||
| 612 | {0x2212, 0xFF0D, COMPAT_OUT}, | ||
| 613 | {0x00A2, 0xFFE0, COMPAT_OUT}, | ||
| 614 | {0x00A3, 0xFFE1, COMPAT_OUT}, | ||
| 615 | {0x00AC, 0xFFE2, COMPAT_OUT}, | ||
| 616 | {0, 0, 0} | ||
| 617 | }; | ||
| 618 | |||
| 619 | static compat_t cp20932_compat[] = { | ||
| 620 | {0x00A5, 0x005C, COMPAT_OUT}, | ||
| 621 | {0x203E, 0x007E, COMPAT_OUT}, | ||
| 622 | {0x2014, 0x2015, COMPAT_OUT}, | ||
| 623 | {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN}, | ||
| 624 | {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN}, | ||
| 625 | {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN}, | ||
| 626 | {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN}, | ||
| 627 | {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN}, | ||
| 628 | {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN}, | ||
| 629 | {0, 0, 0} | ||
| 630 | }; | ||
| 631 | |||
| 632 | static compat_t *cp51932_compat = cp932_compat; | ||
| 633 | |||
| 634 | /* cp20932_compat for kernel. cp932_compat for mlang. */ | ||
| 635 | static compat_t *cp5022x_compat = cp932_compat; | ||
| 636 | |||
| 637 | typedef HRESULT (WINAPI *CONVERTINETSTRING)( | ||
| 638 | LPDWORD lpdwMode, | ||
| 639 | DWORD dwSrcEncoding, | ||
| 640 | DWORD dwDstEncoding, | ||
| 641 | LPCSTR lpSrcStr, | ||
| 642 | LPINT lpnSrcSize, | ||
| 643 | LPBYTE lpDstStr, | ||
| 644 | LPINT lpnDstSize | ||
| 645 | ); | ||
| 646 | typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)( | ||
| 647 | LPDWORD lpdwMode, | ||
| 648 | DWORD dwSrcEncoding, | ||
| 649 | LPCSTR lpSrcStr, | ||
| 650 | LPINT lpnMultiCharCount, | ||
| 651 | LPWSTR lpDstStr, | ||
| 652 | LPINT lpnWideCharCount | ||
| 653 | ); | ||
| 654 | typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)( | ||
| 655 | LPDWORD lpdwMode, | ||
| 656 | DWORD dwEncoding, | ||
| 657 | LPCWSTR lpSrcStr, | ||
| 658 | LPINT lpnWideCharCount, | ||
| 659 | LPSTR lpDstStr, | ||
| 660 | LPINT lpnMultiCharCount | ||
| 661 | ); | ||
| 662 | typedef HRESULT (WINAPI *ISCONVERTINETSTRINGAVAILABLE)( | ||
| 663 | DWORD dwSrcEncoding, | ||
| 664 | DWORD dwDstEncoding | ||
| 665 | ); | ||
| 666 | typedef HRESULT (WINAPI *LCIDTORFC1766A)( | ||
| 667 | LCID Locale, | ||
| 668 | LPSTR pszRfc1766, | ||
| 669 | int nChar | ||
| 670 | ); | ||
| 671 | typedef HRESULT (WINAPI *LCIDTORFC1766W)( | ||
| 672 | LCID Locale, | ||
| 673 | LPWSTR pszRfc1766, | ||
| 674 | int nChar | ||
| 675 | ); | ||
| 676 | typedef HRESULT (WINAPI *RFC1766TOLCIDA)( | ||
| 677 | LCID *pLocale, | ||
| 678 | LPSTR pszRfc1766 | ||
| 679 | ); | ||
| 680 | typedef HRESULT (WINAPI *RFC1766TOLCIDW)( | ||
| 681 | LCID *pLocale, | ||
| 682 | LPWSTR pszRfc1766 | ||
| 683 | ); | ||
| 684 | static CONVERTINETSTRING ConvertINetString; | ||
| 685 | static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode; | ||
| 686 | static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte; | ||
| 687 | static ISCONVERTINETSTRINGAVAILABLE IsConvertINetStringAvailable; | ||
| 688 | static LCIDTORFC1766A LcidToRfc1766A; | ||
| 689 | static RFC1766TOLCIDA Rfc1766ToLcidA; | ||
| 690 | |||
| 691 | static int | ||
| 692 | load_mlang(void) | ||
| 693 | { | ||
| 694 | HMODULE h; | ||
| 695 | if (ConvertINetString != NULL) | ||
| 696 | return TRUE; | ||
| 697 | h = LoadLibrary(TEXT("mlang.dll")); | ||
| 698 | if (!h) | ||
| 699 | return FALSE; | ||
| 700 | ConvertINetString = (CONVERTINETSTRING)GetProcAddressA(h, "ConvertINetString"); | ||
| 701 | ConvertINetMultiByteToUnicode = (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA(h, "ConvertINetMultiByteToUnicode"); | ||
| 702 | ConvertINetUnicodeToMultiByte = (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA(h, "ConvertINetUnicodeToMultiByte"); | ||
| 703 | IsConvertINetStringAvailable = (ISCONVERTINETSTRINGAVAILABLE)GetProcAddressA(h, "IsConvertINetStringAvailable"); | ||
| 704 | LcidToRfc1766A = (LCIDTORFC1766A)GetProcAddressA(h, "LcidToRfc1766A"); | ||
| 705 | Rfc1766ToLcidA = (RFC1766TOLCIDA)GetProcAddressA(h, "Rfc1766ToLcidA"); | ||
| 706 | return TRUE; | ||
| 707 | } | ||
| 708 | |||
| 709 | iconv_t | ||
| 710 | iconv_open(const char *tocode, const char *fromcode) | ||
| 711 | { | ||
| 712 | rec_iconv_t *cd; | ||
| 713 | |||
| 714 | cd = (rec_iconv_t *)xzalloc(sizeof(rec_iconv_t)); | ||
| 715 | |||
| 716 | /* reset the errno to prevent reporting wrong error code. | ||
| 717 | * 0 for unsorted error. */ | ||
| 718 | errno = 0; | ||
| 719 | if (win_iconv_open(cd, tocode, fromcode)) | ||
| 720 | return (iconv_t)cd; | ||
| 721 | |||
| 722 | free(cd); | ||
| 723 | |||
| 724 | return (iconv_t)(-1); | ||
| 725 | } | ||
| 726 | |||
| 727 | int | ||
| 728 | iconv_close(iconv_t _cd) | ||
| 729 | { | ||
| 730 | rec_iconv_t *cd = (rec_iconv_t *)_cd; | ||
| 731 | int r = cd->iconv_close(cd->cd); | ||
| 732 | int e = *(cd->_errno()); | ||
| 733 | free(cd); | ||
| 734 | errno = e; | ||
| 735 | return r; | ||
| 736 | } | ||
| 737 | |||
| 738 | size_t | ||
| 739 | iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) | ||
| 740 | { | ||
| 741 | rec_iconv_t *cd = (rec_iconv_t *)_cd; | ||
| 742 | size_t r = cd->iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft); | ||
| 743 | errno = *(cd->_errno()); | ||
| 744 | return r; | ||
| 745 | } | ||
| 746 | |||
| 747 | static int | ||
| 748 | win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode) | ||
| 749 | { | ||
| 750 | if (!make_csconv(fromcode, &cd->from) || !make_csconv(tocode, &cd->to)) | ||
| 751 | return FALSE; | ||
| 752 | cd->iconv_close = win_iconv_close; | ||
| 753 | cd->iconv = win_iconv; | ||
| 754 | cd->_errno = _errno; | ||
| 755 | cd->cd = (iconv_t)cd; | ||
| 756 | return TRUE; | ||
| 757 | } | ||
| 758 | |||
| 759 | static int | ||
| 760 | win_iconv_close(iconv_t cd UNUSED_PARAM) | ||
| 761 | { | ||
| 762 | return 0; | ||
| 763 | } | ||
| 764 | |||
| 765 | static size_t | ||
| 766 | win_iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) | ||
| 767 | { | ||
| 768 | rec_iconv_t *cd = (rec_iconv_t *)_cd; | ||
| 769 | ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */ | ||
| 770 | int insize; | ||
| 771 | int outsize; | ||
| 772 | int wsize; | ||
| 773 | DWORD frommode; | ||
| 774 | DWORD tomode; | ||
| 775 | uint wc; | ||
| 776 | compat_t *cp; | ||
| 777 | int i; | ||
| 778 | |||
| 779 | if (inbuf == NULL || *inbuf == NULL) | ||
| 780 | { | ||
| 781 | if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL) | ||
| 782 | { | ||
| 783 | tomode = cd->to.mode; | ||
| 784 | outsize = cd->to.flush(&cd->to, (uchar *)*outbuf, *outbytesleft); | ||
| 785 | if (outsize == -1) | ||
| 786 | { | ||
| 787 | if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) | ||
| 788 | { | ||
| 789 | outsize = 0; | ||
| 790 | } | ||
| 791 | else | ||
| 792 | { | ||
| 793 | cd->to.mode = tomode; | ||
| 794 | return (size_t)(-1); | ||
| 795 | } | ||
| 796 | } | ||
| 797 | *outbuf += outsize; | ||
| 798 | *outbytesleft -= outsize; | ||
| 799 | } | ||
| 800 | cd->from.mode = 0; | ||
| 801 | cd->to.mode = 0; | ||
| 802 | return 0; | ||
| 803 | } | ||
| 804 | |||
| 805 | while (*inbytesleft != 0) | ||
| 806 | { | ||
| 807 | frommode = cd->from.mode; | ||
| 808 | tomode = cd->to.mode; | ||
| 809 | wsize = MB_CHAR_MAX; | ||
| 810 | |||
| 811 | insize = cd->from.mbtowc(&cd->from, (const uchar *)*inbuf, *inbytesleft, wbuf, &wsize); | ||
| 812 | if (insize == -1) | ||
| 813 | { | ||
| 814 | if (cd->to.flags & FLAG_IGNORE) | ||
| 815 | { | ||
| 816 | cd->from.mode = frommode; | ||
| 817 | insize = 1; | ||
| 818 | wsize = 0; | ||
| 819 | } | ||
| 820 | else | ||
| 821 | { | ||
| 822 | cd->from.mode = frommode; | ||
| 823 | return (size_t)(-1); | ||
| 824 | } | ||
| 825 | } | ||
| 826 | |||
| 827 | if (wsize == 0) | ||
| 828 | { | ||
| 829 | *inbuf += insize; | ||
| 830 | *inbytesleft -= insize; | ||
| 831 | continue; | ||
| 832 | } | ||
| 833 | |||
| 834 | if (cd->from.compat != NULL) | ||
| 835 | { | ||
| 836 | wc = utf16_to_ucs4(wbuf); | ||
| 837 | cp = cd->from.compat; | ||
| 838 | for (i = 0; cp[i].in != 0; ++i) | ||
| 839 | { | ||
| 840 | if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc) | ||
| 841 | { | ||
| 842 | ucs4_to_utf16(cp[i].in, wbuf, &wsize); | ||
| 843 | break; | ||
| 844 | } | ||
| 845 | } | ||
| 846 | } | ||
| 847 | |||
| 848 | if (cd->to.compat != NULL) | ||
| 849 | { | ||
| 850 | wc = utf16_to_ucs4(wbuf); | ||
| 851 | cp = cd->to.compat; | ||
| 852 | for (i = 0; cp[i].in != 0; ++i) | ||
| 853 | { | ||
| 854 | if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc) | ||
| 855 | { | ||
| 856 | ucs4_to_utf16(cp[i].out, wbuf, &wsize); | ||
| 857 | break; | ||
| 858 | } | ||
| 859 | } | ||
| 860 | } | ||
| 861 | |||
| 862 | outsize = cd->to.wctomb(&cd->to, wbuf, wsize, (uchar *)*outbuf, *outbytesleft); | ||
| 863 | if (outsize == -1) | ||
| 864 | { | ||
| 865 | if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) | ||
| 866 | { | ||
| 867 | cd->to.mode = tomode; | ||
| 868 | outsize = 0; | ||
| 869 | } | ||
| 870 | else | ||
| 871 | { | ||
| 872 | cd->from.mode = frommode; | ||
| 873 | cd->to.mode = tomode; | ||
| 874 | return (size_t)(-1); | ||
| 875 | } | ||
| 876 | } | ||
| 877 | |||
| 878 | *inbuf += insize; | ||
| 879 | *outbuf += outsize; | ||
| 880 | *inbytesleft -= insize; | ||
| 881 | *outbytesleft -= outsize; | ||
| 882 | } | ||
| 883 | |||
| 884 | return 0; | ||
| 885 | } | ||
| 886 | |||
| 887 | static int | ||
| 888 | make_csconv(const char *_name, csconv_t *cv) | ||
| 889 | { | ||
| 890 | CPINFO cpinfo; | ||
| 891 | int use_compat = TRUE; | ||
| 892 | int flag = 0; | ||
| 893 | char *name; | ||
| 894 | char *p; | ||
| 895 | |||
| 896 | name = xstrndup(_name, strlen(_name)); | ||
| 897 | if (name == NULL) | ||
| 898 | return FALSE; | ||
| 899 | |||
| 900 | /* check for option "enc_name//opt1//opt2" */ | ||
| 901 | while ((p = strrstr(name, "//")) != NULL) | ||
| 902 | { | ||
| 903 | if (_stricmp(p + 2, "nocompat") == 0) | ||
| 904 | use_compat = FALSE; | ||
| 905 | else if (_stricmp(p + 2, "translit") == 0) | ||
| 906 | flag |= FLAG_TRANSLIT; | ||
| 907 | else if (_stricmp(p + 2, "ignore") == 0) | ||
| 908 | flag |= FLAG_IGNORE; | ||
| 909 | *p = 0; | ||
| 910 | } | ||
| 911 | |||
| 912 | cv->mode = 0; | ||
| 913 | cv->flags = flag; | ||
| 914 | cv->mblen = NULL; | ||
| 915 | cv->flush = NULL; | ||
| 916 | cv->compat = NULL; | ||
| 917 | cv->codepage = name_to_codepage(name); | ||
| 918 | if (cv->codepage == 1200 || cv->codepage == 1201) | ||
| 919 | { | ||
| 920 | cv->mbtowc = utf16_mbtowc; | ||
| 921 | cv->wctomb = utf16_wctomb; | ||
| 922 | if (_stricmp(name, "UTF-16") == 0 || _stricmp(name, "UTF16") == 0 || | ||
| 923 | _stricmp(name, "UCS-2") == 0 || _stricmp(name, "UCS2") == 0 || | ||
| 924 | _stricmp(name,"UCS-2-INTERNAL") == 0) | ||
| 925 | cv->flags |= FLAG_USE_BOM; | ||
| 926 | } | ||
| 927 | else if (cv->codepage == 12000 || cv->codepage == 12001) | ||
| 928 | { | ||
| 929 | cv->mbtowc = utf32_mbtowc; | ||
| 930 | cv->wctomb = utf32_wctomb; | ||
| 931 | if (_stricmp(name, "UTF-32") == 0 || _stricmp(name, "UTF32") == 0 || | ||
| 932 | _stricmp(name, "UCS-4") == 0 || _stricmp(name, "UCS4") == 0) | ||
| 933 | cv->flags |= FLAG_USE_BOM; | ||
| 934 | } | ||
| 935 | else if (cv->codepage == 65001) | ||
| 936 | { | ||
| 937 | cv->mbtowc = kernel_mbtowc; | ||
| 938 | cv->wctomb = kernel_wctomb; | ||
| 939 | cv->mblen = utf8_mblen; | ||
| 940 | } | ||
| 941 | else if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && load_mlang()) | ||
| 942 | { | ||
| 943 | cv->mbtowc = iso2022jp_mbtowc; | ||
| 944 | cv->wctomb = iso2022jp_wctomb; | ||
| 945 | cv->flush = iso2022jp_flush; | ||
| 946 | } | ||
| 947 | else if (cv->codepage == 51932 && load_mlang()) | ||
| 948 | { | ||
| 949 | cv->mbtowc = mlang_mbtowc; | ||
| 950 | cv->wctomb = mlang_wctomb; | ||
| 951 | cv->mblen = eucjp_mblen; | ||
| 952 | } | ||
| 953 | else if (IsValidCodePage(cv->codepage) | ||
| 954 | && GetCPInfo(cv->codepage, &cpinfo) != 0) | ||
| 955 | { | ||
| 956 | cv->mbtowc = kernel_mbtowc; | ||
| 957 | cv->wctomb = kernel_wctomb; | ||
| 958 | if (cpinfo.MaxCharSize == 1) | ||
| 959 | cv->mblen = sbcs_mblen; | ||
| 960 | else if (cpinfo.MaxCharSize == 2) | ||
| 961 | cv->mblen = dbcs_mblen; | ||
| 962 | else | ||
| 963 | cv->mblen = mbcs_mblen; | ||
| 964 | } | ||
| 965 | else | ||
| 966 | { | ||
| 967 | /* not supported */ | ||
| 968 | free(name); | ||
| 969 | errno = EINVAL; | ||
| 970 | return FALSE; | ||
| 971 | } | ||
| 972 | |||
| 973 | if (use_compat) | ||
| 974 | { | ||
| 975 | switch (cv->codepage) | ||
| 976 | { | ||
| 977 | case 932: cv->compat = cp932_compat; break; | ||
| 978 | case 20932: cv->compat = cp20932_compat; break; | ||
| 979 | case 51932: cv->compat = cp51932_compat; break; | ||
| 980 | case 50220: case 50221: case 50222: cv->compat = cp5022x_compat; break; | ||
| 981 | } | ||
| 982 | } | ||
| 983 | |||
| 984 | free(name); | ||
| 985 | |||
| 986 | return TRUE; | ||
| 987 | } | ||
| 988 | |||
| 989 | static int | ||
| 990 | name_to_codepage(const char *name) | ||
| 991 | { | ||
| 992 | int i; | ||
| 993 | |||
| 994 | if (*name == '\0' || | ||
| 995 | strcmp(name, "char") == 0) | ||
| 996 | return GetACP(); | ||
| 997 | else if (strcmp(name, "wchar_t") == 0) | ||
| 998 | return 1200; | ||
| 999 | else if (_strnicmp(name, "cp", 2) == 0) | ||
| 1000 | return atoi(name + 2); /* CP123 */ | ||
| 1001 | else if ('0' <= name[0] && name[0] <= '9') | ||
| 1002 | return atoi(name); /* 123 */ | ||
| 1003 | else if (_strnicmp(name, "xx", 2) == 0) | ||
| 1004 | return atoi(name + 2); /* XX123 for debug */ | ||
| 1005 | |||
| 1006 | for (i = 0; codepage_alias[i].name != NULL; ++i) | ||
| 1007 | if (_stricmp(name, codepage_alias[i].name) == 0) | ||
| 1008 | return codepage_alias[i].codepage; | ||
| 1009 | return -1; | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* | ||
| 1013 | * http://www.faqs.org/rfcs/rfc2781.html | ||
| 1014 | */ | ||
| 1015 | static uint | ||
| 1016 | utf16_to_ucs4(const ushort *wbuf) | ||
| 1017 | { | ||
| 1018 | uint wc = wbuf[0]; | ||
| 1019 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 1020 | wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000; | ||
| 1021 | return wc; | ||
| 1022 | } | ||
| 1023 | |||
| 1024 | static void | ||
| 1025 | ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize) | ||
| 1026 | { | ||
| 1027 | if (wc < 0x10000) | ||
| 1028 | { | ||
| 1029 | wbuf[0] = wc; | ||
| 1030 | *wbufsize = 1; | ||
| 1031 | } | ||
| 1032 | else | ||
| 1033 | { | ||
| 1034 | wc -= 0x10000; | ||
| 1035 | wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF); | ||
| 1036 | wbuf[1] = 0xDC00 | (wc & 0x3FF); | ||
| 1037 | *wbufsize = 2; | ||
| 1038 | } | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | /* | ||
| 1042 | * Check if codepage is one of those for which the dwFlags parameter | ||
| 1043 | * to MultiByteToWideChar() must be zero. Return zero or | ||
| 1044 | * MB_ERR_INVALID_CHARS. The docs in Platform SDK for Windows | ||
| 1045 | * Server 2003 R2 claims that also codepage 65001 is one of these, but | ||
| 1046 | * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave | ||
| 1047 | * out 65001 (UTF-8), and that indeed seems to be the case on XP, it | ||
| 1048 | * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting | ||
| 1049 | * from UTF-8. | ||
| 1050 | */ | ||
| 1051 | static int | ||
| 1052 | mbtowc_flags(int codepage) | ||
| 1053 | { | ||
| 1054 | return (codepage == 50220 || codepage == 50221 || | ||
| 1055 | codepage == 50222 || codepage == 50225 || | ||
| 1056 | codepage == 50227 || codepage == 50229 || | ||
| 1057 | codepage == 52936 || codepage == 54936 || | ||
| 1058 | (codepage >= 57002 && codepage <= 57011) || | ||
| 1059 | codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; | ||
| 1060 | } | ||
| 1061 | |||
| 1062 | /* | ||
| 1063 | * Check if codepage is one those for which the lpUsedDefaultChar | ||
| 1064 | * parameter to WideCharToMultiByte() must be NULL. The docs in | ||
| 1065 | * Platform SDK for Windows Server 2003 R2 claims that this is the | ||
| 1066 | * list below, while the MSDN docs for MSVS2008 claim that it is only | ||
| 1067 | * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform | ||
| 1068 | * SDK seems to be correct, at least for XP. | ||
| 1069 | */ | ||
| 1070 | static int | ||
| 1071 | must_use_null_useddefaultchar(int codepage) | ||
| 1072 | { | ||
| 1073 | return (codepage == 65000 || codepage == 65001 || | ||
| 1074 | codepage == 50220 || codepage == 50221 || | ||
| 1075 | codepage == 50222 || codepage == 50225 || | ||
| 1076 | codepage == 50227 || codepage == 50229 || | ||
| 1077 | codepage == 52936 || codepage == 54936 || | ||
| 1078 | (codepage >= 57002 && codepage <= 57011) || | ||
| 1079 | codepage == 42); | ||
| 1080 | } | ||
| 1081 | |||
| 1082 | static int | ||
| 1083 | seterror(int err) | ||
| 1084 | { | ||
| 1085 | errno = err; | ||
| 1086 | return -1; | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static int | ||
| 1090 | sbcs_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf UNUSED_PARAM, | ||
| 1091 | int bufsize UNUSED_PARAM) | ||
| 1092 | { | ||
| 1093 | return 1; | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | static int | ||
| 1097 | dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) | ||
| 1098 | { | ||
| 1099 | int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1; | ||
| 1100 | if (bufsize < len) | ||
| 1101 | return seterror(EINVAL); | ||
| 1102 | return len; | ||
| 1103 | } | ||
| 1104 | |||
| 1105 | static int | ||
| 1106 | mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) | ||
| 1107 | { | ||
| 1108 | int len = 0; | ||
| 1109 | |||
| 1110 | if (cv->codepage == 54936) { | ||
| 1111 | if (buf[0] <= 0x7F) | ||
| 1112 | len = 1; | ||
| 1113 | else if (buf[0] >= 0x81 && buf[0] <= 0xFE && | ||
| 1114 | bufsize >= 2 && | ||
| 1115 | ((buf[1] >= 0x40 && buf[1] <= 0x7E) || | ||
| 1116 | (buf[1] >= 0x80 && buf[1] <= 0xFE))) | ||
| 1117 | len = 2; | ||
| 1118 | else if (buf[0] >= 0x81 && buf[0] <= 0xFE && | ||
| 1119 | bufsize >= 4 && | ||
| 1120 | buf[1] >= 0x30 && buf[1] <= 0x39) | ||
| 1121 | len = 4; | ||
| 1122 | else | ||
| 1123 | return seterror(EINVAL); | ||
| 1124 | return len; | ||
| 1125 | } | ||
| 1126 | else | ||
| 1127 | return seterror(EINVAL); | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | static int | ||
| 1131 | utf8_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize) | ||
| 1132 | { | ||
| 1133 | int len = 0; | ||
| 1134 | |||
| 1135 | if (buf[0] < 0x80) len = 1; | ||
| 1136 | else if ((buf[0] & 0xE0) == 0xC0) len = 2; | ||
| 1137 | else if ((buf[0] & 0xF0) == 0xE0) len = 3; | ||
| 1138 | else if ((buf[0] & 0xF8) == 0xF0) len = 4; | ||
| 1139 | else if ((buf[0] & 0xFC) == 0xF8) len = 5; | ||
| 1140 | else if ((buf[0] & 0xFE) == 0xFC) len = 6; | ||
| 1141 | |||
| 1142 | if (len == 0) | ||
| 1143 | return seterror(EILSEQ); | ||
| 1144 | else if (bufsize < len) | ||
| 1145 | return seterror(EINVAL); | ||
| 1146 | return len; | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | static int | ||
| 1150 | eucjp_mblen(csconv_t *cv UNUSED_PARAM, const uchar *buf, int bufsize) | ||
| 1151 | { | ||
| 1152 | if (buf[0] < 0x80) /* ASCII */ | ||
| 1153 | return 1; | ||
| 1154 | else if (buf[0] == 0x8E) /* JIS X 0201 */ | ||
| 1155 | { | ||
| 1156 | if (bufsize < 2) | ||
| 1157 | return seterror(EINVAL); | ||
| 1158 | else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF)) | ||
| 1159 | return seterror(EILSEQ); | ||
| 1160 | return 2; | ||
| 1161 | } | ||
| 1162 | else if (buf[0] == 0x8F) /* JIS X 0212 */ | ||
| 1163 | { | ||
| 1164 | if (bufsize < 3) | ||
| 1165 | return seterror(EINVAL); | ||
| 1166 | else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE) | ||
| 1167 | || !(0xA1 <= buf[2] && buf[2] <= 0xFE)) | ||
| 1168 | return seterror(EILSEQ); | ||
| 1169 | return 3; | ||
| 1170 | } | ||
| 1171 | else /* JIS X 0208 */ | ||
| 1172 | { | ||
| 1173 | if (bufsize < 2) | ||
| 1174 | return seterror(EINVAL); | ||
| 1175 | else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE) | ||
| 1176 | || !(0xA1 <= buf[1] && buf[1] <= 0xFE)) | ||
| 1177 | return seterror(EILSEQ); | ||
| 1178 | return 2; | ||
| 1179 | } | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | static int | ||
| 1183 | kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1184 | { | ||
| 1185 | int len; | ||
| 1186 | |||
| 1187 | len = cv->mblen(cv, buf, bufsize); | ||
| 1188 | if (len == -1) | ||
| 1189 | return -1; | ||
| 1190 | /* If converting from ASCII, reject 8bit | ||
| 1191 | * chars. MultiByteToWideChar() doesn't. Note that for ASCII we | ||
| 1192 | * know that the mblen function is sbcs_mblen() so len is 1. | ||
| 1193 | */ | ||
| 1194 | if (cv->codepage == 20127 && buf[0] >= 0x80) | ||
| 1195 | return seterror(EILSEQ); | ||
| 1196 | *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage), | ||
| 1197 | (const char *)buf, len, (wchar_t *)wbuf, *wbufsize); | ||
| 1198 | if (*wbufsize == 0) | ||
| 1199 | return seterror(EILSEQ); | ||
| 1200 | return len; | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | static int | ||
| 1204 | kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1205 | { | ||
| 1206 | BOOL usedDefaultChar = 0; | ||
| 1207 | BOOL *p = NULL; | ||
| 1208 | int flags = 0; | ||
| 1209 | int len; | ||
| 1210 | |||
| 1211 | if (bufsize == 0) | ||
| 1212 | return seterror(E2BIG); | ||
| 1213 | if (!must_use_null_useddefaultchar(cv->codepage)) | ||
| 1214 | { | ||
| 1215 | p = &usedDefaultChar; | ||
| 1216 | #ifdef WC_NO_BEST_FIT_CHARS | ||
| 1217 | if (!(cv->flags & FLAG_TRANSLIT)) | ||
| 1218 | flags |= WC_NO_BEST_FIT_CHARS; | ||
| 1219 | #endif | ||
| 1220 | } | ||
| 1221 | len = WideCharToMultiByte(cv->codepage, flags, | ||
| 1222 | (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p); | ||
| 1223 | if (len == 0) | ||
| 1224 | { | ||
| 1225 | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) | ||
| 1226 | return seterror(E2BIG); | ||
| 1227 | return seterror(EILSEQ); | ||
| 1228 | } | ||
| 1229 | else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT)) | ||
| 1230 | return seterror(EILSEQ); | ||
| 1231 | else if (cv->mblen(cv, buf, len) != len) /* validate result */ | ||
| 1232 | return seterror(EILSEQ); | ||
| 1233 | return len; | ||
| 1234 | } | ||
| 1235 | |||
| 1236 | /* | ||
| 1237 | * It seems that the mode (cv->mode) is fixnum. | ||
| 1238 | * For example, when converting iso-2022-jp(cp50221) to unicode: | ||
| 1239 | * in ascii sequence: mode=0xC42C0000 | ||
| 1240 | * in jisx0208 sequence: mode=0xC42C0001 | ||
| 1241 | * "C42C" is same for each convert session. | ||
| 1242 | * It should be: ((codepage-1)<<16)|state | ||
| 1243 | */ | ||
| 1244 | static int | ||
| 1245 | mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1246 | { | ||
| 1247 | int len; | ||
| 1248 | int insize; | ||
| 1249 | HRESULT hr; | ||
| 1250 | |||
| 1251 | len = cv->mblen(cv, buf, bufsize); | ||
| 1252 | if (len == -1) | ||
| 1253 | return -1; | ||
| 1254 | insize = len; | ||
| 1255 | hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage, | ||
| 1256 | (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize); | ||
| 1257 | if (hr != S_OK || insize != len) | ||
| 1258 | return seterror(EILSEQ); | ||
| 1259 | return len; | ||
| 1260 | } | ||
| 1261 | |||
| 1262 | static int | ||
| 1263 | mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1264 | { | ||
| 1265 | char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */ | ||
| 1266 | int tmpsize = MB_CHAR_MAX; | ||
| 1267 | int insize = wbufsize; | ||
| 1268 | HRESULT hr; | ||
| 1269 | |||
| 1270 | hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage, | ||
| 1271 | (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize); | ||
| 1272 | if (hr != S_OK || insize != wbufsize) | ||
| 1273 | return seterror(EILSEQ); | ||
| 1274 | else if (bufsize < tmpsize) | ||
| 1275 | return seterror(E2BIG); | ||
| 1276 | else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize) | ||
| 1277 | return seterror(EILSEQ); | ||
| 1278 | memcpy(buf, tmpbuf, tmpsize); | ||
| 1279 | return tmpsize; | ||
| 1280 | } | ||
| 1281 | |||
| 1282 | static int | ||
| 1283 | utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1284 | { | ||
| 1285 | int codepage = cv->codepage; | ||
| 1286 | |||
| 1287 | /* swap endian: 1200 <-> 1201 */ | ||
| 1288 | if (cv->mode & UNICODE_MODE_SWAPPED) | ||
| 1289 | codepage ^= 1; | ||
| 1290 | |||
| 1291 | if (bufsize < 2) | ||
| 1292 | return seterror(EINVAL); | ||
| 1293 | if (codepage == 1200) /* little endian */ | ||
| 1294 | wbuf[0] = (buf[1] << 8) | buf[0]; | ||
| 1295 | else if (codepage == 1201) /* big endian */ | ||
| 1296 | wbuf[0] = (buf[0] << 8) | buf[1]; | ||
| 1297 | |||
| 1298 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1299 | { | ||
| 1300 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1301 | if (wbuf[0] == 0xFFFE) | ||
| 1302 | { | ||
| 1303 | cv->mode |= UNICODE_MODE_SWAPPED; | ||
| 1304 | *wbufsize = 0; | ||
| 1305 | return 2; | ||
| 1306 | } | ||
| 1307 | else if (wbuf[0] == 0xFEFF) | ||
| 1308 | { | ||
| 1309 | *wbufsize = 0; | ||
| 1310 | return 2; | ||
| 1311 | } | ||
| 1312 | } | ||
| 1313 | |||
| 1314 | if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF) | ||
| 1315 | return seterror(EILSEQ); | ||
| 1316 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 1317 | { | ||
| 1318 | if (bufsize < 4) | ||
| 1319 | return seterror(EINVAL); | ||
| 1320 | if (codepage == 1200) /* little endian */ | ||
| 1321 | wbuf[1] = (buf[3] << 8) | buf[2]; | ||
| 1322 | else if (codepage == 1201) /* big endian */ | ||
| 1323 | wbuf[1] = (buf[2] << 8) | buf[3]; | ||
| 1324 | if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF)) | ||
| 1325 | return seterror(EILSEQ); | ||
| 1326 | *wbufsize = 2; | ||
| 1327 | return 4; | ||
| 1328 | } | ||
| 1329 | *wbufsize = 1; | ||
| 1330 | return 2; | ||
| 1331 | } | ||
| 1332 | |||
| 1333 | static int | ||
| 1334 | utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1335 | { | ||
| 1336 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1337 | { | ||
| 1338 | int r; | ||
| 1339 | |||
| 1340 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1341 | if (bufsize < 2) | ||
| 1342 | return seterror(E2BIG); | ||
| 1343 | if (cv->codepage == 1200) /* little endian */ | ||
| 1344 | memcpy(buf, "\xFF\xFE", 2); | ||
| 1345 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1346 | memcpy(buf, "\xFE\xFF", 2); | ||
| 1347 | |||
| 1348 | r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2); | ||
| 1349 | if (r == -1) | ||
| 1350 | return -1; | ||
| 1351 | return r + 2; | ||
| 1352 | } | ||
| 1353 | |||
| 1354 | if (bufsize < 2) | ||
| 1355 | return seterror(E2BIG); | ||
| 1356 | if (cv->codepage == 1200) /* little endian */ | ||
| 1357 | { | ||
| 1358 | buf[0] = (wbuf[0] & 0x00FF); | ||
| 1359 | buf[1] = (wbuf[0] & 0xFF00) >> 8; | ||
| 1360 | } | ||
| 1361 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1362 | { | ||
| 1363 | buf[0] = (wbuf[0] & 0xFF00) >> 8; | ||
| 1364 | buf[1] = (wbuf[0] & 0x00FF); | ||
| 1365 | } | ||
| 1366 | if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) | ||
| 1367 | { | ||
| 1368 | if (bufsize < 4) | ||
| 1369 | return seterror(E2BIG); | ||
| 1370 | if (cv->codepage == 1200) /* little endian */ | ||
| 1371 | { | ||
| 1372 | buf[2] = (wbuf[1] & 0x00FF); | ||
| 1373 | buf[3] = (wbuf[1] & 0xFF00) >> 8; | ||
| 1374 | } | ||
| 1375 | else if (cv->codepage == 1201) /* big endian */ | ||
| 1376 | { | ||
| 1377 | buf[2] = (wbuf[1] & 0xFF00) >> 8; | ||
| 1378 | buf[3] = (wbuf[1] & 0x00FF); | ||
| 1379 | } | ||
| 1380 | return 4; | ||
| 1381 | } | ||
| 1382 | return 2; | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | static int | ||
| 1386 | utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1387 | { | ||
| 1388 | int codepage = cv->codepage; | ||
| 1389 | uint wc = 0xD800; | ||
| 1390 | |||
| 1391 | /* swap endian: 12000 <-> 12001 */ | ||
| 1392 | if (cv->mode & UNICODE_MODE_SWAPPED) | ||
| 1393 | codepage ^= 1; | ||
| 1394 | |||
| 1395 | if (bufsize < 4) | ||
| 1396 | return seterror(EINVAL); | ||
| 1397 | if (codepage == 12000) /* little endian */ | ||
| 1398 | wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; | ||
| 1399 | else if (codepage == 12001) /* big endian */ | ||
| 1400 | wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; | ||
| 1401 | |||
| 1402 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1403 | { | ||
| 1404 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1405 | if (wc == 0xFFFE0000) | ||
| 1406 | { | ||
| 1407 | cv->mode |= UNICODE_MODE_SWAPPED; | ||
| 1408 | *wbufsize = 0; | ||
| 1409 | return 4; | ||
| 1410 | } | ||
| 1411 | else if (wc == 0x0000FEFF) | ||
| 1412 | { | ||
| 1413 | *wbufsize = 0; | ||
| 1414 | return 4; | ||
| 1415 | } | ||
| 1416 | } | ||
| 1417 | |||
| 1418 | if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc) | ||
| 1419 | return seterror(EILSEQ); | ||
| 1420 | ucs4_to_utf16(wc, wbuf, wbufsize); | ||
| 1421 | return 4; | ||
| 1422 | } | ||
| 1423 | |||
| 1424 | static int | ||
| 1425 | utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1426 | { | ||
| 1427 | uint wc; | ||
| 1428 | |||
| 1429 | if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) | ||
| 1430 | { | ||
| 1431 | int r; | ||
| 1432 | |||
| 1433 | cv->mode |= UNICODE_MODE_BOM_DONE; | ||
| 1434 | if (bufsize < 4) | ||
| 1435 | return seterror(E2BIG); | ||
| 1436 | if (cv->codepage == 12000) /* little endian */ | ||
| 1437 | memcpy(buf, "\xFF\xFE\x00\x00", 4); | ||
| 1438 | else if (cv->codepage == 12001) /* big endian */ | ||
| 1439 | memcpy(buf, "\x00\x00\xFE\xFF", 4); | ||
| 1440 | |||
| 1441 | r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4); | ||
| 1442 | if (r == -1) | ||
| 1443 | return -1; | ||
| 1444 | return r + 4; | ||
| 1445 | } | ||
| 1446 | |||
| 1447 | if (bufsize < 4) | ||
| 1448 | return seterror(E2BIG); | ||
| 1449 | wc = utf16_to_ucs4(wbuf); | ||
| 1450 | if (cv->codepage == 12000) /* little endian */ | ||
| 1451 | { | ||
| 1452 | buf[0] = wc & 0x000000FF; | ||
| 1453 | buf[1] = (wc & 0x0000FF00) >> 8; | ||
| 1454 | buf[2] = (wc & 0x00FF0000) >> 16; | ||
| 1455 | buf[3] = (wc & 0xFF000000) >> 24; | ||
| 1456 | } | ||
| 1457 | else if (cv->codepage == 12001) /* big endian */ | ||
| 1458 | { | ||
| 1459 | buf[0] = (wc & 0xFF000000) >> 24; | ||
| 1460 | buf[1] = (wc & 0x00FF0000) >> 16; | ||
| 1461 | buf[2] = (wc & 0x0000FF00) >> 8; | ||
| 1462 | buf[3] = wc & 0x000000FF; | ||
| 1463 | } | ||
| 1464 | return 4; | ||
| 1465 | } | ||
| 1466 | |||
| 1467 | /* | ||
| 1468 | * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) | ||
| 1469 | * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow | ||
| 1470 | * 1 byte Kana) | ||
| 1471 | * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte | ||
| 1472 | * Kana - SO/SI) | ||
| 1473 | * | ||
| 1474 | * MultiByteToWideChar() and WideCharToMultiByte() behave differently | ||
| 1475 | * depending on Windows version. On XP, WideCharToMultiByte() doesn't | ||
| 1476 | * terminate result sequence with ascii escape. But Vista does. | ||
| 1477 | * Use MLang instead. | ||
| 1478 | */ | ||
| 1479 | |||
| 1480 | #define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift)) | ||
| 1481 | #define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF) | ||
| 1482 | #define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF) | ||
| 1483 | |||
| 1484 | #define ISO2022_SI 0 | ||
| 1485 | #define ISO2022_SO 1 | ||
| 1486 | |||
| 1487 | /* shift in */ | ||
| 1488 | static const char iso2022_SI_seq[] = "\x0F"; | ||
| 1489 | /* shift out */ | ||
| 1490 | static const char iso2022_SO_seq[] = "\x0E"; | ||
| 1491 | |||
| 1492 | typedef struct iso2022_esc_t iso2022_esc_t; | ||
| 1493 | struct iso2022_esc_t { | ||
| 1494 | const char *esc; | ||
| 1495 | int esc_len; | ||
| 1496 | int len; | ||
| 1497 | int cs; | ||
| 1498 | }; | ||
| 1499 | |||
| 1500 | #define ISO2022JP_CS_ASCII 0 | ||
| 1501 | #define ISO2022JP_CS_JISX0201_ROMAN 1 | ||
| 1502 | #define ISO2022JP_CS_JISX0201_KANA 2 | ||
| 1503 | #define ISO2022JP_CS_JISX0208_1978 3 | ||
| 1504 | #define ISO2022JP_CS_JISX0208_1983 4 | ||
| 1505 | #define ISO2022JP_CS_JISX0212 5 | ||
| 1506 | |||
| 1507 | static iso2022_esc_t iso2022jp_esc[] = { | ||
| 1508 | {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII}, | ||
| 1509 | {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN}, | ||
| 1510 | {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA}, | ||
| 1511 | {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, /* unify 1978 with 1983 */ | ||
| 1512 | {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983}, | ||
| 1513 | {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212}, | ||
| 1514 | {NULL, 0, 0, 0} | ||
| 1515 | }; | ||
| 1516 | |||
| 1517 | static int | ||
| 1518 | iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) | ||
| 1519 | { | ||
| 1520 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1521 | char tmp[MB_CHAR_MAX]; | ||
| 1522 | int insize; | ||
| 1523 | HRESULT hr; | ||
| 1524 | DWORD dummy = 0; | ||
| 1525 | int len; | ||
| 1526 | int esc_len; | ||
| 1527 | int cs; | ||
| 1528 | int shift; | ||
| 1529 | int i; | ||
| 1530 | |||
| 1531 | if (buf[0] == 0x1B) | ||
| 1532 | { | ||
| 1533 | for (i = 0; iesc[i].esc != NULL; ++i) | ||
| 1534 | { | ||
| 1535 | esc_len = iesc[i].esc_len; | ||
| 1536 | if (bufsize < esc_len) | ||
| 1537 | { | ||
| 1538 | if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0) | ||
| 1539 | return seterror(EINVAL); | ||
| 1540 | } | ||
| 1541 | else | ||
| 1542 | { | ||
| 1543 | if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0) | ||
| 1544 | { | ||
| 1545 | cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI); | ||
| 1546 | *wbufsize = 0; | ||
| 1547 | return esc_len; | ||
| 1548 | } | ||
| 1549 | } | ||
| 1550 | } | ||
| 1551 | /* not supported escape sequence */ | ||
| 1552 | return seterror(EILSEQ); | ||
| 1553 | } | ||
| 1554 | else if (buf[0] == iso2022_SO_seq[0]) | ||
| 1555 | { | ||
| 1556 | cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO); | ||
| 1557 | *wbufsize = 0; | ||
| 1558 | return 1; | ||
| 1559 | } | ||
| 1560 | else if (buf[0] == iso2022_SI_seq[0]) | ||
| 1561 | { | ||
| 1562 | cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI); | ||
| 1563 | *wbufsize = 0; | ||
| 1564 | return 1; | ||
| 1565 | } | ||
| 1566 | |||
| 1567 | cs = ISO2022_MODE_CS(cv->mode); | ||
| 1568 | shift = ISO2022_MODE_SHIFT(cv->mode); | ||
| 1569 | |||
| 1570 | /* reset the mode for informal sequence */ | ||
| 1571 | if (buf[0] < 0x20) | ||
| 1572 | { | ||
| 1573 | cs = ISO2022JP_CS_ASCII; | ||
| 1574 | shift = ISO2022_SI; | ||
| 1575 | } | ||
| 1576 | |||
| 1577 | len = iesc[cs].len; | ||
| 1578 | if (bufsize < len) | ||
| 1579 | return seterror(EINVAL); | ||
| 1580 | for (i = 0; i < len; ++i) | ||
| 1581 | if (!(buf[i] < 0x80)) | ||
| 1582 | return seterror(EILSEQ); | ||
| 1583 | esc_len = iesc[cs].esc_len; | ||
| 1584 | memcpy(tmp, iesc[cs].esc, esc_len); | ||
| 1585 | if (shift == ISO2022_SO) | ||
| 1586 | { | ||
| 1587 | memcpy(tmp + esc_len, iso2022_SO_seq, 1); | ||
| 1588 | esc_len += 1; | ||
| 1589 | } | ||
| 1590 | memcpy(tmp + esc_len, buf, len); | ||
| 1591 | |||
| 1592 | if ((cv->codepage == 50220 || cv->codepage == 50221 | ||
| 1593 | || cv->codepage == 50222) && shift == ISO2022_SO) | ||
| 1594 | { | ||
| 1595 | /* XXX: shift-out cannot be used for mbtowc (both kernel and | ||
| 1596 | * mlang) */ | ||
| 1597 | esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len; | ||
| 1598 | memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len); | ||
| 1599 | memcpy(tmp + esc_len, buf, len); | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | insize = len + esc_len; | ||
| 1603 | hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage, | ||
| 1604 | (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize); | ||
| 1605 | if (hr != S_OK || insize != len + esc_len) | ||
| 1606 | return seterror(EILSEQ); | ||
| 1607 | |||
| 1608 | /* Check for conversion error. Assuming defaultChar is 0x3F. */ | ||
| 1609 | /* ascii should be converted from ascii */ | ||
| 1610 | if (wbuf[0] == buf[0] | ||
| 1611 | && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) | ||
| 1612 | return seterror(EILSEQ); | ||
| 1613 | |||
| 1614 | /* reset the mode for informal sequence */ | ||
| 1615 | if (cv->mode != ISO2022_MODE(cs, shift)) | ||
| 1616 | cv->mode = ISO2022_MODE(cs, shift); | ||
| 1617 | |||
| 1618 | return len; | ||
| 1619 | } | ||
| 1620 | |||
| 1621 | static int | ||
| 1622 | iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) | ||
| 1623 | { | ||
| 1624 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1625 | char tmp[MB_CHAR_MAX]; | ||
| 1626 | int tmpsize = MB_CHAR_MAX; | ||
| 1627 | int insize = wbufsize; | ||
| 1628 | HRESULT hr; | ||
| 1629 | DWORD dummy = 0; | ||
| 1630 | int len; | ||
| 1631 | int esc_len; | ||
| 1632 | int cs; | ||
| 1633 | int shift; | ||
| 1634 | int i; | ||
| 1635 | |||
| 1636 | /* | ||
| 1637 | * MultiByte = [escape sequence] + character + [escape sequence] | ||
| 1638 | * | ||
| 1639 | * Whether trailing escape sequence is added depends on which API is | ||
| 1640 | * used (kernel or MLang, and its version). | ||
| 1641 | */ | ||
| 1642 | hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage, | ||
| 1643 | (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize); | ||
| 1644 | if (hr != S_OK || insize != wbufsize) | ||
| 1645 | return seterror(EILSEQ); | ||
| 1646 | else if (bufsize < tmpsize) | ||
| 1647 | return seterror(E2BIG); | ||
| 1648 | |||
| 1649 | if (tmpsize == 1) | ||
| 1650 | { | ||
| 1651 | cs = ISO2022JP_CS_ASCII; | ||
| 1652 | esc_len = 0; | ||
| 1653 | } | ||
| 1654 | else | ||
| 1655 | { | ||
| 1656 | for (i = 1; iesc[i].esc != NULL; ++i) | ||
| 1657 | { | ||
| 1658 | esc_len = iesc[i].esc_len; | ||
| 1659 | if (strncmp(tmp, iesc[i].esc, esc_len) == 0) | ||
| 1660 | { | ||
| 1661 | cs = iesc[i].cs; | ||
| 1662 | break; | ||
| 1663 | } | ||
| 1664 | } | ||
| 1665 | if (iesc[i].esc == NULL) | ||
| 1666 | /* not supported escape sequence */ | ||
| 1667 | return seterror(EILSEQ); | ||
| 1668 | } | ||
| 1669 | |||
| 1670 | shift = ISO2022_SI; | ||
| 1671 | if (tmp[esc_len] == iso2022_SO_seq[0]) | ||
| 1672 | { | ||
| 1673 | shift = ISO2022_SO; | ||
| 1674 | esc_len += 1; | ||
| 1675 | } | ||
| 1676 | |||
| 1677 | len = iesc[cs].len; | ||
| 1678 | |||
| 1679 | /* Check for converting error. Assuming defaultChar is 0x3F. */ | ||
| 1680 | /* ascii should be converted from ascii */ | ||
| 1681 | if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80)) | ||
| 1682 | return seterror(EILSEQ); | ||
| 1683 | else if (tmpsize < esc_len + len) | ||
| 1684 | return seterror(EILSEQ); | ||
| 1685 | |||
| 1686 | if (cv->mode == ISO2022_MODE(cs, shift)) | ||
| 1687 | { | ||
| 1688 | /* remove escape sequence */ | ||
| 1689 | if (esc_len != 0) | ||
| 1690 | memmove(tmp, tmp + esc_len, len); | ||
| 1691 | esc_len = 0; | ||
| 1692 | } | ||
| 1693 | else | ||
| 1694 | { | ||
| 1695 | if (cs == ISO2022JP_CS_ASCII) | ||
| 1696 | { | ||
| 1697 | esc_len = iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1698 | memmove(tmp + esc_len, tmp, len); | ||
| 1699 | memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len); | ||
| 1700 | } | ||
| 1701 | if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO) | ||
| 1702 | { | ||
| 1703 | /* shift-in before changing to other mode */ | ||
| 1704 | memmove(tmp + 1, tmp, len + esc_len); | ||
| 1705 | memcpy(tmp, iso2022_SI_seq, 1); | ||
| 1706 | esc_len += 1; | ||
| 1707 | } | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | if (bufsize < len + esc_len) | ||
| 1711 | return seterror(E2BIG); | ||
| 1712 | memcpy(buf, tmp, len + esc_len); | ||
| 1713 | cv->mode = ISO2022_MODE(cs, shift); | ||
| 1714 | return len + esc_len; | ||
| 1715 | } | ||
| 1716 | |||
| 1717 | static int | ||
| 1718 | iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize) | ||
| 1719 | { | ||
| 1720 | iso2022_esc_t *iesc = iso2022jp_esc; | ||
| 1721 | int esc_len; | ||
| 1722 | |||
| 1723 | if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) | ||
| 1724 | { | ||
| 1725 | esc_len = 0; | ||
| 1726 | if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) | ||
| 1727 | esc_len += 1; | ||
| 1728 | if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) | ||
| 1729 | esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1730 | if (bufsize < esc_len) | ||
| 1731 | return seterror(E2BIG); | ||
| 1732 | |||
| 1733 | esc_len = 0; | ||
| 1734 | if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) | ||
| 1735 | { | ||
| 1736 | memcpy(buf, iso2022_SI_seq, 1); | ||
| 1737 | esc_len += 1; | ||
| 1738 | } | ||
| 1739 | if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) | ||
| 1740 | { | ||
| 1741 | memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc, | ||
| 1742 | iesc[ISO2022JP_CS_ASCII].esc_len); | ||
| 1743 | esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; | ||
| 1744 | } | ||
| 1745 | return esc_len; | ||
| 1746 | } | ||
| 1747 | return 0; | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | static void process_file(iconv_t cd, FILE *in, FILE *out) | ||
| 1751 | { | ||
| 1752 | char inbuf[BUFSIZ]; | ||
| 1753 | char outbuf[BUFSIZ]; | ||
| 1754 | const char *pin; | ||
| 1755 | char *pout; | ||
| 1756 | size_t inbytesleft; | ||
| 1757 | size_t outbytesleft; | ||
| 1758 | size_t rest = 0; | ||
| 1759 | size_t r; | ||
| 1760 | |||
| 1761 | while ((inbytesleft=fread(inbuf+rest, 1, sizeof(inbuf)-rest, in)) != 0 | ||
| 1762 | || rest != 0) { | ||
| 1763 | inbytesleft += rest; | ||
| 1764 | pin = inbuf; | ||
| 1765 | pout = outbuf; | ||
| 1766 | outbytesleft = sizeof(outbuf); | ||
| 1767 | r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft); | ||
| 1768 | fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); | ||
| 1769 | if (r == (size_t)(-1) && errno != E2BIG && | ||
| 1770 | (errno != EINVAL || feof(in))) | ||
| 1771 | bb_perror_msg_and_die("conversion error"); | ||
| 1772 | memmove(inbuf, pin, inbytesleft); | ||
| 1773 | rest = inbytesleft; | ||
| 1774 | } | ||
| 1775 | pout = outbuf; | ||
| 1776 | outbytesleft = sizeof(outbuf); | ||
| 1777 | r = iconv(cd, NULL, NULL, &pout, &outbytesleft); | ||
| 1778 | fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); | ||
| 1779 | if (r == (size_t)(-1)) | ||
| 1780 | bb_perror_msg_and_die("conversion error"); | ||
| 1781 | } | ||
| 1782 | |||
| 1783 | int iconv_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; | ||
| 1784 | int iconv_main(int argc, char **argv) | ||
| 1785 | { | ||
| 1786 | char *fromcode = NULL; | ||
| 1787 | char *tocode = NULL; | ||
| 1788 | int i; | ||
| 1789 | iconv_t cd; | ||
| 1790 | FILE *in = stdin; | ||
| 1791 | FILE *out = stdout; | ||
| 1792 | int ignore = 0; | ||
| 1793 | |||
| 1794 | while ((i = getopt(argc, argv, "f:t:lco:")) != -1) { | ||
| 1795 | switch (i) { | ||
| 1796 | case 'l': | ||
| 1797 | for (i = 0; codepage_alias[i].name != NULL; ++i) | ||
| 1798 | printf("%s\n", codepage_alias[i].name); | ||
| 1799 | return 0; | ||
| 1800 | |||
| 1801 | case 'f': | ||
| 1802 | fromcode = optarg; | ||
| 1803 | break; | ||
| 1804 | |||
| 1805 | case 't': | ||
| 1806 | tocode = optarg; | ||
| 1807 | break; | ||
| 1808 | |||
| 1809 | case 'c': | ||
| 1810 | ignore = 1; | ||
| 1811 | break; | ||
| 1812 | |||
| 1813 | case 'o': | ||
| 1814 | out = xfopen(optarg, "wb"); | ||
| 1815 | break; | ||
| 1816 | |||
| 1817 | default: | ||
| 1818 | bb_show_usage(); | ||
| 1819 | } | ||
| 1820 | } | ||
| 1821 | |||
| 1822 | if (fromcode == NULL || tocode == NULL) | ||
| 1823 | bb_show_usage(); | ||
| 1824 | |||
| 1825 | if (ignore) | ||
| 1826 | tocode = xasprintf("%s//IGNORE", tocode); | ||
| 1827 | |||
| 1828 | cd = iconv_open(tocode, fromcode); | ||
| 1829 | if (cd == (iconv_t)(-1)) | ||
| 1830 | bb_perror_msg_and_die("iconv_open error"); | ||
| 1831 | |||
| 1832 | if (optind == argc || | ||
| 1833 | (optind == argc-1 && strcmp(argv[optind], "-") == 0)) { | ||
| 1834 | process_file(cd, in, out); | ||
| 1835 | } | ||
| 1836 | else { | ||
| 1837 | for (i=optind; i<argc; ++i) { | ||
| 1838 | in = xfopen(argv[i], "rb"); | ||
| 1839 | process_file(cd, in, out); | ||
| 1840 | fclose(in); | ||
| 1841 | } | ||
| 1842 | } | ||
| 1843 | |||
| 1844 | iconv_close(cd); | ||
| 1845 | return 0; | ||
| 1846 | } | ||
