diff options
| author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2021-12-27 00:00:00 +0000 |
|---|---|---|
| committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2022-03-18 15:35:13 +0500 |
| commit | f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch) | |
| tree | 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/StringConvert.cpp | |
| parent | 98e06a519b63b81986abe76d28887f6984a7732b (diff) | |
| download | 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip | |
'21.07'21.07
Diffstat (limited to 'CPP/Common/StringConvert.cpp')
| -rw-r--r-- | CPP/Common/StringConvert.cpp | 757 |
1 files changed, 757 insertions, 0 deletions
diff --git a/CPP/Common/StringConvert.cpp b/CPP/Common/StringConvert.cpp new file mode 100644 index 0000000..c0bde0f --- /dev/null +++ b/CPP/Common/StringConvert.cpp | |||
| @@ -0,0 +1,757 @@ | |||
| 1 | // Common/StringConvert.cpp | ||
| 2 | |||
| 3 | #include "StdAfx.h" | ||
| 4 | |||
| 5 | #include "StringConvert.h" | ||
| 6 | |||
| 7 | #ifndef _WIN32 | ||
| 8 | // #include <stdio.h> | ||
| 9 | #include <stdlib.h> | ||
| 10 | #endif | ||
| 11 | |||
| 12 | #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) | ||
| 13 | #include "UTFConvert.h" | ||
| 14 | #endif | ||
| 15 | |||
| 16 | #ifdef ENV_HAVE_LOCALE | ||
| 17 | #include <locale.h> | ||
| 18 | #endif | ||
| 19 | |||
| 20 | static const char k_DefultChar = '_'; | ||
| 21 | |||
| 22 | #ifdef _WIN32 | ||
| 23 | |||
| 24 | /* | ||
| 25 | MultiByteToWideChar(CodePage, DWORD dwFlags, | ||
| 26 | LPCSTR lpMultiByteStr, int cbMultiByte, | ||
| 27 | LPWSTR lpWideCharStr, int cchWideChar) | ||
| 28 | |||
| 29 | if (cbMultiByte == 0) | ||
| 30 | return: 0. ERR: ERROR_INVALID_PARAMETER | ||
| 31 | |||
| 32 | if (cchWideChar == 0) | ||
| 33 | return: the required buffer size in characters. | ||
| 34 | |||
| 35 | if (supplied buffer size was not large enough) | ||
| 36 | return: 0. ERR: ERROR_INSUFFICIENT_BUFFER | ||
| 37 | The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex) | ||
| 38 | |||
| 39 | If there are illegal characters: | ||
| 40 | if MB_ERR_INVALID_CHARS is set in dwFlags: | ||
| 41 | - the function stops conversion on illegal character. | ||
| 42 | - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION. | ||
| 43 | |||
| 44 | if MB_ERR_INVALID_CHARS is NOT set in dwFlags: | ||
| 45 | before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0. | ||
| 46 | in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal | ||
| 47 | character is converted to U+FFFD, which is REPLACEMENT CHARACTER. | ||
| 48 | */ | ||
| 49 | |||
| 50 | |||
| 51 | void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) | ||
| 52 | { | ||
| 53 | dest.Empty(); | ||
| 54 | if (src.IsEmpty()) | ||
| 55 | return; | ||
| 56 | { | ||
| 57 | /* | ||
| 58 | wchar_t *d = dest.GetBuf(src.Len()); | ||
| 59 | const char *s = (const char *)src; | ||
| 60 | unsigned i; | ||
| 61 | |||
| 62 | for (i = 0;;) | ||
| 63 | { | ||
| 64 | Byte c = (Byte)s[i]; | ||
| 65 | if (c >= 0x80 || c == 0) | ||
| 66 | break; | ||
| 67 | d[i++] = (wchar_t)c; | ||
| 68 | } | ||
| 69 | |||
| 70 | if (i != src.Len()) | ||
| 71 | { | ||
| 72 | unsigned len = MultiByteToWideChar(codePage, 0, s + i, | ||
| 73 | src.Len() - i, d + i, | ||
| 74 | src.Len() + 1 - i); | ||
| 75 | if (len == 0) | ||
| 76 | throw 282228; | ||
| 77 | i += len; | ||
| 78 | } | ||
| 79 | |||
| 80 | d[i] = 0; | ||
| 81 | dest.ReleaseBuf_SetLen(i); | ||
| 82 | */ | ||
| 83 | unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0); | ||
| 84 | if (len == 0) | ||
| 85 | { | ||
| 86 | if (GetLastError() != 0) | ||
| 87 | throw 282228; | ||
| 88 | } | ||
| 89 | else | ||
| 90 | { | ||
| 91 | len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len); | ||
| 92 | if (len == 0) | ||
| 93 | throw 282228; | ||
| 94 | dest.ReleaseBuf_SetEnd(len); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 99 | /* | ||
| 100 | int WideCharToMultiByte( | ||
| 101 | UINT CodePage, DWORD dwFlags, | ||
| 102 | LPCWSTR lpWideCharStr, int cchWideChar, | ||
| 103 | LPSTR lpMultiByteStr, int cbMultiByte, | ||
| 104 | LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar); | ||
| 105 | |||
| 106 | if (lpDefaultChar == NULL), | ||
| 107 | - it uses system default value. | ||
| 108 | |||
| 109 | if (CodePage == CP_UTF7 || CodePage == CP_UTF8) | ||
| 110 | if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL) | ||
| 111 | return: 0. ERR: ERROR_INVALID_PARAMETER. | ||
| 112 | |||
| 113 | The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL) | ||
| 114 | |||
| 115 | */ | ||
| 116 | |||
| 117 | static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
| 118 | { | ||
| 119 | dest.Empty(); | ||
| 120 | defaultCharWasUsed = false; | ||
| 121 | if (src.IsEmpty()) | ||
| 122 | return; | ||
| 123 | { | ||
| 124 | /* | ||
| 125 | unsigned numRequiredBytes = src.Len() * 2; | ||
| 126 | char *d = dest.GetBuf(numRequiredBytes); | ||
| 127 | const wchar_t *s = (const wchar_t *)src; | ||
| 128 | unsigned i; | ||
| 129 | |||
| 130 | for (i = 0;;) | ||
| 131 | { | ||
| 132 | wchar_t c = s[i]; | ||
| 133 | if (c >= 0x80 || c == 0) | ||
| 134 | break; | ||
| 135 | d[i++] = (char)c; | ||
| 136 | } | ||
| 137 | |||
| 138 | if (i != src.Len()) | ||
| 139 | { | ||
| 140 | BOOL defUsed = FALSE; | ||
| 141 | defaultChar = defaultChar; | ||
| 142 | |||
| 143 | bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); | ||
| 144 | unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i, | ||
| 145 | d + i, numRequiredBytes + 1 - i, | ||
| 146 | (isUtf ? NULL : &defaultChar), | ||
| 147 | (isUtf ? NULL : &defUsed)); | ||
| 148 | defaultCharWasUsed = (defUsed != FALSE); | ||
| 149 | if (len == 0) | ||
| 150 | throw 282229; | ||
| 151 | i += len; | ||
| 152 | } | ||
| 153 | |||
| 154 | d[i] = 0; | ||
| 155 | dest.ReleaseBuf_SetLen(i); | ||
| 156 | */ | ||
| 157 | |||
| 158 | /* | ||
| 159 | if (codePage != CP_UTF7) | ||
| 160 | { | ||
| 161 | const wchar_t *s = (const wchar_t *)src; | ||
| 162 | unsigned i; | ||
| 163 | for (i = 0;; i++) | ||
| 164 | { | ||
| 165 | wchar_t c = s[i]; | ||
| 166 | if (c >= 0x80 || c == 0) | ||
| 167 | break; | ||
| 168 | } | ||
| 169 | |||
| 170 | if (s[i] == 0) | ||
| 171 | { | ||
| 172 | char *d = dest.GetBuf(src.Len()); | ||
| 173 | for (i = 0;;) | ||
| 174 | { | ||
| 175 | wchar_t c = s[i]; | ||
| 176 | if (c == 0) | ||
| 177 | break; | ||
| 178 | d[i++] = (char)c; | ||
| 179 | } | ||
| 180 | d[i] = 0; | ||
| 181 | dest.ReleaseBuf_SetLen(i); | ||
| 182 | return; | ||
| 183 | } | ||
| 184 | } | ||
| 185 | */ | ||
| 186 | |||
| 187 | unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL); | ||
| 188 | if (len == 0) | ||
| 189 | { | ||
| 190 | if (GetLastError() != 0) | ||
| 191 | throw 282228; | ||
| 192 | } | ||
| 193 | else | ||
| 194 | { | ||
| 195 | BOOL defUsed = FALSE; | ||
| 196 | bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); | ||
| 197 | // defaultChar = defaultChar; | ||
| 198 | len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), | ||
| 199 | dest.GetBuf(len), (int)len, | ||
| 200 | (isUtf ? NULL : &defaultChar), | ||
| 201 | (isUtf ? NULL : &defUsed) | ||
| 202 | ); | ||
| 203 | if (!isUtf) | ||
| 204 | defaultCharWasUsed = (defUsed != FALSE); | ||
| 205 | if (len == 0) | ||
| 206 | throw 282228; | ||
| 207 | dest.ReleaseBuf_SetEnd(len); | ||
| 208 | } | ||
| 209 | } | ||
| 210 | } | ||
| 211 | |||
| 212 | /* | ||
| 213 | #ifndef UNDER_CE | ||
| 214 | AString SystemStringToOemString(const CSysString &src) | ||
| 215 | { | ||
| 216 | AString dest; | ||
| 217 | const unsigned len = src.Len() * 2; | ||
| 218 | CharToOem(src, dest.GetBuf(len)); | ||
| 219 | dest.ReleaseBuf_CalcLen(len); | ||
| 220 | return dest; | ||
| 221 | } | ||
| 222 | #endif | ||
| 223 | */ | ||
| 224 | |||
| 225 | #else // _WIN32 | ||
| 226 | |||
| 227 | // #include <stdio.h> | ||
| 228 | /* | ||
| 229 | if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff), | ||
| 230 | and utf-8 string contains big unicode character > 0xffff), | ||
| 231 | then we still use 16-bit surrogate pair in UString. | ||
| 232 | It simplifies another code where utf-16 encoding is used. | ||
| 233 | So we use surrogate-conversion code only in is file. | ||
| 234 | */ | ||
| 235 | |||
| 236 | /* | ||
| 237 | mbstowcs() returns error if there is error in utf-8 stream, | ||
| 238 | mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream | ||
| 239 | */ | ||
| 240 | |||
| 241 | /* | ||
| 242 | static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src) | ||
| 243 | { | ||
| 244 | dest.Empty(); | ||
| 245 | if (src.IsEmpty()) | ||
| 246 | return; | ||
| 247 | |||
| 248 | const size_t limit = ((size_t)src.Len() + 1) * 2; | ||
| 249 | wchar_t *d = dest.GetBuf((unsigned)limit); | ||
| 250 | const size_t len = mbstowcs(d, src, limit); | ||
| 251 | if (len != (size_t)-1) | ||
| 252 | { | ||
| 253 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
| 254 | return; | ||
| 255 | } | ||
| 256 | dest.ReleaseBuf_SetEnd(0); | ||
| 257 | } | ||
| 258 | */ | ||
| 259 | |||
| 260 | bool g_ForceToUTF8 = true; // false; | ||
| 261 | |||
| 262 | void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) | ||
| 263 | { | ||
| 264 | dest.Empty(); | ||
| 265 | if (src.IsEmpty()) | ||
| 266 | return; | ||
| 267 | |||
| 268 | if (codePage == CP_UTF8 || g_ForceToUTF8) | ||
| 269 | { | ||
| 270 | ConvertUTF8ToUnicode(src, dest); | ||
| 271 | return; | ||
| 272 | } | ||
| 273 | |||
| 274 | const size_t limit = ((size_t)src.Len() + 1) * 2; | ||
| 275 | wchar_t *d = dest.GetBuf((unsigned)limit); | ||
| 276 | const size_t len = mbstowcs(d, src, limit); | ||
| 277 | if (len != (size_t)-1) | ||
| 278 | { | ||
| 279 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
| 280 | |||
| 281 | #if WCHAR_MAX > 0xffff | ||
| 282 | d = dest.GetBuf(); | ||
| 283 | for (size_t i = 0;; i++) | ||
| 284 | { | ||
| 285 | // wchar_t c = dest[i]; | ||
| 286 | wchar_t c = d[i]; | ||
| 287 | if (c == 0) | ||
| 288 | break; | ||
| 289 | if (c >= 0x10000 && c < 0x110000) | ||
| 290 | { | ||
| 291 | /* | ||
| 292 | c -= 0x10000; | ||
| 293 | unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF); | ||
| 294 | dest.ReplaceOneCharAtPos(i, c0); | ||
| 295 | i++; | ||
| 296 | c = 0xdc00 + (c & 0x3FF); | ||
| 297 | dest.Insert_wchar_t(i, c); | ||
| 298 | */ | ||
| 299 | UString temp = d + i; | ||
| 300 | |||
| 301 | for (size_t t = 0;; t++) | ||
| 302 | { | ||
| 303 | wchar_t w = temp[t]; | ||
| 304 | if (w == 0) | ||
| 305 | break; | ||
| 306 | if (i == limit) | ||
| 307 | break; // unexpected error | ||
| 308 | if (w >= 0x10000 && w < 0x110000) | ||
| 309 | { | ||
| 310 | if (i + 1 == limit) | ||
| 311 | break; // unexpected error | ||
| 312 | w -= 0x10000; | ||
| 313 | d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF); | ||
| 314 | w = 0xdc00 + (w & 0x3FF); | ||
| 315 | } | ||
| 316 | d[i++] = w; | ||
| 317 | } | ||
| 318 | dest.ReleaseBuf_SetEnd((unsigned)i); | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 322 | #endif | ||
| 323 | |||
| 324 | /* | ||
| 325 | printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr()); | ||
| 326 | printf("char: "); | ||
| 327 | for (unsigned i = 0; i < src.Len(); i++) | ||
| 328 | printf (" %02x", (int)(Byte)src[i]); | ||
| 329 | printf("\n"); | ||
| 330 | printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr()); | ||
| 331 | printf("wchar_t: "); | ||
| 332 | for (unsigned i = 0; i < dest.Len(); i++) | ||
| 333 | { | ||
| 334 | printf (" %02x", (int)dest[i]); | ||
| 335 | } | ||
| 336 | printf("\n"); | ||
| 337 | */ | ||
| 338 | |||
| 339 | return; | ||
| 340 | } | ||
| 341 | |||
| 342 | /* if there is mbstowcs() error, we have two ways: | ||
| 343 | |||
| 344 | 1) change 0x80+ characters to some character: '_' | ||
| 345 | in that case we lose data, but we have correct UString() | ||
| 346 | and that scheme can show errors to user in early stages, | ||
| 347 | when file converted back to mbs() cannot be found | ||
| 348 | |||
| 349 | 2) transfer bad characters in some UTF-16 range. | ||
| 350 | it can be non-original Unicode character. | ||
| 351 | but later we still can restore original character. | ||
| 352 | */ | ||
| 353 | |||
| 354 | |||
| 355 | // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr()); | ||
| 356 | { | ||
| 357 | unsigned i; | ||
| 358 | const char *s = (const char *)src; | ||
| 359 | for (i = 0;;) | ||
| 360 | { | ||
| 361 | Byte c = (Byte)s[i]; | ||
| 362 | if (c == 0) | ||
| 363 | break; | ||
| 364 | // we can use ascii compatibilty character '_' | ||
| 365 | // if (c > 0x7F) c = '_'; // we replace "bad: character | ||
| 366 | d[i++] = (wchar_t)c; | ||
| 367 | } | ||
| 368 | d[i] = 0; | ||
| 369 | dest.ReleaseBuf_SetLen(i); | ||
| 370 | } | ||
| 371 | } | ||
| 372 | |||
| 373 | static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src) | ||
| 374 | { | ||
| 375 | dest.Empty(); | ||
| 376 | if (src.IsEmpty()) | ||
| 377 | return; | ||
| 378 | |||
| 379 | const size_t limit = ((size_t)src.Len() + 1) * 6; | ||
| 380 | char *d = dest.GetBuf((unsigned)limit); | ||
| 381 | |||
| 382 | const size_t len = wcstombs(d, src, limit); | ||
| 383 | |||
| 384 | if (len != (size_t)-1) | ||
| 385 | { | ||
| 386 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
| 387 | return; | ||
| 388 | } | ||
| 389 | dest.ReleaseBuf_SetEnd(0); | ||
| 390 | } | ||
| 391 | |||
| 392 | |||
| 393 | static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
| 394 | { | ||
| 395 | // if (codePage == 1234567) // for debug purposes | ||
| 396 | if (codePage == CP_UTF8 || g_ForceToUTF8) | ||
| 397 | { | ||
| 398 | defaultCharWasUsed = false; | ||
| 399 | ConvertUnicodeToUTF8(src2, dest); | ||
| 400 | return; | ||
| 401 | } | ||
| 402 | |||
| 403 | UString src = src2; | ||
| 404 | #if WCHAR_MAX > 0xffff | ||
| 405 | { | ||
| 406 | src.Empty(); | ||
| 407 | for (unsigned i = 0; i < src2.Len();) | ||
| 408 | { | ||
| 409 | wchar_t c = src2[i]; | ||
| 410 | if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len()) | ||
| 411 | { | ||
| 412 | const wchar_t c2 = src2[i + 1]; | ||
| 413 | if (c2 >= 0xdc00 && c2 < 0x10000) | ||
| 414 | { | ||
| 415 | // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); | ||
| 416 | c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); | ||
| 417 | // printf("%4x\n", (int)c); | ||
| 418 | i++; | ||
| 419 | } | ||
| 420 | } | ||
| 421 | src += c; | ||
| 422 | i++; | ||
| 423 | } | ||
| 424 | } | ||
| 425 | #endif | ||
| 426 | |||
| 427 | dest.Empty(); | ||
| 428 | defaultCharWasUsed = false; | ||
| 429 | if (src.IsEmpty()) | ||
| 430 | return; | ||
| 431 | |||
| 432 | const size_t len = wcstombs(NULL, src, 0); | ||
| 433 | |||
| 434 | if (len != (size_t)-1) | ||
| 435 | { | ||
| 436 | const unsigned limit = ((unsigned)len); | ||
| 437 | if (limit == len) | ||
| 438 | { | ||
| 439 | char *d = dest.GetBuf(limit); | ||
| 440 | |||
| 441 | /* | ||
| 442 | { | ||
| 443 | printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr()); | ||
| 444 | for (unsigned i = 0; i < src.Len(); i++) | ||
| 445 | printf (" %02x", (int)src[i]); | ||
| 446 | printf("\n"); | ||
| 447 | printf("\ndest Limit = %d \n", limit); | ||
| 448 | } | ||
| 449 | */ | ||
| 450 | |||
| 451 | const size_t len2 = wcstombs(d, src, len + 1); | ||
| 452 | |||
| 453 | if (len2 != (size_t)-1 && len2 <= limit) | ||
| 454 | { | ||
| 455 | /* | ||
| 456 | printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr()); | ||
| 457 | for (unsigned i = 0; i < len2; i++) | ||
| 458 | printf(" %02x", (int)(Byte)dest[i]); | ||
| 459 | printf("\n"); | ||
| 460 | */ | ||
| 461 | dest.ReleaseBuf_SetEnd((unsigned)len2); | ||
| 462 | return; | ||
| 463 | } | ||
| 464 | } | ||
| 465 | } | ||
| 466 | |||
| 467 | { | ||
| 468 | const wchar_t *s = (const wchar_t *)src; | ||
| 469 | char *d = dest.GetBuf(src.Len()); | ||
| 470 | |||
| 471 | unsigned i; | ||
| 472 | for (i = 0;;) | ||
| 473 | { | ||
| 474 | wchar_t c = s[i]; | ||
| 475 | if (c == 0) | ||
| 476 | break; | ||
| 477 | if (c >= | ||
| 478 | 0x100 | ||
| 479 | // 0x80 | ||
| 480 | ) | ||
| 481 | { | ||
| 482 | c = defaultChar; | ||
| 483 | defaultCharWasUsed = true; | ||
| 484 | } | ||
| 485 | |||
| 486 | d[i++] = (char)c; | ||
| 487 | } | ||
| 488 | d[i] = 0; | ||
| 489 | dest.ReleaseBuf_SetLen(i); | ||
| 490 | /* | ||
| 491 | printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len()); | ||
| 492 | printf("ERROR: %s\n", dest.Ptr()); | ||
| 493 | */ | ||
| 494 | } | ||
| 495 | } | ||
| 496 | |||
| 497 | #endif // _WIN32 | ||
| 498 | |||
| 499 | |||
| 500 | UString MultiByteToUnicodeString(const AString &src, UINT codePage) | ||
| 501 | { | ||
| 502 | UString dest; | ||
| 503 | MultiByteToUnicodeString2(dest, src, codePage); | ||
| 504 | return dest; | ||
| 505 | } | ||
| 506 | |||
| 507 | UString MultiByteToUnicodeString(const char *src, UINT codePage) | ||
| 508 | { | ||
| 509 | return MultiByteToUnicodeString(AString(src), codePage); | ||
| 510 | } | ||
| 511 | |||
| 512 | |||
| 513 | void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage) | ||
| 514 | { | ||
| 515 | bool defaultCharWasUsed; | ||
| 516 | UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); | ||
| 517 | } | ||
| 518 | |||
| 519 | AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
| 520 | { | ||
| 521 | AString dest; | ||
| 522 | UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed); | ||
| 523 | return dest; | ||
| 524 | } | ||
| 525 | |||
| 526 | AString UnicodeStringToMultiByte(const UString &src, UINT codePage) | ||
| 527 | { | ||
| 528 | AString dest; | ||
| 529 | bool defaultCharWasUsed; | ||
| 530 | UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); | ||
| 531 | return dest; | ||
| 532 | } | ||
| 533 | |||
| 534 | |||
| 535 | |||
| 536 | |||
| 537 | |||
| 538 | #ifdef _WIN32 | ||
| 539 | #define U_to_A(a, b, c) UnicodeStringToMultiByte2 | ||
| 540 | // #define A_to_U(a, b, c) MultiByteToUnicodeString2 | ||
| 541 | #else | ||
| 542 | // void MultiByteToUnicodeString2_Native(UString &dest, const AString &src); | ||
| 543 | #define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b) | ||
| 544 | // #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b) | ||
| 545 | #endif | ||
| 546 | |||
| 547 | #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) | ||
| 548 | |||
| 549 | bool IsNativeUTF8() | ||
| 550 | { | ||
| 551 | UString u; | ||
| 552 | AString a, a2; | ||
| 553 | // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1) | ||
| 554 | for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1) | ||
| 555 | { | ||
| 556 | u.Empty(); | ||
| 557 | u += (wchar_t)c; | ||
| 558 | /* | ||
| 559 | if (Unicode_Is_There_Utf16SurrogateError(u)) | ||
| 560 | continue; | ||
| 561 | #ifndef _WIN32 | ||
| 562 | if (Unicode_Is_There_BmpEscape(u)) | ||
| 563 | continue; | ||
| 564 | #endif | ||
| 565 | */ | ||
| 566 | ConvertUnicodeToUTF8(u, a); | ||
| 567 | U_to_A(a2, u, CP_OEMCP); | ||
| 568 | if (a != a2) | ||
| 569 | return false; | ||
| 570 | } | ||
| 571 | return true; | ||
| 572 | } | ||
| 573 | |||
| 574 | #endif | ||
| 575 | |||
| 576 | |||
| 577 | #ifdef ENV_HAVE_LOCALE | ||
| 578 | |||
| 579 | const char *GetLocale(void) | ||
| 580 | { | ||
| 581 | #ifdef ENV_HAVE_LOCALE | ||
| 582 | // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : "); | ||
| 583 | const char *s = setlocale(LC_CTYPE, NULL); | ||
| 584 | if (!s) | ||
| 585 | { | ||
| 586 | // printf("[NULL]\n"); | ||
| 587 | s = "C"; | ||
| 588 | } | ||
| 589 | else | ||
| 590 | { | ||
| 591 | // ubuntu returns "C" after program start | ||
| 592 | // printf("\"%s\"\n", s); | ||
| 593 | } | ||
| 594 | return s; | ||
| 595 | #elif defined(LOCALE_IS_UTF8) | ||
| 596 | return "utf8"; | ||
| 597 | #else | ||
| 598 | return "C"; | ||
| 599 | #endif | ||
| 600 | } | ||
| 601 | |||
| 602 | #ifdef _WIN32 | ||
| 603 | static void Set_ForceToUTF8(bool) {} | ||
| 604 | #else | ||
| 605 | static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; } | ||
| 606 | #endif | ||
| 607 | |||
| 608 | static bool Is_Default_Basic_Locale(const char *locale) | ||
| 609 | { | ||
| 610 | const AString a (locale); | ||
| 611 | if (a.IsEqualTo_Ascii_NoCase("") | ||
| 612 | || a.IsEqualTo_Ascii_NoCase("C") | ||
| 613 | || a.IsEqualTo_Ascii_NoCase("POSIX")) | ||
| 614 | return true; | ||
| 615 | return false; | ||
| 616 | } | ||
| 617 | |||
| 618 | static bool Is_Default_Basic_Locale() | ||
| 619 | { | ||
| 620 | return Is_Default_Basic_Locale(GetLocale()); | ||
| 621 | } | ||
| 622 | |||
| 623 | |||
| 624 | void MY_SetLocale() | ||
| 625 | { | ||
| 626 | #ifdef ENV_HAVE_LOCALE | ||
| 627 | /* | ||
| 628 | { | ||
| 629 | const char *s = GetLocale(); | ||
| 630 | printf("\nGetLocale() : returned : \"%s\"\n", s); | ||
| 631 | } | ||
| 632 | */ | ||
| 633 | |||
| 634 | unsigned start = 0; | ||
| 635 | // unsigned lim = 0; | ||
| 636 | unsigned lim = 3; | ||
| 637 | |||
| 638 | /* | ||
| 639 | #define MY_SET_LOCALE_FLAGS__FROM_ENV 1 | ||
| 640 | #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2 | ||
| 641 | |||
| 642 | unsigned flags = | ||
| 643 | MY_SET_LOCALE_FLAGS__FROM_ENV | | ||
| 644 | MY_SET_LOCALE_FLAGS__TRY_UTF8 | ||
| 645 | |||
| 646 | if (flags != 0) | ||
| 647 | { | ||
| 648 | if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV) | ||
| 649 | lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1; | ||
| 650 | else | ||
| 651 | { | ||
| 652 | start = 1; | ||
| 653 | lim = 2; | ||
| 654 | } | ||
| 655 | } | ||
| 656 | */ | ||
| 657 | |||
| 658 | for (unsigned i = start; i < lim; i++) | ||
| 659 | { | ||
| 660 | /* | ||
| 661 | man7: "If locale is an empty string, "", each part of the locale that | ||
| 662 | should be modified is set according to the environment variables. | ||
| 663 | for glibc: glibc, first from the user's environment variables: | ||
| 664 | 1) the environment variable LC_ALL, | ||
| 665 | 2) environment variable with the same name as the category (see the | ||
| 666 | 3) the environment variable LANG | ||
| 667 | The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems. | ||
| 668 | |||
| 669 | for WIN32 : MSDN : | ||
| 670 | Sets the locale to the default, which is the user-default | ||
| 671 | ANSI code page obtained from the operating system. | ||
| 672 | The locale name is set to the value returned by GetUserDefaultLocaleName. | ||
| 673 | The code page is set to the value returned by GetACP | ||
| 674 | */ | ||
| 675 | const char *newLocale = ""; | ||
| 676 | |||
| 677 | #ifdef __APPLE__ | ||
| 678 | |||
| 679 | /* look also CFLocale | ||
| 680 | there is no C.UTF-8 in macos | ||
| 681 | macos has UTF-8 locale only with some language like en_US.UTF-8 | ||
| 682 | what is best way to set UTF-8 locale in macos? */ | ||
| 683 | if (i == 1) | ||
| 684 | newLocale = "en_US.UTF-8"; | ||
| 685 | |||
| 686 | /* file open with non-utf8 sequencies return | ||
| 687 | #define EILSEQ 92 // "Illegal byte sequence" | ||
| 688 | */ | ||
| 689 | #else | ||
| 690 | // newLocale = "C"; | ||
| 691 | if (i == 1) | ||
| 692 | { | ||
| 693 | newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu | ||
| 694 | // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime | ||
| 695 | // newLocale = "en_US.utf8"; // supported by ubuntu ? | ||
| 696 | // newLocale = "en_US.UTF-8"; | ||
| 697 | /* setlocale() in ubuntu allows locales with minor chracter changes in strings | ||
| 698 | "en_US.UTF-8" / "en_US.utf8" */ | ||
| 699 | } | ||
| 700 | |||
| 701 | #endif | ||
| 702 | |||
| 703 | // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale); | ||
| 704 | |||
| 705 | // const char *s = | ||
| 706 | setlocale(LC_ALL, newLocale); | ||
| 707 | |||
| 708 | /* | ||
| 709 | if (!s) | ||
| 710 | printf("NULL: can't set locale"); | ||
| 711 | else | ||
| 712 | printf("\"%s\"\n", s); | ||
| 713 | */ | ||
| 714 | |||
| 715 | // request curent locale of program | ||
| 716 | const char *locale = GetLocale(); | ||
| 717 | if (locale) | ||
| 718 | { | ||
| 719 | AString a (locale); | ||
| 720 | a.MakeLower_Ascii(); | ||
| 721 | // if (a.Find("utf") >= 0) | ||
| 722 | { | ||
| 723 | if (IsNativeUTF8()) | ||
| 724 | { | ||
| 725 | Set_ForceToUTF8(true); | ||
| 726 | return; | ||
| 727 | } | ||
| 728 | } | ||
| 729 | if (!Is_Default_Basic_Locale(locale)) | ||
| 730 | { | ||
| 731 | // if there is some non-default and non-utf locale, we want to use it | ||
| 732 | break; // comment it for debug | ||
| 733 | } | ||
| 734 | } | ||
| 735 | } | ||
| 736 | |||
| 737 | if (IsNativeUTF8()) | ||
| 738 | { | ||
| 739 | Set_ForceToUTF8(true); | ||
| 740 | return; | ||
| 741 | } | ||
| 742 | |||
| 743 | if (Is_Default_Basic_Locale()) | ||
| 744 | { | ||
| 745 | Set_ForceToUTF8(true); | ||
| 746 | return; | ||
| 747 | } | ||
| 748 | |||
| 749 | Set_ForceToUTF8(false); | ||
| 750 | |||
| 751 | #elif defined(LOCALE_IS_UTF8) | ||
| 752 | // assume LC_CTYPE="utf8" | ||
| 753 | #else | ||
| 754 | // assume LC_CTYPE="C" | ||
| 755 | #endif | ||
| 756 | } | ||
| 757 | #endif | ||
