diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2021-12-27 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2022-03-18 15:35:13 +0500 |
commit | f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch) | |
tree | 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/StringConvert.cpp | |
parent | 98e06a519b63b81986abe76d28887f6984a7732b (diff) | |
download | 7zip-21.07.tar.gz 7zip-21.07.tar.bz2 7zip-21.07.zip |
'21.07'21.07
Diffstat (limited to 'CPP/Common/StringConvert.cpp')
-rw-r--r-- | CPP/Common/StringConvert.cpp | 757 |
1 files changed, 757 insertions, 0 deletions
diff --git a/CPP/Common/StringConvert.cpp b/CPP/Common/StringConvert.cpp new file mode 100644 index 0000000..c0bde0f --- /dev/null +++ b/CPP/Common/StringConvert.cpp | |||
@@ -0,0 +1,757 @@ | |||
1 | // Common/StringConvert.cpp | ||
2 | |||
3 | #include "StdAfx.h" | ||
4 | |||
5 | #include "StringConvert.h" | ||
6 | |||
7 | #ifndef _WIN32 | ||
8 | // #include <stdio.h> | ||
9 | #include <stdlib.h> | ||
10 | #endif | ||
11 | |||
12 | #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) | ||
13 | #include "UTFConvert.h" | ||
14 | #endif | ||
15 | |||
16 | #ifdef ENV_HAVE_LOCALE | ||
17 | #include <locale.h> | ||
18 | #endif | ||
19 | |||
20 | static const char k_DefultChar = '_'; | ||
21 | |||
22 | #ifdef _WIN32 | ||
23 | |||
24 | /* | ||
25 | MultiByteToWideChar(CodePage, DWORD dwFlags, | ||
26 | LPCSTR lpMultiByteStr, int cbMultiByte, | ||
27 | LPWSTR lpWideCharStr, int cchWideChar) | ||
28 | |||
29 | if (cbMultiByte == 0) | ||
30 | return: 0. ERR: ERROR_INVALID_PARAMETER | ||
31 | |||
32 | if (cchWideChar == 0) | ||
33 | return: the required buffer size in characters. | ||
34 | |||
35 | if (supplied buffer size was not large enough) | ||
36 | return: 0. ERR: ERROR_INSUFFICIENT_BUFFER | ||
37 | The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex) | ||
38 | |||
39 | If there are illegal characters: | ||
40 | if MB_ERR_INVALID_CHARS is set in dwFlags: | ||
41 | - the function stops conversion on illegal character. | ||
42 | - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION. | ||
43 | |||
44 | if MB_ERR_INVALID_CHARS is NOT set in dwFlags: | ||
45 | before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0. | ||
46 | in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal | ||
47 | character is converted to U+FFFD, which is REPLACEMENT CHARACTER. | ||
48 | */ | ||
49 | |||
50 | |||
51 | void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) | ||
52 | { | ||
53 | dest.Empty(); | ||
54 | if (src.IsEmpty()) | ||
55 | return; | ||
56 | { | ||
57 | /* | ||
58 | wchar_t *d = dest.GetBuf(src.Len()); | ||
59 | const char *s = (const char *)src; | ||
60 | unsigned i; | ||
61 | |||
62 | for (i = 0;;) | ||
63 | { | ||
64 | Byte c = (Byte)s[i]; | ||
65 | if (c >= 0x80 || c == 0) | ||
66 | break; | ||
67 | d[i++] = (wchar_t)c; | ||
68 | } | ||
69 | |||
70 | if (i != src.Len()) | ||
71 | { | ||
72 | unsigned len = MultiByteToWideChar(codePage, 0, s + i, | ||
73 | src.Len() - i, d + i, | ||
74 | src.Len() + 1 - i); | ||
75 | if (len == 0) | ||
76 | throw 282228; | ||
77 | i += len; | ||
78 | } | ||
79 | |||
80 | d[i] = 0; | ||
81 | dest.ReleaseBuf_SetLen(i); | ||
82 | */ | ||
83 | unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0); | ||
84 | if (len == 0) | ||
85 | { | ||
86 | if (GetLastError() != 0) | ||
87 | throw 282228; | ||
88 | } | ||
89 | else | ||
90 | { | ||
91 | len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len); | ||
92 | if (len == 0) | ||
93 | throw 282228; | ||
94 | dest.ReleaseBuf_SetEnd(len); | ||
95 | } | ||
96 | } | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | int WideCharToMultiByte( | ||
101 | UINT CodePage, DWORD dwFlags, | ||
102 | LPCWSTR lpWideCharStr, int cchWideChar, | ||
103 | LPSTR lpMultiByteStr, int cbMultiByte, | ||
104 | LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar); | ||
105 | |||
106 | if (lpDefaultChar == NULL), | ||
107 | - it uses system default value. | ||
108 | |||
109 | if (CodePage == CP_UTF7 || CodePage == CP_UTF8) | ||
110 | if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL) | ||
111 | return: 0. ERR: ERROR_INVALID_PARAMETER. | ||
112 | |||
113 | The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL) | ||
114 | |||
115 | */ | ||
116 | |||
117 | static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
118 | { | ||
119 | dest.Empty(); | ||
120 | defaultCharWasUsed = false; | ||
121 | if (src.IsEmpty()) | ||
122 | return; | ||
123 | { | ||
124 | /* | ||
125 | unsigned numRequiredBytes = src.Len() * 2; | ||
126 | char *d = dest.GetBuf(numRequiredBytes); | ||
127 | const wchar_t *s = (const wchar_t *)src; | ||
128 | unsigned i; | ||
129 | |||
130 | for (i = 0;;) | ||
131 | { | ||
132 | wchar_t c = s[i]; | ||
133 | if (c >= 0x80 || c == 0) | ||
134 | break; | ||
135 | d[i++] = (char)c; | ||
136 | } | ||
137 | |||
138 | if (i != src.Len()) | ||
139 | { | ||
140 | BOOL defUsed = FALSE; | ||
141 | defaultChar = defaultChar; | ||
142 | |||
143 | bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); | ||
144 | unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i, | ||
145 | d + i, numRequiredBytes + 1 - i, | ||
146 | (isUtf ? NULL : &defaultChar), | ||
147 | (isUtf ? NULL : &defUsed)); | ||
148 | defaultCharWasUsed = (defUsed != FALSE); | ||
149 | if (len == 0) | ||
150 | throw 282229; | ||
151 | i += len; | ||
152 | } | ||
153 | |||
154 | d[i] = 0; | ||
155 | dest.ReleaseBuf_SetLen(i); | ||
156 | */ | ||
157 | |||
158 | /* | ||
159 | if (codePage != CP_UTF7) | ||
160 | { | ||
161 | const wchar_t *s = (const wchar_t *)src; | ||
162 | unsigned i; | ||
163 | for (i = 0;; i++) | ||
164 | { | ||
165 | wchar_t c = s[i]; | ||
166 | if (c >= 0x80 || c == 0) | ||
167 | break; | ||
168 | } | ||
169 | |||
170 | if (s[i] == 0) | ||
171 | { | ||
172 | char *d = dest.GetBuf(src.Len()); | ||
173 | for (i = 0;;) | ||
174 | { | ||
175 | wchar_t c = s[i]; | ||
176 | if (c == 0) | ||
177 | break; | ||
178 | d[i++] = (char)c; | ||
179 | } | ||
180 | d[i] = 0; | ||
181 | dest.ReleaseBuf_SetLen(i); | ||
182 | return; | ||
183 | } | ||
184 | } | ||
185 | */ | ||
186 | |||
187 | unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL); | ||
188 | if (len == 0) | ||
189 | { | ||
190 | if (GetLastError() != 0) | ||
191 | throw 282228; | ||
192 | } | ||
193 | else | ||
194 | { | ||
195 | BOOL defUsed = FALSE; | ||
196 | bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); | ||
197 | // defaultChar = defaultChar; | ||
198 | len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), | ||
199 | dest.GetBuf(len), (int)len, | ||
200 | (isUtf ? NULL : &defaultChar), | ||
201 | (isUtf ? NULL : &defUsed) | ||
202 | ); | ||
203 | if (!isUtf) | ||
204 | defaultCharWasUsed = (defUsed != FALSE); | ||
205 | if (len == 0) | ||
206 | throw 282228; | ||
207 | dest.ReleaseBuf_SetEnd(len); | ||
208 | } | ||
209 | } | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | #ifndef UNDER_CE | ||
214 | AString SystemStringToOemString(const CSysString &src) | ||
215 | { | ||
216 | AString dest; | ||
217 | const unsigned len = src.Len() * 2; | ||
218 | CharToOem(src, dest.GetBuf(len)); | ||
219 | dest.ReleaseBuf_CalcLen(len); | ||
220 | return dest; | ||
221 | } | ||
222 | #endif | ||
223 | */ | ||
224 | |||
225 | #else // _WIN32 | ||
226 | |||
227 | // #include <stdio.h> | ||
228 | /* | ||
229 | if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff), | ||
230 | and utf-8 string contains big unicode character > 0xffff), | ||
231 | then we still use 16-bit surrogate pair in UString. | ||
232 | It simplifies another code where utf-16 encoding is used. | ||
233 | So we use surrogate-conversion code only in is file. | ||
234 | */ | ||
235 | |||
236 | /* | ||
237 | mbstowcs() returns error if there is error in utf-8 stream, | ||
238 | mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream | ||
239 | */ | ||
240 | |||
241 | /* | ||
242 | static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src) | ||
243 | { | ||
244 | dest.Empty(); | ||
245 | if (src.IsEmpty()) | ||
246 | return; | ||
247 | |||
248 | const size_t limit = ((size_t)src.Len() + 1) * 2; | ||
249 | wchar_t *d = dest.GetBuf((unsigned)limit); | ||
250 | const size_t len = mbstowcs(d, src, limit); | ||
251 | if (len != (size_t)-1) | ||
252 | { | ||
253 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
254 | return; | ||
255 | } | ||
256 | dest.ReleaseBuf_SetEnd(0); | ||
257 | } | ||
258 | */ | ||
259 | |||
260 | bool g_ForceToUTF8 = true; // false; | ||
261 | |||
262 | void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) | ||
263 | { | ||
264 | dest.Empty(); | ||
265 | if (src.IsEmpty()) | ||
266 | return; | ||
267 | |||
268 | if (codePage == CP_UTF8 || g_ForceToUTF8) | ||
269 | { | ||
270 | ConvertUTF8ToUnicode(src, dest); | ||
271 | return; | ||
272 | } | ||
273 | |||
274 | const size_t limit = ((size_t)src.Len() + 1) * 2; | ||
275 | wchar_t *d = dest.GetBuf((unsigned)limit); | ||
276 | const size_t len = mbstowcs(d, src, limit); | ||
277 | if (len != (size_t)-1) | ||
278 | { | ||
279 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
280 | |||
281 | #if WCHAR_MAX > 0xffff | ||
282 | d = dest.GetBuf(); | ||
283 | for (size_t i = 0;; i++) | ||
284 | { | ||
285 | // wchar_t c = dest[i]; | ||
286 | wchar_t c = d[i]; | ||
287 | if (c == 0) | ||
288 | break; | ||
289 | if (c >= 0x10000 && c < 0x110000) | ||
290 | { | ||
291 | /* | ||
292 | c -= 0x10000; | ||
293 | unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF); | ||
294 | dest.ReplaceOneCharAtPos(i, c0); | ||
295 | i++; | ||
296 | c = 0xdc00 + (c & 0x3FF); | ||
297 | dest.Insert_wchar_t(i, c); | ||
298 | */ | ||
299 | UString temp = d + i; | ||
300 | |||
301 | for (size_t t = 0;; t++) | ||
302 | { | ||
303 | wchar_t w = temp[t]; | ||
304 | if (w == 0) | ||
305 | break; | ||
306 | if (i == limit) | ||
307 | break; // unexpected error | ||
308 | if (w >= 0x10000 && w < 0x110000) | ||
309 | { | ||
310 | if (i + 1 == limit) | ||
311 | break; // unexpected error | ||
312 | w -= 0x10000; | ||
313 | d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF); | ||
314 | w = 0xdc00 + (w & 0x3FF); | ||
315 | } | ||
316 | d[i++] = w; | ||
317 | } | ||
318 | dest.ReleaseBuf_SetEnd((unsigned)i); | ||
319 | } | ||
320 | } | ||
321 | |||
322 | #endif | ||
323 | |||
324 | /* | ||
325 | printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr()); | ||
326 | printf("char: "); | ||
327 | for (unsigned i = 0; i < src.Len(); i++) | ||
328 | printf (" %02x", (int)(Byte)src[i]); | ||
329 | printf("\n"); | ||
330 | printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr()); | ||
331 | printf("wchar_t: "); | ||
332 | for (unsigned i = 0; i < dest.Len(); i++) | ||
333 | { | ||
334 | printf (" %02x", (int)dest[i]); | ||
335 | } | ||
336 | printf("\n"); | ||
337 | */ | ||
338 | |||
339 | return; | ||
340 | } | ||
341 | |||
342 | /* if there is mbstowcs() error, we have two ways: | ||
343 | |||
344 | 1) change 0x80+ characters to some character: '_' | ||
345 | in that case we lose data, but we have correct UString() | ||
346 | and that scheme can show errors to user in early stages, | ||
347 | when file converted back to mbs() cannot be found | ||
348 | |||
349 | 2) transfer bad characters in some UTF-16 range. | ||
350 | it can be non-original Unicode character. | ||
351 | but later we still can restore original character. | ||
352 | */ | ||
353 | |||
354 | |||
355 | // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr()); | ||
356 | { | ||
357 | unsigned i; | ||
358 | const char *s = (const char *)src; | ||
359 | for (i = 0;;) | ||
360 | { | ||
361 | Byte c = (Byte)s[i]; | ||
362 | if (c == 0) | ||
363 | break; | ||
364 | // we can use ascii compatibilty character '_' | ||
365 | // if (c > 0x7F) c = '_'; // we replace "bad: character | ||
366 | d[i++] = (wchar_t)c; | ||
367 | } | ||
368 | d[i] = 0; | ||
369 | dest.ReleaseBuf_SetLen(i); | ||
370 | } | ||
371 | } | ||
372 | |||
373 | static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src) | ||
374 | { | ||
375 | dest.Empty(); | ||
376 | if (src.IsEmpty()) | ||
377 | return; | ||
378 | |||
379 | const size_t limit = ((size_t)src.Len() + 1) * 6; | ||
380 | char *d = dest.GetBuf((unsigned)limit); | ||
381 | |||
382 | const size_t len = wcstombs(d, src, limit); | ||
383 | |||
384 | if (len != (size_t)-1) | ||
385 | { | ||
386 | dest.ReleaseBuf_SetEnd((unsigned)len); | ||
387 | return; | ||
388 | } | ||
389 | dest.ReleaseBuf_SetEnd(0); | ||
390 | } | ||
391 | |||
392 | |||
393 | static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
394 | { | ||
395 | // if (codePage == 1234567) // for debug purposes | ||
396 | if (codePage == CP_UTF8 || g_ForceToUTF8) | ||
397 | { | ||
398 | defaultCharWasUsed = false; | ||
399 | ConvertUnicodeToUTF8(src2, dest); | ||
400 | return; | ||
401 | } | ||
402 | |||
403 | UString src = src2; | ||
404 | #if WCHAR_MAX > 0xffff | ||
405 | { | ||
406 | src.Empty(); | ||
407 | for (unsigned i = 0; i < src2.Len();) | ||
408 | { | ||
409 | wchar_t c = src2[i]; | ||
410 | if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len()) | ||
411 | { | ||
412 | const wchar_t c2 = src2[i + 1]; | ||
413 | if (c2 >= 0xdc00 && c2 < 0x10000) | ||
414 | { | ||
415 | // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); | ||
416 | c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); | ||
417 | // printf("%4x\n", (int)c); | ||
418 | i++; | ||
419 | } | ||
420 | } | ||
421 | src += c; | ||
422 | i++; | ||
423 | } | ||
424 | } | ||
425 | #endif | ||
426 | |||
427 | dest.Empty(); | ||
428 | defaultCharWasUsed = false; | ||
429 | if (src.IsEmpty()) | ||
430 | return; | ||
431 | |||
432 | const size_t len = wcstombs(NULL, src, 0); | ||
433 | |||
434 | if (len != (size_t)-1) | ||
435 | { | ||
436 | const unsigned limit = ((unsigned)len); | ||
437 | if (limit == len) | ||
438 | { | ||
439 | char *d = dest.GetBuf(limit); | ||
440 | |||
441 | /* | ||
442 | { | ||
443 | printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr()); | ||
444 | for (unsigned i = 0; i < src.Len(); i++) | ||
445 | printf (" %02x", (int)src[i]); | ||
446 | printf("\n"); | ||
447 | printf("\ndest Limit = %d \n", limit); | ||
448 | } | ||
449 | */ | ||
450 | |||
451 | const size_t len2 = wcstombs(d, src, len + 1); | ||
452 | |||
453 | if (len2 != (size_t)-1 && len2 <= limit) | ||
454 | { | ||
455 | /* | ||
456 | printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr()); | ||
457 | for (unsigned i = 0; i < len2; i++) | ||
458 | printf(" %02x", (int)(Byte)dest[i]); | ||
459 | printf("\n"); | ||
460 | */ | ||
461 | dest.ReleaseBuf_SetEnd((unsigned)len2); | ||
462 | return; | ||
463 | } | ||
464 | } | ||
465 | } | ||
466 | |||
467 | { | ||
468 | const wchar_t *s = (const wchar_t *)src; | ||
469 | char *d = dest.GetBuf(src.Len()); | ||
470 | |||
471 | unsigned i; | ||
472 | for (i = 0;;) | ||
473 | { | ||
474 | wchar_t c = s[i]; | ||
475 | if (c == 0) | ||
476 | break; | ||
477 | if (c >= | ||
478 | 0x100 | ||
479 | // 0x80 | ||
480 | ) | ||
481 | { | ||
482 | c = defaultChar; | ||
483 | defaultCharWasUsed = true; | ||
484 | } | ||
485 | |||
486 | d[i++] = (char)c; | ||
487 | } | ||
488 | d[i] = 0; | ||
489 | dest.ReleaseBuf_SetLen(i); | ||
490 | /* | ||
491 | printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len()); | ||
492 | printf("ERROR: %s\n", dest.Ptr()); | ||
493 | */ | ||
494 | } | ||
495 | } | ||
496 | |||
497 | #endif // _WIN32 | ||
498 | |||
499 | |||
500 | UString MultiByteToUnicodeString(const AString &src, UINT codePage) | ||
501 | { | ||
502 | UString dest; | ||
503 | MultiByteToUnicodeString2(dest, src, codePage); | ||
504 | return dest; | ||
505 | } | ||
506 | |||
507 | UString MultiByteToUnicodeString(const char *src, UINT codePage) | ||
508 | { | ||
509 | return MultiByteToUnicodeString(AString(src), codePage); | ||
510 | } | ||
511 | |||
512 | |||
513 | void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage) | ||
514 | { | ||
515 | bool defaultCharWasUsed; | ||
516 | UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); | ||
517 | } | ||
518 | |||
519 | AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) | ||
520 | { | ||
521 | AString dest; | ||
522 | UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed); | ||
523 | return dest; | ||
524 | } | ||
525 | |||
526 | AString UnicodeStringToMultiByte(const UString &src, UINT codePage) | ||
527 | { | ||
528 | AString dest; | ||
529 | bool defaultCharWasUsed; | ||
530 | UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); | ||
531 | return dest; | ||
532 | } | ||
533 | |||
534 | |||
535 | |||
536 | |||
537 | |||
538 | #ifdef _WIN32 | ||
539 | #define U_to_A(a, b, c) UnicodeStringToMultiByte2 | ||
540 | // #define A_to_U(a, b, c) MultiByteToUnicodeString2 | ||
541 | #else | ||
542 | // void MultiByteToUnicodeString2_Native(UString &dest, const AString &src); | ||
543 | #define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b) | ||
544 | // #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b) | ||
545 | #endif | ||
546 | |||
547 | #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) | ||
548 | |||
549 | bool IsNativeUTF8() | ||
550 | { | ||
551 | UString u; | ||
552 | AString a, a2; | ||
553 | // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1) | ||
554 | for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1) | ||
555 | { | ||
556 | u.Empty(); | ||
557 | u += (wchar_t)c; | ||
558 | /* | ||
559 | if (Unicode_Is_There_Utf16SurrogateError(u)) | ||
560 | continue; | ||
561 | #ifndef _WIN32 | ||
562 | if (Unicode_Is_There_BmpEscape(u)) | ||
563 | continue; | ||
564 | #endif | ||
565 | */ | ||
566 | ConvertUnicodeToUTF8(u, a); | ||
567 | U_to_A(a2, u, CP_OEMCP); | ||
568 | if (a != a2) | ||
569 | return false; | ||
570 | } | ||
571 | return true; | ||
572 | } | ||
573 | |||
574 | #endif | ||
575 | |||
576 | |||
577 | #ifdef ENV_HAVE_LOCALE | ||
578 | |||
579 | const char *GetLocale(void) | ||
580 | { | ||
581 | #ifdef ENV_HAVE_LOCALE | ||
582 | // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : "); | ||
583 | const char *s = setlocale(LC_CTYPE, NULL); | ||
584 | if (!s) | ||
585 | { | ||
586 | // printf("[NULL]\n"); | ||
587 | s = "C"; | ||
588 | } | ||
589 | else | ||
590 | { | ||
591 | // ubuntu returns "C" after program start | ||
592 | // printf("\"%s\"\n", s); | ||
593 | } | ||
594 | return s; | ||
595 | #elif defined(LOCALE_IS_UTF8) | ||
596 | return "utf8"; | ||
597 | #else | ||
598 | return "C"; | ||
599 | #endif | ||
600 | } | ||
601 | |||
602 | #ifdef _WIN32 | ||
603 | static void Set_ForceToUTF8(bool) {} | ||
604 | #else | ||
605 | static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; } | ||
606 | #endif | ||
607 | |||
608 | static bool Is_Default_Basic_Locale(const char *locale) | ||
609 | { | ||
610 | const AString a (locale); | ||
611 | if (a.IsEqualTo_Ascii_NoCase("") | ||
612 | || a.IsEqualTo_Ascii_NoCase("C") | ||
613 | || a.IsEqualTo_Ascii_NoCase("POSIX")) | ||
614 | return true; | ||
615 | return false; | ||
616 | } | ||
617 | |||
618 | static bool Is_Default_Basic_Locale() | ||
619 | { | ||
620 | return Is_Default_Basic_Locale(GetLocale()); | ||
621 | } | ||
622 | |||
623 | |||
624 | void MY_SetLocale() | ||
625 | { | ||
626 | #ifdef ENV_HAVE_LOCALE | ||
627 | /* | ||
628 | { | ||
629 | const char *s = GetLocale(); | ||
630 | printf("\nGetLocale() : returned : \"%s\"\n", s); | ||
631 | } | ||
632 | */ | ||
633 | |||
634 | unsigned start = 0; | ||
635 | // unsigned lim = 0; | ||
636 | unsigned lim = 3; | ||
637 | |||
638 | /* | ||
639 | #define MY_SET_LOCALE_FLAGS__FROM_ENV 1 | ||
640 | #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2 | ||
641 | |||
642 | unsigned flags = | ||
643 | MY_SET_LOCALE_FLAGS__FROM_ENV | | ||
644 | MY_SET_LOCALE_FLAGS__TRY_UTF8 | ||
645 | |||
646 | if (flags != 0) | ||
647 | { | ||
648 | if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV) | ||
649 | lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1; | ||
650 | else | ||
651 | { | ||
652 | start = 1; | ||
653 | lim = 2; | ||
654 | } | ||
655 | } | ||
656 | */ | ||
657 | |||
658 | for (unsigned i = start; i < lim; i++) | ||
659 | { | ||
660 | /* | ||
661 | man7: "If locale is an empty string, "", each part of the locale that | ||
662 | should be modified is set according to the environment variables. | ||
663 | for glibc: glibc, first from the user's environment variables: | ||
664 | 1) the environment variable LC_ALL, | ||
665 | 2) environment variable with the same name as the category (see the | ||
666 | 3) the environment variable LANG | ||
667 | The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems. | ||
668 | |||
669 | for WIN32 : MSDN : | ||
670 | Sets the locale to the default, which is the user-default | ||
671 | ANSI code page obtained from the operating system. | ||
672 | The locale name is set to the value returned by GetUserDefaultLocaleName. | ||
673 | The code page is set to the value returned by GetACP | ||
674 | */ | ||
675 | const char *newLocale = ""; | ||
676 | |||
677 | #ifdef __APPLE__ | ||
678 | |||
679 | /* look also CFLocale | ||
680 | there is no C.UTF-8 in macos | ||
681 | macos has UTF-8 locale only with some language like en_US.UTF-8 | ||
682 | what is best way to set UTF-8 locale in macos? */ | ||
683 | if (i == 1) | ||
684 | newLocale = "en_US.UTF-8"; | ||
685 | |||
686 | /* file open with non-utf8 sequencies return | ||
687 | #define EILSEQ 92 // "Illegal byte sequence" | ||
688 | */ | ||
689 | #else | ||
690 | // newLocale = "C"; | ||
691 | if (i == 1) | ||
692 | { | ||
693 | newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu | ||
694 | // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime | ||
695 | // newLocale = "en_US.utf8"; // supported by ubuntu ? | ||
696 | // newLocale = "en_US.UTF-8"; | ||
697 | /* setlocale() in ubuntu allows locales with minor chracter changes in strings | ||
698 | "en_US.UTF-8" / "en_US.utf8" */ | ||
699 | } | ||
700 | |||
701 | #endif | ||
702 | |||
703 | // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale); | ||
704 | |||
705 | // const char *s = | ||
706 | setlocale(LC_ALL, newLocale); | ||
707 | |||
708 | /* | ||
709 | if (!s) | ||
710 | printf("NULL: can't set locale"); | ||
711 | else | ||
712 | printf("\"%s\"\n", s); | ||
713 | */ | ||
714 | |||
715 | // request curent locale of program | ||
716 | const char *locale = GetLocale(); | ||
717 | if (locale) | ||
718 | { | ||
719 | AString a (locale); | ||
720 | a.MakeLower_Ascii(); | ||
721 | // if (a.Find("utf") >= 0) | ||
722 | { | ||
723 | if (IsNativeUTF8()) | ||
724 | { | ||
725 | Set_ForceToUTF8(true); | ||
726 | return; | ||
727 | } | ||
728 | } | ||
729 | if (!Is_Default_Basic_Locale(locale)) | ||
730 | { | ||
731 | // if there is some non-default and non-utf locale, we want to use it | ||
732 | break; // comment it for debug | ||
733 | } | ||
734 | } | ||
735 | } | ||
736 | |||
737 | if (IsNativeUTF8()) | ||
738 | { | ||
739 | Set_ForceToUTF8(true); | ||
740 | return; | ||
741 | } | ||
742 | |||
743 | if (Is_Default_Basic_Locale()) | ||
744 | { | ||
745 | Set_ForceToUTF8(true); | ||
746 | return; | ||
747 | } | ||
748 | |||
749 | Set_ForceToUTF8(false); | ||
750 | |||
751 | #elif defined(LOCALE_IS_UTF8) | ||
752 | // assume LC_CTYPE="utf8" | ||
753 | #else | ||
754 | // assume LC_CTYPE="C" | ||
755 | #endif | ||
756 | } | ||
757 | #endif | ||