'21.07'21.07

author: Igor Pavlov <87184205+ip7z@users.noreply.github.com> 2021-12-27 00:00:00 +0000
committer: Igor Pavlov <87184205+ip7z@users.noreply.github.com> 2022-03-18 15:35:13 +0500
commit: f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree: 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/StringConvert.cpp
parent: 98e06a519b63b81986abe76d28887f6984a7732b (diff)
download: 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip
1 files changed, 757 insertions, 0 deletions
diff --git a/CPP/Common/StringConvert.cpp b/CPP/Common/StringConvert.cpp
new file mode 100644
index 0000000..c0bde0f
--- /dev/null
+++ b/CPP/Common/StringConvert.cpp
@@ -0,0 +1,757 @@
+// Common/StringConvert.cpp
+#include "StdAfx.h"
+#include "StringConvert.h"
+#ifndef _WIN32
+// #include <stdio.h>
+#include <stdlib.h>
+#endif
+#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
+#include "UTFConvert.h"
+#endif
+#ifdef ENV_HAVE_LOCALE
+#include <locale.h>
+#endif
+static const char k_DefultChar = '_';
+#ifdef _WIN32
+/*
+MultiByteToWideChar(CodePage, DWORD dwFlags,
+    LPCSTR lpMultiByteStr, int cbMultiByte,
+    LPWSTR lpWideCharStr, int cchWideChar)
+  if (cbMultiByte == 0)
+    return: 0. ERR: ERROR_INVALID_PARAMETER
+  if (cchWideChar == 0)
+    return: the required buffer size in characters.
+  if (supplied buffer size was not large enough)
+    return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
+    The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
+  If there are illegal characters:
+    if MB_ERR_INVALID_CHARS is set in dwFlags:
+      - the function stops conversion on illegal character.
+      - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
+    
+    if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
+      before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
+      in Vista+:    illegal character is not dropped (MSDN). Undocumented: illegal
+                    character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
+*/
+void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
+{
+  dest.Empty();
+  if (src.IsEmpty())
+    return;
+  {
+    /*
+    wchar_t *d = dest.GetBuf(src.Len());
+    const char *s = (const char *)src;
+    unsigned i;
+    
+    for (i = 0;;)
+    {
+      Byte c = (Byte)s[i];
+      if (c >= 0x80 || c == 0)
+        break;
+      d[i++] = (wchar_t)c;
+    }
+    if (i != src.Len())
+    {
+      unsigned len = MultiByteToWideChar(codePage, 0, s + i,
+          src.Len() - i, d + i,
+          src.Len() + 1 - i);
+      if (len == 0)
+        throw 282228;
+      i += len;
+    }
+    d[i] = 0;
+    dest.ReleaseBuf_SetLen(i);
+    */
+    unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
+    if (len == 0)
+    {
+      if (GetLastError() != 0)
+        throw 282228;
+    }
+    else
+    {
+      len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
+      if (len == 0)
+        throw 282228;
+      dest.ReleaseBuf_SetEnd(len);
+    }
+  }
+}
+/*
+  int WideCharToMultiByte(
+      UINT CodePage, DWORD dwFlags,
+      LPCWSTR lpWideCharStr, int cchWideChar,
+      LPSTR lpMultiByteStr, int cbMultiByte,
+      LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
+if (lpDefaultChar == NULL),
+  - it uses system default value.
+if (CodePage == CP_UTF7 || CodePage == CP_UTF8)
+  if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
+    return: 0. ERR: ERROR_INVALID_PARAMETER.
+The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
+*/
+static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
+{
+  dest.Empty();
+  defaultCharWasUsed = false;
+  if (src.IsEmpty())
+    return;
+  {
+    /*
+    unsigned numRequiredBytes = src.Len() * 2;
+    char *d = dest.GetBuf(numRequiredBytes);
+    const wchar_t *s = (const wchar_t *)src;
+    unsigned i;
+    
+    for (i = 0;;)
+    {
+      wchar_t c = s[i];
+      if (c >= 0x80 || c == 0)
+        break;
+      d[i++] = (char)c;
+    }
+    
+    if (i != src.Len())
+    {
+      BOOL defUsed = FALSE;
+      defaultChar = defaultChar;
+      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
+      unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
+          d + i, numRequiredBytes + 1 - i,
+          (isUtf ? NULL : &defaultChar),
+          (isUtf ? NULL : &defUsed));
+      defaultCharWasUsed = (defUsed != FALSE);
+      if (len == 0)
+        throw 282229;
+      i += len;
+    }
+    d[i] = 0;
+    dest.ReleaseBuf_SetLen(i);
+    */
+    /*
+    if (codePage != CP_UTF7)
+    {
+      const wchar_t *s = (const wchar_t *)src;
+      unsigned i;
+      for (i = 0;; i++)
+      {
+        wchar_t c = s[i];
+        if (c >= 0x80 || c == 0)
+          break;
+      }
+      
+      if (s[i] == 0)
+      {
+        char *d = dest.GetBuf(src.Len());
+        for (i = 0;;)
+        {
+          wchar_t c = s[i];
+          if (c == 0)
+            break;
+          d[i++] = (char)c;
+        }
+        d[i] = 0;
+        dest.ReleaseBuf_SetLen(i);
+        return;
+      }
+    }
+    */
+    unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
+    if (len == 0)
+    {
+      if (GetLastError() != 0)
+        throw 282228;
+    }
+    else
+    {
+      BOOL defUsed = FALSE;
+      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
+      // defaultChar = defaultChar;
+      len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
+          dest.GetBuf(len), (int)len,
+          (isUtf ? NULL : &defaultChar),
+          (isUtf ? NULL : &defUsed)
+          );
+      if (!isUtf)
+        defaultCharWasUsed = (defUsed != FALSE);
+      if (len == 0)
+        throw 282228;
+      dest.ReleaseBuf_SetEnd(len);
+    }
+  }
+}
+/*
+#ifndef UNDER_CE
+AString SystemStringToOemString(const CSysString &src)
+{
+  AString dest;
+  const unsigned len = src.Len() * 2;
+  CharToOem(src, dest.GetBuf(len));
+  dest.ReleaseBuf_CalcLen(len);
+  return dest;
+}
+#endif
+*/
+#else // _WIN32
+// #include <stdio.h>
+/*
+  if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
+      and utf-8 string contains big unicode character > 0xffff),
+  then we still use 16-bit surrogate pair in UString.
+  It simplifies another code where utf-16 encoding is used.
+  So we use surrogate-conversion code only in is file.
+*/
+/*
+   mbstowcs() returns error if there is error in utf-8 stream,
+   mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
+*/
+/*
+static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
+{
+  dest.Empty();
+  if (src.IsEmpty())
+    return;
+  const size_t limit = ((size_t)src.Len() + 1) * 2;
+  wchar_t *d = dest.GetBuf((unsigned)limit);
+  const size_t len = mbstowcs(d, src, limit);
+  if (len != (size_t)-1)
+  {
+    dest.ReleaseBuf_SetEnd((unsigned)len);
+    return;
+  }
+  dest.ReleaseBuf_SetEnd(0);
+}
+*/
+bool g_ForceToUTF8 = true; // false;
+void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
+{
+  dest.Empty();
+  if (src.IsEmpty())
+    return;
+  if (codePage == CP_UTF8 || g_ForceToUTF8)
+  {
+    ConvertUTF8ToUnicode(src, dest);
+    return;
+  }
+  const size_t limit = ((size_t)src.Len() + 1) * 2;
+  wchar_t *d = dest.GetBuf((unsigned)limit);
+  const size_t len = mbstowcs(d, src, limit);
+  if (len != (size_t)-1)
+  {
+    dest.ReleaseBuf_SetEnd((unsigned)len);
+    #if WCHAR_MAX > 0xffff
+    d = dest.GetBuf();
+    for (size_t i = 0;; i++)
+    {
+      // wchar_t c = dest[i];
+      wchar_t c = d[i];
+      if (c == 0)
+        break;
+      if (c >= 0x10000 && c < 0x110000)
+      {
+        /*
+        c -= 0x10000;
+        unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
+        dest.ReplaceOneCharAtPos(i, c0);
+        i++;
+        c = 0xdc00 + (c & 0x3FF);
+        dest.Insert_wchar_t(i, c);
+        */
+        UString temp = d + i;
+        for (size_t t = 0;; t++)
+        {
+          wchar_t w = temp[t];
+          if (w == 0)
+            break;
+          if (i == limit)
+            break; // unexpected error
+          if (w >= 0x10000 && w < 0x110000)
+          {
+            if (i + 1 == limit)
+              break; // unexpected error
+            w -= 0x10000;
+            d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
+            w = 0xdc00 + (w & 0x3FF);
+          }
+          d[i++] = w;
+        }
+        dest.ReleaseBuf_SetEnd((unsigned)i);
+      }
+    }
+    #endif
+ 
+    /*
+    printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(),  src.Ptr());
+    printf("char:    ");
+    for (unsigned i = 0; i < src.Len(); i++)
+      printf (" %02x", (int)(Byte)src[i]);
+    printf("\n");
+    printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
+    printf("wchar_t: ");
+    for (unsigned i = 0; i < dest.Len(); i++)
+    {
+      printf (" %02x", (int)dest[i]);
+    }
+    printf("\n");
+    */
+    return;
+  }
+  /* if there is mbstowcs() error, we have two ways:
+     
+     1) change 0x80+ characters to some character: '_'
+        in that case we lose data, but we have correct UString()
+        and that scheme can show errors to user in early stages,
+        when file converted back to mbs() cannot be found
+     2) transfer bad characters in some UTF-16 range.
+        it can be non-original Unicode character.
+        but later we still can restore original character.
+  */
+  
+  // printf("\nmbstowcs  ERROR !!!!!! s=%s\n", src.Ptr());
+  {
+    unsigned i;
+    const char *s = (const char *)src;
+    for (i = 0;;)
+    {
+      Byte c = (Byte)s[i];
+      if (c == 0)
+        break;
+      // we can use ascii compatibilty character '_'
+      // if (c > 0x7F) c = '_'; // we replace "bad: character
+      d[i++] = (wchar_t)c;
+    }
+    d[i] = 0;
+    dest.ReleaseBuf_SetLen(i);
+  }
+}
+static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
+{
+  dest.Empty();
+  if (src.IsEmpty())
+    return;
+  const size_t limit = ((size_t)src.Len() + 1) * 6;
+  char *d = dest.GetBuf((unsigned)limit);
+  const size_t len = wcstombs(d, src, limit);
+  if (len != (size_t)-1)
+  {
+    dest.ReleaseBuf_SetEnd((unsigned)len);
+    return;
+  }
+  dest.ReleaseBuf_SetEnd(0);
+}
+static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
+{
+  // if (codePage == 1234567) // for debug purposes
+  if (codePage == CP_UTF8 || g_ForceToUTF8)
+  {
+    defaultCharWasUsed = false;
+    ConvertUnicodeToUTF8(src2, dest);
+    return;
+  }
+  UString src = src2;
+  #if WCHAR_MAX > 0xffff
+  {
+    src.Empty();
+    for (unsigned i = 0; i < src2.Len();)
+    {
+      wchar_t c = src2[i];
+      if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
+      {
+        const wchar_t c2 = src2[i + 1];
+        if (c2 >= 0xdc00 && c2 < 0x10000)
+        {
+          // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
+          c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
+          // printf("%4x\n", (int)c);
+          i++;
+        }
+      }
+      src += c;
+      i++;
+    }
+  }
+  #endif
+  dest.Empty();
+  defaultCharWasUsed = false;
+  if (src.IsEmpty())
+    return;
+  const size_t len = wcstombs(NULL, src, 0);
+  if (len != (size_t)-1)
+  {
+    const unsigned limit = ((unsigned)len);
+    if (limit == len)
+    {
+      char *d = dest.GetBuf(limit);
+      /*
+      {
+        printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
+        for (unsigned i = 0; i < src.Len(); i++)
+          printf (" %02x", (int)src[i]);
+        printf("\n");
+        printf("\ndest Limit = %d \n", limit);
+      }
+      */
+      const size_t len2 = wcstombs(d, src, len + 1);
+      
+      if (len2 != (size_t)-1 && len2 <= limit)
+      {
+        /*
+        printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
+        for (unsigned i = 0; i < len2; i++)
+          printf(" %02x", (int)(Byte)dest[i]);
+        printf("\n");
+        */
+        dest.ReleaseBuf_SetEnd((unsigned)len2);
+        return;
+      }
+    }
+  }
+  {
+    const wchar_t *s = (const wchar_t *)src;
+    char *d = dest.GetBuf(src.Len());
+    unsigned i;
+    for (i = 0;;)
+    {
+      wchar_t c = s[i];
+      if (c == 0)
+        break;
+      if (c >=
+            0x100
+            // 0x80
+          )
+      {
+        c = defaultChar;
+        defaultCharWasUsed = true;
+      }
+      d[i++] = (char)c;
+    }
+    d[i] = 0;
+    dest.ReleaseBuf_SetLen(i);
+    /*
+    printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
+    printf("ERROR: %s\n", dest.Ptr());
+    */
+  }
+}
+#endif // _WIN32
+UString MultiByteToUnicodeString(const AString &src, UINT codePage)
+{
+  UString dest;
+  MultiByteToUnicodeString2(dest, src, codePage);
+  return dest;
+}
+UString MultiByteToUnicodeString(const char *src, UINT codePage)
+{
+  return MultiByteToUnicodeString(AString(src), codePage);
+}
+void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
+{
+  bool defaultCharWasUsed;
+  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
+}
+AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
+{
+  AString dest;
+  UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
+  return dest;
+}
+AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
+{
+  AString dest;
+  bool defaultCharWasUsed;
+  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
+  return dest;
+}
+#ifdef _WIN32
+#define U_to_A(a, b, c)  UnicodeStringToMultiByte2
+// #define A_to_U(a, b, c)  MultiByteToUnicodeString2
+#else
+// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
+#define U_to_A(a, b, c)  UnicodeStringToMultiByte2_Native(a, b)
+// #define A_to_U(a, b, c)  MultiByteToUnicodeString2_Native(a, b)
+#endif
+#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
+bool IsNativeUTF8()
+{
+  UString u;
+  AString a, a2;
+  // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
+  for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
+  {
+    u.Empty();
+    u += (wchar_t)c;
+    /*
+    if (Unicode_Is_There_Utf16SurrogateError(u))
+      continue;
+    #ifndef _WIN32
+    if (Unicode_Is_There_BmpEscape(u))
+      continue;
+    #endif
+    */
+    ConvertUnicodeToUTF8(u, a);
+    U_to_A(a2, u, CP_OEMCP);
+    if (a != a2)
+      return false;
+  }
+  return true;
+}
+#endif
+#ifdef ENV_HAVE_LOCALE
+const char *GetLocale(void)
+{
+  #ifdef ENV_HAVE_LOCALE
+    // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
+    const char *s = setlocale(LC_CTYPE, NULL);
+    if (!s)
+    {
+      // printf("[NULL]\n");
+      s = "C";
+    }
+    else
+    {
+      // ubuntu returns "C" after program start
+      // printf("\"%s\"\n", s);
+    }
+    return s;
+  #elif defined(LOCALE_IS_UTF8)
+    return "utf8";
+  #else
+    return "C";
+  #endif
+}
+#ifdef _WIN32
+  static void Set_ForceToUTF8(bool) {}
+#else
+  static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
+#endif
+static bool Is_Default_Basic_Locale(const char *locale)
+{
+  const AString a (locale);
+  if (a.IsEqualTo_Ascii_NoCase("")
+      || a.IsEqualTo_Ascii_NoCase("C")
+      || a.IsEqualTo_Ascii_NoCase("POSIX"))
+      return true;
+  return false;
+}
+static bool Is_Default_Basic_Locale()
+{
+  return Is_Default_Basic_Locale(GetLocale());
+}
+void MY_SetLocale()
+{
+  #ifdef ENV_HAVE_LOCALE
+  /*
+  {
+    const char *s = GetLocale();
+    printf("\nGetLocale() : returned : \"%s\"\n", s);
+  }
+  */
+  
+  unsigned start = 0;
+  // unsigned lim = 0;
+  unsigned lim = 3;
+  /*
+  #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
+  #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
+  unsigned flags =
+      MY_SET_LOCALE_FLAGS__FROM_ENV |
+      MY_SET_LOCALE_FLAGS__TRY_UTF8
+  if (flags != 0)
+  {
+    if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
+      lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
+    else
+    {
+      start = 1;
+      lim = 2;
+    }
+  }
+  */
+  for (unsigned i = start; i < lim; i++)
+  {
+    /*
+    man7: "If locale is an empty string, "", each part of the locale that
+    should be modified is set according to the environment variables.
+    for glibc: glibc, first from the user's environment variables:
+      1) the environment variable LC_ALL,
+      2) environment variable with the same name as the category (see the
+      3) the environment variable LANG
+    The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
+    
+    for WIN32 : MSDN :
+      Sets the locale to the default, which is the user-default
+      ANSI code page obtained from the operating system.
+      The locale name is set to the value returned by GetUserDefaultLocaleName.
+      The code page is set to the value returned by GetACP
+  */
+    const char *newLocale = "";
+    
+    #ifdef __APPLE__
+    
+    /* look also CFLocale
+       there is no C.UTF-8 in macos
+       macos has UTF-8 locale only with some language like en_US.UTF-8
+       what is best way to set UTF-8 locale in macos? */
+    if (i == 1)
+      newLocale = "en_US.UTF-8";
+   
+    /* file open with non-utf8 sequencies return
+      #define EILSEQ    92    // "Illegal byte sequence"
+    */
+#else
+    // newLocale = "C";
+    if (i == 1)
+    {
+      newLocale = "C.UTF-8";    // main UTF-8 locale in ubuntu
+      // newLocale = ".utf8";    // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
+      // newLocale = "en_US.utf8"; // supported by ubuntu ?
+      // newLocale = "en_US.UTF-8";
+      /* setlocale() in ubuntu allows locales with minor chracter changes in strings
+        "en_US.UTF-8" /  "en_US.utf8" */
+    }
+    
+#endif
+    
+    // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
+    
+    // const char *s =
+    setlocale(LC_ALL, newLocale);
+    
+    /*
+    if (!s)
+      printf("NULL: can't set locale");
+    else
+      printf("\"%s\"\n", s);
+    */
+    
+    // request curent locale of program
+    const char *locale = GetLocale();
+    if (locale)
+    {
+      AString a (locale);
+      a.MakeLower_Ascii();
+      // if (a.Find("utf") >= 0)
+      {
+        if (IsNativeUTF8())
+        {
+          Set_ForceToUTF8(true);
+          return;
+        }
+      }
+      if (!Is_Default_Basic_Locale(locale))
+      {
+        // if there is some non-default and non-utf locale, we want to use it
+        break; // comment it for debug
+      }
+    }
+  }
+  if (IsNativeUTF8())
+  {
+    Set_ForceToUTF8(true);
+    return;
+  }
+  if (Is_Default_Basic_Locale())
+  {
+    Set_ForceToUTF8(true);
+    return;
+  }
+  Set_ForceToUTF8(false);
+  #elif defined(LOCALE_IS_UTF8)
+    // assume LC_CTYPE="utf8"
+  #else
+    // assume LC_CTYPE="C"
+  #endif
+}
+#endif
author	Igor Pavlov <87184205+ip7z@users.noreply.github.com>	2021-12-27 00:00:00 +0000
committer	Igor Pavlov <87184205+ip7z@users.noreply.github.com>	2022-03-18 15:35:13 +0500
commit	f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree	816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/StringConvert.cpp
parent	98e06a519b63b81986abe76d28887f6984a7732b (diff)
download	7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip

diff --git a/CPP/Common/StringConvert.cpp b/CPP/Common/StringConvert.cpp new file mode 100644 index 0000000..c0bde0f --- /dev/null +++ b/CPP/Common/StringConvert.cpp
@@ -0,0 +1,757 @@
	1	// Common/StringConvert.cpp
	2
	3	#include "StdAfx.h"
	4
	5	#include "StringConvert.h"
	6
	7	#ifndef _WIN32
	8	// #include <stdio.h>
	9	#include <stdlib.h>
	10	#endif
	11
	12	#if !defined(_WIN32) \|\| defined(ENV_HAVE_LOCALE)
	13	#include "UTFConvert.h"
	14	#endif
	15
	16	#ifdef ENV_HAVE_LOCALE
	17	#include <locale.h>
	18	#endif
	19
	20	static const char k_DefultChar = '_';
	21
	22	#ifdef _WIN32
	23
	24	/*
	25	MultiByteToWideChar(CodePage, DWORD dwFlags,
	26	LPCSTR lpMultiByteStr, int cbMultiByte,
	27	LPWSTR lpWideCharStr, int cchWideChar)
	28
	29	if (cbMultiByte == 0)
	30	return: 0. ERR: ERROR_INVALID_PARAMETER
	31
	32	if (cchWideChar == 0)
	33	return: the required buffer size in characters.
	34
	35	if (supplied buffer size was not large enough)
	36	return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
	37	The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
	38
	39	If there are illegal characters:
	40	if MB_ERR_INVALID_CHARS is set in dwFlags:
	41	- the function stops conversion on illegal character.
	42	- Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
	43
	44	if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
	45	before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
	46	in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal
	47	character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
	48	*/
	49
	50
	51	void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
	52	{
	53	dest.Empty();
	54	if (src.IsEmpty())
	55	return;
	56	{
	57	/*
	58	wchar_t *d = dest.GetBuf(src.Len());
	59	const char s = (const char )src;
	60	unsigned i;
	61
	62	for (i = 0;;)
	63	{
	64	Byte c = (Byte)s[i];
	65	if (c >= 0x80 \|\| c == 0)
	66	break;
	67	d[i++] = (wchar_t)c;
	68	}
	69
	70	if (i != src.Len())
	71	{
	72	unsigned len = MultiByteToWideChar(codePage, 0, s + i,
	73	src.Len() - i, d + i,
	74	src.Len() + 1 - i);
	75	if (len == 0)
	76	throw 282228;
	77	i += len;
	78	}
	79
	80	d[i] = 0;
	81	dest.ReleaseBuf_SetLen(i);
	82	*/
	83	unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
	84	if (len == 0)
	85	{
	86	if (GetLastError() != 0)
	87	throw 282228;
	88	}
	89	else
	90	{
	91	len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
	92	if (len == 0)
	93	throw 282228;
	94	dest.ReleaseBuf_SetEnd(len);
	95	}
	96	}
	97	}
	98
	99	/*
	100	int WideCharToMultiByte(
	101	UINT CodePage, DWORD dwFlags,
	102	LPCWSTR lpWideCharStr, int cchWideChar,
	103	LPSTR lpMultiByteStr, int cbMultiByte,
	104	LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
	105
	106	if (lpDefaultChar == NULL),
	107	- it uses system default value.
	108
	109	if (CodePage == CP_UTF7 \|\| CodePage == CP_UTF8)
	110	if (lpDefaultChar != NULL \|\| lpUsedDefaultChar != NULL)
	111	return: 0. ERR: ERROR_INVALID_PARAMETER.
	112
	113	The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
	114
	115	*/
	116
	117	static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
	118	{
	119	dest.Empty();
	120	defaultCharWasUsed = false;
	121	if (src.IsEmpty())
	122	return;
	123	{
	124	/*
	125	unsigned numRequiredBytes = src.Len() * 2;
	126	char *d = dest.GetBuf(numRequiredBytes);
	127	const wchar_t s = (const wchar_t )src;
	128	unsigned i;
	129
	130	for (i = 0;;)
	131	{
	132	wchar_t c = s[i];
	133	if (c >= 0x80 \|\| c == 0)
	134	break;
	135	d[i++] = (char)c;
	136	}
	137
	138	if (i != src.Len())
	139	{
	140	BOOL defUsed = FALSE;
	141	defaultChar = defaultChar;
	142
	143	bool isUtf = (codePage == CP_UTF8 \|\| codePage == CP_UTF7);
	144	unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
	145	d + i, numRequiredBytes + 1 - i,
	146	(isUtf ? NULL : &defaultChar),
	147	(isUtf ? NULL : &defUsed));
	148	defaultCharWasUsed = (defUsed != FALSE);
	149	if (len == 0)
	150	throw 282229;
	151	i += len;
	152	}
	153
	154	d[i] = 0;
	155	dest.ReleaseBuf_SetLen(i);
	156	*/
	157
	158	/*
	159	if (codePage != CP_UTF7)
	160	{
	161	const wchar_t s = (const wchar_t )src;
	162	unsigned i;
	163	for (i = 0;; i++)
	164	{
	165	wchar_t c = s[i];
	166	if (c >= 0x80 \|\| c == 0)
	167	break;
	168	}
	169
	170	if (s[i] == 0)
	171	{
	172	char *d = dest.GetBuf(src.Len());
	173	for (i = 0;;)
	174	{
	175	wchar_t c = s[i];
	176	if (c == 0)
	177	break;
	178	d[i++] = (char)c;
	179	}
	180	d[i] = 0;
	181	dest.ReleaseBuf_SetLen(i);
	182	return;
	183	}
	184	}
	185	*/
	186
	187	unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
	188	if (len == 0)
	189	{
	190	if (GetLastError() != 0)
	191	throw 282228;
	192	}
	193	else
	194	{
	195	BOOL defUsed = FALSE;
	196	bool isUtf = (codePage == CP_UTF8 \|\| codePage == CP_UTF7);
	197	// defaultChar = defaultChar;
	198	len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
	199	dest.GetBuf(len), (int)len,
	200	(isUtf ? NULL : &defaultChar),
	201	(isUtf ? NULL : &defUsed)
	202	);
	203	if (!isUtf)
	204	defaultCharWasUsed = (defUsed != FALSE);
	205	if (len == 0)
	206	throw 282228;
	207	dest.ReleaseBuf_SetEnd(len);
	208	}
	209	}
	210	}
	211
	212	/*
	213	#ifndef UNDER_CE
	214	AString SystemStringToOemString(const CSysString &src)
	215	{
	216	AString dest;
	217	const unsigned len = src.Len() * 2;
	218	CharToOem(src, dest.GetBuf(len));
	219	dest.ReleaseBuf_CalcLen(len);
	220	return dest;
	221	}
	222	#endif
	223	*/
	224
	225	#else // _WIN32
	226
	227	// #include <stdio.h>
	228	/*
	229	if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
	230	and utf-8 string contains big unicode character > 0xffff),
	231	then we still use 16-bit surrogate pair in UString.
	232	It simplifies another code where utf-16 encoding is used.
	233	So we use surrogate-conversion code only in is file.
	234	*/
	235
	236	/*
	237	mbstowcs() returns error if there is error in utf-8 stream,
	238	mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
	239	*/
	240
	241	/*
	242	static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
	243	{
	244	dest.Empty();
	245	if (src.IsEmpty())
	246	return;
	247
	248	const size_t limit = ((size_t)src.Len() + 1) * 2;
	249	wchar_t *d = dest.GetBuf((unsigned)limit);
	250	const size_t len = mbstowcs(d, src, limit);
	251	if (len != (size_t)-1)
	252	{
	253	dest.ReleaseBuf_SetEnd((unsigned)len);
	254	return;
	255	}
	256	dest.ReleaseBuf_SetEnd(0);
	257	}
	258	*/
	259
	260	bool g_ForceToUTF8 = true; // false;
	261
	262	void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
	263	{
	264	dest.Empty();
	265	if (src.IsEmpty())
	266	return;
	267
	268	if (codePage == CP_UTF8 \|\| g_ForceToUTF8)
	269	{
	270	ConvertUTF8ToUnicode(src, dest);
	271	return;
	272	}
	273
	274	const size_t limit = ((size_t)src.Len() + 1) * 2;
	275	wchar_t *d = dest.GetBuf((unsigned)limit);
	276	const size_t len = mbstowcs(d, src, limit);
	277	if (len != (size_t)-1)
	278	{
	279	dest.ReleaseBuf_SetEnd((unsigned)len);
	280
	281	#if WCHAR_MAX > 0xffff
	282	d = dest.GetBuf();
	283	for (size_t i = 0;; i++)
	284	{
	285	// wchar_t c = dest[i];
	286	wchar_t c = d[i];
	287	if (c == 0)
	288	break;
	289	if (c >= 0x10000 && c < 0x110000)
	290	{
	291	/*
	292	c -= 0x10000;
	293	unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
	294	dest.ReplaceOneCharAtPos(i, c0);
	295	i++;
	296	c = 0xdc00 + (c & 0x3FF);
	297	dest.Insert_wchar_t(i, c);
	298	*/
	299	UString temp = d + i;
	300
	301	for (size_t t = 0;; t++)
	302	{
	303	wchar_t w = temp[t];
	304	if (w == 0)
	305	break;
	306	if (i == limit)
	307	break; // unexpected error
	308	if (w >= 0x10000 && w < 0x110000)
	309	{
	310	if (i + 1 == limit)
	311	break; // unexpected error
	312	w -= 0x10000;
	313	d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
	314	w = 0xdc00 + (w & 0x3FF);
	315	}
	316	d[i++] = w;
	317	}
	318	dest.ReleaseBuf_SetEnd((unsigned)i);
	319	}
	320	}
	321
	322	#endif
	323
	324	/*
	325	printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr());
	326	printf("char: ");
	327	for (unsigned i = 0; i < src.Len(); i++)
	328	printf (" %02x", (int)(Byte)src[i]);
	329	printf("\n");
	330	printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
	331	printf("wchar_t: ");
	332	for (unsigned i = 0; i < dest.Len(); i++)
	333	{
	334	printf (" %02x", (int)dest[i]);
	335	}
	336	printf("\n");
	337	*/
	338
	339	return;
	340	}
	341
	342	/* if there is mbstowcs() error, we have two ways:
	343
	344	1) change 0x80+ characters to some character: '_'
	345	in that case we lose data, but we have correct UString()
	346	and that scheme can show errors to user in early stages,
	347	when file converted back to mbs() cannot be found
	348
	349	2) transfer bad characters in some UTF-16 range.
	350	it can be non-original Unicode character.
	351	but later we still can restore original character.
	352	*/
	353
	354
	355	// printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr());
	356	{
	357	unsigned i;
	358	const char s = (const char )src;
	359	for (i = 0;;)
	360	{
	361	Byte c = (Byte)s[i];
	362	if (c == 0)
	363	break;
	364	// we can use ascii compatibilty character '_'
	365	// if (c > 0x7F) c = '_'; // we replace "bad: character
	366	d[i++] = (wchar_t)c;
	367	}
	368	d[i] = 0;
	369	dest.ReleaseBuf_SetLen(i);
	370	}
	371	}
	372
	373	static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
	374	{
	375	dest.Empty();
	376	if (src.IsEmpty())
	377	return;
	378
	379	const size_t limit = ((size_t)src.Len() + 1) * 6;
	380	char *d = dest.GetBuf((unsigned)limit);
	381
	382	const size_t len = wcstombs(d, src, limit);
	383
	384	if (len != (size_t)-1)
	385	{
	386	dest.ReleaseBuf_SetEnd((unsigned)len);
	387	return;
	388	}
	389	dest.ReleaseBuf_SetEnd(0);
	390	}
	391
	392
	393	static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
	394	{
	395	// if (codePage == 1234567) // for debug purposes
	396	if (codePage == CP_UTF8 \|\| g_ForceToUTF8)
	397	{
	398	defaultCharWasUsed = false;
	399	ConvertUnicodeToUTF8(src2, dest);
	400	return;
	401	}
	402
	403	UString src = src2;
	404	#if WCHAR_MAX > 0xffff
	405	{
	406	src.Empty();
	407	for (unsigned i = 0; i < src2.Len();)
	408	{
	409	wchar_t c = src2[i];
	410	if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
	411	{
	412	const wchar_t c2 = src2[i + 1];
	413	if (c2 >= 0xdc00 && c2 < 0x10000)
	414	{
	415	// printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
	416	c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
	417	// printf("%4x\n", (int)c);
	418	i++;
	419	}
	420	}
	421	src += c;
	422	i++;
	423	}
	424	}
	425	#endif
	426
	427	dest.Empty();
	428	defaultCharWasUsed = false;
	429	if (src.IsEmpty())
	430	return;
	431
	432	const size_t len = wcstombs(NULL, src, 0);
	433
	434	if (len != (size_t)-1)
	435	{
	436	const unsigned limit = ((unsigned)len);
	437	if (limit == len)
	438	{
	439	char *d = dest.GetBuf(limit);
	440
	441	/*
	442	{
	443	printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
	444	for (unsigned i = 0; i < src.Len(); i++)
	445	printf (" %02x", (int)src[i]);
	446	printf("\n");
	447	printf("\ndest Limit = %d \n", limit);
	448	}
	449	*/
	450
	451	const size_t len2 = wcstombs(d, src, len + 1);
	452
	453	if (len2 != (size_t)-1 && len2 <= limit)
	454	{
	455	/*
	456	printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
	457	for (unsigned i = 0; i < len2; i++)
	458	printf(" %02x", (int)(Byte)dest[i]);
	459	printf("\n");
	460	*/
	461	dest.ReleaseBuf_SetEnd((unsigned)len2);
	462	return;
	463	}
	464	}
	465	}
	466
	467	{
	468	const wchar_t s = (const wchar_t )src;
	469	char *d = dest.GetBuf(src.Len());
	470
	471	unsigned i;
	472	for (i = 0;;)
	473	{
	474	wchar_t c = s[i];
	475	if (c == 0)
	476	break;
	477	if (c >=
	478	0x100
	479	// 0x80
	480	)
	481	{
	482	c = defaultChar;
	483	defaultCharWasUsed = true;
	484	}
	485
	486	d[i++] = (char)c;
	487	}
	488	d[i] = 0;
	489	dest.ReleaseBuf_SetLen(i);
	490	/*
	491	printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
	492	printf("ERROR: %s\n", dest.Ptr());
	493	*/
	494	}
	495	}
	496
	497	#endif // _WIN32
	498
	499
	500	UString MultiByteToUnicodeString(const AString &src, UINT codePage)
	501	{
	502	UString dest;
	503	MultiByteToUnicodeString2(dest, src, codePage);
	504	return dest;
	505	}
	506
	507	UString MultiByteToUnicodeString(const char *src, UINT codePage)
	508	{
	509	return MultiByteToUnicodeString(AString(src), codePage);
	510	}
	511
	512
	513	void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
	514	{
	515	bool defaultCharWasUsed;
	516	UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
	517	}
	518
	519	AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
	520	{
	521	AString dest;
	522	UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
	523	return dest;
	524	}
	525
	526	AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
	527	{
	528	AString dest;
	529	bool defaultCharWasUsed;
	530	UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
	531	return dest;
	532	}
	533
	534
	535
	536
	537
	538	#ifdef _WIN32
	539	#define U_to_A(a, b, c) UnicodeStringToMultiByte2
	540	// #define A_to_U(a, b, c) MultiByteToUnicodeString2
	541	#else
	542	// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
	543	#define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b)
	544	// #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b)
	545	#endif
	546
	547	#if !defined(_WIN32) \|\| defined(ENV_HAVE_LOCALE)
	548
	549	bool IsNativeUTF8()
	550	{
	551	UString u;
	552	AString a, a2;
	553	// for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
	554	for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
	555	{
	556	u.Empty();
	557	u += (wchar_t)c;
	558	/*
	559	if (Unicode_Is_There_Utf16SurrogateError(u))
	560	continue;
	561	#ifndef _WIN32
	562	if (Unicode_Is_There_BmpEscape(u))
	563	continue;
	564	#endif
	565	*/
	566	ConvertUnicodeToUTF8(u, a);
	567	U_to_A(a2, u, CP_OEMCP);
	568	if (a != a2)
	569	return false;
	570	}
	571	return true;
	572	}
	573
	574	#endif
	575
	576
	577	#ifdef ENV_HAVE_LOCALE
	578
	579	const char *GetLocale(void)
	580	{
	581	#ifdef ENV_HAVE_LOCALE
	582	// printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
	583	const char *s = setlocale(LC_CTYPE, NULL);
	584	if (!s)
	585	{
	586	// printf("[NULL]\n");
	587	s = "C";
	588	}
	589	else
	590	{
	591	// ubuntu returns "C" after program start
	592	// printf("\"%s\"\n", s);
	593	}
	594	return s;
	595	#elif defined(LOCALE_IS_UTF8)
	596	return "utf8";
	597	#else
	598	return "C";
	599	#endif
	600	}
	601
	602	#ifdef _WIN32
	603	static void Set_ForceToUTF8(bool) {}
	604	#else
	605	static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
	606	#endif
	607
	608	static bool Is_Default_Basic_Locale(const char *locale)
	609	{
	610	const AString a (locale);
	611	if (a.IsEqualTo_Ascii_NoCase("")
	612	\|\| a.IsEqualTo_Ascii_NoCase("C")
	613	\|\| a.IsEqualTo_Ascii_NoCase("POSIX"))
	614	return true;
	615	return false;
	616	}
	617
	618	static bool Is_Default_Basic_Locale()
	619	{
	620	return Is_Default_Basic_Locale(GetLocale());
	621	}
	622
	623
	624	void MY_SetLocale()
	625	{
	626	#ifdef ENV_HAVE_LOCALE
	627	/*
	628	{
	629	const char *s = GetLocale();
	630	printf("\nGetLocale() : returned : \"%s\"\n", s);
	631	}
	632	*/
	633
	634	unsigned start = 0;
	635	// unsigned lim = 0;
	636	unsigned lim = 3;
	637
	638	/*
	639	#define MY_SET_LOCALE_FLAGS__FROM_ENV 1
	640	#define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
	641
	642	unsigned flags =
	643	MY_SET_LOCALE_FLAGS__FROM_ENV \|
	644	MY_SET_LOCALE_FLAGS__TRY_UTF8
	645
	646	if (flags != 0)
	647	{
	648	if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
	649	lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
	650	else
	651	{
	652	start = 1;
	653	lim = 2;
	654	}
	655	}
	656	*/
	657
	658	for (unsigned i = start; i < lim; i++)
	659	{
	660	/*
	661	man7: "If locale is an empty string, "", each part of the locale that
	662	should be modified is set according to the environment variables.
	663	for glibc: glibc, first from the user's environment variables:
	664	1) the environment variable LC_ALL,
	665	2) environment variable with the same name as the category (see the
	666	3) the environment variable LANG
	667	The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
	668
	669	for WIN32 : MSDN :
	670	Sets the locale to the default, which is the user-default
	671	ANSI code page obtained from the operating system.
	672	The locale name is set to the value returned by GetUserDefaultLocaleName.
	673	The code page is set to the value returned by GetACP
	674	*/
	675	const char *newLocale = "";
	676
	677	#ifdef __APPLE__
	678
	679	/* look also CFLocale
	680	there is no C.UTF-8 in macos
	681	macos has UTF-8 locale only with some language like en_US.UTF-8
	682	what is best way to set UTF-8 locale in macos? */
	683	if (i == 1)
	684	newLocale = "en_US.UTF-8";
	685
	686	/* file open with non-utf8 sequencies return
	687	#define EILSEQ 92 // "Illegal byte sequence"
	688	*/
	689	#else
	690	// newLocale = "C";
	691	if (i == 1)
	692	{
	693	newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu
	694	// newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
	695	// newLocale = "en_US.utf8"; // supported by ubuntu ?
	696	// newLocale = "en_US.UTF-8";
	697	/* setlocale() in ubuntu allows locales with minor chracter changes in strings
	698	"en_US.UTF-8" / "en_US.utf8" */
	699	}
	700
	701	#endif
	702
	703	// printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
	704
	705	// const char *s =
	706	setlocale(LC_ALL, newLocale);
	707
	708	/*
	709	if (!s)
	710	printf("NULL: can't set locale");
	711	else
	712	printf("\"%s\"\n", s);
	713	*/
	714
	715	// request curent locale of program
	716	const char *locale = GetLocale();
	717	if (locale)
	718	{
	719	AString a (locale);
	720	a.MakeLower_Ascii();
	721	// if (a.Find("utf") >= 0)
	722	{
	723	if (IsNativeUTF8())
	724	{
	725	Set_ForceToUTF8(true);
	726	return;
	727	}
	728	}
	729	if (!Is_Default_Basic_Locale(locale))
	730	{
	731	// if there is some non-default and non-utf locale, we want to use it
	732	break; // comment it for debug
	733	}
	734	}
	735	}
	736
	737	if (IsNativeUTF8())
	738	{
	739	Set_ForceToUTF8(true);
	740	return;
	741	}
	742
	743	if (Is_Default_Basic_Locale())
	744	{
	745	Set_ForceToUTF8(true);
	746	return;
	747	}
	748
	749	Set_ForceToUTF8(false);
	750
	751	#elif defined(LOCALE_IS_UTF8)
	752	// assume LC_CTYPE="utf8"
	753	#else
	754	// assume LC_CTYPE="C"
	755	#endif
	756	}
	757	#endif