// Common/UTFConvert.h #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H #define ZIP7_INC_COMMON_UTF_CONVERT_H #include "MyBuffer.h" #include "MyString.h" struct CUtf8Check { // Byte MaxByte; // in original src stream bool NonUtf; bool ZeroChar; bool SingleSurrogate; bool Escape; bool Truncated; UInt32 MaxHighPoint; // only for points >= 0x80 CUtf8Check() { Clear(); } void Clear() { // MaxByte = 0; NonUtf = false; ZeroChar = false; SingleSurrogate = false; Escape = false; Truncated = false; MaxHighPoint = 0; } void Update(const CUtf8Check &c) { if (c.NonUtf) NonUtf = true; if (c.ZeroChar) ZeroChar = true; if (c.SingleSurrogate) SingleSurrogate = true; if (c.Escape) Escape = true; if (c.Truncated) Truncated = true; if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint; } void PrintStatus(AString &s) const { s.Empty(); // s.Add_OptSpaced("MaxByte="); // s.Add_UInt32(MaxByte); if (NonUtf) s.Add_OptSpaced("non-UTF8"); if (ZeroChar) s.Add_OptSpaced("ZeroChar"); if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate"); if (Escape) s.Add_OptSpaced("Escape"); if (Truncated) s.Add_OptSpaced("Truncated"); if (MaxHighPoint != 0) { s.Add_OptSpaced("MaxUnicode="); s.Add_UInt32(MaxHighPoint); } } bool IsOK(bool allowReduced = false) const { if (NonUtf || SingleSurrogate || ZeroChar) return false; if (MaxHighPoint >= 0x110000) return false; if (Truncated && !allowReduced) return false; return true; } // it checks full buffer as specified in (size) and it doesn't stop on zero char void Check_Buf(const char *src, size_t size) throw(); void Check_AString(const AString &s) throw() { Check_Buf(s.Ptr(), s.Len()); } }; /* if (allowReduced == false) - all UTF-8 character sequences must be finished. if (allowReduced == true) - it allows truncated last character-Utf8-sequence */ bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); bool CheckUTF8_AString(const AString &s) throw(); #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0) #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1) #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2) /* Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR if (flag is NOT set) { it processes SINGLE-SURROGATE-8 as valid Unicode point. it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16 Note: some sequencies of two SINGLE-SURROGATE-8 points will generate correct SURROGATE-16-PAIR, and that SURROGATE-16-PAIR later will be converted to correct UTF8-SURROGATE-21 point. So we don't restore original STR-8 sequence in that case. } if (flag is set) { if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined) it generates ESCAPE for SINGLE-SURROGATE-8, if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined) it generates U+fffd for SINGLE-SURROGATE-8, } Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE if (flag is NOT set) it generates (U+fffd) code for non-UTF-8 (invalid) characters if (flag is set) { It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters. And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. } Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT if (flag is NOT set) { it process ESCAPE-8 points as another Unicode points. In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences, so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW } if (flag is set) { it generates ESCAPE-16-21 for ESCAPE-8 points so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21. } Main USE CASES with UTF-8 <-> UTF-16 conversions: WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW { set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. } Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW { we want restore original UTF-8-RAW sequence later from that ESCAPE-16. Set the flags: Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT } MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW { we want to restore correct UTF-8 without any BMP processing: Set the flags: Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE } */ // zero char is not allowed in (src) buf bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0); bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); bool ConvertUTF8ToUnicode(const AString &src, UString &dest); #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8) #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9) // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10) /* Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR if (flag is NOT set) { we extract SINGLE-SURROGATE as normal UTF-8 In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in . In Linux : use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used. use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) will generate SINGLE-SURROGATE-UTF-8 here. } if (flag is set) { we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE it can be used for compatibility mode with WIN32 UTF function or if we want UTF-8 stream without any errors } Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE // that flag affects the code only if (wchar_t is 32-bit) // that mode with high-escape can be disabled now in UTFConvert.cpp if (flag is NOT set) it doesn't extract raw 8-bit symbol from High-Escape-Plane if (flag is set) it extracts raw 8-bit symbol from High-Escape-Plane Main use cases: WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW { Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR. So we restore original UTF-16-RAW. } Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16. Note: high esacape mode can be ignored now in UTFConvert.cpp macOS: the system doesn't support incorrect UTF-8 in file names. set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR */ extern unsigned g_Unicode_To_UTF8_Flags; void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0); void ConvertUnicodeToUTF8(const UString &src, AString &dest); void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest); /* #ifndef _WIN32 void Convert_UTF16_To_UTF32(const UString &src, UString &dest); void Convert_UTF32_To_UTF16(const UString &src, UString &dest); bool UTF32_IsThere_BigPoint(const UString &src); bool Unicode_IsThere_BmpEscape(const UString &src); #endif bool Unicode_IsThere_Utf16SurrogateError(const UString &src); */ #ifdef Z7_WCHART_IS_16BIT #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) #else void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); #endif /* // #include "../../C/CpuArch.h" // ---------- Utf16 Little endian functions ---------- // We store 16-bit surrogates even in 32-bit WCHARs in Linux. // So now we don't use the following code: #if WCHAR_MAX > 0xffff // void *p : pointer to src bytes stream // size_t len : num Utf16 characters : it can include or not include NULL character inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len) { #if WCHAR_MAX > 0xffff size_t num_wchars = 0; for (size_t i = 0; i < len; i++) { wchar_t c = GetUi16(p); p = (const void *)((const Byte *)p + 2); if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) { wchar_t c2 = GetUi16(p); if (c2 >= 0xdc00 && c2 < 0xe000) { c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); p = (const void *)((const Byte *)p + 2); i++; } } num_wchars++; } return num_wchars; #else UNUSED_VAR(p) return len; #endif } // #include inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest) { for (size_t i = 0; i < len; i++) { wchar_t c = GetUi16(p); p = (const void *)((const Byte *)p + 2); #if WCHAR_PATH_SEPARATOR != L'/' if (c == L'/') c = WCHAR_PATH_SEPARATOR; #endif #if WCHAR_MAX > 0xffff if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) { wchar_t c2 = GetUi16(p); if (c2 >= 0xdc00 && c2 < 0xe000) { // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2); c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); p = (const void *)((const Byte *)p + 2); i++; // printf("%4x\n", (int)c); } } #endif *dest++ = c; } return dest; } inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p) { size_t num = 0; for (;;) { wchar_t c = *p++; if (c == 0) return num; num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1); } return num; } inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest) { for (;;) { wchar_t c = *p++; if (c == 0) return dest; if (c >= 0x10000 && c < 0x110000) { SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF))); SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF))); dest += 4; } else { SetUi16(dest, c); dest += 2; } } } #endif */ #endif