aboutsummaryrefslogtreecommitdiff
path: root/CPP/Common/UTFConvert.h
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-06-21 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-12-17 14:59:19 +0500
commit5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch)
treefe5e17420300b715021a76328444088d32047963 /CPP/Common/UTFConvert.h
parent93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff)
download7zip-23.01.tar.gz
7zip-23.01.tar.bz2
7zip-23.01.zip
23.0123.01
Diffstat (limited to 'CPP/Common/UTFConvert.h')
-rw-r--r--CPP/Common/UTFConvert.h64
1 files changed, 32 insertions, 32 deletions
diff --git a/CPP/Common/UTFConvert.h b/CPP/Common/UTFConvert.h
index 37c4975..94a8024 100644
--- a/CPP/Common/UTFConvert.h
+++ b/CPP/Common/UTFConvert.h
@@ -1,7 +1,7 @@
1// Common/UTFConvert.h 1// Common/UTFConvert.h
2 2
3#ifndef __COMMON_UTF_CONVERT_H 3#ifndef ZIP7_INC_COMMON_UTF_CONVERT_H
4#define __COMMON_UTF_CONVERT_H 4#define ZIP7_INC_COMMON_UTF_CONVERT_H
5 5
6#include "MyBuffer.h" 6#include "MyBuffer.h"
7#include "MyString.h" 7#include "MyString.h"
@@ -88,12 +88,12 @@ if (allowReduced == true) - it allows truncated last character-Utf8-sequence
88bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); 88bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
89bool CheckUTF8_AString(const AString &s) throw(); 89bool CheckUTF8_AString(const AString &s) throw();
90 90
91#define UTF_FLAG__FROM_UTF8__SURROGATE_ERROR (1 << 0) 91#define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0)
92#define UTF_FLAG__FROM_UTF8__USE_ESCAPE (1 << 1) 92#define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1)
93#define UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT (1 << 2) 93#define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2)
94 94
95/* 95/*
96UTF_FLAG__FROM_UTF8__SURROGATE_ERROR 96Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
97 97
98 if (flag is NOT set) 98 if (flag is NOT set)
99 { 99 {
@@ -108,14 +108,14 @@ UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
108 108
109 if (flag is set) 109 if (flag is set)
110 { 110 {
111 if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is defined) 111 if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined)
112 it generates ESCAPE for SINGLE-SURROGATE-8, 112 it generates ESCAPE for SINGLE-SURROGATE-8,
113 if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is not defined) 113 if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined)
114 it generates U+fffd for SINGLE-SURROGATE-8, 114 it generates U+fffd for SINGLE-SURROGATE-8,
115 } 115 }
116 116
117 117
118UTF_FLAG__FROM_UTF8__USE_ESCAPE 118Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
119 119
120 if (flag is NOT set) 120 if (flag is NOT set)
121 it generates (U+fffd) code for non-UTF-8 (invalid) characters 121 it generates (U+fffd) code for non-UTF-8 (invalid) characters
@@ -126,7 +126,7 @@ UTF_FLAG__FROM_UTF8__USE_ESCAPE
126 And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. 126 And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
127 } 127 }
128 128
129UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT 129Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
130 130
131 if (flag is NOT set) 131 if (flag is NOT set)
132 { 132 {
@@ -146,9 +146,9 @@ Main USE CASES with UTF-8 <-> UTF-16 conversions:
146 146
147 WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW 147 WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
148 { 148 {
149 set UTF_FLAG__FROM_UTF8__USE_ESCAPE 149 set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
150 Do NOT set UTF_FLAG__FROM_UTF8__SURROGATE_ERROR 150 Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
151 Do NOT set UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT 151 Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
152 152
153 So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. 153 So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
154 } 154 }
@@ -157,17 +157,17 @@ Main USE CASES with UTF-8 <-> UTF-16 conversions:
157 { 157 {
158 we want restore original UTF-8-RAW sequence later from that ESCAPE-16. 158 we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
159 Set the flags: 159 Set the flags:
160 UTF_FLAG__FROM_UTF8__SURROGATE_ERROR 160 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
161 UTF_FLAG__FROM_UTF8__USE_ESCAPE 161 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
162 UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT 162 Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
163 } 163 }
164 164
165 MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW 165 MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
166 { 166 {
167 we want to restore correct UTF-8 without any BMP processing: 167 we want to restore correct UTF-8 without any BMP processing:
168 Set the flags: 168 Set the flags:
169 UTF_FLAG__FROM_UTF8__SURROGATE_ERROR 169 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
170 UTF_FLAG__FROM_UTF8__USE_ESCAPE 170 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
171 } 171 }
172 172
173*/ 173*/
@@ -178,12 +178,12 @@ bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest,
178bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); 178bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
179bool ConvertUTF8ToUnicode(const AString &src, UString &dest); 179bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
180 180
181#define UTF_FLAG__TO_UTF8__SURROGATE_ERROR (1 << 8) 181#define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8)
182#define UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE (1 << 9) 182#define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9)
183// #define UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE (1 << 10) 183// #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10)
184 184
185/* 185/*
186UTF_FLAG__TO_UTF8__SURROGATE_ERROR 186Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
187 187
188 if (flag is NOT set) 188 if (flag is NOT set)
189 { 189 {
@@ -193,7 +193,7 @@ UTF_FLAG__TO_UTF8__SURROGATE_ERROR
193 193
194 In Linux : 194 In Linux :
195 use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, 195 use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE,
196 if (UTF_FLAG__FROM_UTF8__SURROGATE_ERROR) is used. 196 if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used.
197 use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) 197 use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
198 will generate SINGLE-SURROGATE-UTF-8 here. 198 will generate SINGLE-SURROGATE-UTF-8 here.
199 } 199 }
@@ -206,17 +206,17 @@ UTF_FLAG__TO_UTF8__SURROGATE_ERROR
206 } 206 }
207 207
208 208
209UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE 209Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
210 210
211 if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 211 if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16
212 if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 212 if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16
213 213
214 in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): 214 in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
215 if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. 215 if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
216 if (we don't use High-Escape-Plane), we must use UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. 216 if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
217 217
218 218
219UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE 219Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
220 // that flag affects the code only if (wchar_t is 32-bit) 220 // that flag affects the code only if (wchar_t is 32-bit)
221 // that mode with high-escape can be disabled now in UTFConvert.cpp 221 // that mode with high-escape can be disabled now in UTFConvert.cpp
222 if (flag is NOT set) 222 if (flag is NOT set)
@@ -228,19 +228,19 @@ Main use cases:
228 228
229WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW 229WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
230 { 230 {
231 Do NOT set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. 231 Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
232 Do NOT set UTF_FLAG__TO_UTF8__SURROGATE_ERROR. 232 Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR.
233 So we restore original UTF-16-RAW. 233 So we restore original UTF-16-RAW.
234 } 234 }
235 235
236Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes 236Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
237 set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive 237 set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
238 set UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE for intermediate UTF-16. 238 set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16.
239 Note: high esacape mode can be ignored now in UTFConvert.cpp 239 Note: high esacape mode can be ignored now in UTFConvert.cpp
240 240
241macOS: 241macOS:
242 the system doesn't support incorrect UTF-8 in file names. 242 the system doesn't support incorrect UTF-8 in file names.
243 set UTF_FLAG__TO_UTF8__SURROGATE_ERROR 243 set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
244*/ 244*/
245 245
246extern unsigned g_Unicode_To_UTF8_Flags; 246extern unsigned g_Unicode_To_UTF8_Flags;
@@ -261,7 +261,7 @@ bool Unicode_IsThere_BmpEscape(const UString &src);
261bool Unicode_IsThere_Utf16SurrogateError(const UString &src); 261bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
262*/ 262*/
263 263
264#ifdef _WCHART_IS_16BIT 264#ifdef Z7_WCHART_IS_16BIT
265#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) 265#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
266#else 266#else
267void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); 267void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);