diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-06-21 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-12-17 14:59:19 +0500 |
commit | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch) | |
tree | fe5e17420300b715021a76328444088d32047963 /CPP/Common/UTFConvert.h | |
parent | 93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff) | |
download | 7zip-23.01.tar.gz 7zip-23.01.tar.bz2 7zip-23.01.zip |
23.0123.01
Diffstat (limited to 'CPP/Common/UTFConvert.h')
-rw-r--r-- | CPP/Common/UTFConvert.h | 64 |
1 files changed, 32 insertions, 32 deletions
diff --git a/CPP/Common/UTFConvert.h b/CPP/Common/UTFConvert.h index 37c4975..94a8024 100644 --- a/CPP/Common/UTFConvert.h +++ b/CPP/Common/UTFConvert.h | |||
@@ -1,7 +1,7 @@ | |||
1 | // Common/UTFConvert.h | 1 | // Common/UTFConvert.h |
2 | 2 | ||
3 | #ifndef __COMMON_UTF_CONVERT_H | 3 | #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H |
4 | #define __COMMON_UTF_CONVERT_H | 4 | #define ZIP7_INC_COMMON_UTF_CONVERT_H |
5 | 5 | ||
6 | #include "MyBuffer.h" | 6 | #include "MyBuffer.h" |
7 | #include "MyString.h" | 7 | #include "MyString.h" |
@@ -88,12 +88,12 @@ if (allowReduced == true) - it allows truncated last character-Utf8-sequence | |||
88 | bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); | 88 | bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); |
89 | bool CheckUTF8_AString(const AString &s) throw(); | 89 | bool CheckUTF8_AString(const AString &s) throw(); |
90 | 90 | ||
91 | #define UTF_FLAG__FROM_UTF8__SURROGATE_ERROR (1 << 0) | 91 | #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0) |
92 | #define UTF_FLAG__FROM_UTF8__USE_ESCAPE (1 << 1) | 92 | #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1) |
93 | #define UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT (1 << 2) | 93 | #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2) |
94 | 94 | ||
95 | /* | 95 | /* |
96 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | 96 | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
97 | 97 | ||
98 | if (flag is NOT set) | 98 | if (flag is NOT set) |
99 | { | 99 | { |
@@ -108,14 +108,14 @@ UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | |||
108 | 108 | ||
109 | if (flag is set) | 109 | if (flag is set) |
110 | { | 110 | { |
111 | if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is defined) | 111 | if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined) |
112 | it generates ESCAPE for SINGLE-SURROGATE-8, | 112 | it generates ESCAPE for SINGLE-SURROGATE-8, |
113 | if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is not defined) | 113 | if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined) |
114 | it generates U+fffd for SINGLE-SURROGATE-8, | 114 | it generates U+fffd for SINGLE-SURROGATE-8, |
115 | } | 115 | } |
116 | 116 | ||
117 | 117 | ||
118 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | 118 | Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
119 | 119 | ||
120 | if (flag is NOT set) | 120 | if (flag is NOT set) |
121 | it generates (U+fffd) code for non-UTF-8 (invalid) characters | 121 | it generates (U+fffd) code for non-UTF-8 (invalid) characters |
@@ -126,7 +126,7 @@ UTF_FLAG__FROM_UTF8__USE_ESCAPE | |||
126 | And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. | 126 | And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. |
127 | } | 127 | } |
128 | 128 | ||
129 | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | 129 | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
130 | 130 | ||
131 | if (flag is NOT set) | 131 | if (flag is NOT set) |
132 | { | 132 | { |
@@ -146,9 +146,9 @@ Main USE CASES with UTF-8 <-> UTF-16 conversions: | |||
146 | 146 | ||
147 | WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW | 147 | WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW |
148 | { | 148 | { |
149 | set UTF_FLAG__FROM_UTF8__USE_ESCAPE | 149 | set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
150 | Do NOT set UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | 150 | Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
151 | Do NOT set UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | 151 | Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
152 | 152 | ||
153 | So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. | 153 | So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. |
154 | } | 154 | } |
@@ -157,17 +157,17 @@ Main USE CASES with UTF-8 <-> UTF-16 conversions: | |||
157 | { | 157 | { |
158 | we want restore original UTF-8-RAW sequence later from that ESCAPE-16. | 158 | we want restore original UTF-8-RAW sequence later from that ESCAPE-16. |
159 | Set the flags: | 159 | Set the flags: |
160 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | 160 | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
161 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | 161 | Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
162 | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | 162 | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
163 | } | 163 | } |
164 | 164 | ||
165 | MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW | 165 | MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW |
166 | { | 166 | { |
167 | we want to restore correct UTF-8 without any BMP processing: | 167 | we want to restore correct UTF-8 without any BMP processing: |
168 | Set the flags: | 168 | Set the flags: |
169 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | 169 | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
170 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | 170 | Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
171 | } | 171 | } |
172 | 172 | ||
173 | */ | 173 | */ |
@@ -178,12 +178,12 @@ bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, | |||
178 | bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); | 178 | bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); |
179 | bool ConvertUTF8ToUnicode(const AString &src, UString &dest); | 179 | bool ConvertUTF8ToUnicode(const AString &src, UString &dest); |
180 | 180 | ||
181 | #define UTF_FLAG__TO_UTF8__SURROGATE_ERROR (1 << 8) | 181 | #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8) |
182 | #define UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE (1 << 9) | 182 | #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9) |
183 | // #define UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE (1 << 10) | 183 | // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10) |
184 | 184 | ||
185 | /* | 185 | /* |
186 | UTF_FLAG__TO_UTF8__SURROGATE_ERROR | 186 | Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR |
187 | 187 | ||
188 | if (flag is NOT set) | 188 | if (flag is NOT set) |
189 | { | 189 | { |
@@ -193,7 +193,7 @@ UTF_FLAG__TO_UTF8__SURROGATE_ERROR | |||
193 | 193 | ||
194 | In Linux : | 194 | In Linux : |
195 | use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, | 195 | use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, |
196 | if (UTF_FLAG__FROM_UTF8__SURROGATE_ERROR) is used. | 196 | if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used. |
197 | use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) | 197 | use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) |
198 | will generate SINGLE-SURROGATE-UTF-8 here. | 198 | will generate SINGLE-SURROGATE-UTF-8 here. |
199 | } | 199 | } |
@@ -206,17 +206,17 @@ UTF_FLAG__TO_UTF8__SURROGATE_ERROR | |||
206 | } | 206 | } |
207 | 207 | ||
208 | 208 | ||
209 | UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE | 209 | Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE |
210 | 210 | ||
211 | if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 | 211 | if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 |
212 | if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 | 212 | if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 |
213 | 213 | ||
214 | in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): | 214 | in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): |
215 | if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. | 215 | if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. |
216 | if (we don't use High-Escape-Plane), we must use UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. | 216 | if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. |
217 | 217 | ||
218 | 218 | ||
219 | UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE | 219 | Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE |
220 | // that flag affects the code only if (wchar_t is 32-bit) | 220 | // that flag affects the code only if (wchar_t is 32-bit) |
221 | // that mode with high-escape can be disabled now in UTFConvert.cpp | 221 | // that mode with high-escape can be disabled now in UTFConvert.cpp |
222 | if (flag is NOT set) | 222 | if (flag is NOT set) |
@@ -228,19 +228,19 @@ Main use cases: | |||
228 | 228 | ||
229 | WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW | 229 | WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW |
230 | { | 230 | { |
231 | Do NOT set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. | 231 | Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. |
232 | Do NOT set UTF_FLAG__TO_UTF8__SURROGATE_ERROR. | 232 | Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR. |
233 | So we restore original UTF-16-RAW. | 233 | So we restore original UTF-16-RAW. |
234 | } | 234 | } |
235 | 235 | ||
236 | Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes | 236 | Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes |
237 | set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive | 237 | set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive |
238 | set UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE for intermediate UTF-16. | 238 | set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16. |
239 | Note: high esacape mode can be ignored now in UTFConvert.cpp | 239 | Note: high esacape mode can be ignored now in UTFConvert.cpp |
240 | 240 | ||
241 | macOS: | 241 | macOS: |
242 | the system doesn't support incorrect UTF-8 in file names. | 242 | the system doesn't support incorrect UTF-8 in file names. |
243 | set UTF_FLAG__TO_UTF8__SURROGATE_ERROR | 243 | set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR |
244 | */ | 244 | */ |
245 | 245 | ||
246 | extern unsigned g_Unicode_To_UTF8_Flags; | 246 | extern unsigned g_Unicode_To_UTF8_Flags; |
@@ -261,7 +261,7 @@ bool Unicode_IsThere_BmpEscape(const UString &src); | |||
261 | bool Unicode_IsThere_Utf16SurrogateError(const UString &src); | 261 | bool Unicode_IsThere_Utf16SurrogateError(const UString &src); |
262 | */ | 262 | */ |
263 | 263 | ||
264 | #ifdef _WCHART_IS_16BIT | 264 | #ifdef Z7_WCHART_IS_16BIT |
265 | #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) | 265 | #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) |
266 | #else | 266 | #else |
267 | void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); | 267 | void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); |