diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2021-12-27 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2022-03-18 15:35:13 +0500 |
commit | f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch) | |
tree | 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/UTFConvert.h | |
parent | 98e06a519b63b81986abe76d28887f6984a7732b (diff) | |
download | 7zip-21.07.tar.gz 7zip-21.07.tar.bz2 7zip-21.07.zip |
'21.07'21.07
Diffstat (limited to 'CPP/Common/UTFConvert.h')
-rw-r--r-- | CPP/Common/UTFConvert.h | 384 |
1 files changed, 384 insertions, 0 deletions
diff --git a/CPP/Common/UTFConvert.h b/CPP/Common/UTFConvert.h new file mode 100644 index 0000000..37c4975 --- /dev/null +++ b/CPP/Common/UTFConvert.h | |||
@@ -0,0 +1,384 @@ | |||
1 | // Common/UTFConvert.h | ||
2 | |||
3 | #ifndef __COMMON_UTF_CONVERT_H | ||
4 | #define __COMMON_UTF_CONVERT_H | ||
5 | |||
6 | #include "MyBuffer.h" | ||
7 | #include "MyString.h" | ||
8 | |||
9 | struct CUtf8Check | ||
10 | { | ||
11 | // Byte MaxByte; // in original src stream | ||
12 | bool NonUtf; | ||
13 | bool ZeroChar; | ||
14 | bool SingleSurrogate; | ||
15 | bool Escape; | ||
16 | bool Truncated; | ||
17 | UInt32 MaxHighPoint; // only for points >= 0x80 | ||
18 | |||
19 | CUtf8Check() { Clear(); } | ||
20 | |||
21 | void Clear() | ||
22 | { | ||
23 | // MaxByte = 0; | ||
24 | NonUtf = false; | ||
25 | ZeroChar = false; | ||
26 | SingleSurrogate = false; | ||
27 | Escape = false; | ||
28 | Truncated = false; | ||
29 | MaxHighPoint = 0; | ||
30 | } | ||
31 | |||
32 | void Update(const CUtf8Check &c) | ||
33 | { | ||
34 | if (c.NonUtf) NonUtf = true; | ||
35 | if (c.ZeroChar) ZeroChar = true; | ||
36 | if (c.SingleSurrogate) SingleSurrogate = true; | ||
37 | if (c.Escape) Escape = true; | ||
38 | if (c.Truncated) Truncated = true; | ||
39 | if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint; | ||
40 | } | ||
41 | |||
42 | void PrintStatus(AString &s) const | ||
43 | { | ||
44 | s.Empty(); | ||
45 | |||
46 | // s.Add_OptSpaced("MaxByte="); | ||
47 | // s.Add_UInt32(MaxByte); | ||
48 | |||
49 | if (NonUtf) s.Add_OptSpaced("non-UTF8"); | ||
50 | if (ZeroChar) s.Add_OptSpaced("ZeroChar"); | ||
51 | if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate"); | ||
52 | if (Escape) s.Add_OptSpaced("Escape"); | ||
53 | if (Truncated) s.Add_OptSpaced("Truncated"); | ||
54 | |||
55 | if (MaxHighPoint != 0) | ||
56 | { | ||
57 | s.Add_OptSpaced("MaxUnicode="); | ||
58 | s.Add_UInt32(MaxHighPoint); | ||
59 | } | ||
60 | } | ||
61 | |||
62 | |||
63 | bool IsOK(bool allowReduced = false) const | ||
64 | { | ||
65 | if (NonUtf || SingleSurrogate || ZeroChar) | ||
66 | return false; | ||
67 | if (MaxHighPoint >= 0x110000) | ||
68 | return false; | ||
69 | if (Truncated && !allowReduced) | ||
70 | return false; | ||
71 | return true; | ||
72 | } | ||
73 | |||
74 | // it checks full buffer as specified in (size) and it doesn't stop on zero char | ||
75 | void Check_Buf(const char *src, size_t size) throw(); | ||
76 | |||
77 | void Check_AString(const AString &s) throw() | ||
78 | { | ||
79 | Check_Buf(s.Ptr(), s.Len()); | ||
80 | } | ||
81 | }; | ||
82 | |||
83 | /* | ||
84 | if (allowReduced == false) - all UTF-8 character sequences must be finished. | ||
85 | if (allowReduced == true) - it allows truncated last character-Utf8-sequence | ||
86 | */ | ||
87 | |||
88 | bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); | ||
89 | bool CheckUTF8_AString(const AString &s) throw(); | ||
90 | |||
91 | #define UTF_FLAG__FROM_UTF8__SURROGATE_ERROR (1 << 0) | ||
92 | #define UTF_FLAG__FROM_UTF8__USE_ESCAPE (1 << 1) | ||
93 | #define UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT (1 << 2) | ||
94 | |||
95 | /* | ||
96 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | ||
97 | |||
98 | if (flag is NOT set) | ||
99 | { | ||
100 | it processes SINGLE-SURROGATE-8 as valid Unicode point. | ||
101 | it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16 | ||
102 | Note: some sequencies of two SINGLE-SURROGATE-8 points | ||
103 | will generate correct SURROGATE-16-PAIR, and | ||
104 | that SURROGATE-16-PAIR later will be converted to correct | ||
105 | UTF8-SURROGATE-21 point. So we don't restore original | ||
106 | STR-8 sequence in that case. | ||
107 | } | ||
108 | |||
109 | if (flag is set) | ||
110 | { | ||
111 | if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is defined) | ||
112 | it generates ESCAPE for SINGLE-SURROGATE-8, | ||
113 | if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is not defined) | ||
114 | it generates U+fffd for SINGLE-SURROGATE-8, | ||
115 | } | ||
116 | |||
117 | |||
118 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | ||
119 | |||
120 | if (flag is NOT set) | ||
121 | it generates (U+fffd) code for non-UTF-8 (invalid) characters | ||
122 | |||
123 | if (flag is set) | ||
124 | { | ||
125 | It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters. | ||
126 | And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. | ||
127 | } | ||
128 | |||
129 | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | ||
130 | |||
131 | if (flag is NOT set) | ||
132 | { | ||
133 | it process ESCAPE-8 points as another Unicode points. | ||
134 | In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences, | ||
135 | so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW | ||
136 | } | ||
137 | |||
138 | if (flag is set) | ||
139 | { | ||
140 | it generates ESCAPE-16-21 for ESCAPE-8 points | ||
141 | so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21. | ||
142 | } | ||
143 | |||
144 | |||
145 | Main USE CASES with UTF-8 <-> UTF-16 conversions: | ||
146 | |||
147 | WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW | ||
148 | { | ||
149 | set UTF_FLAG__FROM_UTF8__USE_ESCAPE | ||
150 | Do NOT set UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | ||
151 | Do NOT set UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | ||
152 | |||
153 | So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. | ||
154 | } | ||
155 | |||
156 | Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW | ||
157 | { | ||
158 | we want restore original UTF-8-RAW sequence later from that ESCAPE-16. | ||
159 | Set the flags: | ||
160 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | ||
161 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | ||
162 | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT | ||
163 | } | ||
164 | |||
165 | MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW | ||
166 | { | ||
167 | we want to restore correct UTF-8 without any BMP processing: | ||
168 | Set the flags: | ||
169 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR | ||
170 | UTF_FLAG__FROM_UTF8__USE_ESCAPE | ||
171 | } | ||
172 | |||
173 | */ | ||
174 | |||
175 | // zero char is not allowed in (src) buf | ||
176 | bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0); | ||
177 | |||
178 | bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); | ||
179 | bool ConvertUTF8ToUnicode(const AString &src, UString &dest); | ||
180 | |||
181 | #define UTF_FLAG__TO_UTF8__SURROGATE_ERROR (1 << 8) | ||
182 | #define UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE (1 << 9) | ||
183 | // #define UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE (1 << 10) | ||
184 | |||
185 | /* | ||
186 | UTF_FLAG__TO_UTF8__SURROGATE_ERROR | ||
187 | |||
188 | if (flag is NOT set) | ||
189 | { | ||
190 | we extract SINGLE-SURROGATE as normal UTF-8 | ||
191 | |||
192 | In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in . | ||
193 | |||
194 | In Linux : | ||
195 | use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, | ||
196 | if (UTF_FLAG__FROM_UTF8__SURROGATE_ERROR) is used. | ||
197 | use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) | ||
198 | will generate SINGLE-SURROGATE-UTF-8 here. | ||
199 | } | ||
200 | |||
201 | if (flag is set) | ||
202 | { | ||
203 | we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE | ||
204 | it can be used for compatibility mode with WIN32 UTF function | ||
205 | or if we want UTF-8 stream without any errors | ||
206 | } | ||
207 | |||
208 | |||
209 | UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE | ||
210 | |||
211 | if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 | ||
212 | if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 | ||
213 | |||
214 | in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): | ||
215 | if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. | ||
216 | if (we don't use High-Escape-Plane), we must use UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. | ||
217 | |||
218 | |||
219 | UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE | ||
220 | // that flag affects the code only if (wchar_t is 32-bit) | ||
221 | // that mode with high-escape can be disabled now in UTFConvert.cpp | ||
222 | if (flag is NOT set) | ||
223 | it doesn't extract raw 8-bit symbol from High-Escape-Plane | ||
224 | if (flag is set) | ||
225 | it extracts raw 8-bit symbol from High-Escape-Plane | ||
226 | |||
227 | Main use cases: | ||
228 | |||
229 | WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW | ||
230 | { | ||
231 | Do NOT set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE. | ||
232 | Do NOT set UTF_FLAG__TO_UTF8__SURROGATE_ERROR. | ||
233 | So we restore original UTF-16-RAW. | ||
234 | } | ||
235 | |||
236 | Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes | ||
237 | set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive | ||
238 | set UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE for intermediate UTF-16. | ||
239 | Note: high esacape mode can be ignored now in UTFConvert.cpp | ||
240 | |||
241 | macOS: | ||
242 | the system doesn't support incorrect UTF-8 in file names. | ||
243 | set UTF_FLAG__TO_UTF8__SURROGATE_ERROR | ||
244 | */ | ||
245 | |||
246 | extern unsigned g_Unicode_To_UTF8_Flags; | ||
247 | |||
248 | void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0); | ||
249 | void ConvertUnicodeToUTF8(const UString &src, AString &dest); | ||
250 | |||
251 | void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest); | ||
252 | |||
253 | /* | ||
254 | #ifndef _WIN32 | ||
255 | void Convert_UTF16_To_UTF32(const UString &src, UString &dest); | ||
256 | void Convert_UTF32_To_UTF16(const UString &src, UString &dest); | ||
257 | bool UTF32_IsThere_BigPoint(const UString &src); | ||
258 | bool Unicode_IsThere_BmpEscape(const UString &src); | ||
259 | #endif | ||
260 | |||
261 | bool Unicode_IsThere_Utf16SurrogateError(const UString &src); | ||
262 | */ | ||
263 | |||
264 | #ifdef _WCHART_IS_16BIT | ||
265 | #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) | ||
266 | #else | ||
267 | void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); | ||
268 | #endif | ||
269 | |||
270 | /* | ||
271 | // #include "../../C/CpuArch.h" | ||
272 | |||
273 | // ---------- Utf16 Little endian functions ---------- | ||
274 | |||
275 | // We store 16-bit surrogates even in 32-bit WCHARs in Linux. | ||
276 | // So now we don't use the following code: | ||
277 | |||
278 | #if WCHAR_MAX > 0xffff | ||
279 | |||
280 | // void *p : pointer to src bytes stream | ||
281 | // size_t len : num Utf16 characters : it can include or not include NULL character | ||
282 | |||
283 | inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len) | ||
284 | { | ||
285 | #if WCHAR_MAX > 0xffff | ||
286 | size_t num_wchars = 0; | ||
287 | for (size_t i = 0; i < len; i++) | ||
288 | { | ||
289 | wchar_t c = GetUi16(p); | ||
290 | p = (const void *)((const Byte *)p + 2); | ||
291 | if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) | ||
292 | { | ||
293 | wchar_t c2 = GetUi16(p); | ||
294 | if (c2 >= 0xdc00 && c2 < 0xe000) | ||
295 | { | ||
296 | c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); | ||
297 | p = (const void *)((const Byte *)p + 2); | ||
298 | i++; | ||
299 | } | ||
300 | } | ||
301 | num_wchars++; | ||
302 | } | ||
303 | return num_wchars; | ||
304 | #else | ||
305 | UNUSED_VAR(p) | ||
306 | return len; | ||
307 | #endif | ||
308 | } | ||
309 | |||
310 | // #include <stdio.h> | ||
311 | |||
312 | inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest) | ||
313 | { | ||
314 | for (size_t i = 0; i < len; i++) | ||
315 | { | ||
316 | wchar_t c = GetUi16(p); | ||
317 | p = (const void *)((const Byte *)p + 2); | ||
318 | |||
319 | #if WCHAR_PATH_SEPARATOR != L'/' | ||
320 | if (c == L'/') | ||
321 | c = WCHAR_PATH_SEPARATOR; | ||
322 | #endif | ||
323 | |||
324 | #if WCHAR_MAX > 0xffff | ||
325 | |||
326 | if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) | ||
327 | { | ||
328 | wchar_t c2 = GetUi16(p); | ||
329 | if (c2 >= 0xdc00 && c2 < 0xe000) | ||
330 | { | ||
331 | // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2); | ||
332 | c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); | ||
333 | p = (const void *)((const Byte *)p + 2); | ||
334 | i++; | ||
335 | // printf("%4x\n", (int)c); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | #endif | ||
340 | |||
341 | *dest++ = c; | ||
342 | } | ||
343 | return dest; | ||
344 | } | ||
345 | |||
346 | |||
347 | inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p) | ||
348 | { | ||
349 | size_t num = 0; | ||
350 | for (;;) | ||
351 | { | ||
352 | wchar_t c = *p++; | ||
353 | if (c == 0) | ||
354 | return num; | ||
355 | num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1); | ||
356 | } | ||
357 | return num; | ||
358 | } | ||
359 | |||
360 | inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest) | ||
361 | { | ||
362 | for (;;) | ||
363 | { | ||
364 | wchar_t c = *p++; | ||
365 | if (c == 0) | ||
366 | return dest; | ||
367 | if (c >= 0x10000 && c < 0x110000) | ||
368 | { | ||
369 | SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF))); | ||
370 | SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF))); | ||
371 | dest += 4; | ||
372 | } | ||
373 | else | ||
374 | { | ||
375 | SetUi16(dest, c); | ||
376 | dest += 2; | ||
377 | } | ||
378 | } | ||
379 | } | ||
380 | |||
381 | #endif | ||
382 | */ | ||
383 | |||
384 | #endif | ||