aboutsummaryrefslogtreecommitdiff
path: root/CPP/Common/UTFConvert.h
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /CPP/Common/UTFConvert.h
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-21.07.tar.gz
7zip-21.07.tar.bz2
7zip-21.07.zip
'21.07'21.07
Diffstat (limited to 'CPP/Common/UTFConvert.h')
-rw-r--r--CPP/Common/UTFConvert.h384
1 files changed, 384 insertions, 0 deletions
diff --git a/CPP/Common/UTFConvert.h b/CPP/Common/UTFConvert.h
new file mode 100644
index 0000000..37c4975
--- /dev/null
+++ b/CPP/Common/UTFConvert.h
@@ -0,0 +1,384 @@
1// Common/UTFConvert.h
2
3#ifndef __COMMON_UTF_CONVERT_H
4#define __COMMON_UTF_CONVERT_H
5
6#include "MyBuffer.h"
7#include "MyString.h"
8
9struct CUtf8Check
10{
11 // Byte MaxByte; // in original src stream
12 bool NonUtf;
13 bool ZeroChar;
14 bool SingleSurrogate;
15 bool Escape;
16 bool Truncated;
17 UInt32 MaxHighPoint; // only for points >= 0x80
18
19 CUtf8Check() { Clear(); }
20
21 void Clear()
22 {
23 // MaxByte = 0;
24 NonUtf = false;
25 ZeroChar = false;
26 SingleSurrogate = false;
27 Escape = false;
28 Truncated = false;
29 MaxHighPoint = 0;
30 }
31
32 void Update(const CUtf8Check &c)
33 {
34 if (c.NonUtf) NonUtf = true;
35 if (c.ZeroChar) ZeroChar = true;
36 if (c.SingleSurrogate) SingleSurrogate = true;
37 if (c.Escape) Escape = true;
38 if (c.Truncated) Truncated = true;
39 if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
40 }
41
42 void PrintStatus(AString &s) const
43 {
44 s.Empty();
45
46 // s.Add_OptSpaced("MaxByte=");
47 // s.Add_UInt32(MaxByte);
48
49 if (NonUtf) s.Add_OptSpaced("non-UTF8");
50 if (ZeroChar) s.Add_OptSpaced("ZeroChar");
51 if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
52 if (Escape) s.Add_OptSpaced("Escape");
53 if (Truncated) s.Add_OptSpaced("Truncated");
54
55 if (MaxHighPoint != 0)
56 {
57 s.Add_OptSpaced("MaxUnicode=");
58 s.Add_UInt32(MaxHighPoint);
59 }
60 }
61
62
63 bool IsOK(bool allowReduced = false) const
64 {
65 if (NonUtf || SingleSurrogate || ZeroChar)
66 return false;
67 if (MaxHighPoint >= 0x110000)
68 return false;
69 if (Truncated && !allowReduced)
70 return false;
71 return true;
72 }
73
74 // it checks full buffer as specified in (size) and it doesn't stop on zero char
75 void Check_Buf(const char *src, size_t size) throw();
76
77 void Check_AString(const AString &s) throw()
78 {
79 Check_Buf(s.Ptr(), s.Len());
80 }
81};
82
83/*
84if (allowReduced == false) - all UTF-8 character sequences must be finished.
85if (allowReduced == true) - it allows truncated last character-Utf8-sequence
86*/
87
88bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
89bool CheckUTF8_AString(const AString &s) throw();
90
91#define UTF_FLAG__FROM_UTF8__SURROGATE_ERROR (1 << 0)
92#define UTF_FLAG__FROM_UTF8__USE_ESCAPE (1 << 1)
93#define UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT (1 << 2)
94
95/*
96UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
97
98 if (flag is NOT set)
99 {
100 it processes SINGLE-SURROGATE-8 as valid Unicode point.
101 it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
102 Note: some sequencies of two SINGLE-SURROGATE-8 points
103 will generate correct SURROGATE-16-PAIR, and
104 that SURROGATE-16-PAIR later will be converted to correct
105 UTF8-SURROGATE-21 point. So we don't restore original
106 STR-8 sequence in that case.
107 }
108
109 if (flag is set)
110 {
111 if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is defined)
112 it generates ESCAPE for SINGLE-SURROGATE-8,
113 if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is not defined)
114 it generates U+fffd for SINGLE-SURROGATE-8,
115 }
116
117
118UTF_FLAG__FROM_UTF8__USE_ESCAPE
119
120 if (flag is NOT set)
121 it generates (U+fffd) code for non-UTF-8 (invalid) characters
122
123 if (flag is set)
124 {
125 It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
126 And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
127 }
128
129UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
130
131 if (flag is NOT set)
132 {
133 it process ESCAPE-8 points as another Unicode points.
134 In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
135 so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
136 }
137
138 if (flag is set)
139 {
140 it generates ESCAPE-16-21 for ESCAPE-8 points
141 so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
142 }
143
144
145Main USE CASES with UTF-8 <-> UTF-16 conversions:
146
147 WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
148 {
149 set UTF_FLAG__FROM_UTF8__USE_ESCAPE
150 Do NOT set UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
151 Do NOT set UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
152
153 So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
154 }
155
156 Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
157 {
158 we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
159 Set the flags:
160 UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
161 UTF_FLAG__FROM_UTF8__USE_ESCAPE
162 UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
163 }
164
165 MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
166 {
167 we want to restore correct UTF-8 without any BMP processing:
168 Set the flags:
169 UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
170 UTF_FLAG__FROM_UTF8__USE_ESCAPE
171 }
172
173*/
174
175// zero char is not allowed in (src) buf
176bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
177
178bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
179bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
180
181#define UTF_FLAG__TO_UTF8__SURROGATE_ERROR (1 << 8)
182#define UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE (1 << 9)
183// #define UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE (1 << 10)
184
185/*
186UTF_FLAG__TO_UTF8__SURROGATE_ERROR
187
188 if (flag is NOT set)
189 {
190 we extract SINGLE-SURROGATE as normal UTF-8
191
192 In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
193
194 In Linux :
195 use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE,
196 if (UTF_FLAG__FROM_UTF8__SURROGATE_ERROR) is used.
197 use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
198 will generate SINGLE-SURROGATE-UTF-8 here.
199 }
200
201 if (flag is set)
202 {
203 we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
204 it can be used for compatibility mode with WIN32 UTF function
205 or if we want UTF-8 stream without any errors
206 }
207
208
209UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE
210
211 if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16
212 if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16
213
214 in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
215 if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
216 if (we don't use High-Escape-Plane), we must use UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE.
217
218
219UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE
220 // that flag affects the code only if (wchar_t is 32-bit)
221 // that mode with high-escape can be disabled now in UTFConvert.cpp
222 if (flag is NOT set)
223 it doesn't extract raw 8-bit symbol from High-Escape-Plane
224 if (flag is set)
225 it extracts raw 8-bit symbol from High-Escape-Plane
226
227Main use cases:
228
229WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
230 {
231 Do NOT set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE.
232 Do NOT set UTF_FLAG__TO_UTF8__SURROGATE_ERROR.
233 So we restore original UTF-16-RAW.
234 }
235
236Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
237 set UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
238 set UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE for intermediate UTF-16.
239 Note: high esacape mode can be ignored now in UTFConvert.cpp
240
241macOS:
242 the system doesn't support incorrect UTF-8 in file names.
243 set UTF_FLAG__TO_UTF8__SURROGATE_ERROR
244*/
245
246extern unsigned g_Unicode_To_UTF8_Flags;
247
248void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
249void ConvertUnicodeToUTF8(const UString &src, AString &dest);
250
251void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
252
253/*
254#ifndef _WIN32
255void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
256void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
257bool UTF32_IsThere_BigPoint(const UString &src);
258bool Unicode_IsThere_BmpEscape(const UString &src);
259#endif
260
261bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
262*/
263
264#ifdef _WCHART_IS_16BIT
265#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
266#else
267void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
268#endif
269
270/*
271// #include "../../C/CpuArch.h"
272
273// ---------- Utf16 Little endian functions ----------
274
275// We store 16-bit surrogates even in 32-bit WCHARs in Linux.
276// So now we don't use the following code:
277
278#if WCHAR_MAX > 0xffff
279
280// void *p : pointer to src bytes stream
281// size_t len : num Utf16 characters : it can include or not include NULL character
282
283inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
284{
285 #if WCHAR_MAX > 0xffff
286 size_t num_wchars = 0;
287 for (size_t i = 0; i < len; i++)
288 {
289 wchar_t c = GetUi16(p);
290 p = (const void *)((const Byte *)p + 2);
291 if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
292 {
293 wchar_t c2 = GetUi16(p);
294 if (c2 >= 0xdc00 && c2 < 0xe000)
295 {
296 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
297 p = (const void *)((const Byte *)p + 2);
298 i++;
299 }
300 }
301 num_wchars++;
302 }
303 return num_wchars;
304 #else
305 UNUSED_VAR(p)
306 return len;
307 #endif
308}
309
310// #include <stdio.h>
311
312inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
313{
314 for (size_t i = 0; i < len; i++)
315 {
316 wchar_t c = GetUi16(p);
317 p = (const void *)((const Byte *)p + 2);
318
319 #if WCHAR_PATH_SEPARATOR != L'/'
320 if (c == L'/')
321 c = WCHAR_PATH_SEPARATOR;
322 #endif
323
324 #if WCHAR_MAX > 0xffff
325
326 if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
327 {
328 wchar_t c2 = GetUi16(p);
329 if (c2 >= 0xdc00 && c2 < 0xe000)
330 {
331 // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
332 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
333 p = (const void *)((const Byte *)p + 2);
334 i++;
335 // printf("%4x\n", (int)c);
336 }
337 }
338
339 #endif
340
341 *dest++ = c;
342 }
343 return dest;
344}
345
346
347inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
348{
349 size_t num = 0;
350 for (;;)
351 {
352 wchar_t c = *p++;
353 if (c == 0)
354 return num;
355 num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
356 }
357 return num;
358}
359
360inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
361{
362 for (;;)
363 {
364 wchar_t c = *p++;
365 if (c == 0)
366 return dest;
367 if (c >= 0x10000 && c < 0x110000)
368 {
369 SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
370 SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF)));
371 dest += 4;
372 }
373 else
374 {
375 SetUi16(dest, c);
376 dest += 2;
377 }
378 }
379}
380
381#endif
382*/
383
384#endif