aboutsummaryrefslogtreecommitdiff
path: root/src/3rdParty/efsw/Utf.inl
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdParty/efsw/Utf.inl')
-rwxr-xr-xsrc/3rdParty/efsw/Utf.inl576
1 files changed, 576 insertions, 0 deletions
diff --git a/src/3rdParty/efsw/Utf.inl b/src/3rdParty/efsw/Utf.inl
new file mode 100755
index 0000000..7e3e9d6
--- /dev/null
+++ b/src/3rdParty/efsw/Utf.inl
@@ -0,0 +1,576 @@
1// References :
2// http://www.unicode.org/
3// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
4// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
5// http://people.w3.org/rishida/scripts/uniview/conversion
6////////////////////////////////////////////////////////////
7
8template <typename In> In Utf<8>::Decode( In begin, In end, Uint32& output, Uint32 replacement ) {
9 // Some useful precomputed data
10 static const int trailing[256] = {
11 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
19 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
20 static const Uint32 offsets[6] = { 0x00000000, 0x00003080, 0x000E2080,
21 0x03C82080, 0xFA082080, 0x82082080 };
22
23 // Decode the character
24 int trailingBytes = trailing[static_cast<Uint8>( *begin )];
25 if ( begin + trailingBytes < end ) {
26 output = 0;
27 switch ( trailingBytes ) {
28 case 5:
29 output += static_cast<Uint8>( *begin++ );
30 output <<= 6;
31 case 4:
32 output += static_cast<Uint8>( *begin++ );
33 output <<= 6;
34 case 3:
35 output += static_cast<Uint8>( *begin++ );
36 output <<= 6;
37 case 2:
38 output += static_cast<Uint8>( *begin++ );
39 output <<= 6;
40 case 1:
41 output += static_cast<Uint8>( *begin++ );
42 output <<= 6;
43 case 0:
44 output += static_cast<Uint8>( *begin++ );
45 }
46 output -= offsets[trailingBytes];
47 } else {
48 // Incomplete character
49 begin = end;
50 output = replacement;
51 }
52
53 return begin;
54}
55
56template <typename Out> Out Utf<8>::Encode( Uint32 input, Out output, Uint8 replacement ) {
57 // Some useful precomputed data
58 static const Uint8 firstBytes[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
59
60 // Encode the character
61 if ( ( input > 0x0010FFFF ) || ( ( input >= 0xD800 ) && ( input <= 0xDBFF ) ) ) {
62 // Invalid character
63 if ( replacement )
64 *output++ = replacement;
65 } else {
66 // Valid character
67
68 // Get the number of bytes to write
69 int bytesToWrite = 1;
70 if ( input < 0x80 )
71 bytesToWrite = 1;
72 else if ( input < 0x800 )
73 bytesToWrite = 2;
74 else if ( input < 0x10000 )
75 bytesToWrite = 3;
76 else if ( input <= 0x0010FFFF )
77 bytesToWrite = 4;
78
79 // Extract the bytes to write
80 Uint8 bytes[4];
81 switch ( bytesToWrite ) {
82 case 4:
83 bytes[3] = static_cast<Uint8>( ( input | 0x80 ) & 0xBF );
84 input >>= 6;
85 case 3:
86 bytes[2] = static_cast<Uint8>( ( input | 0x80 ) & 0xBF );
87 input >>= 6;
88 case 2:
89 bytes[1] = static_cast<Uint8>( ( input | 0x80 ) & 0xBF );
90 input >>= 6;
91 case 1:
92 bytes[0] = static_cast<Uint8>( input | firstBytes[bytesToWrite] );
93 }
94
95 // Add them to the output
96 const Uint8* currentByte = bytes;
97 switch ( bytesToWrite ) {
98 case 4:
99 *output++ = *currentByte++;
100 case 3:
101 *output++ = *currentByte++;
102 case 2:
103 *output++ = *currentByte++;
104 case 1:
105 *output++ = *currentByte++;
106 }
107 }
108
109 return output;
110}
111
112template <typename In> In Utf<8>::Next( In begin, In end ) {
113 Uint32 codepoint;
114 return Decode( begin, end, codepoint );
115}
116
117template <typename In> std::size_t Utf<8>::Count( In begin, In end ) {
118 std::size_t length = 0;
119 while ( begin < end ) {
120 begin = Next( begin, end );
121 ++length;
122 }
123
124 return length;
125}
126
127template <typename In, typename Out>
128Out Utf<8>::FromAnsi( In begin, In end, Out output, const std::locale& locale ) {
129 while ( begin < end ) {
130 Uint32 codepoint = Utf<32>::DecodeAnsi( *begin++, locale );
131 output = Encode( codepoint, output );
132 }
133
134 return output;
135}
136
137template <typename In, typename Out> Out Utf<8>::FromWide( In begin, In end, Out output ) {
138 while ( begin < end ) {
139 Uint32 codepoint = Utf<32>::DecodeWide( *begin++ );
140 output = Encode( codepoint, output );
141 }
142
143 return output;
144}
145
146template <typename In, typename Out> Out Utf<8>::FromLatin1( In begin, In end, Out output ) {
147 // Latin-1 is directly compatible with Unicode encodings,
148 // and can thus be treated as (a sub-range of) UTF-32
149 while ( begin < end )
150 output = Encode( *begin++, output );
151
152 return output;
153}
154
155template <typename In, typename Out>
156Out Utf<8>::ToAnsi( In begin, In end, Out output, char replacement, const std::locale& locale ) {
157 while ( begin < end ) {
158 Uint32 codepoint;
159 begin = Decode( begin, end, codepoint );
160 output = Utf<32>::EncodeAnsi( codepoint, output, replacement, locale );
161 }
162
163 return output;
164}
165
166#ifndef EFSW_NO_WIDECHAR
167template <typename In, typename Out>
168Out Utf<8>::ToWide( In begin, In end, Out output, wchar_t replacement ) {
169 while ( begin < end ) {
170 Uint32 codepoint;
171 begin = Decode( begin, end, codepoint );
172 output = Utf<32>::EncodeWide( codepoint, output, replacement );
173 }
174
175 return output;
176}
177#endif
178
179template <typename In, typename Out>
180Out Utf<8>::ToLatin1( In begin, In end, Out output, char replacement ) {
181 // Latin-1 is directly compatible with Unicode encodings,
182 // and can thus be treated as (a sub-range of) UTF-32
183 while ( begin < end ) {
184 Uint32 codepoint;
185 begin = Decode( begin, end, codepoint );
186 *output++ = codepoint < 256 ? static_cast<char>( codepoint ) : replacement;
187 }
188
189 return output;
190}
191
192template <typename In, typename Out> Out Utf<8>::toUtf8( In begin, In end, Out output ) {
193 while ( begin < end )
194 *output++ = *begin++;
195
196 return output;
197}
198
199template <typename In, typename Out> Out Utf<8>::ToUtf16( In begin, In end, Out output ) {
200 while ( begin < end ) {
201 Uint32 codepoint;
202 begin = Decode( begin, end, codepoint );
203 output = Utf<16>::Encode( codepoint, output );
204 }
205
206 return output;
207}
208
209template <typename In, typename Out> Out Utf<8>::ToUtf32( In begin, In end, Out output ) {
210 while ( begin < end ) {
211 Uint32 codepoint;
212 begin = Decode( begin, end, codepoint );
213 *output++ = codepoint;
214 }
215
216 return output;
217}
218
219template <typename In> In Utf<16>::Decode( In begin, In end, Uint32& output, Uint32 replacement ) {
220 Uint16 first = *begin++;
221
222 // If it's a surrogate pair, first convert to a single UTF-32 character
223 if ( ( first >= 0xD800 ) && ( first <= 0xDBFF ) ) {
224 if ( begin < end ) {
225 Uint32 second = *begin++;
226 if ( ( second >= 0xDC00 ) && ( second <= 0xDFFF ) ) {
227 // The second element is valid: convert the two elements to a UTF-32 character
228 output = static_cast<Uint32>( ( ( first - 0xD800 ) << 10 ) + ( second - 0xDC00 ) +
229 0x0010000 );
230 } else {
231 // Invalid character
232 output = replacement;
233 }
234 } else {
235 // Invalid character
236 begin = end;
237 output = replacement;
238 }
239 } else {
240 // We can make a direct copy
241 output = first;
242 }
243
244 return begin;
245}
246
247template <typename Out> Out Utf<16>::Encode( Uint32 input, Out output, Uint16 replacement ) {
248 if ( input < 0xFFFF ) {
249 // The character can be copied directly, we just need to check if it's in the valid range
250 if ( ( input >= 0xD800 ) && ( input <= 0xDFFF ) ) {
251 // Invalid character (this range is reserved)
252 if ( replacement )
253 *output++ = replacement;
254 } else {
255 // Valid character directly convertible to a single UTF-16 character
256 *output++ = static_cast<Uint16>( input );
257 }
258 } else if ( input > 0x0010FFFF ) {
259 // Invalid character (greater than the maximum unicode value)
260 if ( replacement )
261 *output++ = replacement;
262 } else {
263 // The input character will be converted to two UTF-16 elements
264 input -= 0x0010000;
265 *output++ = static_cast<Uint16>( ( input >> 10 ) + 0xD800 );
266 *output++ = static_cast<Uint16>( ( input & 0x3FFUL ) + 0xDC00 );
267 }
268
269 return output;
270}
271
272template <typename In> In Utf<16>::Next( In begin, In end ) {
273 Uint32 codepoint;
274 return Decode( begin, end, codepoint );
275}
276
277template <typename In> std::size_t Utf<16>::Count( In begin, In end ) {
278 std::size_t length = 0;
279 while ( begin < end ) {
280 begin = Next( begin, end );
281 ++length;
282 }
283
284 return length;
285}
286
287template <typename In, typename Out>
288Out Utf<16>::FromAnsi( In begin, In end, Out output, const std::locale& locale ) {
289 while ( begin < end ) {
290 Uint32 codepoint = Utf<32>::DecodeAnsi( *begin++, locale );
291 output = Encode( codepoint, output );
292 }
293
294 return output;
295}
296
297template <typename In, typename Out> Out Utf<16>::FromWide( In begin, In end, Out output ) {
298 while ( begin < end ) {
299 Uint32 codepoint = Utf<32>::DecodeWide( *begin++ );
300 output = Encode( codepoint, output );
301 }
302
303 return output;
304}
305
306template <typename In, typename Out> Out Utf<16>::FromLatin1( In begin, In end, Out output ) {
307 // Latin-1 is directly compatible with Unicode encodings,
308 // and can thus be treated as (a sub-range of) UTF-32
309 while ( begin < end )
310 *output++ = *begin++;
311
312 return output;
313}
314
315template <typename In, typename Out>
316Out Utf<16>::ToAnsi( In begin, In end, Out output, char replacement, const std::locale& locale ) {
317 while ( begin < end ) {
318 Uint32 codepoint;
319 begin = Decode( begin, end, codepoint );
320 output = Utf<32>::EncodeAnsi( codepoint, output, replacement, locale );
321 }
322
323 return output;
324}
325
326#ifndef EFSW_NO_WIDECHAR
327template <typename In, typename Out>
328Out Utf<16>::ToWide( In begin, In end, Out output, wchar_t replacement ) {
329 while ( begin < end ) {
330 Uint32 codepoint;
331 begin = Decode( begin, end, codepoint );
332 output = Utf<32>::EncodeWide( codepoint, output, replacement );
333 }
334
335 return output;
336}
337#endif
338
339template <typename In, typename Out>
340Out Utf<16>::ToLatin1( In begin, In end, Out output, char replacement ) {
341 // Latin-1 is directly compatible with Unicode encodings,
342 // and can thus be treated as (a sub-range of) UTF-32
343 while ( begin < end ) {
344 *output++ = *begin < 256 ? static_cast<char>( *begin ) : replacement;
345 begin++;
346 }
347
348 return output;
349}
350
351template <typename In, typename Out> Out Utf<16>::toUtf8( In begin, In end, Out output ) {
352 while ( begin < end ) {
353 Uint32 codepoint;
354 begin = Decode( begin, end, codepoint );
355 output = Utf<8>::Encode( codepoint, output );
356 }
357
358 return output;
359}
360
361template <typename In, typename Out> Out Utf<16>::ToUtf16( In begin, In end, Out output ) {
362 while ( begin < end )
363 *output++ = *begin++;
364
365 return output;
366}
367
368template <typename In, typename Out> Out Utf<16>::ToUtf32( In begin, In end, Out output ) {
369 while ( begin < end ) {
370 Uint32 codepoint;
371 begin = Decode( begin, end, codepoint );
372 *output++ = codepoint;
373 }
374
375 return output;
376}
377
378template <typename In> In Utf<32>::Decode( In begin, In end, Uint32& output, Uint32 ) {
379 output = *begin++;
380 return begin;
381}
382
383template <typename Out> Out Utf<32>::Encode( Uint32 input, Out output, Uint32 replacement ) {
384 *output++ = input;
385 return output;
386}
387
388template <typename In> In Utf<32>::Next( In begin, In end ) {
389 return ++begin;
390}
391
392template <typename In> std::size_t Utf<32>::Count( In begin, In end ) {
393 return begin - end;
394}
395
396template <typename In, typename Out>
397Out Utf<32>::FromAnsi( In begin, In end, Out output, const std::locale& locale ) {
398 while ( begin < end )
399 *output++ = DecodeAnsi( *begin++, locale );
400
401 return output;
402}
403
404template <typename In, typename Out> Out Utf<32>::FromWide( In begin, In end, Out output ) {
405 while ( begin < end )
406 *output++ = DecodeWide( *begin++ );
407
408 return output;
409}
410
411template <typename In, typename Out> Out Utf<32>::FromLatin1( In begin, In end, Out output ) {
412 // Latin-1 is directly compatible with Unicode encodings,
413 // and can thus be treated as (a sub-range of) UTF-32
414 while ( begin < end )
415 *output++ = *begin++;
416
417 return output;
418}
419
420template <typename In, typename Out>
421Out Utf<32>::ToAnsi( In begin, In end, Out output, char replacement, const std::locale& locale ) {
422 while ( begin < end )
423 output = EncodeAnsi( *begin++, output, replacement, locale );
424
425 return output;
426}
427
428#ifndef EFSW_NO_WIDECHAR
429template <typename In, typename Out>
430Out Utf<32>::ToWide( In begin, In end, Out output, wchar_t replacement ) {
431 while ( begin < end )
432 output = EncodeWide( *begin++, output, replacement );
433
434 return output;
435}
436#endif
437
438template <typename In, typename Out>
439Out Utf<32>::ToLatin1( In begin, In end, Out output, char replacement ) {
440 // Latin-1 is directly compatible with Unicode encodings,
441 // and can thus be treated as (a sub-range of) UTF-32
442 while ( begin < end ) {
443 *output++ = *begin < 256 ? static_cast<char>( *begin ) : replacement;
444 begin++;
445 }
446
447 return output;
448}
449
450template <typename In, typename Out> Out Utf<32>::toUtf8( In begin, In end, Out output ) {
451 while ( begin < end )
452 output = Utf<8>::Encode( *begin++, output );
453
454 return output;
455}
456
457template <typename In, typename Out> Out Utf<32>::ToUtf16( In begin, In end, Out output ) {
458 while ( begin < end )
459 output = Utf<16>::Encode( *begin++, output );
460
461 return output;
462}
463
464template <typename In, typename Out> Out Utf<32>::ToUtf32( In begin, In end, Out output ) {
465 while ( begin < end )
466 *output++ = *begin++;
467
468 return output;
469}
470
471template <typename In> Uint32 Utf<32>::DecodeAnsi( In input, const std::locale& locale ) {
472 // On Windows, gcc's standard library (glibc++) has almost
473 // no support for Unicode stuff. As a consequence, in this
474 // context we can only use the default locale and ignore
475 // the one passed as parameter.
476
477#if EFSW_PLATFORM == EFSW_PLATFORM_WIN && /* if Windows ... */ \
478 ( defined( __GLIBCPP__ ) || \
479 defined( __GLIBCXX__ ) ) && /* ... and standard library is glibc++ ... */ \
480 !( defined( __SGI_STL_PORT ) || \
481 defined( _STLPORT_VERSION ) ) /* ... and STLPort is not used on top of it */
482
483 wchar_t character = 0;
484 mbtowc( &character, &input, 1 );
485 return static_cast<Uint32>( character );
486
487#else
488// Get the facet of the locale which deals with character conversion
489#ifndef EFSW_NO_WIDECHAR
490 const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>( locale );
491#else
492 const std::ctype<char>& facet = std::use_facet<std::ctype<char>>( locale );
493#endif
494
495 // Use the facet to convert each character of the input string
496 return static_cast<Uint32>( facet.widen( input ) );
497
498#endif
499}
500
501template <typename In> Uint32 Utf<32>::DecodeWide( In input ) {
502 // The encoding of wide characters is not well defined and is left to the system;
503 // however we can safely assume that it is UCS-2 on Windows and
504 // UCS-4 on Unix systems.
505 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
506 // and UCS-4 *is* UTF-32).
507
508 return input;
509}
510
511template <typename Out>
512Out Utf<32>::EncodeAnsi( Uint32 codepoint, Out output, char replacement,
513 const std::locale& locale ) {
514 // On Windows, gcc's standard library (glibc++) has almost
515 // no support for Unicode stuff. As a consequence, in this
516 // context we can only use the default locale and ignore
517 // the one passed as parameter.
518
519#if EFSW_PLATFORM == EFSW_PLATFORM_WIN && /* if Windows ... */ \
520 ( defined( __GLIBCPP__ ) || \
521 defined( __GLIBCXX__ ) ) && /* ... and standard library is glibc++ ... */ \
522 !( defined( __SGI_STL_PORT ) || \
523 defined( _STLPORT_VERSION ) ) /* ... and STLPort is not used on top of it */
524
525 char character = 0;
526 if ( wctomb( &character, static_cast<wchar_t>( codepoint ) ) >= 0 )
527 *output++ = character;
528 else if ( replacement )
529 *output++ = replacement;
530
531 return output;
532
533#else
534// Get the facet of the locale which deals with character conversion
535#ifndef EFSW_NO_WIDECHAR
536 const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>( locale );
537#else
538 const std::ctype<char>& facet = std::use_facet<std::ctype<char>>( locale );
539#endif
540
541 // Use the facet to convert each character of the input string
542 *output++ = facet.narrow( static_cast<wchar_t>( codepoint ), replacement );
543
544 return output;
545
546#endif
547}
548
549#ifndef EFSW_NO_WIDECHAR
550template <typename Out>
551Out Utf<32>::EncodeWide( Uint32 codepoint, Out output, wchar_t replacement ) {
552 // The encoding of wide characters is not well defined and is left to the system;
553 // however we can safely assume that it is UCS-2 on Windows and
554 // UCS-4 on Unix systems.
555 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
556 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
557
558 switch ( sizeof( wchar_t ) ) {
559 case 4: {
560 *output++ = static_cast<wchar_t>( codepoint );
561 break;
562 }
563
564 default: {
565 if ( ( codepoint <= 0xFFFF ) && ( ( codepoint < 0xD800 ) || ( codepoint > 0xDFFF ) ) ) {
566 *output++ = static_cast<wchar_t>( codepoint );
567 } else if ( replacement ) {
568 *output++ = replacement;
569 }
570 break;
571 }
572 }
573
574 return output;
575}
576#endif