diff options
Diffstat (limited to 'src/3rdParty/utf8cpp.h')
| -rwxr-xr-x | src/3rdParty/utf8cpp.h | 1277 |
1 files changed, 1277 insertions, 0 deletions
diff --git a/src/3rdParty/utf8cpp.h b/src/3rdParty/utf8cpp.h new file mode 100755 index 0000000..76f0fa1 --- /dev/null +++ b/src/3rdParty/utf8cpp.h | |||
| @@ -0,0 +1,1277 @@ | |||
| 1 | // Copyright 2006 Nemanja Trifunovic | ||
| 2 | |||
| 3 | /* | ||
| 4 | Permission is hereby granted, free of charge, to any person or organization | ||
| 5 | obtaining a copy of the software and accompanying documentation covered by | ||
| 6 | this license (the "Software") to use, reproduce, display, distribute, | ||
| 7 | execute, and transmit the Software, and to prepare derivative works of the | ||
| 8 | Software, and to permit third-parties to whom the Software is furnished to | ||
| 9 | do so, all subject to the following: | ||
| 10 | |||
| 11 | The copyright notices in the Software and this entire statement, including | ||
| 12 | the above license grant, this restriction and the following disclaimer, | ||
| 13 | must be included in all copies of the Software, in whole or in part, and | ||
| 14 | all derivative works of the Software, unless such copies or derivative | ||
| 15 | works are solely in the form of machine-executable object code generated by | ||
| 16 | a source language processor. | ||
| 17 | |||
| 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
| 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
| 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
| 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
| 24 | DEALINGS IN THE SOFTWARE. | ||
| 25 | */ | ||
| 26 | |||
| 27 | |||
| 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
| 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
| 30 | |||
| 31 | /* | ||
| 32 | To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro | ||
| 33 | and set it to one of the values used by the __cplusplus predefined macro. | ||
| 34 | |||
| 35 | For instance, | ||
| 36 | #define UTF_CPP_CPLUSPLUS 199711L | ||
| 37 | will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. | ||
| 38 | Some library features will be disabled. | ||
| 39 | |||
| 40 | If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. | ||
| 41 | */ | ||
| 42 | |||
| 43 | #include <iterator> | ||
| 44 | #include <cstring> | ||
| 45 | #include <string> | ||
| 46 | |||
| 47 | // Determine the C++ standard version. | ||
| 48 | // If the user defines UTF_CPP_CPLUSPLUS, use that. | ||
| 49 | // Otherwise, trust the unreliable predefined macro __cplusplus | ||
| 50 | |||
| 51 | #if !defined UTF_CPP_CPLUSPLUS | ||
| 52 | #define UTF_CPP_CPLUSPLUS __cplusplus | ||
| 53 | #endif | ||
| 54 | |||
| 55 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 56 | #define UTF_CPP_OVERRIDE override | ||
| 57 | #define UTF_CPP_NOEXCEPT noexcept | ||
| 58 | #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); | ||
| 59 | #else // C++ 98/03 | ||
| 60 | #define UTF_CPP_OVERRIDE | ||
| 61 | #define UTF_CPP_NOEXCEPT throw() | ||
| 62 | // Simulate static_assert: | ||
| 63 | template<bool> struct UtfCppCompileTimeAssert; | ||
| 64 | template<> struct UtfCppCompileTimeAssert <true> { }; | ||
| 65 | #define UTF_CPP_STATIC_ASSERT(condition) (UtfCppCompileTimeAssert <(condition) != 0>()) | ||
| 66 | #endif // C++ 11 or later | ||
| 67 | |||
| 68 | |||
| 69 | namespace utf8 | ||
| 70 | { | ||
| 71 | // The typedefs for 8-bit, 16-bit and 32-bit code units | ||
| 72 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 73 | #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later | ||
| 74 | typedef char8_t utfchar8_t; | ||
| 75 | #else // C++ 11/14/17 | ||
| 76 | typedef unsigned char utfchar8_t; | ||
| 77 | #endif | ||
| 78 | typedef char16_t utfchar16_t; | ||
| 79 | typedef char32_t utfchar32_t; | ||
| 80 | #else // C++ 98/03 | ||
| 81 | typedef unsigned char utfchar8_t; | ||
| 82 | typedef unsigned short utfchar16_t; | ||
| 83 | typedef unsigned int utfchar32_t; | ||
| 84 | #endif // C++ 11 or later | ||
| 85 | |||
| 86 | // Helper code - not intended to be directly called by the library users. May be changed at any time | ||
| 87 | namespace internal | ||
| 88 | { | ||
| 89 | // Unicode constants | ||
| 90 | // Leading (high) surrogates: 0xd800 - 0xdbff | ||
| 91 | // Trailing (low) surrogates: 0xdc00 - 0xdfff | ||
| 92 | const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; | ||
| 93 | const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; | ||
| 94 | const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; | ||
| 95 | const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; | ||
| 96 | const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) | ||
| 97 | const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN | ||
| 98 | |||
| 99 | // Maximum valid value for a Unicode code point | ||
| 100 | const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; | ||
| 101 | |||
| 102 | template<typename octet_type> | ||
| 103 | inline utfchar8_t mask8(octet_type oc) | ||
| 104 | { | ||
| 105 | return static_cast<utfchar8_t>(0xff & oc); | ||
| 106 | } | ||
| 107 | |||
| 108 | template<typename u16_type> | ||
| 109 | inline utfchar16_t mask16(u16_type oc) | ||
| 110 | { | ||
| 111 | return static_cast<utfchar16_t>(0xffff & oc); | ||
| 112 | } | ||
| 113 | |||
| 114 | template<typename octet_type> | ||
| 115 | inline bool is_trail(octet_type oc) | ||
| 116 | { | ||
| 117 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); | ||
| 118 | } | ||
| 119 | |||
| 120 | inline bool is_lead_surrogate(utfchar32_t cp) | ||
| 121 | { | ||
| 122 | return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX)); | ||
| 123 | } | ||
| 124 | |||
| 125 | inline bool is_trail_surrogate(utfchar32_t cp) | ||
| 126 | { | ||
| 127 | return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX)); | ||
| 128 | } | ||
| 129 | |||
| 130 | inline bool is_surrogate(utfchar32_t cp) | ||
| 131 | { | ||
| 132 | return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX)); | ||
| 133 | } | ||
| 134 | |||
| 135 | inline bool is_code_point_valid(utfchar32_t cp) | ||
| 136 | { | ||
| 137 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); | ||
| 138 | } | ||
| 139 | |||
| 140 | inline bool is_in_bmp(utfchar32_t cp) | ||
| 141 | { | ||
| 142 | return cp < utfchar32_t(0x10000); | ||
| 143 | } | ||
| 144 | |||
| 145 | template <typename octet_iterator> | ||
| 146 | int sequence_length(octet_iterator lead_it) | ||
| 147 | { | ||
| 148 | const utfchar8_t lead = utf8::internal::mask8(*lead_it); | ||
| 149 | if (lead < 0x80) | ||
| 150 | return 1; | ||
| 151 | else if ((lead >> 5) == 0x6) | ||
| 152 | return 2; | ||
| 153 | else if ((lead >> 4) == 0xe) | ||
| 154 | return 3; | ||
| 155 | else if ((lead >> 3) == 0x1e) | ||
| 156 | return 4; | ||
| 157 | else | ||
| 158 | return 0; | ||
| 159 | } | ||
| 160 | |||
| 161 | inline bool is_overlong_sequence(utfchar32_t cp, int length) | ||
| 162 | { | ||
| 163 | if (cp < 0x80) { | ||
| 164 | if (length != 1) | ||
| 165 | return true; | ||
| 166 | } | ||
| 167 | else if (cp < 0x800) { | ||
| 168 | if (length != 2) | ||
| 169 | return true; | ||
| 170 | } | ||
| 171 | else if (cp < 0x10000) { | ||
| 172 | if (length != 3) | ||
| 173 | return true; | ||
| 174 | } | ||
| 175 | return false; | ||
| 176 | } | ||
| 177 | |||
| 178 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; | ||
| 179 | |||
| 180 | /// Helper for get_sequence_x | ||
| 181 | template <typename octet_iterator> | ||
| 182 | utf_error increase_safely(octet_iterator& it, const octet_iterator end) | ||
| 183 | { | ||
| 184 | if (++it == end) | ||
| 185 | return NOT_ENOUGH_ROOM; | ||
| 186 | |||
| 187 | if (!utf8::internal::is_trail(*it)) | ||
| 188 | return INCOMPLETE_SEQUENCE; | ||
| 189 | |||
| 190 | return UTF8_OK; | ||
| 191 | } | ||
| 192 | |||
| 193 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} | ||
| 194 | |||
| 195 | /// get_sequence_x functions decode utf-8 sequences of the length x | ||
| 196 | template <typename octet_iterator> | ||
| 197 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 198 | { | ||
| 199 | if (it == end) | ||
| 200 | return NOT_ENOUGH_ROOM; | ||
| 201 | |||
| 202 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 203 | |||
| 204 | return UTF8_OK; | ||
| 205 | } | ||
| 206 | |||
| 207 | template <typename octet_iterator> | ||
| 208 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 209 | { | ||
| 210 | if (it == end) | ||
| 211 | return NOT_ENOUGH_ROOM; | ||
| 212 | |||
| 213 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 214 | |||
| 215 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 216 | |||
| 217 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); | ||
| 218 | |||
| 219 | return UTF8_OK; | ||
| 220 | } | ||
| 221 | |||
| 222 | template <typename octet_iterator> | ||
| 223 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 224 | { | ||
| 225 | if (it == end) | ||
| 226 | return NOT_ENOUGH_ROOM; | ||
| 227 | |||
| 228 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 229 | |||
| 230 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 231 | |||
| 232 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
| 233 | |||
| 234 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 235 | |||
| 236 | code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f)); | ||
| 237 | |||
| 238 | return UTF8_OK; | ||
| 239 | } | ||
| 240 | |||
| 241 | template <typename octet_iterator> | ||
| 242 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 243 | { | ||
| 244 | if (it == end) | ||
| 245 | return NOT_ENOUGH_ROOM; | ||
| 246 | |||
| 247 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 248 | |||
| 249 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 250 | |||
| 251 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
| 252 | |||
| 253 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 254 | |||
| 255 | code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); | ||
| 256 | |||
| 257 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 258 | |||
| 259 | code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f)); | ||
| 260 | |||
| 261 | return UTF8_OK; | ||
| 262 | } | ||
| 263 | |||
| 264 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR | ||
| 265 | |||
| 266 | template <typename octet_iterator> | ||
| 267 | utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 268 | { | ||
| 269 | if (it == end) | ||
| 270 | return NOT_ENOUGH_ROOM; | ||
| 271 | |||
| 272 | // Save the original value of it so we can go back in case of failure | ||
| 273 | // Of course, it does not make much sense with i.e. stream iterators | ||
| 274 | octet_iterator original_it = it; | ||
| 275 | |||
| 276 | utfchar32_t cp = 0; | ||
| 277 | // Determine the sequence length based on the lead octet | ||
| 278 | const int length = utf8::internal::sequence_length(it); | ||
| 279 | |||
| 280 | // Get trail octets and calculate the code point | ||
| 281 | utf_error err = UTF8_OK; | ||
| 282 | switch (length) { | ||
| 283 | case 0: | ||
| 284 | return INVALID_LEAD; | ||
| 285 | case 1: | ||
| 286 | err = utf8::internal::get_sequence_1(it, end, cp); | ||
| 287 | break; | ||
| 288 | case 2: | ||
| 289 | err = utf8::internal::get_sequence_2(it, end, cp); | ||
| 290 | break; | ||
| 291 | case 3: | ||
| 292 | err = utf8::internal::get_sequence_3(it, end, cp); | ||
| 293 | break; | ||
| 294 | case 4: | ||
| 295 | err = utf8::internal::get_sequence_4(it, end, cp); | ||
| 296 | break; | ||
| 297 | } | ||
| 298 | |||
| 299 | if (err == UTF8_OK) { | ||
| 300 | // Decoding succeeded. Now, security checks... | ||
| 301 | if (utf8::internal::is_code_point_valid(cp)) { | ||
| 302 | if (!utf8::internal::is_overlong_sequence(cp, length)){ | ||
| 303 | // Passed! Return here. | ||
| 304 | code_point = cp; | ||
| 305 | ++it; | ||
| 306 | return UTF8_OK; | ||
| 307 | } | ||
| 308 | else | ||
| 309 | err = OVERLONG_SEQUENCE; | ||
| 310 | } | ||
| 311 | else | ||
| 312 | err = INVALID_CODE_POINT; | ||
| 313 | } | ||
| 314 | |||
| 315 | // Failure branch - restore the original value of the iterator | ||
| 316 | it = original_it; | ||
| 317 | return err; | ||
| 318 | } | ||
| 319 | |||
| 320 | template <typename octet_iterator> | ||
| 321 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { | ||
| 322 | utfchar32_t ignored; | ||
| 323 | return utf8::internal::validate_next(it, end, ignored); | ||
| 324 | } | ||
| 325 | |||
| 326 | template <typename word_iterator> | ||
| 327 | utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) | ||
| 328 | { | ||
| 329 | // Make sure the iterator dereferences a large enough type | ||
| 330 | typedef typename std::iterator_traits<word_iterator>::value_type word_type; | ||
| 331 | UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); | ||
| 332 | // Check the edge case: | ||
| 333 | if (it == end) | ||
| 334 | return NOT_ENOUGH_ROOM; | ||
| 335 | // Save the original value of it so we can go back in case of failure | ||
| 336 | // Of course, it does not make much sense with i.e. stream iterators | ||
| 337 | word_iterator original_it = it; | ||
| 338 | |||
| 339 | utf_error err = UTF8_OK; | ||
| 340 | |||
| 341 | const utfchar16_t first_word = *it++; | ||
| 342 | if (!is_surrogate(first_word)) { | ||
| 343 | code_point = first_word; | ||
| 344 | return UTF8_OK; | ||
| 345 | } | ||
| 346 | else { | ||
| 347 | if (it == end) | ||
| 348 | err = NOT_ENOUGH_ROOM; | ||
| 349 | else if (is_lead_surrogate(first_word)) { | ||
| 350 | const utfchar16_t second_word = *it++; | ||
| 351 | if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) { | ||
| 352 | code_point = static_cast<utfchar32_t>(first_word << 10) + static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET; | ||
| 353 | return UTF8_OK; | ||
| 354 | } else | ||
| 355 | err = INCOMPLETE_SEQUENCE; | ||
| 356 | |||
| 357 | } else { | ||
| 358 | err = INVALID_LEAD; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | // error branch | ||
| 362 | it = original_it; | ||
| 363 | return err; | ||
| 364 | } | ||
| 365 | |||
| 366 | // Internal implementation of both checked and unchecked append() function | ||
| 367 | // This function will be invoked by the overloads below, as they will know | ||
| 368 | // the octet_type. | ||
| 369 | template <typename octet_iterator, typename octet_type> | ||
| 370 | octet_iterator append(utfchar32_t cp, octet_iterator result) { | ||
| 371 | if (cp < 0x80) // one octet | ||
| 372 | *(result++) = static_cast<octet_type>(cp); | ||
| 373 | else if (cp < 0x800) { // two octets | ||
| 374 | *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0); | ||
| 375 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 376 | } | ||
| 377 | else if (cp < 0x10000) { // three octets | ||
| 378 | *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0); | ||
| 379 | *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); | ||
| 380 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 381 | } | ||
| 382 | else { // four octets | ||
| 383 | *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0); | ||
| 384 | *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80); | ||
| 385 | *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); | ||
| 386 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 387 | } | ||
| 388 | return result; | ||
| 389 | } | ||
| 390 | |||
| 391 | // One of the following overloads will be invoked from the API calls | ||
| 392 | |||
| 393 | // A simple (but dangerous) case: the caller appends byte(s) to a char array | ||
| 394 | inline char* append(utfchar32_t cp, char* result) { | ||
| 395 | return append<char*, char>(cp, result); | ||
| 396 | } | ||
| 397 | |||
| 398 | // Hopefully, most common case: the caller uses back_inserter | ||
| 399 | // i.e. append(cp, std::back_inserter(str)); | ||
| 400 | template<typename container_type> | ||
| 401 | std::back_insert_iterator<container_type> append | ||
| 402 | (utfchar32_t cp, std::back_insert_iterator<container_type> result) { | ||
| 403 | return append<std::back_insert_iterator<container_type>, | ||
| 404 | typename container_type::value_type>(cp, result); | ||
| 405 | } | ||
| 406 | |||
| 407 | // The caller uses some other kind of output operator - not covered above | ||
| 408 | // Note that in this case we are not able to determine octet_type | ||
| 409 | // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. | ||
| 410 | template <typename octet_iterator> | ||
| 411 | octet_iterator append(utfchar32_t cp, octet_iterator result) { | ||
| 412 | return append<octet_iterator, utfchar8_t>(cp, result); | ||
| 413 | } | ||
| 414 | |||
| 415 | // Internal implementation of both checked and unchecked append16() function | ||
| 416 | // This function will be invoked by the overloads below, as they will know | ||
| 417 | // the word_type. | ||
| 418 | template <typename word_iterator, typename word_type> | ||
| 419 | word_iterator append16(utfchar32_t cp, word_iterator result) { | ||
| 420 | UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); | ||
| 421 | if (is_in_bmp(cp)) | ||
| 422 | *(result++) = static_cast<word_type>(cp); | ||
| 423 | else { | ||
| 424 | // Code points from the supplementary planes are encoded via surrogate pairs | ||
| 425 | *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10)); | ||
| 426 | *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); | ||
| 427 | } | ||
| 428 | return result; | ||
| 429 | } | ||
| 430 | |||
| 431 | // Hopefully, most common case: the caller uses back_inserter | ||
| 432 | // i.e. append16(cp, std::back_inserter(str)); | ||
| 433 | template<typename container_type> | ||
| 434 | std::back_insert_iterator<container_type> append16 | ||
| 435 | (utfchar32_t cp, std::back_insert_iterator<container_type> result) { | ||
| 436 | return append16<std::back_insert_iterator<container_type>, | ||
| 437 | typename container_type::value_type>(cp, result); | ||
| 438 | } | ||
| 439 | |||
| 440 | // The caller uses some other kind of output operator - not covered above | ||
| 441 | // Note that in this case we are not able to determine word_type | ||
| 442 | // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. | ||
| 443 | template <typename word_iterator> | ||
| 444 | word_iterator append16(utfchar32_t cp, word_iterator result) { | ||
| 445 | return append16<word_iterator, utfchar16_t>(cp, result); | ||
| 446 | } | ||
| 447 | |||
| 448 | } // namespace internal | ||
| 449 | |||
| 450 | /// The library API - functions intended to be called by the users | ||
| 451 | |||
| 452 | // Byte order mark | ||
| 453 | const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; | ||
| 454 | |||
| 455 | template <typename octet_iterator> | ||
| 456 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) | ||
| 457 | { | ||
| 458 | octet_iterator result = start; | ||
| 459 | while (result != end) { | ||
| 460 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); | ||
| 461 | if (err_code != internal::UTF8_OK) | ||
| 462 | return result; | ||
| 463 | } | ||
| 464 | return result; | ||
| 465 | } | ||
| 466 | |||
| 467 | inline const char* find_invalid(const char* str) | ||
| 468 | { | ||
| 469 | const char* end = str + std::strlen(str); | ||
| 470 | return find_invalid(str, end); | ||
| 471 | } | ||
| 472 | |||
| 473 | inline std::size_t find_invalid(const std::string& s) | ||
| 474 | { | ||
| 475 | std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 476 | return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 477 | } | ||
| 478 | |||
| 479 | template <typename octet_iterator> | ||
| 480 | inline bool is_valid(octet_iterator start, octet_iterator end) | ||
| 481 | { | ||
| 482 | return (utf8::find_invalid(start, end) == end); | ||
| 483 | } | ||
| 484 | |||
| 485 | inline bool is_valid(const char* str) | ||
| 486 | { | ||
| 487 | return (*(utf8::find_invalid(str)) == '\0'); | ||
| 488 | } | ||
| 489 | |||
| 490 | inline bool is_valid(const std::string& s) | ||
| 491 | { | ||
| 492 | return is_valid(s.begin(), s.end()); | ||
| 493 | } | ||
| 494 | |||
| 495 | |||
| 496 | |||
| 497 | template <typename octet_iterator> | ||
| 498 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) | ||
| 499 | { | ||
| 500 | return ( | ||
| 501 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && | ||
| 502 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && | ||
| 503 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) | ||
| 504 | ); | ||
| 505 | } | ||
| 506 | |||
| 507 | inline bool starts_with_bom(const std::string& s) | ||
| 508 | { | ||
| 509 | return starts_with_bom(s.begin(), s.end()); | ||
| 510 | } | ||
| 511 | } // namespace utf8 | ||
| 512 | |||
| 513 | #include <stdexcept> | ||
| 514 | |||
| 515 | namespace utf8 | ||
| 516 | { | ||
| 517 | // Base for the exceptions that may be thrown from the library | ||
| 518 | class exception : public ::std::exception { | ||
| 519 | }; | ||
| 520 | |||
| 521 | // Exceptions that may be thrown from the library functions. | ||
| 522 | class invalid_code_point : public exception { | ||
| 523 | utfchar32_t cp; | ||
| 524 | public: | ||
| 525 | invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} | ||
| 526 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } | ||
| 527 | utfchar32_t code_point() const {return cp;} | ||
| 528 | }; | ||
| 529 | |||
| 530 | class invalid_utf8 : public exception { | ||
| 531 | utfchar8_t u8; | ||
| 532 | public: | ||
| 533 | invalid_utf8 (utfchar8_t u) : u8(u) {} | ||
| 534 | invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {} | ||
| 535 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } | ||
| 536 | utfchar8_t utf8_octet() const {return u8;} | ||
| 537 | }; | ||
| 538 | |||
| 539 | class invalid_utf16 : public exception { | ||
| 540 | utfchar16_t u16; | ||
| 541 | public: | ||
| 542 | invalid_utf16 (utfchar16_t u) : u16(u) {} | ||
| 543 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } | ||
| 544 | utfchar16_t utf16_word() const {return u16;} | ||
| 545 | }; | ||
| 546 | |||
| 547 | class not_enough_room : public exception { | ||
| 548 | public: | ||
| 549 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } | ||
| 550 | }; | ||
| 551 | |||
| 552 | /// The library API - functions intended to be called by the users | ||
| 553 | |||
| 554 | template <typename octet_iterator> | ||
| 555 | octet_iterator append(utfchar32_t cp, octet_iterator result) | ||
| 556 | { | ||
| 557 | if (!utf8::internal::is_code_point_valid(cp)) | ||
| 558 | throw invalid_code_point(cp); | ||
| 559 | |||
| 560 | return internal::append(cp, result); | ||
| 561 | } | ||
| 562 | |||
| 563 | inline void append(utfchar32_t cp, std::string& s) | ||
| 564 | { | ||
| 565 | append(cp, std::back_inserter(s)); | ||
| 566 | } | ||
| 567 | |||
| 568 | template <typename word_iterator> | ||
| 569 | word_iterator append16(utfchar32_t cp, word_iterator result) | ||
| 570 | { | ||
| 571 | if (!utf8::internal::is_code_point_valid(cp)) | ||
| 572 | throw invalid_code_point(cp); | ||
| 573 | |||
| 574 | return internal::append16(cp, result); | ||
| 575 | } | ||
| 576 | |||
| 577 | template <typename octet_iterator, typename output_iterator> | ||
| 578 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) | ||
| 579 | { | ||
| 580 | while (start != end) { | ||
| 581 | octet_iterator sequence_start = start; | ||
| 582 | internal::utf_error err_code = utf8::internal::validate_next(start, end); | ||
| 583 | switch (err_code) { | ||
| 584 | case internal::UTF8_OK : | ||
| 585 | for (octet_iterator it = sequence_start; it != start; ++it) | ||
| 586 | *out++ = *it; | ||
| 587 | break; | ||
| 588 | case internal::NOT_ENOUGH_ROOM: | ||
| 589 | out = utf8::append (replacement, out); | ||
| 590 | start = end; | ||
| 591 | break; | ||
| 592 | case internal::INVALID_LEAD: | ||
| 593 | out = utf8::append (replacement, out); | ||
| 594 | ++start; | ||
| 595 | break; | ||
| 596 | case internal::INCOMPLETE_SEQUENCE: | ||
| 597 | case internal::OVERLONG_SEQUENCE: | ||
| 598 | case internal::INVALID_CODE_POINT: | ||
| 599 | out = utf8::append (replacement, out); | ||
| 600 | ++start; | ||
| 601 | // just one replacement mark for the sequence | ||
| 602 | while (start != end && utf8::internal::is_trail(*start)) | ||
| 603 | ++start; | ||
| 604 | break; | ||
| 605 | } | ||
| 606 | } | ||
| 607 | return out; | ||
| 608 | } | ||
| 609 | |||
| 610 | template <typename octet_iterator, typename output_iterator> | ||
| 611 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | ||
| 612 | { | ||
| 613 | static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd)); | ||
| 614 | return utf8::replace_invalid(start, end, out, replacement_marker); | ||
| 615 | } | ||
| 616 | |||
| 617 | inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) | ||
| 618 | { | ||
| 619 | std::string result; | ||
| 620 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 621 | return result; | ||
| 622 | } | ||
| 623 | |||
| 624 | inline std::string replace_invalid(const std::string& s) | ||
| 625 | { | ||
| 626 | std::string result; | ||
| 627 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 628 | return result; | ||
| 629 | } | ||
| 630 | |||
| 631 | template <typename octet_iterator> | ||
| 632 | utfchar32_t next(octet_iterator& it, octet_iterator end) | ||
| 633 | { | ||
| 634 | utfchar32_t cp = 0; | ||
| 635 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | ||
| 636 | switch (err_code) { | ||
| 637 | case internal::UTF8_OK : | ||
| 638 | break; | ||
| 639 | case internal::NOT_ENOUGH_ROOM : | ||
| 640 | throw not_enough_room(); | ||
| 641 | case internal::INVALID_LEAD : | ||
| 642 | case internal::INCOMPLETE_SEQUENCE : | ||
| 643 | case internal::OVERLONG_SEQUENCE : | ||
| 644 | throw invalid_utf8(static_cast<utfchar8_t>(*it)); | ||
| 645 | case internal::INVALID_CODE_POINT : | ||
| 646 | throw invalid_code_point(cp); | ||
| 647 | } | ||
| 648 | return cp; | ||
| 649 | } | ||
| 650 | |||
| 651 | template <typename word_iterator> | ||
| 652 | utfchar32_t next16(word_iterator& it, word_iterator end) | ||
| 653 | { | ||
| 654 | utfchar32_t cp = 0; | ||
| 655 | internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); | ||
| 656 | if (err_code == internal::NOT_ENOUGH_ROOM) | ||
| 657 | throw not_enough_room(); | ||
| 658 | return cp; | ||
| 659 | } | ||
| 660 | |||
| 661 | template <typename octet_iterator> | ||
| 662 | utfchar32_t peek_next(octet_iterator it, octet_iterator end) | ||
| 663 | { | ||
| 664 | return utf8::next(it, end); | ||
| 665 | } | ||
| 666 | |||
| 667 | template <typename octet_iterator> | ||
| 668 | utfchar32_t prior(octet_iterator& it, octet_iterator start) | ||
| 669 | { | ||
| 670 | // can't do much if it == start | ||
| 671 | if (it == start) | ||
| 672 | throw not_enough_room(); | ||
| 673 | |||
| 674 | octet_iterator end = it; | ||
| 675 | // Go back until we hit either a lead octet or start | ||
| 676 | while (utf8::internal::is_trail(*(--it))) | ||
| 677 | if (it == start) | ||
| 678 | throw invalid_utf8(*it); // error - no lead byte in the sequence | ||
| 679 | return utf8::peek_next(it, end); | ||
| 680 | } | ||
| 681 | |||
| 682 | template <typename octet_iterator, typename distance_type> | ||
| 683 | void advance (octet_iterator& it, distance_type n, octet_iterator end) | ||
| 684 | { | ||
| 685 | const distance_type zero(0); | ||
| 686 | if (n < zero) { | ||
| 687 | // backward | ||
| 688 | for (distance_type i = n; i < zero; ++i) | ||
| 689 | utf8::prior(it, end); | ||
| 690 | } else { | ||
| 691 | // forward | ||
| 692 | for (distance_type i = zero; i < n; ++i) | ||
| 693 | utf8::next(it, end); | ||
| 694 | } | ||
| 695 | } | ||
| 696 | |||
| 697 | template <typename octet_iterator> | ||
| 698 | typename std::iterator_traits<octet_iterator>::difference_type | ||
| 699 | distance (octet_iterator first, octet_iterator last) | ||
| 700 | { | ||
| 701 | typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
| 702 | for (dist = 0; first < last; ++dist) | ||
| 703 | utf8::next(first, last); | ||
| 704 | return dist; | ||
| 705 | } | ||
| 706 | |||
| 707 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 708 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
| 709 | { | ||
| 710 | while (start != end) { | ||
| 711 | utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++)); | ||
| 712 | // Take care of surrogate pairs first | ||
| 713 | if (utf8::internal::is_lead_surrogate(cp)) { | ||
| 714 | if (start != end) { | ||
| 715 | const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++)); | ||
| 716 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) | ||
| 717 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
| 718 | else | ||
| 719 | throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate)); | ||
| 720 | } | ||
| 721 | else | ||
| 722 | throw invalid_utf16(static_cast<utfchar16_t>(cp)); | ||
| 723 | |||
| 724 | } | ||
| 725 | // Lone trail surrogate | ||
| 726 | else if (utf8::internal::is_trail_surrogate(cp)) | ||
| 727 | throw invalid_utf16(static_cast<utfchar16_t>(cp)); | ||
| 728 | |||
| 729 | result = utf8::append(cp, result); | ||
| 730 | } | ||
| 731 | return result; | ||
| 732 | } | ||
| 733 | |||
| 734 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 735 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
| 736 | { | ||
| 737 | while (start < end) { | ||
| 738 | const utfchar32_t cp = utf8::next(start, end); | ||
| 739 | if (cp > 0xffff) { //make a surrogate pair | ||
| 740 | *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
| 741 | *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
| 742 | } | ||
| 743 | else | ||
| 744 | *result++ = static_cast<utfchar16_t>(cp); | ||
| 745 | } | ||
| 746 | return result; | ||
| 747 | } | ||
| 748 | |||
| 749 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 750 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
| 751 | { | ||
| 752 | while (start != end) | ||
| 753 | result = utf8::append(*(start++), result); | ||
| 754 | |||
| 755 | return result; | ||
| 756 | } | ||
| 757 | |||
| 758 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 759 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
| 760 | { | ||
| 761 | while (start < end) | ||
| 762 | (*result++) = utf8::next(start, end); | ||
| 763 | |||
| 764 | return result; | ||
| 765 | } | ||
| 766 | |||
| 767 | // The iterator class | ||
| 768 | template <typename octet_iterator> | ||
| 769 | class iterator { | ||
| 770 | octet_iterator it; | ||
| 771 | octet_iterator range_start; | ||
| 772 | octet_iterator range_end; | ||
| 773 | public: | ||
| 774 | typedef utfchar32_t value_type; | ||
| 775 | typedef utfchar32_t* pointer; | ||
| 776 | typedef utfchar32_t& reference; | ||
| 777 | typedef std::ptrdiff_t difference_type; | ||
| 778 | typedef std::bidirectional_iterator_tag iterator_category; | ||
| 779 | iterator () {} | ||
| 780 | explicit iterator (const octet_iterator& octet_it, | ||
| 781 | const octet_iterator& rangestart, | ||
| 782 | const octet_iterator& rangeend) : | ||
| 783 | it(octet_it), range_start(rangestart), range_end(rangeend) | ||
| 784 | { | ||
| 785 | if (it < range_start || it > range_end) | ||
| 786 | throw std::out_of_range("Invalid utf-8 iterator position"); | ||
| 787 | } | ||
| 788 | // the default "big three" are OK | ||
| 789 | octet_iterator base () const { return it; } | ||
| 790 | utfchar32_t operator * () const | ||
| 791 | { | ||
| 792 | octet_iterator temp = it; | ||
| 793 | return utf8::next(temp, range_end); | ||
| 794 | } | ||
| 795 | bool operator == (const iterator& rhs) const | ||
| 796 | { | ||
| 797 | if (range_start != rhs.range_start || range_end != rhs.range_end) | ||
| 798 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); | ||
| 799 | return (it == rhs.it); | ||
| 800 | } | ||
| 801 | bool operator != (const iterator& rhs) const | ||
| 802 | { | ||
| 803 | return !(operator == (rhs)); | ||
| 804 | } | ||
| 805 | iterator& operator ++ () | ||
| 806 | { | ||
| 807 | utf8::next(it, range_end); | ||
| 808 | return *this; | ||
| 809 | } | ||
| 810 | iterator operator ++ (int) | ||
| 811 | { | ||
| 812 | iterator temp = *this; | ||
| 813 | utf8::next(it, range_end); | ||
| 814 | return temp; | ||
| 815 | } | ||
| 816 | iterator& operator -- () | ||
| 817 | { | ||
| 818 | utf8::prior(it, range_start); | ||
| 819 | return *this; | ||
| 820 | } | ||
| 821 | iterator operator -- (int) | ||
| 822 | { | ||
| 823 | iterator temp = *this; | ||
| 824 | utf8::prior(it, range_start); | ||
| 825 | return temp; | ||
| 826 | } | ||
| 827 | }; // class iterator | ||
| 828 | |||
| 829 | } // namespace utf8 | ||
| 830 | |||
| 831 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 832 | namespace utf8 | ||
| 833 | { | ||
| 834 | inline void append16(utfchar32_t cp, std::u16string& s) | ||
| 835 | { | ||
| 836 | append16(cp, std::back_inserter(s)); | ||
| 837 | } | ||
| 838 | |||
| 839 | inline std::string utf16to8(const std::u16string& s) | ||
| 840 | { | ||
| 841 | std::string result; | ||
| 842 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 843 | return result; | ||
| 844 | } | ||
| 845 | |||
| 846 | inline std::u16string utf8to16(const std::string& s) | ||
| 847 | { | ||
| 848 | std::u16string result; | ||
| 849 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 850 | return result; | ||
| 851 | } | ||
| 852 | |||
| 853 | inline std::string utf32to8(const std::u32string& s) | ||
| 854 | { | ||
| 855 | std::string result; | ||
| 856 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 857 | return result; | ||
| 858 | } | ||
| 859 | |||
| 860 | inline std::u32string utf8to32(const std::string& s) | ||
| 861 | { | ||
| 862 | std::u32string result; | ||
| 863 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 864 | return result; | ||
| 865 | } | ||
| 866 | } // namespace utf8 | ||
| 867 | #endif // C++ 11 or later | ||
| 868 | |||
| 869 | #if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later | ||
| 870 | namespace utf8 | ||
| 871 | { | ||
| 872 | inline std::string utf16to8(std::u16string_view s) | ||
| 873 | { | ||
| 874 | std::string result; | ||
| 875 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 876 | return result; | ||
| 877 | } | ||
| 878 | |||
| 879 | inline std::u16string utf8to16(std::string_view s) | ||
| 880 | { | ||
| 881 | std::u16string result; | ||
| 882 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 883 | return result; | ||
| 884 | } | ||
| 885 | |||
| 886 | inline std::string utf32to8(std::u32string_view s) | ||
| 887 | { | ||
| 888 | std::string result; | ||
| 889 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 890 | return result; | ||
| 891 | } | ||
| 892 | |||
| 893 | inline std::u32string utf8to32(std::string_view s) | ||
| 894 | { | ||
| 895 | std::u32string result; | ||
| 896 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 897 | return result; | ||
| 898 | } | ||
| 899 | |||
| 900 | inline std::size_t find_invalid(std::string_view s) | ||
| 901 | { | ||
| 902 | std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 903 | return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 904 | } | ||
| 905 | |||
| 906 | inline bool is_valid(std::string_view s) | ||
| 907 | { | ||
| 908 | return is_valid(s.begin(), s.end()); | ||
| 909 | } | ||
| 910 | |||
| 911 | inline std::string replace_invalid(std::string_view s, char32_t replacement) | ||
| 912 | { | ||
| 913 | std::string result; | ||
| 914 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 915 | return result; | ||
| 916 | } | ||
| 917 | |||
| 918 | inline std::string replace_invalid(std::string_view s) | ||
| 919 | { | ||
| 920 | std::string result; | ||
| 921 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 922 | return result; | ||
| 923 | } | ||
| 924 | |||
| 925 | inline bool starts_with_bom(std::string_view s) | ||
| 926 | { | ||
| 927 | return starts_with_bom(s.begin(), s.end()); | ||
| 928 | } | ||
| 929 | |||
| 930 | } // namespace utf8 | ||
| 931 | #endif // C++ 17 or later | ||
| 932 | |||
| 933 | #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later | ||
| 934 | namespace utf8 | ||
| 935 | { | ||
| 936 | inline std::u8string utf16tou8(const std::u16string& s) | ||
| 937 | { | ||
| 938 | std::u8string result; | ||
| 939 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 940 | return result; | ||
| 941 | } | ||
| 942 | |||
| 943 | inline std::u8string utf16tou8(std::u16string_view s) | ||
| 944 | { | ||
| 945 | std::u8string result; | ||
| 946 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 947 | return result; | ||
| 948 | } | ||
| 949 | |||
| 950 | inline std::u16string utf8to16(const std::u8string& s) | ||
| 951 | { | ||
| 952 | std::u16string result; | ||
| 953 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 954 | return result; | ||
| 955 | } | ||
| 956 | |||
| 957 | inline std::u16string utf8to16(const std::u8string_view& s) | ||
| 958 | { | ||
| 959 | std::u16string result; | ||
| 960 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 961 | return result; | ||
| 962 | } | ||
| 963 | |||
| 964 | inline std::u8string utf32tou8(const std::u32string& s) | ||
| 965 | { | ||
| 966 | std::u8string result; | ||
| 967 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 968 | return result; | ||
| 969 | } | ||
| 970 | |||
| 971 | inline std::u8string utf32tou8(const std::u32string_view& s) | ||
| 972 | { | ||
| 973 | std::u8string result; | ||
| 974 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 975 | return result; | ||
| 976 | } | ||
| 977 | |||
| 978 | inline std::u32string utf8to32(const std::u8string& s) | ||
| 979 | { | ||
| 980 | std::u32string result; | ||
| 981 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 982 | return result; | ||
| 983 | } | ||
| 984 | |||
| 985 | inline std::u32string utf8to32(const std::u8string_view& s) | ||
| 986 | { | ||
| 987 | std::u32string result; | ||
| 988 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 989 | return result; | ||
| 990 | } | ||
| 991 | |||
| 992 | inline std::size_t find_invalid(const std::u8string& s) | ||
| 993 | { | ||
| 994 | std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 995 | return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 996 | } | ||
| 997 | |||
| 998 | inline bool is_valid(const std::u8string& s) | ||
| 999 | { | ||
| 1000 | return is_valid(s.begin(), s.end()); | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) | ||
| 1004 | { | ||
| 1005 | std::u8string result; | ||
| 1006 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 1007 | return result; | ||
| 1008 | } | ||
| 1009 | |||
| 1010 | inline std::u8string replace_invalid(const std::u8string& s) | ||
| 1011 | { | ||
| 1012 | std::u8string result; | ||
| 1013 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 1014 | return result; | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | inline bool starts_with_bom(const std::u8string& s) | ||
| 1018 | { | ||
| 1019 | return starts_with_bom(s.begin(), s.end()); | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | } // namespace utf8 | ||
| 1023 | #endif // C++ 20 or later | ||
| 1024 | |||
| 1025 | namespace utf8 | ||
| 1026 | { | ||
| 1027 | namespace unchecked | ||
| 1028 | { | ||
| 1029 | template <typename octet_iterator> | ||
| 1030 | octet_iterator append(utfchar32_t cp, octet_iterator result) | ||
| 1031 | { | ||
| 1032 | return internal::append(cp, result); | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | template <typename word_iterator> | ||
| 1036 | word_iterator append16(utfchar32_t cp, word_iterator result) | ||
| 1037 | { | ||
| 1038 | return internal::append16(cp, result); | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | template <typename octet_iterator, typename output_iterator> | ||
| 1042 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) | ||
| 1043 | { | ||
| 1044 | while (start != end) { | ||
| 1045 | octet_iterator sequence_start = start; | ||
| 1046 | internal::utf_error err_code = utf8::internal::validate_next(start, end); | ||
| 1047 | switch (err_code) { | ||
| 1048 | case internal::UTF8_OK : | ||
| 1049 | for (octet_iterator it = sequence_start; it != start; ++it) | ||
| 1050 | *out++ = *it; | ||
| 1051 | break; | ||
| 1052 | case internal::NOT_ENOUGH_ROOM: | ||
| 1053 | out = utf8::unchecked::append(replacement, out); | ||
| 1054 | start = end; | ||
| 1055 | break; | ||
| 1056 | case internal::INVALID_LEAD: | ||
| 1057 | out = utf8::unchecked::append(replacement, out); | ||
| 1058 | ++start; | ||
| 1059 | break; | ||
| 1060 | case internal::INCOMPLETE_SEQUENCE: | ||
| 1061 | case internal::OVERLONG_SEQUENCE: | ||
| 1062 | case internal::INVALID_CODE_POINT: | ||
| 1063 | out = utf8::unchecked::append(replacement, out); | ||
| 1064 | ++start; | ||
| 1065 | // just one replacement mark for the sequence | ||
| 1066 | while (start != end && utf8::internal::is_trail(*start)) | ||
| 1067 | ++start; | ||
| 1068 | break; | ||
| 1069 | } | ||
| 1070 | } | ||
| 1071 | return out; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | template <typename octet_iterator, typename output_iterator> | ||
| 1075 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | ||
| 1076 | { | ||
| 1077 | static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd)); | ||
| 1078 | return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); | ||
| 1079 | } | ||
| 1080 | |||
| 1081 | inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) | ||
| 1082 | { | ||
| 1083 | std::string result; | ||
| 1084 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 1085 | return result; | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | inline std::string replace_invalid(const std::string& s) | ||
| 1089 | { | ||
| 1090 | std::string result; | ||
| 1091 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 1092 | return result; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | template <typename octet_iterator> | ||
| 1096 | utfchar32_t next(octet_iterator& it) | ||
| 1097 | { | ||
| 1098 | utfchar32_t cp = utf8::internal::mask8(*it); | ||
| 1099 | switch (utf8::internal::sequence_length(it)) { | ||
| 1100 | case 1: | ||
| 1101 | break; | ||
| 1102 | case 2: | ||
| 1103 | ++it; | ||
| 1104 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); | ||
| 1105 | break; | ||
| 1106 | case 3: | ||
| 1107 | ++it; | ||
| 1108 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
| 1109 | ++it; | ||
| 1110 | cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f)); | ||
| 1111 | break; | ||
| 1112 | case 4: | ||
| 1113 | ++it; | ||
| 1114 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
| 1115 | ++it; | ||
| 1116 | cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); | ||
| 1117 | ++it; | ||
| 1118 | cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f)); | ||
| 1119 | break; | ||
| 1120 | } | ||
| 1121 | ++it; | ||
| 1122 | return cp; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | template <typename octet_iterator> | ||
| 1126 | utfchar32_t peek_next(octet_iterator it) | ||
| 1127 | { | ||
| 1128 | return utf8::unchecked::next(it); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | template <typename word_iterator> | ||
| 1132 | utfchar32_t next16(word_iterator& it) | ||
| 1133 | { | ||
| 1134 | utfchar32_t cp = utf8::internal::mask16(*it++); | ||
| 1135 | if (utf8::internal::is_lead_surrogate(cp)) | ||
| 1136 | return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; | ||
| 1137 | return cp; | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | template <typename octet_iterator> | ||
| 1141 | utfchar32_t prior(octet_iterator& it) | ||
| 1142 | { | ||
| 1143 | while (utf8::internal::is_trail(*(--it))) ; | ||
| 1144 | octet_iterator temp = it; | ||
| 1145 | return utf8::unchecked::next(temp); | ||
| 1146 | } | ||
| 1147 | |||
| 1148 | template <typename octet_iterator, typename distance_type> | ||
| 1149 | void advance(octet_iterator& it, distance_type n) | ||
| 1150 | { | ||
| 1151 | const distance_type zero(0); | ||
| 1152 | if (n < zero) { | ||
| 1153 | // backward | ||
| 1154 | for (distance_type i = n; i < zero; ++i) | ||
| 1155 | utf8::unchecked::prior(it); | ||
| 1156 | } else { | ||
| 1157 | // forward | ||
| 1158 | for (distance_type i = zero; i < n; ++i) | ||
| 1159 | utf8::unchecked::next(it); | ||
| 1160 | } | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | template <typename octet_iterator> | ||
| 1164 | typename std::iterator_traits<octet_iterator>::difference_type | ||
| 1165 | distance(octet_iterator first, octet_iterator last) | ||
| 1166 | { | ||
| 1167 | typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
| 1168 | for (dist = 0; first < last; ++dist) | ||
| 1169 | utf8::unchecked::next(first); | ||
| 1170 | return dist; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 1174 | octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
| 1175 | { | ||
| 1176 | while (start != end) { | ||
| 1177 | utfchar32_t cp = utf8::internal::mask16(*start++); | ||
| 1178 | // Take care of surrogate pairs first | ||
| 1179 | if (utf8::internal::is_lead_surrogate(cp)) { | ||
| 1180 | if (start == end) | ||
| 1181 | return result; | ||
| 1182 | utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); | ||
| 1183 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
| 1184 | } | ||
| 1185 | result = utf8::unchecked::append(cp, result); | ||
| 1186 | } | ||
| 1187 | return result; | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 1191 | u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
| 1192 | { | ||
| 1193 | while (start < end) { | ||
| 1194 | utfchar32_t cp = utf8::unchecked::next(start); | ||
| 1195 | if (cp > 0xffff) { //make a surrogate pair | ||
| 1196 | *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
| 1197 | *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
| 1198 | } | ||
| 1199 | else | ||
| 1200 | *result++ = static_cast<utfchar16_t>(cp); | ||
| 1201 | } | ||
| 1202 | return result; | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 1206 | octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
| 1207 | { | ||
| 1208 | while (start != end) | ||
| 1209 | result = utf8::unchecked::append(*(start++), result); | ||
| 1210 | |||
| 1211 | return result; | ||
| 1212 | } | ||
| 1213 | |||
| 1214 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 1215 | u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
| 1216 | { | ||
| 1217 | while (start < end) | ||
| 1218 | (*result++) = utf8::unchecked::next(start); | ||
| 1219 | |||
| 1220 | return result; | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | // The iterator class | ||
| 1224 | template <typename octet_iterator> | ||
| 1225 | class iterator { | ||
| 1226 | octet_iterator it; | ||
| 1227 | public: | ||
| 1228 | typedef utfchar32_t value_type; | ||
| 1229 | typedef utfchar32_t* pointer; | ||
| 1230 | typedef utfchar32_t& reference; | ||
| 1231 | typedef std::ptrdiff_t difference_type; | ||
| 1232 | typedef std::bidirectional_iterator_tag iterator_category; | ||
| 1233 | iterator () {} | ||
| 1234 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} | ||
| 1235 | // the default "big three" are OK | ||
| 1236 | octet_iterator base () const { return it; } | ||
| 1237 | utfchar32_t operator * () const | ||
| 1238 | { | ||
| 1239 | octet_iterator temp = it; | ||
| 1240 | return utf8::unchecked::next(temp); | ||
| 1241 | } | ||
| 1242 | bool operator == (const iterator& rhs) const | ||
| 1243 | { | ||
| 1244 | return (it == rhs.it); | ||
| 1245 | } | ||
| 1246 | bool operator != (const iterator& rhs) const | ||
| 1247 | { | ||
| 1248 | return !(operator == (rhs)); | ||
| 1249 | } | ||
| 1250 | iterator& operator ++ () | ||
| 1251 | { | ||
| 1252 | ::std::advance(it, utf8::internal::sequence_length(it)); | ||
| 1253 | return *this; | ||
| 1254 | } | ||
| 1255 | iterator operator ++ (int) | ||
| 1256 | { | ||
| 1257 | iterator temp = *this; | ||
| 1258 | ::std::advance(it, utf8::internal::sequence_length(it)); | ||
| 1259 | return temp; | ||
| 1260 | } | ||
| 1261 | iterator& operator -- () | ||
| 1262 | { | ||
| 1263 | utf8::unchecked::prior(it); | ||
| 1264 | return *this; | ||
| 1265 | } | ||
| 1266 | iterator operator -- (int) | ||
| 1267 | { | ||
| 1268 | iterator temp = *this; | ||
| 1269 | utf8::unchecked::prior(it); | ||
| 1270 | return temp; | ||
| 1271 | } | ||
| 1272 | }; // class iterator | ||
| 1273 | |||
| 1274 | } // namespace utf8::unchecked | ||
| 1275 | } // namespace utf8 | ||
| 1276 | |||
| 1277 | #endif // header guard | ||
