aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLi Jin <dragon-fly@qq.com>2025-11-05 11:09:25 +0800
committerLi Jin <dragon-fly@qq.com>2025-11-05 11:09:25 +0800
commit4e7cf4d863804303a931d6e16df7de6616337909 (patch)
treeb7ca7d521c5e350ccbd160c8d4ff55269e998903 /src
parent3a830bd4dd003dbf90270ca22f40b517b7b576df (diff)
downloadyuescript-4e7cf4d863804303a931d6e16df7de6616337909.tar.gz
yuescript-4e7cf4d863804303a931d6e16df7de6616337909.tar.bz2
yuescript-4e7cf4d863804303a931d6e16df7de6616337909.zip
Removed the use of codecvt from C++ for it being removed in C++26.
Diffstat (limited to 'src')
-rwxr-xr-xsrc/3rdParty/utf8cpp.h1277
-rw-r--r--src/yuescript/parser.cpp29
-rw-r--r--src/yuescript/parser.hpp7
-rw-r--r--src/yuescript/yue_ast.cpp2
-rw-r--r--src/yuescript/yue_ast.h1
-rw-r--r--src/yuescript/yue_compiler.cpp2
-rw-r--r--src/yuescript/yue_parser.cpp12
-rw-r--r--src/yuescript/yue_parser.h1
8 files changed, 1316 insertions, 15 deletions
diff --git a/src/3rdParty/utf8cpp.h b/src/3rdParty/utf8cpp.h
new file mode 100755
index 0000000..76f0fa1
--- /dev/null
+++ b/src/3rdParty/utf8cpp.h
@@ -0,0 +1,1277 @@
1// Copyright 2006 Nemanja Trifunovic
2
3/*
4Permission is hereby granted, free of charge, to any person or organization
5obtaining a copy of the software and accompanying documentation covered by
6this license (the "Software") to use, reproduce, display, distribute,
7execute, and transmit the Software, and to prepare derivative works of the
8Software, and to permit third-parties to whom the Software is furnished to
9do so, all subject to the following:
10
11The copyright notices in the Software and this entire statement, including
12the above license grant, this restriction and the following disclaimer,
13must be included in all copies of the Software, in whole or in part, and
14all derivative works of the Software, unless such copies or derivative
15works are solely in the form of machine-executable object code generated by
16a source language processor.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24DEALINGS IN THE SOFTWARE.
25*/
26
27
28#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30
31/*
32To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro
33and set it to one of the values used by the __cplusplus predefined macro.
34
35For instance,
36 #define UTF_CPP_CPLUSPLUS 199711L
37will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard.
38Some library features will be disabled.
39
40If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus.
41*/
42
43#include <iterator>
44#include <cstring>
45#include <string>
46
47// Determine the C++ standard version.
48// If the user defines UTF_CPP_CPLUSPLUS, use that.
49// Otherwise, trust the unreliable predefined macro __cplusplus
50
51#if !defined UTF_CPP_CPLUSPLUS
52 #define UTF_CPP_CPLUSPLUS __cplusplus
53#endif
54
55#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
56 #define UTF_CPP_OVERRIDE override
57 #define UTF_CPP_NOEXCEPT noexcept
58 #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
59#else // C++ 98/03
60 #define UTF_CPP_OVERRIDE
61 #define UTF_CPP_NOEXCEPT throw()
62 // Simulate static_assert:
63 template<bool> struct UtfCppCompileTimeAssert;
64 template<> struct UtfCppCompileTimeAssert <true> { };
65 #define UTF_CPP_STATIC_ASSERT(condition) (UtfCppCompileTimeAssert <(condition) != 0>())
66#endif // C++ 11 or later
67
68
69namespace utf8
70{
71// The typedefs for 8-bit, 16-bit and 32-bit code units
72#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
73 #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
74 typedef char8_t utfchar8_t;
75 #else // C++ 11/14/17
76 typedef unsigned char utfchar8_t;
77 #endif
78 typedef char16_t utfchar16_t;
79 typedef char32_t utfchar32_t;
80#else // C++ 98/03
81 typedef unsigned char utfchar8_t;
82 typedef unsigned short utfchar16_t;
83 typedef unsigned int utfchar32_t;
84#endif // C++ 11 or later
85
86// Helper code - not intended to be directly called by the library users. May be changed at any time
87namespace internal
88{
89 // Unicode constants
90 // Leading (high) surrogates: 0xd800 - 0xdbff
91 // Trailing (low) surrogates: 0xdc00 - 0xdfff
92 const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u;
93 const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu;
94 const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u;
95 const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu;
96 const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
97 const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
98
99 // Maximum valid value for a Unicode code point
100 const utfchar32_t CODE_POINT_MAX = 0x0010ffffu;
101
102 template<typename octet_type>
103 inline utfchar8_t mask8(octet_type oc)
104 {
105 return static_cast<utfchar8_t>(0xff & oc);
106 }
107
108 template<typename u16_type>
109 inline utfchar16_t mask16(u16_type oc)
110 {
111 return static_cast<utfchar16_t>(0xffff & oc);
112 }
113
114 template<typename octet_type>
115 inline bool is_trail(octet_type oc)
116 {
117 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
118 }
119
120 inline bool is_lead_surrogate(utfchar32_t cp)
121 {
122 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX));
123 }
124
125 inline bool is_trail_surrogate(utfchar32_t cp)
126 {
127 return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
128 }
129
130 inline bool is_surrogate(utfchar32_t cp)
131 {
132 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
133 }
134
135 inline bool is_code_point_valid(utfchar32_t cp)
136 {
137 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
138 }
139
140 inline bool is_in_bmp(utfchar32_t cp)
141 {
142 return cp < utfchar32_t(0x10000);
143 }
144
145 template <typename octet_iterator>
146 int sequence_length(octet_iterator lead_it)
147 {
148 const utfchar8_t lead = utf8::internal::mask8(*lead_it);
149 if (lead < 0x80)
150 return 1;
151 else if ((lead >> 5) == 0x6)
152 return 2;
153 else if ((lead >> 4) == 0xe)
154 return 3;
155 else if ((lead >> 3) == 0x1e)
156 return 4;
157 else
158 return 0;
159 }
160
161 inline bool is_overlong_sequence(utfchar32_t cp, int length)
162 {
163 if (cp < 0x80) {
164 if (length != 1)
165 return true;
166 }
167 else if (cp < 0x800) {
168 if (length != 2)
169 return true;
170 }
171 else if (cp < 0x10000) {
172 if (length != 3)
173 return true;
174 }
175 return false;
176 }
177
178 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
179
180 /// Helper for get_sequence_x
181 template <typename octet_iterator>
182 utf_error increase_safely(octet_iterator& it, const octet_iterator end)
183 {
184 if (++it == end)
185 return NOT_ENOUGH_ROOM;
186
187 if (!utf8::internal::is_trail(*it))
188 return INCOMPLETE_SEQUENCE;
189
190 return UTF8_OK;
191 }
192
193 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
194
195 /// get_sequence_x functions decode utf-8 sequences of the length x
196 template <typename octet_iterator>
197 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
198 {
199 if (it == end)
200 return NOT_ENOUGH_ROOM;
201
202 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
203
204 return UTF8_OK;
205 }
206
207 template <typename octet_iterator>
208 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
209 {
210 if (it == end)
211 return NOT_ENOUGH_ROOM;
212
213 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
214
215 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
216
217 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
218
219 return UTF8_OK;
220 }
221
222 template <typename octet_iterator>
223 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
224 {
225 if (it == end)
226 return NOT_ENOUGH_ROOM;
227
228 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
229
230 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
231
232 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
233
234 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
235
236 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
237
238 return UTF8_OK;
239 }
240
241 template <typename octet_iterator>
242 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
243 {
244 if (it == end)
245 return NOT_ENOUGH_ROOM;
246
247 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
248
249 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
250
251 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
252
253 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
254
255 code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
256
257 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
258
259 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
260
261 return UTF8_OK;
262 }
263
264 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
265
266 template <typename octet_iterator>
267 utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
268 {
269 if (it == end)
270 return NOT_ENOUGH_ROOM;
271
272 // Save the original value of it so we can go back in case of failure
273 // Of course, it does not make much sense with i.e. stream iterators
274 octet_iterator original_it = it;
275
276 utfchar32_t cp = 0;
277 // Determine the sequence length based on the lead octet
278 const int length = utf8::internal::sequence_length(it);
279
280 // Get trail octets and calculate the code point
281 utf_error err = UTF8_OK;
282 switch (length) {
283 case 0:
284 return INVALID_LEAD;
285 case 1:
286 err = utf8::internal::get_sequence_1(it, end, cp);
287 break;
288 case 2:
289 err = utf8::internal::get_sequence_2(it, end, cp);
290 break;
291 case 3:
292 err = utf8::internal::get_sequence_3(it, end, cp);
293 break;
294 case 4:
295 err = utf8::internal::get_sequence_4(it, end, cp);
296 break;
297 }
298
299 if (err == UTF8_OK) {
300 // Decoding succeeded. Now, security checks...
301 if (utf8::internal::is_code_point_valid(cp)) {
302 if (!utf8::internal::is_overlong_sequence(cp, length)){
303 // Passed! Return here.
304 code_point = cp;
305 ++it;
306 return UTF8_OK;
307 }
308 else
309 err = OVERLONG_SEQUENCE;
310 }
311 else
312 err = INVALID_CODE_POINT;
313 }
314
315 // Failure branch - restore the original value of the iterator
316 it = original_it;
317 return err;
318 }
319
320 template <typename octet_iterator>
321 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
322 utfchar32_t ignored;
323 return utf8::internal::validate_next(it, end, ignored);
324 }
325
326 template <typename word_iterator>
327 utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
328 {
329 // Make sure the iterator dereferences a large enough type
330 typedef typename std::iterator_traits<word_iterator>::value_type word_type;
331 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
332 // Check the edge case:
333 if (it == end)
334 return NOT_ENOUGH_ROOM;
335 // Save the original value of it so we can go back in case of failure
336 // Of course, it does not make much sense with i.e. stream iterators
337 word_iterator original_it = it;
338
339 utf_error err = UTF8_OK;
340
341 const utfchar16_t first_word = *it++;
342 if (!is_surrogate(first_word)) {
343 code_point = first_word;
344 return UTF8_OK;
345 }
346 else {
347 if (it == end)
348 err = NOT_ENOUGH_ROOM;
349 else if (is_lead_surrogate(first_word)) {
350 const utfchar16_t second_word = *it++;
351 if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) {
352 code_point = static_cast<utfchar32_t>(first_word << 10) + static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET;
353 return UTF8_OK;
354 } else
355 err = INCOMPLETE_SEQUENCE;
356
357 } else {
358 err = INVALID_LEAD;
359 }
360 }
361 // error branch
362 it = original_it;
363 return err;
364 }
365
366 // Internal implementation of both checked and unchecked append() function
367 // This function will be invoked by the overloads below, as they will know
368 // the octet_type.
369 template <typename octet_iterator, typename octet_type>
370 octet_iterator append(utfchar32_t cp, octet_iterator result) {
371 if (cp < 0x80) // one octet
372 *(result++) = static_cast<octet_type>(cp);
373 else if (cp < 0x800) { // two octets
374 *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
375 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
376 }
377 else if (cp < 0x10000) { // three octets
378 *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
379 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
380 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
381 }
382 else { // four octets
383 *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
384 *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
385 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
386 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
387 }
388 return result;
389 }
390
391 // One of the following overloads will be invoked from the API calls
392
393 // A simple (but dangerous) case: the caller appends byte(s) to a char array
394 inline char* append(utfchar32_t cp, char* result) {
395 return append<char*, char>(cp, result);
396 }
397
398 // Hopefully, most common case: the caller uses back_inserter
399 // i.e. append(cp, std::back_inserter(str));
400 template<typename container_type>
401 std::back_insert_iterator<container_type> append
402 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
403 return append<std::back_insert_iterator<container_type>,
404 typename container_type::value_type>(cp, result);
405 }
406
407 // The caller uses some other kind of output operator - not covered above
408 // Note that in this case we are not able to determine octet_type
409 // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
410 template <typename octet_iterator>
411 octet_iterator append(utfchar32_t cp, octet_iterator result) {
412 return append<octet_iterator, utfchar8_t>(cp, result);
413 }
414
415 // Internal implementation of both checked and unchecked append16() function
416 // This function will be invoked by the overloads below, as they will know
417 // the word_type.
418 template <typename word_iterator, typename word_type>
419 word_iterator append16(utfchar32_t cp, word_iterator result) {
420 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
421 if (is_in_bmp(cp))
422 *(result++) = static_cast<word_type>(cp);
423 else {
424 // Code points from the supplementary planes are encoded via surrogate pairs
425 *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
426 *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
427 }
428 return result;
429 }
430
431 // Hopefully, most common case: the caller uses back_inserter
432 // i.e. append16(cp, std::back_inserter(str));
433 template<typename container_type>
434 std::back_insert_iterator<container_type> append16
435 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
436 return append16<std::back_insert_iterator<container_type>,
437 typename container_type::value_type>(cp, result);
438 }
439
440 // The caller uses some other kind of output operator - not covered above
441 // Note that in this case we are not able to determine word_type
442 // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
443 template <typename word_iterator>
444 word_iterator append16(utfchar32_t cp, word_iterator result) {
445 return append16<word_iterator, utfchar16_t>(cp, result);
446 }
447
448} // namespace internal
449
450 /// The library API - functions intended to be called by the users
451
452 // Byte order mark
453 const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
454
455 template <typename octet_iterator>
456 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
457 {
458 octet_iterator result = start;
459 while (result != end) {
460 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
461 if (err_code != internal::UTF8_OK)
462 return result;
463 }
464 return result;
465 }
466
467 inline const char* find_invalid(const char* str)
468 {
469 const char* end = str + std::strlen(str);
470 return find_invalid(str, end);
471 }
472
473 inline std::size_t find_invalid(const std::string& s)
474 {
475 std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
476 return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
477 }
478
479 template <typename octet_iterator>
480 inline bool is_valid(octet_iterator start, octet_iterator end)
481 {
482 return (utf8::find_invalid(start, end) == end);
483 }
484
485 inline bool is_valid(const char* str)
486 {
487 return (*(utf8::find_invalid(str)) == '\0');
488 }
489
490 inline bool is_valid(const std::string& s)
491 {
492 return is_valid(s.begin(), s.end());
493 }
494
495
496
497 template <typename octet_iterator>
498 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
499 {
500 return (
501 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
502 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
503 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
504 );
505 }
506
507 inline bool starts_with_bom(const std::string& s)
508 {
509 return starts_with_bom(s.begin(), s.end());
510 }
511} // namespace utf8
512
513#include <stdexcept>
514
515namespace utf8
516{
517 // Base for the exceptions that may be thrown from the library
518 class exception : public ::std::exception {
519 };
520
521 // Exceptions that may be thrown from the library functions.
522 class invalid_code_point : public exception {
523 utfchar32_t cp;
524 public:
525 invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {}
526 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
527 utfchar32_t code_point() const {return cp;}
528 };
529
530 class invalid_utf8 : public exception {
531 utfchar8_t u8;
532 public:
533 invalid_utf8 (utfchar8_t u) : u8(u) {}
534 invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {}
535 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
536 utfchar8_t utf8_octet() const {return u8;}
537 };
538
539 class invalid_utf16 : public exception {
540 utfchar16_t u16;
541 public:
542 invalid_utf16 (utfchar16_t u) : u16(u) {}
543 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
544 utfchar16_t utf16_word() const {return u16;}
545 };
546
547 class not_enough_room : public exception {
548 public:
549 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
550 };
551
552 /// The library API - functions intended to be called by the users
553
554 template <typename octet_iterator>
555 octet_iterator append(utfchar32_t cp, octet_iterator result)
556 {
557 if (!utf8::internal::is_code_point_valid(cp))
558 throw invalid_code_point(cp);
559
560 return internal::append(cp, result);
561 }
562
563 inline void append(utfchar32_t cp, std::string& s)
564 {
565 append(cp, std::back_inserter(s));
566 }
567
568 template <typename word_iterator>
569 word_iterator append16(utfchar32_t cp, word_iterator result)
570 {
571 if (!utf8::internal::is_code_point_valid(cp))
572 throw invalid_code_point(cp);
573
574 return internal::append16(cp, result);
575 }
576
577 template <typename octet_iterator, typename output_iterator>
578 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
579 {
580 while (start != end) {
581 octet_iterator sequence_start = start;
582 internal::utf_error err_code = utf8::internal::validate_next(start, end);
583 switch (err_code) {
584 case internal::UTF8_OK :
585 for (octet_iterator it = sequence_start; it != start; ++it)
586 *out++ = *it;
587 break;
588 case internal::NOT_ENOUGH_ROOM:
589 out = utf8::append (replacement, out);
590 start = end;
591 break;
592 case internal::INVALID_LEAD:
593 out = utf8::append (replacement, out);
594 ++start;
595 break;
596 case internal::INCOMPLETE_SEQUENCE:
597 case internal::OVERLONG_SEQUENCE:
598 case internal::INVALID_CODE_POINT:
599 out = utf8::append (replacement, out);
600 ++start;
601 // just one replacement mark for the sequence
602 while (start != end && utf8::internal::is_trail(*start))
603 ++start;
604 break;
605 }
606 }
607 return out;
608 }
609
610 template <typename octet_iterator, typename output_iterator>
611 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
612 {
613 static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
614 return utf8::replace_invalid(start, end, out, replacement_marker);
615 }
616
617 inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
618 {
619 std::string result;
620 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
621 return result;
622 }
623
624 inline std::string replace_invalid(const std::string& s)
625 {
626 std::string result;
627 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
628 return result;
629 }
630
631 template <typename octet_iterator>
632 utfchar32_t next(octet_iterator& it, octet_iterator end)
633 {
634 utfchar32_t cp = 0;
635 internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
636 switch (err_code) {
637 case internal::UTF8_OK :
638 break;
639 case internal::NOT_ENOUGH_ROOM :
640 throw not_enough_room();
641 case internal::INVALID_LEAD :
642 case internal::INCOMPLETE_SEQUENCE :
643 case internal::OVERLONG_SEQUENCE :
644 throw invalid_utf8(static_cast<utfchar8_t>(*it));
645 case internal::INVALID_CODE_POINT :
646 throw invalid_code_point(cp);
647 }
648 return cp;
649 }
650
651 template <typename word_iterator>
652 utfchar32_t next16(word_iterator& it, word_iterator end)
653 {
654 utfchar32_t cp = 0;
655 internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp);
656 if (err_code == internal::NOT_ENOUGH_ROOM)
657 throw not_enough_room();
658 return cp;
659 }
660
661 template <typename octet_iterator>
662 utfchar32_t peek_next(octet_iterator it, octet_iterator end)
663 {
664 return utf8::next(it, end);
665 }
666
667 template <typename octet_iterator>
668 utfchar32_t prior(octet_iterator& it, octet_iterator start)
669 {
670 // can't do much if it == start
671 if (it == start)
672 throw not_enough_room();
673
674 octet_iterator end = it;
675 // Go back until we hit either a lead octet or start
676 while (utf8::internal::is_trail(*(--it)))
677 if (it == start)
678 throw invalid_utf8(*it); // error - no lead byte in the sequence
679 return utf8::peek_next(it, end);
680 }
681
682 template <typename octet_iterator, typename distance_type>
683 void advance (octet_iterator& it, distance_type n, octet_iterator end)
684 {
685 const distance_type zero(0);
686 if (n < zero) {
687 // backward
688 for (distance_type i = n; i < zero; ++i)
689 utf8::prior(it, end);
690 } else {
691 // forward
692 for (distance_type i = zero; i < n; ++i)
693 utf8::next(it, end);
694 }
695 }
696
697 template <typename octet_iterator>
698 typename std::iterator_traits<octet_iterator>::difference_type
699 distance (octet_iterator first, octet_iterator last)
700 {
701 typename std::iterator_traits<octet_iterator>::difference_type dist;
702 for (dist = 0; first < last; ++dist)
703 utf8::next(first, last);
704 return dist;
705 }
706
707 template <typename u16bit_iterator, typename octet_iterator>
708 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
709 {
710 while (start != end) {
711 utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
712 // Take care of surrogate pairs first
713 if (utf8::internal::is_lead_surrogate(cp)) {
714 if (start != end) {
715 const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
716 if (utf8::internal::is_trail_surrogate(trail_surrogate))
717 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
718 else
719 throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate));
720 }
721 else
722 throw invalid_utf16(static_cast<utfchar16_t>(cp));
723
724 }
725 // Lone trail surrogate
726 else if (utf8::internal::is_trail_surrogate(cp))
727 throw invalid_utf16(static_cast<utfchar16_t>(cp));
728
729 result = utf8::append(cp, result);
730 }
731 return result;
732 }
733
734 template <typename u16bit_iterator, typename octet_iterator>
735 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
736 {
737 while (start < end) {
738 const utfchar32_t cp = utf8::next(start, end);
739 if (cp > 0xffff) { //make a surrogate pair
740 *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET);
741 *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
742 }
743 else
744 *result++ = static_cast<utfchar16_t>(cp);
745 }
746 return result;
747 }
748
749 template <typename octet_iterator, typename u32bit_iterator>
750 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
751 {
752 while (start != end)
753 result = utf8::append(*(start++), result);
754
755 return result;
756 }
757
758 template <typename octet_iterator, typename u32bit_iterator>
759 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
760 {
761 while (start < end)
762 (*result++) = utf8::next(start, end);
763
764 return result;
765 }
766
767 // The iterator class
768 template <typename octet_iterator>
769 class iterator {
770 octet_iterator it;
771 octet_iterator range_start;
772 octet_iterator range_end;
773 public:
774 typedef utfchar32_t value_type;
775 typedef utfchar32_t* pointer;
776 typedef utfchar32_t& reference;
777 typedef std::ptrdiff_t difference_type;
778 typedef std::bidirectional_iterator_tag iterator_category;
779 iterator () {}
780 explicit iterator (const octet_iterator& octet_it,
781 const octet_iterator& rangestart,
782 const octet_iterator& rangeend) :
783 it(octet_it), range_start(rangestart), range_end(rangeend)
784 {
785 if (it < range_start || it > range_end)
786 throw std::out_of_range("Invalid utf-8 iterator position");
787 }
788 // the default "big three" are OK
789 octet_iterator base () const { return it; }
790 utfchar32_t operator * () const
791 {
792 octet_iterator temp = it;
793 return utf8::next(temp, range_end);
794 }
795 bool operator == (const iterator& rhs) const
796 {
797 if (range_start != rhs.range_start || range_end != rhs.range_end)
798 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
799 return (it == rhs.it);
800 }
801 bool operator != (const iterator& rhs) const
802 {
803 return !(operator == (rhs));
804 }
805 iterator& operator ++ ()
806 {
807 utf8::next(it, range_end);
808 return *this;
809 }
810 iterator operator ++ (int)
811 {
812 iterator temp = *this;
813 utf8::next(it, range_end);
814 return temp;
815 }
816 iterator& operator -- ()
817 {
818 utf8::prior(it, range_start);
819 return *this;
820 }
821 iterator operator -- (int)
822 {
823 iterator temp = *this;
824 utf8::prior(it, range_start);
825 return temp;
826 }
827 }; // class iterator
828
829} // namespace utf8
830
831#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
832namespace utf8
833{
834 inline void append16(utfchar32_t cp, std::u16string& s)
835 {
836 append16(cp, std::back_inserter(s));
837 }
838
839 inline std::string utf16to8(const std::u16string& s)
840 {
841 std::string result;
842 utf16to8(s.begin(), s.end(), std::back_inserter(result));
843 return result;
844 }
845
846 inline std::u16string utf8to16(const std::string& s)
847 {
848 std::u16string result;
849 utf8to16(s.begin(), s.end(), std::back_inserter(result));
850 return result;
851 }
852
853 inline std::string utf32to8(const std::u32string& s)
854 {
855 std::string result;
856 utf32to8(s.begin(), s.end(), std::back_inserter(result));
857 return result;
858 }
859
860 inline std::u32string utf8to32(const std::string& s)
861 {
862 std::u32string result;
863 utf8to32(s.begin(), s.end(), std::back_inserter(result));
864 return result;
865 }
866} // namespace utf8
867#endif // C++ 11 or later
868
869#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
870namespace utf8
871{
872 inline std::string utf16to8(std::u16string_view s)
873 {
874 std::string result;
875 utf16to8(s.begin(), s.end(), std::back_inserter(result));
876 return result;
877 }
878
879 inline std::u16string utf8to16(std::string_view s)
880 {
881 std::u16string result;
882 utf8to16(s.begin(), s.end(), std::back_inserter(result));
883 return result;
884 }
885
886 inline std::string utf32to8(std::u32string_view s)
887 {
888 std::string result;
889 utf32to8(s.begin(), s.end(), std::back_inserter(result));
890 return result;
891 }
892
893 inline std::u32string utf8to32(std::string_view s)
894 {
895 std::u32string result;
896 utf8to32(s.begin(), s.end(), std::back_inserter(result));
897 return result;
898 }
899
900 inline std::size_t find_invalid(std::string_view s)
901 {
902 std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
903 return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
904 }
905
906 inline bool is_valid(std::string_view s)
907 {
908 return is_valid(s.begin(), s.end());
909 }
910
911 inline std::string replace_invalid(std::string_view s, char32_t replacement)
912 {
913 std::string result;
914 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
915 return result;
916 }
917
918 inline std::string replace_invalid(std::string_view s)
919 {
920 std::string result;
921 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
922 return result;
923 }
924
925 inline bool starts_with_bom(std::string_view s)
926 {
927 return starts_with_bom(s.begin(), s.end());
928 }
929
930} // namespace utf8
931#endif // C++ 17 or later
932
933#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
934namespace utf8
935{
936 inline std::u8string utf16tou8(const std::u16string& s)
937 {
938 std::u8string result;
939 utf16to8(s.begin(), s.end(), std::back_inserter(result));
940 return result;
941 }
942
943 inline std::u8string utf16tou8(std::u16string_view s)
944 {
945 std::u8string result;
946 utf16to8(s.begin(), s.end(), std::back_inserter(result));
947 return result;
948 }
949
950 inline std::u16string utf8to16(const std::u8string& s)
951 {
952 std::u16string result;
953 utf8to16(s.begin(), s.end(), std::back_inserter(result));
954 return result;
955 }
956
957 inline std::u16string utf8to16(const std::u8string_view& s)
958 {
959 std::u16string result;
960 utf8to16(s.begin(), s.end(), std::back_inserter(result));
961 return result;
962 }
963
964 inline std::u8string utf32tou8(const std::u32string& s)
965 {
966 std::u8string result;
967 utf32to8(s.begin(), s.end(), std::back_inserter(result));
968 return result;
969 }
970
971 inline std::u8string utf32tou8(const std::u32string_view& s)
972 {
973 std::u8string result;
974 utf32to8(s.begin(), s.end(), std::back_inserter(result));
975 return result;
976 }
977
978 inline std::u32string utf8to32(const std::u8string& s)
979 {
980 std::u32string result;
981 utf8to32(s.begin(), s.end(), std::back_inserter(result));
982 return result;
983 }
984
985 inline std::u32string utf8to32(const std::u8string_view& s)
986 {
987 std::u32string result;
988 utf8to32(s.begin(), s.end(), std::back_inserter(result));
989 return result;
990 }
991
992 inline std::size_t find_invalid(const std::u8string& s)
993 {
994 std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
995 return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
996 }
997
998 inline bool is_valid(const std::u8string& s)
999 {
1000 return is_valid(s.begin(), s.end());
1001 }
1002
1003 inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
1004 {
1005 std::u8string result;
1006 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
1007 return result;
1008 }
1009
1010 inline std::u8string replace_invalid(const std::u8string& s)
1011 {
1012 std::u8string result;
1013 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
1014 return result;
1015 }
1016
1017 inline bool starts_with_bom(const std::u8string& s)
1018 {
1019 return starts_with_bom(s.begin(), s.end());
1020 }
1021
1022} // namespace utf8
1023#endif // C++ 20 or later
1024
1025namespace utf8
1026{
1027 namespace unchecked
1028 {
1029 template <typename octet_iterator>
1030 octet_iterator append(utfchar32_t cp, octet_iterator result)
1031 {
1032 return internal::append(cp, result);
1033 }
1034
1035 template <typename word_iterator>
1036 word_iterator append16(utfchar32_t cp, word_iterator result)
1037 {
1038 return internal::append16(cp, result);
1039 }
1040
1041 template <typename octet_iterator, typename output_iterator>
1042 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
1043 {
1044 while (start != end) {
1045 octet_iterator sequence_start = start;
1046 internal::utf_error err_code = utf8::internal::validate_next(start, end);
1047 switch (err_code) {
1048 case internal::UTF8_OK :
1049 for (octet_iterator it = sequence_start; it != start; ++it)
1050 *out++ = *it;
1051 break;
1052 case internal::NOT_ENOUGH_ROOM:
1053 out = utf8::unchecked::append(replacement, out);
1054 start = end;
1055 break;
1056 case internal::INVALID_LEAD:
1057 out = utf8::unchecked::append(replacement, out);
1058 ++start;
1059 break;
1060 case internal::INCOMPLETE_SEQUENCE:
1061 case internal::OVERLONG_SEQUENCE:
1062 case internal::INVALID_CODE_POINT:
1063 out = utf8::unchecked::append(replacement, out);
1064 ++start;
1065 // just one replacement mark for the sequence
1066 while (start != end && utf8::internal::is_trail(*start))
1067 ++start;
1068 break;
1069 }
1070 }
1071 return out;
1072 }
1073
1074 template <typename octet_iterator, typename output_iterator>
1075 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
1076 {
1077 static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
1078 return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
1079 }
1080
1081 inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
1082 {
1083 std::string result;
1084 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
1085 return result;
1086 }
1087
1088 inline std::string replace_invalid(const std::string& s)
1089 {
1090 std::string result;
1091 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
1092 return result;
1093 }
1094
1095 template <typename octet_iterator>
1096 utfchar32_t next(octet_iterator& it)
1097 {
1098 utfchar32_t cp = utf8::internal::mask8(*it);
1099 switch (utf8::internal::sequence_length(it)) {
1100 case 1:
1101 break;
1102 case 2:
1103 ++it;
1104 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
1105 break;
1106 case 3:
1107 ++it;
1108 cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
1109 ++it;
1110 cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
1111 break;
1112 case 4:
1113 ++it;
1114 cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
1115 ++it;
1116 cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff));
1117 ++it;
1118 cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
1119 break;
1120 }
1121 ++it;
1122 return cp;
1123 }
1124
1125 template <typename octet_iterator>
1126 utfchar32_t peek_next(octet_iterator it)
1127 {
1128 return utf8::unchecked::next(it);
1129 }
1130
1131 template <typename word_iterator>
1132 utfchar32_t next16(word_iterator& it)
1133 {
1134 utfchar32_t cp = utf8::internal::mask16(*it++);
1135 if (utf8::internal::is_lead_surrogate(cp))
1136 return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET;
1137 return cp;
1138 }
1139
1140 template <typename octet_iterator>
1141 utfchar32_t prior(octet_iterator& it)
1142 {
1143 while (utf8::internal::is_trail(*(--it))) ;
1144 octet_iterator temp = it;
1145 return utf8::unchecked::next(temp);
1146 }
1147
1148 template <typename octet_iterator, typename distance_type>
1149 void advance(octet_iterator& it, distance_type n)
1150 {
1151 const distance_type zero(0);
1152 if (n < zero) {
1153 // backward
1154 for (distance_type i = n; i < zero; ++i)
1155 utf8::unchecked::prior(it);
1156 } else {
1157 // forward
1158 for (distance_type i = zero; i < n; ++i)
1159 utf8::unchecked::next(it);
1160 }
1161 }
1162
1163 template <typename octet_iterator>
1164 typename std::iterator_traits<octet_iterator>::difference_type
1165 distance(octet_iterator first, octet_iterator last)
1166 {
1167 typename std::iterator_traits<octet_iterator>::difference_type dist;
1168 for (dist = 0; first < last; ++dist)
1169 utf8::unchecked::next(first);
1170 return dist;
1171 }
1172
1173 template <typename u16bit_iterator, typename octet_iterator>
1174 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
1175 {
1176 while (start != end) {
1177 utfchar32_t cp = utf8::internal::mask16(*start++);
1178 // Take care of surrogate pairs first
1179 if (utf8::internal::is_lead_surrogate(cp)) {
1180 if (start == end)
1181 return result;
1182 utfchar32_t trail_surrogate = utf8::internal::mask16(*start++);
1183 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
1184 }
1185 result = utf8::unchecked::append(cp, result);
1186 }
1187 return result;
1188 }
1189
1190 template <typename u16bit_iterator, typename octet_iterator>
1191 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
1192 {
1193 while (start < end) {
1194 utfchar32_t cp = utf8::unchecked::next(start);
1195 if (cp > 0xffff) { //make a surrogate pair
1196 *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET);
1197 *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
1198 }
1199 else
1200 *result++ = static_cast<utfchar16_t>(cp);
1201 }
1202 return result;
1203 }
1204
1205 template <typename octet_iterator, typename u32bit_iterator>
1206 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
1207 {
1208 while (start != end)
1209 result = utf8::unchecked::append(*(start++), result);
1210
1211 return result;
1212 }
1213
1214 template <typename octet_iterator, typename u32bit_iterator>
1215 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
1216 {
1217 while (start < end)
1218 (*result++) = utf8::unchecked::next(start);
1219
1220 return result;
1221 }
1222
1223 // The iterator class
1224 template <typename octet_iterator>
1225 class iterator {
1226 octet_iterator it;
1227 public:
1228 typedef utfchar32_t value_type;
1229 typedef utfchar32_t* pointer;
1230 typedef utfchar32_t& reference;
1231 typedef std::ptrdiff_t difference_type;
1232 typedef std::bidirectional_iterator_tag iterator_category;
1233 iterator () {}
1234 explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
1235 // the default "big three" are OK
1236 octet_iterator base () const { return it; }
1237 utfchar32_t operator * () const
1238 {
1239 octet_iterator temp = it;
1240 return utf8::unchecked::next(temp);
1241 }
1242 bool operator == (const iterator& rhs) const
1243 {
1244 return (it == rhs.it);
1245 }
1246 bool operator != (const iterator& rhs) const
1247 {
1248 return !(operator == (rhs));
1249 }
1250 iterator& operator ++ ()
1251 {
1252 ::std::advance(it, utf8::internal::sequence_length(it));
1253 return *this;
1254 }
1255 iterator operator ++ (int)
1256 {
1257 iterator temp = *this;
1258 ::std::advance(it, utf8::internal::sequence_length(it));
1259 return temp;
1260 }
1261 iterator& operator -- ()
1262 {
1263 utf8::unchecked::prior(it);
1264 return *this;
1265 }
1266 iterator operator -- (int)
1267 {
1268 iterator temp = *this;
1269 utf8::unchecked::prior(it);
1270 return temp;
1271 }
1272 }; // class iterator
1273
1274 } // namespace utf8::unchecked
1275} // namespace utf8
1276
1277#endif // header guard
diff --git a/src/yuescript/parser.cpp b/src/yuescript/parser.cpp
index f0ddd06..5910348 100644
--- a/src/yuescript/parser.cpp
+++ b/src/yuescript/parser.cpp
@@ -18,8 +18,33 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 18
19#include "yuescript/parser.hpp" 19#include "yuescript/parser.hpp"
20 20
21#ifndef YUE_UTF8_IMPL
22namespace CodeCvt {
23 std::u32string utf8to32(const std::string& str);
24 std::string utf32to8(const std::u32string& str);
25} // namespace CodeCvt
26#else
27#include "utf8cpp.h"
28namespace CodeCvt {
29 std::u32string utf8to32(const std::string& str) {
30 return utf8::utf8to32(str);
31 }
32 std::string utf32to8(const std::u32string& str) {
33 return utf8::utf32to8(str);
34 }
35} // namespace CodeCvt
36#endif // YUE_UTF8_IMPL
37
21namespace parserlib { 38namespace parserlib {
22 39
40input utf8_decode(const std::string& str) {
41 return CodeCvt::utf8to32(str);
42}
43
44std::string utf8_encode(const input& str) {
45 return CodeCvt::utf32to8(str);
46}
47
23// internal private class that manages access to the public classes' internals. 48// internal private class that manages access to the public classes' internals.
24class _private { 49class _private {
25public: 50public:
@@ -241,7 +266,7 @@ class _string : public _expr {
241public: 266public:
242 // constructor from ansi string. 267 // constructor from ansi string.
243 _string(const char* s) 268 _string(const char* s)
244 : m_string(Converter{}.from_bytes(s)) { 269 : m_string(utf8_decode(s)) {
245 } 270 }
246 271
247 // parse with whitespace 272 // parse with whitespace
@@ -279,7 +304,7 @@ class _set : public _expr {
279public: 304public:
280 // constructor from ansi string. 305 // constructor from ansi string.
281 _set(const char* s) { 306 _set(const char* s) {
282 auto str = Converter{}.from_bytes(s); 307 auto str = utf8_decode(s);
283 for (auto ch : str) { 308 for (auto ch : str) {
284 _add(ch); 309 _add(ch);
285 } 310 }
diff --git a/src/yuescript/parser.hpp b/src/yuescript/parser.hpp
index c544785..4742539 100644
--- a/src/yuescript/parser.hpp
+++ b/src/yuescript/parser.hpp
@@ -17,7 +17,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17#pragma warning(disable : 4521) 17#pragma warning(disable : 4521)
18#endif 18#endif
19 19
20#include <codecvt>
21#include <functional> 20#include <functional>
22#include <list> 21#include <list>
23#include <locale> 22#include <locale>
@@ -27,9 +26,11 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
27namespace parserlib { 26namespace parserlib {
28 27
29/// type of the parser's input. 28/// type of the parser's input.
30typedef std::basic_string<wchar_t> input; 29typedef std::basic_string<char32_t> input;
31typedef input::iterator input_it; 30typedef input::iterator input_it;
32typedef std::wstring_convert<std::codecvt_utf8_utf16<input::value_type>> Converter; 31
32input utf8_decode(const std::string& str);
33std::string utf8_encode(const input& str);
33 34
34class _private; 35class _private;
35class _expr; 36class _expr;
diff --git a/src/yuescript/yue_ast.cpp b/src/yuescript/yue_ast.cpp
index 945e1d7..b225acc 100644
--- a/src/yuescript/yue_ast.cpp
+++ b/src/yuescript/yue_ast.cpp
@@ -30,7 +30,7 @@ std::string YueFormat::ind() const {
30} 30}
31 31
32std::string YueFormat::convert(const ast_node* node) { 32std::string YueFormat::convert(const ast_node* node) {
33 return converter.to_bytes(std::wstring(node->m_begin.m_it, node->m_end.m_it)); 33 return utf8_encode({node->m_begin.m_it, node->m_end.m_it});
34} 34}
35 35
36std::string YueFormat::toString(ast_node* node) { 36std::string YueFormat::toString(ast_node* node) {
diff --git a/src/yuescript/yue_ast.h b/src/yuescript/yue_ast.h
index 6e1bb88..b1a369b 100644
--- a/src/yuescript/yue_ast.h
+++ b/src/yuescript/yue_ast.h
@@ -991,7 +991,6 @@ struct YueFormat {
991 int tabSpaces = 4; 991 int tabSpaces = 4;
992 std::string toString(ast_node* node); 992 std::string toString(ast_node* node);
993 993
994 Converter converter{};
995 void pushScope(); 994 void pushScope();
996 void popScope(); 995 void popScope();
997 std::string convert(const ast_node* node); 996 std::string convert(const ast_node* node);
diff --git a/src/yuescript/yue_compiler.cpp b/src/yuescript/yue_compiler.cpp
index d7d117a..4dd3583 100644
--- a/src/yuescript/yue_compiler.cpp
+++ b/src/yuescript/yue_compiler.cpp
@@ -78,7 +78,7 @@ static std::unordered_set<std::string> Metamethods = {
78 "close"s // Lua 5.4 78 "close"s // Lua 5.4
79}; 79};
80 80
81const std::string_view version = "0.29.5"sv; 81const std::string_view version = "0.29.6"sv;
82const std::string_view extension = "yue"sv; 82const std::string_view extension = "yue"sv;
83 83
84class CompileError : public std::logic_error { 84class CompileError : public std::logic_error {
diff --git a/src/yuescript/yue_parser.cpp b/src/yuescript/yue_parser.cpp
index 01ca083..44baced 100644
--- a/src/yuescript/yue_parser.cpp
+++ b/src/yuescript/yue_parser.cpp
@@ -1061,7 +1061,7 @@ bool YueParser::startWith(std::string_view codes, rule& r) {
1061 } 1061 }
1062 try { 1062 try {
1063 if (!codes.empty()) { 1063 if (!codes.empty()) {
1064 converted = std::make_unique<input>(_converter.from_bytes(&codes.front(), &codes.back() + 1)); 1064 converted = std::make_unique<input>(utf8_decode({&codes.front(), &codes.back() + 1}));
1065 } else { 1065 } else {
1066 converted = std::make_unique<input>(); 1066 converted = std::make_unique<input>();
1067 } 1067 }
@@ -1087,11 +1087,11 @@ ParseInfo YueParser::parse(std::string_view codes, rule& r, bool lax) {
1087 } 1087 }
1088 try { 1088 try {
1089 if (!codes.empty()) { 1089 if (!codes.empty()) {
1090 res.codes = std::make_unique<input>(_converter.from_bytes(&codes.front(), &codes.back() + 1)); 1090 res.codes = std::make_unique<input>(utf8_decode({&codes.front(), &codes.back() + 1}));
1091 } else { 1091 } else {
1092 res.codes = std::make_unique<input>(); 1092 res.codes = std::make_unique<input>();
1093 } 1093 }
1094 } catch (const std::range_error&) { 1094 } catch (const std::exception&) {
1095 res.error = {"invalid text encoding"s, 1, 1}; 1095 res.error = {"invalid text encoding"s, 1, 1};
1096 return res; 1096 return res;
1097 } 1097 }
@@ -1156,11 +1156,11 @@ bool YueParser::match(std::string_view astName, std::string_view codes) {
1156} 1156}
1157 1157
1158std::string YueParser::toString(ast_node* node) { 1158std::string YueParser::toString(ast_node* node) {
1159 return _converter.to_bytes(std::wstring(node->m_begin.m_it, node->m_end.m_it)); 1159 return utf8_encode({node->m_begin.m_it, node->m_end.m_it});
1160} 1160}
1161 1161
1162std::string YueParser::toString(input::iterator begin, input::iterator end) { 1162std::string YueParser::toString(input::iterator begin, input::iterator end) {
1163 return _converter.to_bytes(std::wstring(begin, end)); 1163 return utf8_encode({begin, end});
1164} 1164}
1165 1165
1166bool YueParser::hasAST(std::string_view name) const { 1166bool YueParser::hasAST(std::string_view name) const {
@@ -1237,7 +1237,7 @@ std::string ParseInfo::errorMessage(std::string_view msg, int errLine, int errCo
1237 } 1237 }
1238 ++it; 1238 ++it;
1239 } 1239 }
1240 auto line = Converter{}.to_bytes(std::wstring(begin, end)); 1240 auto line = utf8_encode({begin, end});
1241 while (col < static_cast<int>(line.size()) 1241 while (col < static_cast<int>(line.size())
1242 && (line[col] == ' ' || line[col] == '\t')) { 1242 && (line[col] == ' ' || line[col] == '\t')) {
1243 col++; 1243 col++;
diff --git a/src/yuescript/yue_parser.h b/src/yuescript/yue_parser.h
index c91e530..4c546b1 100644
--- a/src/yuescript/yue_parser.h
+++ b/src/yuescript/yue_parser.h
@@ -134,7 +134,6 @@ protected:
134 } 134 }
135 135
136private: 136private:
137 Converter _converter;
138 std::unordered_map<std::string_view, rule*> _rules; 137 std::unordered_map<std::string_view, rule*> _rules;
139 138
140 template <class T> 139 template <class T>