aboutsummaryrefslogtreecommitdiff
path: root/src/3rdParty
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdParty')
-rwxr-xr-xsrc/3rdParty/colib/LICENSE21
-rw-r--r--src/3rdParty/colib/ljson.c925
-rwxr-xr-xsrc/3rdParty/utf8cpp.h1277
3 files changed, 2223 insertions, 0 deletions
diff --git a/src/3rdParty/colib/LICENSE b/src/3rdParty/colib/LICENSE
new file mode 100755
index 0000000..e0eddeb
--- /dev/null
+++ b/src/3rdParty/colib/LICENSE
@@ -0,0 +1,21 @@
1MIT License
2
3Copyright (c) 2020 colin
4
5Permission is hereby granted, free of charge, to any person obtaining a copy
6of this software and associated documentation files (the "Software"), to deal
7in the Software without restriction, including without limitation the rights
8to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9copies of the Software, and to permit persons to whom the Software is
10furnished to do so, subject to the following conditions:
11
12The above copyright notice and this permission notice shall be included in all
13copies or substantial portions of the Software.
14
15THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21SOFTWARE.
diff --git a/src/3rdParty/colib/ljson.c b/src/3rdParty/colib/ljson.c
new file mode 100644
index 0000000..4daba07
--- /dev/null
+++ b/src/3rdParty/colib/ljson.c
@@ -0,0 +1,925 @@
1/**
2 * json解析器:只支持utf-8格式,Lua只支持64位的数字
3 */
4#define LUA_LIB
5#include <stdlib.h>
6#include <string.h>
7#include <stdint.h>
8#include <stdio.h>
9#include <ctype.h>
10#include <assert.h>
11#include <errno.h>
12#include <setjmp.h>
13#include <ctype.h>
14#include <limits.h>
15#include <float.h>
16#include <math.h>
17#include "lua.h"
18#include "lauxlib.h"
19
20#if LUA_VERSION_NUM > 501
21#ifndef LUA_COMPAT_5_1
22#ifndef lua_objlen
23#define lua_objlen lua_rawlen
24#endif // lua_objlen
25#endif // LUA_COMPAT_5_1
26#endif // LUA_VERSION_NUM
27
28// 内存分配函数,方便替换
29#define co_malloc malloc
30#define co_free free
31#define co_realloc realloc
32#define co_calloc calloc
33
34
35#if !defined(likely)
36#if defined(__GNUC__)
37#define likely(x) (__builtin_expect(((x) != 0), 1))
38#define unlikely(x) (__builtin_expect(((x) != 0), 0))
39#else
40#define likely(x) (x)
41#define unlikely(x) (x)
42#endif
43
44#endif
45
46//-----------------------------------------------------------------------------
47// membuffer
48
49#define STACK_BUFF_SIZE 512
50
51typedef struct membuffer {
52 char *b; // 内存buffer
53 size_t sz; // buffer已用长度
54 size_t cap; // buffer实际大小
55 char s[STACK_BUFF_SIZE];
56} membuffer_t;
57
58// 初始化buffer
59static inline void membuffer_init(membuffer_t *buff) {
60 buff->b = buff->s;
61 buff->cap = STACK_BUFF_SIZE;
62 buff->sz = 0;
63}
64
65static inline void membuffer_add_size(membuffer_t *buff, size_t sz) {
66 buff->sz += sz;
67}
68
69static inline void membuffer_reset(membuffer_t *buff) {
70 buff->sz = 0;
71}
72
73static inline void membuffer_free(membuffer_t *buff) {
74 if (buff->b && buff->b != buff->s) {
75 co_free(buff->b);
76 buff->b = NULL;
77 }
78}
79
80static inline void _membuffer_grow(membuffer_t *buff, size_t needsz) {
81 if (buff->cap < needsz) {
82 size_t newcap = buff->cap * 2;
83 if (newcap < needsz)
84 newcap = needsz;
85 if (buff->b == buff->s) {
86 buff->b = (char*)co_malloc(newcap);
87 memcpy(buff->b, buff->s, buff->sz);
88 } else {
89 buff->b = (char*)co_realloc(buff->b, newcap);
90 }
91 buff->cap = newcap;
92 }
93}
94
95// 确保缓存中还有sz的可用空间
96static inline void membuffer_ensure_space(membuffer_t *buff, size_t sz) {
97 if (buff->sz + sz > buff->cap) {
98 _membuffer_grow(buff, buff->sz+sz);
99 }
100}
101
102// 压入一个字符
103static inline void membuffer_putc(membuffer_t *buff, char c) {
104 membuffer_ensure_space(buff, 1);
105 buff->b[buff->sz++] = c;
106}
107
108// 写入一段内存
109static inline void membuffer_putb(membuffer_t *buff, const void *b, size_t sz) {
110 membuffer_ensure_space(buff, sz);
111 memcpy(buff->b + buff->sz, b, sz);
112 buff->sz += sz;
113}
114
115// 压入一个字符:不检查空间(不安全版本)
116static inline void membuffer_putc_unsafe(membuffer_t *buff, char c) {
117 buff->b[buff->sz++] = c;
118}
119
120#if LUA_VERSION_NUM > 501
121// 写入一段内存:不检查空间(不安全版本)
122static inline void membuffer_putb_unsafe(membuffer_t *buff, const void *b, size_t sz) {
123 memcpy(buff->b + buff->sz, b, sz);
124 buff->sz += sz;
125}
126#endif
127
128// 取当前的指针
129static inline char* membuffer_getp(membuffer_t *buff) {
130 return buff->b + buff->sz;
131}
132
133//-----------------------------------------------------------------------------
134// parser
135
136//-------------------------------------
137// 与Lua相关的代码
138
139static inline void l_add_object(lua_State *L) {
140 luaL_checkstack(L, 6, NULL);
141 lua_createtable(L, 0, 4);
142}
143static inline void l_begin_pair(lua_State *L, const char *k, size_t sz) {
144 lua_pushlstring(L, k, sz);
145}
146static inline void l_end_pair(lua_State *L) {
147 lua_rawset(L, -3);
148}
149static inline void l_add_array(lua_State *L) {
150 luaL_checkstack(L, 6, NULL);
151 lua_createtable(L, 4, 0);
152}
153static inline void l_add_index(lua_State *L, int i) {
154 lua_rawseti(L, -2, i+1);
155}
156static inline void l_add_string(lua_State *L, const char *s, size_t sz) {
157 lua_pushlstring(L, s, sz);
158}
159static inline void l_add_float(lua_State *L, double f) {
160 lua_pushnumber(L, (lua_Number)f);
161}
162static inline void l_add_integer(lua_State *L, int64_t i) {
163 lua_pushinteger(L, (lua_Integer)i);
164}
165static inline void l_add_boolean(lua_State *L, int b) {
166 lua_pushboolean(L, b);
167}
168static inline void l_add_null(lua_State *L) {
169 lua_pushlightuserdata(L, NULL);
170}
171static inline void l_error(lua_State *L, const char *msg) {
172 luaL_error(L, msg);
173}
174
175// 解析事件
176#define ON_ADD_OBJECT(ud) l_add_object((lua_State*)(ud))
177#define ON_BEGIN_PAIR(ud, k, sz) l_begin_pair((lua_State*)(ud), k, sz)
178#define ON_END_PAIR(ud) l_end_pair((lua_State*)(ud))
179#define ON_ADD_ARRAY(ud) l_add_array((lua_State*)(ud))
180#define ON_ADD_INDEX(ud, i) l_add_index((lua_State*)(ud), i)
181#define ON_ADD_STRING(ud, s, sz) l_add_string((lua_State*)(ud), s, sz)
182#define ON_ADD_FLOAT(ud, f) l_add_float((lua_State*)(ud), f)
183#define ON_ADD_INTEGER(ud, i) l_add_integer((lua_State*)(ud), i)
184#define ON_ADD_BOOLEAN(ud, b) l_add_boolean((lua_State*)(ud), b)
185#define ON_ADD_NULL(ud) l_add_null((lua_State*)(ud))
186#define ON_ERROR(ud, msg) l_error((lua_State*)(ud), msg)
187
188//-------------------------------------
189// 解析json,这部分代码与Lua无关,是通用的解析器;如果要移植这部分代码,需要把 //>>> 开头的注释去掉
190
191// 错误消息的大小
192#define ERRMSG_SIZE 256
193
194// json解析器
195typedef struct {
196 const char *str; // json字符串
197 const char *ptr; // json字符串解析指针
198 void *ud; // 解析事件的用户数据
199 membuffer_t buff; // 临时缓存
200 int curdepth; // 当前层次
201 int maxdepth; // 最大层次
202 int allowcomment; // 是否允许注释
203 char errmsg[ERRMSG_SIZE]; // 保存错误消息
204 //>>>jmp_buf jb; // 用于实现从解析中出错直接跳出
205} json_parser_t;
206
207static inline void parser_init(json_parser_t *parser, const char *str, size_t size, void *ud,
208 int maxdepth, int allowcomment) {
209 membuffer_init(&parser->buff);
210 membuffer_ensure_space(&parser->buff, size);
211 parser->str = str;
212 parser->ptr = str;
213 parser->ud = ud;
214 parser->maxdepth = maxdepth;
215 parser->curdepth = 0;
216 parser->allowcomment = allowcomment;
217}
218
219static inline void parser_free(json_parser_t *parser) {
220 membuffer_free(&parser->buff);
221}
222
223// 抛出错误
224static void parser_throw_error(json_parser_t *parser, const char *fmt, ...) {
225 membuffer_free(&parser->buff);
226 va_list arg;
227 va_start(arg, fmt);
228 vsnprintf(parser->errmsg, ERRMSG_SIZE, fmt, arg);
229 va_end(arg);
230 ON_ERROR(parser->ud, parser->errmsg);
231 // 直接跳出解析代码,由于Lua的lua_error也是用longjmp,所以下面的代码没有机会执行到。但其他语言就不一定。
232 //>>>longjmp(parser->jb, 1);
233}
234
235// 辅助宏
236#define peekchar(p) (*(p)->ptr)
237#define skipchar(p) (++(p)->ptr)
238#define get_and_next(p) (*(p)->ptr++)
239#define next_and_get(p) (*(++(p)->ptr))
240#define savechar(p, c) membuffer_putc_unsafe(&(p)->buff, (c))
241#define currpos(p) (size_t)((p)->ptr - (p)->str)
242
243// 取解析到的错误内容
244static const char* parser_error_content(json_parser_t *p) {
245 size_t n = currpos(p);
246 if (n > 50) n = 50; // 调整这个数获得更长的内容
247 membuffer_reset(&p->buff);
248 membuffer_putb(&p->buff, p->ptr - n, n);
249 membuffer_putc(&p->buff, '\0');
250 return p->buff.b;
251}
252
253// 增加深度
254static inline void parser_add_depth(json_parser_t *p) {
255 p->curdepth++;
256 if (p->curdepth >= p->maxdepth)
257 parser_throw_error(p, "Too many nested data, max depth is %d, at: %s[:%lu]", p->maxdepth,
258 parser_error_content(p), currpos(p));
259}
260
261static inline void parser_skip_whitespaces(json_parser_t *p) {
262 // colin: 要支持注释,请将下面注释去掉
263 // if (likely(!p->allowcomment)) {
264 char ch = peekchar(p);
265 while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
266 ch = next_and_get(p);
267 // } else {
268 // char ch = peekchar(p);
269 // for (;;) {
270 // while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
271 // ch = next_and_get(p);
272 // if (ch == '/') {
273 // ch = next_and_get(p);
274 // if (ch == '/') {
275 // ch = next_and_get(p);
276 // while (ch != '\n' && ch != '\r' && ch != '\0')
277 // ch = next_and_get(p);
278 // continue;
279 // } else {
280 // parser_throw_error(p, "Invalid comment, at: %s[:%lu]", parser_error_content(p), currpos(p));
281 // }
282 // }
283 // break;
284 // }
285 // }
286}
287
288static inline void parser_expect_char(json_parser_t *p, char c) {
289 if (likely(peekchar(p) == c))
290 skipchar(p);
291 else
292 parser_throw_error(p, "Expect '%c' at: %s[:%lu]", c, parser_error_content(p), currpos(p));
293}
294
295static inline void parser_process_false(json_parser_t *p) {
296 if (likely(p->ptr[0] == 'a' && p->ptr[1] == 'l' && p->ptr[2] == 's' && p->ptr[3] == 'e')) {
297 p->ptr += 4;
298 ON_ADD_BOOLEAN(p->ud, 0);
299 } else {
300 parser_throw_error(p, "Invalid boolean, at: %s[:%lu]", parser_error_content(p), currpos(p));
301 }
302}
303
304static inline void parser_process_true(json_parser_t *p) {
305 if (likely(p->ptr[0] == 'r' && p->ptr[1] == 'u' && p->ptr[2] == 'e')) {
306 p->ptr += 3;
307 ON_ADD_BOOLEAN(p->ud, 1);
308 } else {
309 parser_throw_error(p, "Invalid boolean, at: %s[:%lu]", parser_error_content(p), currpos(p));
310 }
311}
312
313static inline void parser_process_null(json_parser_t *p) {
314 if (likely(p->ptr[0] == 'u' && p->ptr[1] == 'l' && p->ptr[2] == 'l')) {
315 p->ptr += 3;
316 ON_ADD_NULL(p->ud);
317 } else {
318 parser_throw_error(p, "Invalid null, at: %s[:%lu]", parser_error_content(p), currpos(p));
319 }
320}
321
322static inline uint32_t parser_read_hex(json_parser_t *p) {
323 uint32_t cp = 0;
324 unsigned char ch;
325 int i = 4;
326 while (i--) {
327 ch = (unsigned char)get_and_next(p);
328 if ('0' <= ch && ch <= '9')
329 ch -= '0';
330 else if (ch >= 'a' && ch <= 'f')
331 ch = ch - 'a' + 10;
332 else if (ch >= 'A' && ch <= 'F')
333 ch = ch - 'A' + 10;
334 else {
335 parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p));
336 return cp;
337 }
338 cp = (cp << 4) + ch;
339 }
340 return cp;
341}
342
343static inline void parser_process_utf8esc(json_parser_t *p) {
344 uint32_t cp = parser_read_hex(p);
345 // UTF-16 surrogate pairs, see https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
346 if (cp >= 0xD800 && cp <= 0xDBFF) {
347 char p0 = p->ptr[0];
348 char p1 = p->ptr[1];
349 if (p0 != '\\' || p1 != 'u')
350 parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p));
351 p->ptr += 2;
352 uint32_t cp2 = parser_read_hex(p);
353 if (cp2 < 0xDC00 || cp2 > 0xDFFF)
354 parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p));
355 cp = 0x10000 + (((cp & 0x03FF) << 10) | (cp2 & 0x03FF));
356 }
357 if (cp < 0x80) {
358 membuffer_putc_unsafe(&p->buff, (char)cp);
359 } else if (cp < 0x800) {
360 membuffer_putc_unsafe(&p->buff, 0xC0 | (cp >> 6));
361 membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F));
362 } else if (cp < 0x10000) {
363 membuffer_putc_unsafe(&p->buff, 0xE0 | (cp >> 12));
364 membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 6) & 0x3F));
365 membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F));
366 } else {
367 membuffer_putc_unsafe(&p->buff, 0xF0 | (cp >> 18));
368 membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 12) & 0x3F));
369 membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 6) & 0x3F));
370 membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F));
371 }
372}
373
374static const char escape2char[256] = {
375 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0~19
376 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\"',0, 0, 0, 0, 0, // 20~39
377 0, 0, 0, 0, 0, 0, 0, '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40~59
378 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60~79
379 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\\',0, 0, 0, 0, 0, '\b',0, // 80~99
380 0, 0, '\f',0, 0, 0, 0, 0, 0, 0, '\n',0, 0, 0, '\r',0, '\t',0, 0, 0, // 100~119
381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 120~139
382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 140~159
383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160~179
384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 180~199
385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 200~219
386 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 220~239
387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 240~256
388};
389
390static inline void parser_process_string(json_parser_t *p) {
391 membuffer_reset(&p->buff);
392 char ch = get_and_next(p);
393 for (;;) {
394 if (ch == '\\') {
395 unsigned char nch = (unsigned char)peekchar(p);
396 if (likely(escape2char[nch])) {
397 savechar(p, escape2char[nch]);
398 skipchar(p);
399 } else if (nch == 'u') {
400 skipchar(p);
401 parser_process_utf8esc(p);
402 } else {
403 parser_throw_error(p, "Invalid escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p));
404 }
405 } else if (ch == '"') {
406 break;
407 } else if ((unsigned char)ch < 0x20) {
408 parser_throw_error(p, "Invalid string, at: %s[:%lu]", parser_error_content(p), currpos(p));
409 } else {
410 savechar(p, ch);
411 }
412 ch = get_and_next(p);
413 }
414}
415
416#define invalid_number(p) parser_throw_error(p, "Invalid value, at: %s[:%lu]", parser_error_content(p), currpos(p))
417#define MAXBY10 (int64_t)(922337203685477580)
418#define MAXLASTD (int)(7)
419static double powersOf10[] = {10., 100., 1.0e4, 1.0e8, 1.0e16, 1.0e32, 1.0e64, 1.0e128, 1.0e256};
420static inline void parser_process_number(json_parser_t *p, char ch) {
421 double db; // 浮点数
422 int64_t in = 0; // 整型值
423 int isdouble = 0; // 是否是浮点数
424 int neg = 0; // 是否是负数
425 int exponent = 0; // 指数位数
426
427 if (ch == '-') { // 负值
428 neg = 1;
429 ch = get_and_next(p);
430 }
431 if (unlikely(ch == '0')) { // 0开头的后面只能是:.eE或\0
432 ch = peekchar(p);
433 } else if (likely(ch >= '1' && ch <= '9')) {
434 in = ch - '0';
435 ch = peekchar(p);
436 while (ch >= '0' && ch <= '9') {
437 if (unlikely(in >= MAXBY10 && (in > MAXBY10 || (ch - '0') > MAXLASTD + neg))) { // 更大的数字就用浮点数表示
438 isdouble = 1;
439 db = (double)in;
440 do {
441 db = db * 10.0 + (ch - '0');
442 ch = next_and_get(p);
443 } while (ch >= '0' && ch <= '9');
444 break;
445 }
446 in = in * 10 + (ch - '0');
447 ch = next_and_get(p);
448 }
449 } else {
450 invalid_number(p);
451 }
452
453 if (ch == '.') { // 小数点部分
454 if (likely(!isdouble)) {
455 isdouble = 1;
456 db = (double)in;
457 }
458 ch = next_and_get(p);
459 if (unlikely(!(ch >= '0' && ch <= '9')))
460 invalid_number(p); // .后面一定是数字
461 do {
462 db = db * 10. + (ch - '0');
463 exponent--;
464 ch = next_and_get(p);
465 } while (ch >= '0' && ch <= '9');
466 }
467
468 if (ch == 'e' || ch == 'E') { // 指数部分
469 if (!isdouble) { // 有e强制认为是浮点数
470 isdouble = 1;
471 db = (double)in;
472 }
473 ch = next_and_get(p);
474 int eneg = 0;
475 if (ch == '-') {
476 eneg = 1;
477 ch = next_and_get(p);
478 } else if (ch == '+') {
479 ch = next_and_get(p);
480 }
481 if (unlikely(!(ch >= '0' && ch <= '9')))
482 invalid_number(p); // 后面一定是数字
483 int exp = 0;
484 do {
485 exp = exp * 10. + (ch - '0');
486 ch = next_and_get(p);
487 } while (ch >= '0' && ch <= '9');
488 exponent += eneg ? (-exp) : (exp);
489 }
490
491 if (isdouble) {
492 int n = exponent < 0 ? -exponent : exponent;
493 if (unlikely(n>511))
494 n = 511; // inf
495 double p10 = 1.0;
496 double *d;
497 for (d = powersOf10; n != 0; n >>= 1, d += 1) {
498 if (n & 1) p10 *= *d;
499 }
500 if (exponent < 0)
501 db /= p10;
502 else
503 db *= p10;
504 if (neg) db = -db;
505 ON_ADD_FLOAT(p->ud, db);
506 } else {
507 if (neg) in = -in;
508 ON_ADD_INTEGER(p->ud, in);
509 }
510}
511
512static void parser_process_value(json_parser_t *p);
513
514static inline void parser_process_object(json_parser_t *p) {
515 parser_add_depth(p);
516 ON_ADD_OBJECT(p->ud);
517 parser_skip_whitespaces(p);
518 char ch = peekchar(p);
519 if (ch == '}') {
520 skipchar(p);
521 p->curdepth--;
522 return;
523 }
524 for (;;) {
525 parser_expect_char(p, '"');
526 parser_process_string(p); // key
527 ON_BEGIN_PAIR(p->ud, p->buff.b, p->buff.sz);
528
529 parser_skip_whitespaces(p);
530 parser_expect_char(p, ':');
531
532 parser_process_value(p); // value
533 ON_END_PAIR(p->ud);
534
535 parser_skip_whitespaces(p);
536 if (peekchar(p) == '}') {
537 skipchar(p);
538 p->curdepth--;
539 return;
540 }
541 else {
542 parser_expect_char(p, ',');
543 parser_skip_whitespaces(p);
544 }
545 }
546}
547
548static inline void parser_process_array(json_parser_t *p) {
549 parser_add_depth(p);
550 ON_ADD_ARRAY(p->ud);
551 parser_skip_whitespaces(p);
552 char ch = peekchar(p);
553 if (ch == ']') {
554 skipchar(p);
555 p->curdepth--;
556 return;
557 }
558 int i;
559 for (i = 0; ;++i) {
560 parser_process_value(p);
561 ON_ADD_INDEX(p->ud, i);
562
563 parser_skip_whitespaces(p);
564 if (peekchar(p) == ']') {
565 skipchar(p);
566 p->curdepth--;
567 return;
568 }
569 else {
570 parser_expect_char(p, ',');
571 }
572 }
573}
574
575static void parser_process_value(json_parser_t *p) {
576 parser_skip_whitespaces(p);
577 char ch = get_and_next(p);
578 switch (ch) {
579 case 'f':
580 parser_process_false(p);
581 break;
582 case 't':
583 parser_process_true(p);
584 break;
585 case 'n':
586 parser_process_null(p);
587 break;
588 case '"':
589 parser_process_string(p);
590 ON_ADD_STRING(p->ud, p->buff.b, p->buff.sz);
591 break;
592 case '{':
593 parser_process_object(p);
594 break;
595 case '[':
596 parser_process_array(p);
597 break;
598 default:
599 parser_process_number(p, ch);
600 break;
601 }
602}
603
604// 解析json文本
605static void parser_do_parse(const char *str, size_t size, void *ud, int maxdepth, int allowcomment) {
606 json_parser_t p;
607 parser_init(&p, str, size, ud, maxdepth, allowcomment);
608 //>>>if (setjmp(p.jb) == 0) {
609 parser_process_value(&p);
610 parser_skip_whitespaces(&p);
611 if (peekchar(&p) != '\0') {
612 parser_throw_error(&p, "Expect '<eof>' but got '%c', at: %s[:%lu]", peekchar(&p),
613 parser_error_content(&p), currpos(&p));
614 }
615 parser_free(&p);
616 //>>>}
617}
618
619//-----------------------------------------------------------------------------
620// dumpper
621
622typedef struct {
623 membuffer_t buff; // 临时缓存
624 int maxdepth; // 最大层次
625 int format; // 是否格式化
626 int empty_as_array; // 空表是否当成数组
627 int num_as_str; // 数字Key转为字符串
628 char errmsg[ERRMSG_SIZE]; // 保存错误消息
629} json_dumpper_t;
630
631// 足够转换数字的缓存大小
632#define NUMBER_BUFF_SZ 44
633#define INTEGER_BUFF_SZ 24
634
635// 抛出错误
636static void dumpper_throw_error(json_dumpper_t *d, lua_State *L, const char *fmt, ...) {
637 membuffer_free(&d->buff);
638 va_list arg;
639 va_start(arg, fmt);
640 vsnprintf(d->errmsg, ERRMSG_SIZE, fmt, arg);
641 va_end(arg);
642 luaL_error(L, d->errmsg);
643}
644
645#if LUA_VERSION_NUM > 501
646static void dumpper_process_integer(json_dumpper_t *d, lua_State *L, int idx) {
647 char nbuff[INTEGER_BUFF_SZ];
648 int i = INTEGER_BUFF_SZ;
649 membuffer_ensure_space(&d->buff, INTEGER_BUFF_SZ);
650 int64_t x = (int64_t)lua_tointeger(L, idx);
651 uint64_t ux = (uint64_t)x;
652 if (x < 0) {
653 membuffer_putc_unsafe(&d->buff, '-');
654 ux = ~ux + 1;
655 }
656 do {
657 nbuff[--i] = (ux % 10) + '0';
658 } while (ux /= 10);
659 membuffer_putb_unsafe(&d->buff, nbuff+i, INTEGER_BUFF_SZ-i);
660}
661#endif
662
663static void dumpper_process_number(json_dumpper_t *d, lua_State *L, int idx) {
664 lua_Number num = lua_tonumber(L, idx);
665 if (isinf(num) || isnan(num))
666 dumpper_throw_error(d, L, "The number is NaN or Infinity");
667 membuffer_ensure_space(&d->buff, NUMBER_BUFF_SZ);
668 char *p = membuffer_getp(&d->buff);
669 int len = sprintf(p, LUA_NUMBER_FMT, num);
670 membuffer_add_size(&d->buff, len);
671}
672
673// 字符转义表
674static const char char2escape[256] = {
675 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', 'u', 'u', 'u', 'u', // 0~19
676 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 0, 0, '"', 0, 0, 0, 0, 0, // 20~39
677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40~59
678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60~79
679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\\', 0, 0, 0, 0, 0, 0, 0, // 80~99
680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 100~119
681 0, 0, 0, 0, 0, 0, 0, 'u', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 120~139
682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 140~159
683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160~179
684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 180~199
685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 200~219
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 220~239
687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 240~256
688};
689static const char hex_digits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
690
691static void dumpper_process_string(json_dumpper_t *d, lua_State *L, int idx) {
692 membuffer_t *buff = &d->buff;
693 size_t len, i;
694 const char *str = lua_tolstring(L, idx, &len);
695 membuffer_ensure_space(buff, len * 6 + 2);
696 membuffer_putc_unsafe(buff, '\"');
697 char esc;
698 unsigned char ch;
699 for (i = 0; i < len; ++i) {
700 ch = (unsigned char)str[i];
701 esc = char2escape[ch];
702 if (likely(!esc))
703 membuffer_putc_unsafe(buff, (char)ch);
704 else {
705 membuffer_putc_unsafe(buff, '\\');
706 membuffer_putc_unsafe(buff, esc);
707 if (esc == 'u') {
708 membuffer_putc_unsafe(buff, '0');
709 membuffer_putc_unsafe(buff, '0');
710 membuffer_putc_unsafe(buff, hex_digits[(unsigned char)esc >> 4]);
711 membuffer_putc_unsafe(buff, hex_digits[(unsigned char)esc & 0xF]);
712 }
713 }
714 }
715 membuffer_putc_unsafe(buff, '\"');
716}
717
718static void dumpper_process_value(json_dumpper_t *d, lua_State *L, int depth);
719
720static int dumpper_check_array(json_dumpper_t *d, lua_State *L, int *len) {
721 int asize = lua_objlen(L, -1);
722 if (asize > 0) {
723 lua_pushinteger(L, asize);
724 if (lua_next(L, -2) == 0) {
725 *len = asize;
726 return 1;
727 } else {
728 lua_pop(L, 2);
729 return 0;
730 }
731 } else {
732 lua_pushnil(L);
733 if (lua_next(L, -2) == 0) {
734 *len = asize;
735 return d->empty_as_array;
736 } else {
737 lua_pop(L, 2);
738 return 0;
739 }
740 }
741}
742
743static inline void dumpper_add_indent(json_dumpper_t *d, int count) {
744 membuffer_ensure_space(&d->buff, count);
745 int i;
746 for (i = 0; i < count; ++i)
747 membuffer_putc_unsafe(&d->buff, '\t');
748}
749
750static void dumpper_process_array(json_dumpper_t *d, lua_State *L, int len, int depth) {
751 membuffer_t *buff = &d->buff;
752 membuffer_putc(buff, '[');
753
754 int i;
755 for (i = 1; i <= len; ++i) {
756 if (unlikely(d->format && i == 1)) membuffer_putc(buff, '\n');
757 lua_rawgeti(L, -1, i);
758 if (unlikely(d->format)) dumpper_add_indent(d, depth);
759 dumpper_process_value(d, L, depth);
760 lua_pop(L, 1);
761 if (i < len)
762 membuffer_putc(buff, ',');
763 if (unlikely(d->format)) membuffer_putc(buff, '\n');
764 }
765
766 if (unlikely(d->format && i > 1)) dumpper_add_indent(d, depth-1);
767 membuffer_putc(buff, ']');
768}
769
770static void dumpper_process_object(json_dumpper_t *d, lua_State *L, int depth) {
771 membuffer_t *buff = &d->buff;
772 membuffer_putc(buff, '{');
773
774 int ktp;
775 int comma = 0;
776 lua_pushnil(L); // t nil
777 while (lua_next(L, -2) != 0) { // t k v
778 if (comma) {
779 membuffer_putc(buff, ',');
780 if (unlikely(d->format)) membuffer_putc(buff, '\n');
781 } else {
782 comma = 1;
783 if (unlikely(d->format)) membuffer_putc(buff, '\n');
784 }
785 // key
786 ktp = lua_type(L, -2);
787 if (ktp == LUA_TSTRING) {
788 if (unlikely(d->format)) dumpper_add_indent(d, depth);
789 dumpper_process_string(d, L, -2);
790 if (likely(!d->format))
791 membuffer_putc(buff, ':');
792 else
793 membuffer_putb(buff, " : ", 3);
794 } else if (ktp == LUA_TNUMBER && d->num_as_str) {
795 if (unlikely(d->format)) dumpper_add_indent(d, depth);
796 membuffer_putc(buff, '\"');
797#if LUA_VERSION_NUM > 501
798 if (lua_isinteger(L, -2))
799 dumpper_process_integer(d, L, -2);
800 else
801#endif
802 dumpper_process_number(d, L, -2);
803 if (likely(!d->format))
804 membuffer_putb(buff, "\":", 2);
805 else
806 membuffer_putb(buff, "\" : ", 4);
807 } else {
808 dumpper_throw_error(d, L, "Table key must be a string");
809 }
810 // value
811 dumpper_process_value(d, L, depth);
812 lua_pop(L, 1);
813 }
814 if (unlikely(d->format && comma)) {
815 membuffer_putc(buff, '\n');
816 dumpper_add_indent(d, depth-1);
817 }
818 membuffer_putc(buff, '}');
819}
820
821static inline void dumpper_process_table(json_dumpper_t *d, lua_State *L, int depth) {
822 depth++;
823 if (depth > d->maxdepth)
824 dumpper_throw_error(d, L, "Too many nested data, max depth is %d", d->maxdepth);
825 luaL_checkstack(L, 6, NULL);
826
827 int len;
828 if (dumpper_check_array(d, L, &len))
829 dumpper_process_array(d, L, len, depth);
830 else
831 dumpper_process_object(d, L, depth);
832}
833
834static void dumpper_process_value(json_dumpper_t *d, lua_State *L, int depth) {
835 int tp = lua_type(L, -1);
836 switch (tp) {
837 case LUA_TSTRING:
838 dumpper_process_string(d, L, -1);
839 break;
840 case LUA_TNUMBER:
841#if LUA_VERSION_NUM > 501
842 if (lua_isinteger(L, -1))
843 dumpper_process_integer(d, L, -1);
844 else
845#endif
846 dumpper_process_number(d, L, -1);
847 break;
848 case LUA_TBOOLEAN:
849 if (lua_toboolean(L, -1))
850 membuffer_putb(&d->buff, "true", 4);
851 else
852 membuffer_putb(&d->buff, "false", 5);
853 break;
854 case LUA_TTABLE:
855 dumpper_process_table(d, L, depth);
856 break;
857 case LUA_TNIL:
858 membuffer_putb(&d->buff, "null", 4);
859 break;
860 case LUA_TLIGHTUSERDATA:
861 if (lua_touserdata(L, -1) == NULL) {
862 membuffer_putb(&d->buff, "null", 4);
863 break;
864 }
865 goto error;
866 default:
867 error:
868 dumpper_throw_error(d, L, "Unsupport type %s", lua_typename(L, tp));
869 }
870}
871
872//-----------------------------------------------------------------------------
873// 接口
874#define DEF_MAX_DEPTH 128
875
876// 从字符串加载:json.decode(str, maxdepth) -> obj
877// 要求字符串必须以0结尾
878int colibc_json_decode(lua_State *L) {
879 size_t size;
880 const char *str = luaL_checklstring(L, 1, &size);
881 int maxdepth = (int)luaL_optinteger(L, 2, DEF_MAX_DEPTH);
882 int allowcomment = lua_toboolean(L, 3);
883 parser_do_parse(str, size, L, maxdepth, allowcomment);
884 return 1;
885}
886
887// 保存到字符串: json.encode(obj) -> str
888int colibc_json_encode(lua_State *L) {
889 luaL_checkany(L, 1);
890 json_dumpper_t dumpper;
891 membuffer_init(&dumpper.buff);
892 dumpper.format = lua_toboolean(L, 2);
893 dumpper.empty_as_array = lua_toboolean(L, 3);
894 dumpper.num_as_str = lua_toboolean(L, 4);
895 dumpper.maxdepth = (int)luaL_optinteger(L, 5, DEF_MAX_DEPTH);
896
897 lua_settop(L, 1);
898 dumpper_process_value(&dumpper, L, 0);
899 lua_pushlstring(L, dumpper.buff.b, dumpper.buff.sz);
900 membuffer_free(&dumpper.buff);
901 return 1;
902}
903
904static const luaL_Reg lib[] = {
905 {"decode", colibc_json_decode},
906 {"encode", colibc_json_encode},
907 {NULL, NULL},
908};
909
910LUALIB_API int luaopen_colibc_json(lua_State* L) {
911#if LUA_VERSION_NUM > 501
912 luaL_newlib(L, lib); // json
913#else
914 lua_getglobal(L, "package"); // package
915 lua_getfield(L, -1, "loaded"); // package loaded
916 lua_createtable(L, 0, 0); // package loaded json
917 lua_pushvalue(L, -1); // package loaded json json
918 lua_setfield(L, -3, "cojson"); // loaded["cojson"] = json, package loaded json
919 luaL_register(L, NULL, lib); // package loaded json
920#endif
921 // json.null
922 lua_pushlightuserdata(L, NULL);
923 lua_setfield(L, -2, "null");
924 return 1;
925}
diff --git a/src/3rdParty/utf8cpp.h b/src/3rdParty/utf8cpp.h
new file mode 100755
index 0000000..76f0fa1
--- /dev/null
+++ b/src/3rdParty/utf8cpp.h
@@ -0,0 +1,1277 @@
1// Copyright 2006 Nemanja Trifunovic
2
3/*
4Permission is hereby granted, free of charge, to any person or organization
5obtaining a copy of the software and accompanying documentation covered by
6this license (the "Software") to use, reproduce, display, distribute,
7execute, and transmit the Software, and to prepare derivative works of the
8Software, and to permit third-parties to whom the Software is furnished to
9do so, all subject to the following:
10
11The copyright notices in the Software and this entire statement, including
12the above license grant, this restriction and the following disclaimer,
13must be included in all copies of the Software, in whole or in part, and
14all derivative works of the Software, unless such copies or derivative
15works are solely in the form of machine-executable object code generated by
16a source language processor.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24DEALINGS IN THE SOFTWARE.
25*/
26
27
28#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30
31/*
32To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro
33and set it to one of the values used by the __cplusplus predefined macro.
34
35For instance,
36 #define UTF_CPP_CPLUSPLUS 199711L
37will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard.
38Some library features will be disabled.
39
40If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus.
41*/
42
43#include <iterator>
44#include <cstring>
45#include <string>
46
47// Determine the C++ standard version.
48// If the user defines UTF_CPP_CPLUSPLUS, use that.
49// Otherwise, trust the unreliable predefined macro __cplusplus
50
51#if !defined UTF_CPP_CPLUSPLUS
52 #define UTF_CPP_CPLUSPLUS __cplusplus
53#endif
54
55#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
56 #define UTF_CPP_OVERRIDE override
57 #define UTF_CPP_NOEXCEPT noexcept
58 #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
59#else // C++ 98/03
60 #define UTF_CPP_OVERRIDE
61 #define UTF_CPP_NOEXCEPT throw()
62 // Simulate static_assert:
63 template<bool> struct UtfCppCompileTimeAssert;
64 template<> struct UtfCppCompileTimeAssert <true> { };
65 #define UTF_CPP_STATIC_ASSERT(condition) (UtfCppCompileTimeAssert <(condition) != 0>())
66#endif // C++ 11 or later
67
68
69namespace utf8
70{
71// The typedefs for 8-bit, 16-bit and 32-bit code units
72#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
73 #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
74 typedef char8_t utfchar8_t;
75 #else // C++ 11/14/17
76 typedef unsigned char utfchar8_t;
77 #endif
78 typedef char16_t utfchar16_t;
79 typedef char32_t utfchar32_t;
80#else // C++ 98/03
81 typedef unsigned char utfchar8_t;
82 typedef unsigned short utfchar16_t;
83 typedef unsigned int utfchar32_t;
84#endif // C++ 11 or later
85
86// Helper code - not intended to be directly called by the library users. May be changed at any time
87namespace internal
88{
89 // Unicode constants
90 // Leading (high) surrogates: 0xd800 - 0xdbff
91 // Trailing (low) surrogates: 0xdc00 - 0xdfff
92 const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u;
93 const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu;
94 const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u;
95 const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu;
96 const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
97 const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
98
99 // Maximum valid value for a Unicode code point
100 const utfchar32_t CODE_POINT_MAX = 0x0010ffffu;
101
102 template<typename octet_type>
103 inline utfchar8_t mask8(octet_type oc)
104 {
105 return static_cast<utfchar8_t>(0xff & oc);
106 }
107
108 template<typename u16_type>
109 inline utfchar16_t mask16(u16_type oc)
110 {
111 return static_cast<utfchar16_t>(0xffff & oc);
112 }
113
114 template<typename octet_type>
115 inline bool is_trail(octet_type oc)
116 {
117 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
118 }
119
120 inline bool is_lead_surrogate(utfchar32_t cp)
121 {
122 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX));
123 }
124
125 inline bool is_trail_surrogate(utfchar32_t cp)
126 {
127 return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
128 }
129
130 inline bool is_surrogate(utfchar32_t cp)
131 {
132 return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
133 }
134
135 inline bool is_code_point_valid(utfchar32_t cp)
136 {
137 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
138 }
139
140 inline bool is_in_bmp(utfchar32_t cp)
141 {
142 return cp < utfchar32_t(0x10000);
143 }
144
145 template <typename octet_iterator>
146 int sequence_length(octet_iterator lead_it)
147 {
148 const utfchar8_t lead = utf8::internal::mask8(*lead_it);
149 if (lead < 0x80)
150 return 1;
151 else if ((lead >> 5) == 0x6)
152 return 2;
153 else if ((lead >> 4) == 0xe)
154 return 3;
155 else if ((lead >> 3) == 0x1e)
156 return 4;
157 else
158 return 0;
159 }
160
161 inline bool is_overlong_sequence(utfchar32_t cp, int length)
162 {
163 if (cp < 0x80) {
164 if (length != 1)
165 return true;
166 }
167 else if (cp < 0x800) {
168 if (length != 2)
169 return true;
170 }
171 else if (cp < 0x10000) {
172 if (length != 3)
173 return true;
174 }
175 return false;
176 }
177
178 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
179
180 /// Helper for get_sequence_x
181 template <typename octet_iterator>
182 utf_error increase_safely(octet_iterator& it, const octet_iterator end)
183 {
184 if (++it == end)
185 return NOT_ENOUGH_ROOM;
186
187 if (!utf8::internal::is_trail(*it))
188 return INCOMPLETE_SEQUENCE;
189
190 return UTF8_OK;
191 }
192
193 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
194
195 /// get_sequence_x functions decode utf-8 sequences of the length x
196 template <typename octet_iterator>
197 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
198 {
199 if (it == end)
200 return NOT_ENOUGH_ROOM;
201
202 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
203
204 return UTF8_OK;
205 }
206
207 template <typename octet_iterator>
208 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
209 {
210 if (it == end)
211 return NOT_ENOUGH_ROOM;
212
213 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
214
215 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
216
217 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
218
219 return UTF8_OK;
220 }
221
222 template <typename octet_iterator>
223 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
224 {
225 if (it == end)
226 return NOT_ENOUGH_ROOM;
227
228 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
229
230 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
231
232 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
233
234 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
235
236 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
237
238 return UTF8_OK;
239 }
240
241 template <typename octet_iterator>
242 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
243 {
244 if (it == end)
245 return NOT_ENOUGH_ROOM;
246
247 code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
248
249 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
250
251 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
252
253 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
254
255 code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
256
257 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
258
259 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
260
261 return UTF8_OK;
262 }
263
264 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
265
266 template <typename octet_iterator>
267 utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
268 {
269 if (it == end)
270 return NOT_ENOUGH_ROOM;
271
272 // Save the original value of it so we can go back in case of failure
273 // Of course, it does not make much sense with i.e. stream iterators
274 octet_iterator original_it = it;
275
276 utfchar32_t cp = 0;
277 // Determine the sequence length based on the lead octet
278 const int length = utf8::internal::sequence_length(it);
279
280 // Get trail octets and calculate the code point
281 utf_error err = UTF8_OK;
282 switch (length) {
283 case 0:
284 return INVALID_LEAD;
285 case 1:
286 err = utf8::internal::get_sequence_1(it, end, cp);
287 break;
288 case 2:
289 err = utf8::internal::get_sequence_2(it, end, cp);
290 break;
291 case 3:
292 err = utf8::internal::get_sequence_3(it, end, cp);
293 break;
294 case 4:
295 err = utf8::internal::get_sequence_4(it, end, cp);
296 break;
297 }
298
299 if (err == UTF8_OK) {
300 // Decoding succeeded. Now, security checks...
301 if (utf8::internal::is_code_point_valid(cp)) {
302 if (!utf8::internal::is_overlong_sequence(cp, length)){
303 // Passed! Return here.
304 code_point = cp;
305 ++it;
306 return UTF8_OK;
307 }
308 else
309 err = OVERLONG_SEQUENCE;
310 }
311 else
312 err = INVALID_CODE_POINT;
313 }
314
315 // Failure branch - restore the original value of the iterator
316 it = original_it;
317 return err;
318 }
319
320 template <typename octet_iterator>
321 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
322 utfchar32_t ignored;
323 return utf8::internal::validate_next(it, end, ignored);
324 }
325
326 template <typename word_iterator>
327 utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
328 {
329 // Make sure the iterator dereferences a large enough type
330 typedef typename std::iterator_traits<word_iterator>::value_type word_type;
331 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
332 // Check the edge case:
333 if (it == end)
334 return NOT_ENOUGH_ROOM;
335 // Save the original value of it so we can go back in case of failure
336 // Of course, it does not make much sense with i.e. stream iterators
337 word_iterator original_it = it;
338
339 utf_error err = UTF8_OK;
340
341 const utfchar16_t first_word = *it++;
342 if (!is_surrogate(first_word)) {
343 code_point = first_word;
344 return UTF8_OK;
345 }
346 else {
347 if (it == end)
348 err = NOT_ENOUGH_ROOM;
349 else if (is_lead_surrogate(first_word)) {
350 const utfchar16_t second_word = *it++;
351 if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) {
352 code_point = static_cast<utfchar32_t>(first_word << 10) + static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET;
353 return UTF8_OK;
354 } else
355 err = INCOMPLETE_SEQUENCE;
356
357 } else {
358 err = INVALID_LEAD;
359 }
360 }
361 // error branch
362 it = original_it;
363 return err;
364 }
365
366 // Internal implementation of both checked and unchecked append() function
367 // This function will be invoked by the overloads below, as they will know
368 // the octet_type.
369 template <typename octet_iterator, typename octet_type>
370 octet_iterator append(utfchar32_t cp, octet_iterator result) {
371 if (cp < 0x80) // one octet
372 *(result++) = static_cast<octet_type>(cp);
373 else if (cp < 0x800) { // two octets
374 *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
375 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
376 }
377 else if (cp < 0x10000) { // three octets
378 *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
379 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
380 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
381 }
382 else { // four octets
383 *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
384 *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
385 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
386 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
387 }
388 return result;
389 }
390
391 // One of the following overloads will be invoked from the API calls
392
393 // A simple (but dangerous) case: the caller appends byte(s) to a char array
394 inline char* append(utfchar32_t cp, char* result) {
395 return append<char*, char>(cp, result);
396 }
397
398 // Hopefully, most common case: the caller uses back_inserter
399 // i.e. append(cp, std::back_inserter(str));
400 template<typename container_type>
401 std::back_insert_iterator<container_type> append
402 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
403 return append<std::back_insert_iterator<container_type>,
404 typename container_type::value_type>(cp, result);
405 }
406
407 // The caller uses some other kind of output operator - not covered above
408 // Note that in this case we are not able to determine octet_type
409 // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
410 template <typename octet_iterator>
411 octet_iterator append(utfchar32_t cp, octet_iterator result) {
412 return append<octet_iterator, utfchar8_t>(cp, result);
413 }
414
415 // Internal implementation of both checked and unchecked append16() function
416 // This function will be invoked by the overloads below, as they will know
417 // the word_type.
418 template <typename word_iterator, typename word_type>
419 word_iterator append16(utfchar32_t cp, word_iterator result) {
420 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
421 if (is_in_bmp(cp))
422 *(result++) = static_cast<word_type>(cp);
423 else {
424 // Code points from the supplementary planes are encoded via surrogate pairs
425 *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
426 *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
427 }
428 return result;
429 }
430
431 // Hopefully, most common case: the caller uses back_inserter
432 // i.e. append16(cp, std::back_inserter(str));
433 template<typename container_type>
434 std::back_insert_iterator<container_type> append16
435 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
436 return append16<std::back_insert_iterator<container_type>,
437 typename container_type::value_type>(cp, result);
438 }
439
440 // The caller uses some other kind of output operator - not covered above
441 // Note that in this case we are not able to determine word_type
442 // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
443 template <typename word_iterator>
444 word_iterator append16(utfchar32_t cp, word_iterator result) {
445 return append16<word_iterator, utfchar16_t>(cp, result);
446 }
447
448} // namespace internal
449
450 /// The library API - functions intended to be called by the users
451
452 // Byte order mark
453 const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
454
455 template <typename octet_iterator>
456 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
457 {
458 octet_iterator result = start;
459 while (result != end) {
460 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
461 if (err_code != internal::UTF8_OK)
462 return result;
463 }
464 return result;
465 }
466
467 inline const char* find_invalid(const char* str)
468 {
469 const char* end = str + std::strlen(str);
470 return find_invalid(str, end);
471 }
472
473 inline std::size_t find_invalid(const std::string& s)
474 {
475 std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
476 return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
477 }
478
479 template <typename octet_iterator>
480 inline bool is_valid(octet_iterator start, octet_iterator end)
481 {
482 return (utf8::find_invalid(start, end) == end);
483 }
484
485 inline bool is_valid(const char* str)
486 {
487 return (*(utf8::find_invalid(str)) == '\0');
488 }
489
490 inline bool is_valid(const std::string& s)
491 {
492 return is_valid(s.begin(), s.end());
493 }
494
495
496
497 template <typename octet_iterator>
498 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
499 {
500 return (
501 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
502 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
503 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
504 );
505 }
506
507 inline bool starts_with_bom(const std::string& s)
508 {
509 return starts_with_bom(s.begin(), s.end());
510 }
511} // namespace utf8
512
513#include <stdexcept>
514
515namespace utf8
516{
517 // Base for the exceptions that may be thrown from the library
518 class exception : public ::std::exception {
519 };
520
521 // Exceptions that may be thrown from the library functions.
522 class invalid_code_point : public exception {
523 utfchar32_t cp;
524 public:
525 invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {}
526 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
527 utfchar32_t code_point() const {return cp;}
528 };
529
530 class invalid_utf8 : public exception {
531 utfchar8_t u8;
532 public:
533 invalid_utf8 (utfchar8_t u) : u8(u) {}
534 invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {}
535 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
536 utfchar8_t utf8_octet() const {return u8;}
537 };
538
539 class invalid_utf16 : public exception {
540 utfchar16_t u16;
541 public:
542 invalid_utf16 (utfchar16_t u) : u16(u) {}
543 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
544 utfchar16_t utf16_word() const {return u16;}
545 };
546
547 class not_enough_room : public exception {
548 public:
549 virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
550 };
551
552 /// The library API - functions intended to be called by the users
553
554 template <typename octet_iterator>
555 octet_iterator append(utfchar32_t cp, octet_iterator result)
556 {
557 if (!utf8::internal::is_code_point_valid(cp))
558 throw invalid_code_point(cp);
559
560 return internal::append(cp, result);
561 }
562
563 inline void append(utfchar32_t cp, std::string& s)
564 {
565 append(cp, std::back_inserter(s));
566 }
567
568 template <typename word_iterator>
569 word_iterator append16(utfchar32_t cp, word_iterator result)
570 {
571 if (!utf8::internal::is_code_point_valid(cp))
572 throw invalid_code_point(cp);
573
574 return internal::append16(cp, result);
575 }
576
577 template <typename octet_iterator, typename output_iterator>
578 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
579 {
580 while (start != end) {
581 octet_iterator sequence_start = start;
582 internal::utf_error err_code = utf8::internal::validate_next(start, end);
583 switch (err_code) {
584 case internal::UTF8_OK :
585 for (octet_iterator it = sequence_start; it != start; ++it)
586 *out++ = *it;
587 break;
588 case internal::NOT_ENOUGH_ROOM:
589 out = utf8::append (replacement, out);
590 start = end;
591 break;
592 case internal::INVALID_LEAD:
593 out = utf8::append (replacement, out);
594 ++start;
595 break;
596 case internal::INCOMPLETE_SEQUENCE:
597 case internal::OVERLONG_SEQUENCE:
598 case internal::INVALID_CODE_POINT:
599 out = utf8::append (replacement, out);
600 ++start;
601 // just one replacement mark for the sequence
602 while (start != end && utf8::internal::is_trail(*start))
603 ++start;
604 break;
605 }
606 }
607 return out;
608 }
609
610 template <typename octet_iterator, typename output_iterator>
611 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
612 {
613 static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
614 return utf8::replace_invalid(start, end, out, replacement_marker);
615 }
616
617 inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
618 {
619 std::string result;
620 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
621 return result;
622 }
623
624 inline std::string replace_invalid(const std::string& s)
625 {
626 std::string result;
627 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
628 return result;
629 }
630
631 template <typename octet_iterator>
632 utfchar32_t next(octet_iterator& it, octet_iterator end)
633 {
634 utfchar32_t cp = 0;
635 internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
636 switch (err_code) {
637 case internal::UTF8_OK :
638 break;
639 case internal::NOT_ENOUGH_ROOM :
640 throw not_enough_room();
641 case internal::INVALID_LEAD :
642 case internal::INCOMPLETE_SEQUENCE :
643 case internal::OVERLONG_SEQUENCE :
644 throw invalid_utf8(static_cast<utfchar8_t>(*it));
645 case internal::INVALID_CODE_POINT :
646 throw invalid_code_point(cp);
647 }
648 return cp;
649 }
650
651 template <typename word_iterator>
652 utfchar32_t next16(word_iterator& it, word_iterator end)
653 {
654 utfchar32_t cp = 0;
655 internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp);
656 if (err_code == internal::NOT_ENOUGH_ROOM)
657 throw not_enough_room();
658 return cp;
659 }
660
661 template <typename octet_iterator>
662 utfchar32_t peek_next(octet_iterator it, octet_iterator end)
663 {
664 return utf8::next(it, end);
665 }
666
667 template <typename octet_iterator>
668 utfchar32_t prior(octet_iterator& it, octet_iterator start)
669 {
670 // can't do much if it == start
671 if (it == start)
672 throw not_enough_room();
673
674 octet_iterator end = it;
675 // Go back until we hit either a lead octet or start
676 while (utf8::internal::is_trail(*(--it)))
677 if (it == start)
678 throw invalid_utf8(*it); // error - no lead byte in the sequence
679 return utf8::peek_next(it, end);
680 }
681
682 template <typename octet_iterator, typename distance_type>
683 void advance (octet_iterator& it, distance_type n, octet_iterator end)
684 {
685 const distance_type zero(0);
686 if (n < zero) {
687 // backward
688 for (distance_type i = n; i < zero; ++i)
689 utf8::prior(it, end);
690 } else {
691 // forward
692 for (distance_type i = zero; i < n; ++i)
693 utf8::next(it, end);
694 }
695 }
696
697 template <typename octet_iterator>
698 typename std::iterator_traits<octet_iterator>::difference_type
699 distance (octet_iterator first, octet_iterator last)
700 {
701 typename std::iterator_traits<octet_iterator>::difference_type dist;
702 for (dist = 0; first < last; ++dist)
703 utf8::next(first, last);
704 return dist;
705 }
706
707 template <typename u16bit_iterator, typename octet_iterator>
708 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
709 {
710 while (start != end) {
711 utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
712 // Take care of surrogate pairs first
713 if (utf8::internal::is_lead_surrogate(cp)) {
714 if (start != end) {
715 const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
716 if (utf8::internal::is_trail_surrogate(trail_surrogate))
717 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
718 else
719 throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate));
720 }
721 else
722 throw invalid_utf16(static_cast<utfchar16_t>(cp));
723
724 }
725 // Lone trail surrogate
726 else if (utf8::internal::is_trail_surrogate(cp))
727 throw invalid_utf16(static_cast<utfchar16_t>(cp));
728
729 result = utf8::append(cp, result);
730 }
731 return result;
732 }
733
734 template <typename u16bit_iterator, typename octet_iterator>
735 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
736 {
737 while (start < end) {
738 const utfchar32_t cp = utf8::next(start, end);
739 if (cp > 0xffff) { //make a surrogate pair
740 *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET);
741 *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
742 }
743 else
744 *result++ = static_cast<utfchar16_t>(cp);
745 }
746 return result;
747 }
748
749 template <typename octet_iterator, typename u32bit_iterator>
750 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
751 {
752 while (start != end)
753 result = utf8::append(*(start++), result);
754
755 return result;
756 }
757
758 template <typename octet_iterator, typename u32bit_iterator>
759 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
760 {
761 while (start < end)
762 (*result++) = utf8::next(start, end);
763
764 return result;
765 }
766
767 // The iterator class
768 template <typename octet_iterator>
769 class iterator {
770 octet_iterator it;
771 octet_iterator range_start;
772 octet_iterator range_end;
773 public:
774 typedef utfchar32_t value_type;
775 typedef utfchar32_t* pointer;
776 typedef utfchar32_t& reference;
777 typedef std::ptrdiff_t difference_type;
778 typedef std::bidirectional_iterator_tag iterator_category;
779 iterator () {}
780 explicit iterator (const octet_iterator& octet_it,
781 const octet_iterator& rangestart,
782 const octet_iterator& rangeend) :
783 it(octet_it), range_start(rangestart), range_end(rangeend)
784 {
785 if (it < range_start || it > range_end)
786 throw std::out_of_range("Invalid utf-8 iterator position");
787 }
788 // the default "big three" are OK
789 octet_iterator base () const { return it; }
790 utfchar32_t operator * () const
791 {
792 octet_iterator temp = it;
793 return utf8::next(temp, range_end);
794 }
795 bool operator == (const iterator& rhs) const
796 {
797 if (range_start != rhs.range_start || range_end != rhs.range_end)
798 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
799 return (it == rhs.it);
800 }
801 bool operator != (const iterator& rhs) const
802 {
803 return !(operator == (rhs));
804 }
805 iterator& operator ++ ()
806 {
807 utf8::next(it, range_end);
808 return *this;
809 }
810 iterator operator ++ (int)
811 {
812 iterator temp = *this;
813 utf8::next(it, range_end);
814 return temp;
815 }
816 iterator& operator -- ()
817 {
818 utf8::prior(it, range_start);
819 return *this;
820 }
821 iterator operator -- (int)
822 {
823 iterator temp = *this;
824 utf8::prior(it, range_start);
825 return temp;
826 }
827 }; // class iterator
828
829} // namespace utf8
830
831#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
832namespace utf8
833{
834 inline void append16(utfchar32_t cp, std::u16string& s)
835 {
836 append16(cp, std::back_inserter(s));
837 }
838
839 inline std::string utf16to8(const std::u16string& s)
840 {
841 std::string result;
842 utf16to8(s.begin(), s.end(), std::back_inserter(result));
843 return result;
844 }
845
846 inline std::u16string utf8to16(const std::string& s)
847 {
848 std::u16string result;
849 utf8to16(s.begin(), s.end(), std::back_inserter(result));
850 return result;
851 }
852
853 inline std::string utf32to8(const std::u32string& s)
854 {
855 std::string result;
856 utf32to8(s.begin(), s.end(), std::back_inserter(result));
857 return result;
858 }
859
860 inline std::u32string utf8to32(const std::string& s)
861 {
862 std::u32string result;
863 utf8to32(s.begin(), s.end(), std::back_inserter(result));
864 return result;
865 }
866} // namespace utf8
867#endif // C++ 11 or later
868
869#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
870namespace utf8
871{
872 inline std::string utf16to8(std::u16string_view s)
873 {
874 std::string result;
875 utf16to8(s.begin(), s.end(), std::back_inserter(result));
876 return result;
877 }
878
879 inline std::u16string utf8to16(std::string_view s)
880 {
881 std::u16string result;
882 utf8to16(s.begin(), s.end(), std::back_inserter(result));
883 return result;
884 }
885
886 inline std::string utf32to8(std::u32string_view s)
887 {
888 std::string result;
889 utf32to8(s.begin(), s.end(), std::back_inserter(result));
890 return result;
891 }
892
893 inline std::u32string utf8to32(std::string_view s)
894 {
895 std::u32string result;
896 utf8to32(s.begin(), s.end(), std::back_inserter(result));
897 return result;
898 }
899
900 inline std::size_t find_invalid(std::string_view s)
901 {
902 std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
903 return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
904 }
905
906 inline bool is_valid(std::string_view s)
907 {
908 return is_valid(s.begin(), s.end());
909 }
910
911 inline std::string replace_invalid(std::string_view s, char32_t replacement)
912 {
913 std::string result;
914 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
915 return result;
916 }
917
918 inline std::string replace_invalid(std::string_view s)
919 {
920 std::string result;
921 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
922 return result;
923 }
924
925 inline bool starts_with_bom(std::string_view s)
926 {
927 return starts_with_bom(s.begin(), s.end());
928 }
929
930} // namespace utf8
931#endif // C++ 17 or later
932
933#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
934namespace utf8
935{
936 inline std::u8string utf16tou8(const std::u16string& s)
937 {
938 std::u8string result;
939 utf16to8(s.begin(), s.end(), std::back_inserter(result));
940 return result;
941 }
942
943 inline std::u8string utf16tou8(std::u16string_view s)
944 {
945 std::u8string result;
946 utf16to8(s.begin(), s.end(), std::back_inserter(result));
947 return result;
948 }
949
950 inline std::u16string utf8to16(const std::u8string& s)
951 {
952 std::u16string result;
953 utf8to16(s.begin(), s.end(), std::back_inserter(result));
954 return result;
955 }
956
957 inline std::u16string utf8to16(const std::u8string_view& s)
958 {
959 std::u16string result;
960 utf8to16(s.begin(), s.end(), std::back_inserter(result));
961 return result;
962 }
963
964 inline std::u8string utf32tou8(const std::u32string& s)
965 {
966 std::u8string result;
967 utf32to8(s.begin(), s.end(), std::back_inserter(result));
968 return result;
969 }
970
971 inline std::u8string utf32tou8(const std::u32string_view& s)
972 {
973 std::u8string result;
974 utf32to8(s.begin(), s.end(), std::back_inserter(result));
975 return result;
976 }
977
978 inline std::u32string utf8to32(const std::u8string& s)
979 {
980 std::u32string result;
981 utf8to32(s.begin(), s.end(), std::back_inserter(result));
982 return result;
983 }
984
985 inline std::u32string utf8to32(const std::u8string_view& s)
986 {
987 std::u32string result;
988 utf8to32(s.begin(), s.end(), std::back_inserter(result));
989 return result;
990 }
991
992 inline std::size_t find_invalid(const std::u8string& s)
993 {
994 std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
995 return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
996 }
997
998 inline bool is_valid(const std::u8string& s)
999 {
1000 return is_valid(s.begin(), s.end());
1001 }
1002
1003 inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
1004 {
1005 std::u8string result;
1006 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
1007 return result;
1008 }
1009
1010 inline std::u8string replace_invalid(const std::u8string& s)
1011 {
1012 std::u8string result;
1013 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
1014 return result;
1015 }
1016
1017 inline bool starts_with_bom(const std::u8string& s)
1018 {
1019 return starts_with_bom(s.begin(), s.end());
1020 }
1021
1022} // namespace utf8
1023#endif // C++ 20 or later
1024
1025namespace utf8
1026{
1027 namespace unchecked
1028 {
1029 template <typename octet_iterator>
1030 octet_iterator append(utfchar32_t cp, octet_iterator result)
1031 {
1032 return internal::append(cp, result);
1033 }
1034
1035 template <typename word_iterator>
1036 word_iterator append16(utfchar32_t cp, word_iterator result)
1037 {
1038 return internal::append16(cp, result);
1039 }
1040
1041 template <typename octet_iterator, typename output_iterator>
1042 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
1043 {
1044 while (start != end) {
1045 octet_iterator sequence_start = start;
1046 internal::utf_error err_code = utf8::internal::validate_next(start, end);
1047 switch (err_code) {
1048 case internal::UTF8_OK :
1049 for (octet_iterator it = sequence_start; it != start; ++it)
1050 *out++ = *it;
1051 break;
1052 case internal::NOT_ENOUGH_ROOM:
1053 out = utf8::unchecked::append(replacement, out);
1054 start = end;
1055 break;
1056 case internal::INVALID_LEAD:
1057 out = utf8::unchecked::append(replacement, out);
1058 ++start;
1059 break;
1060 case internal::INCOMPLETE_SEQUENCE:
1061 case internal::OVERLONG_SEQUENCE:
1062 case internal::INVALID_CODE_POINT:
1063 out = utf8::unchecked::append(replacement, out);
1064 ++start;
1065 // just one replacement mark for the sequence
1066 while (start != end && utf8::internal::is_trail(*start))
1067 ++start;
1068 break;
1069 }
1070 }
1071 return out;
1072 }
1073
1074 template <typename octet_iterator, typename output_iterator>
1075 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
1076 {
1077 static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
1078 return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
1079 }
1080
1081 inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
1082 {
1083 std::string result;
1084 replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
1085 return result;
1086 }
1087
1088 inline std::string replace_invalid(const std::string& s)
1089 {
1090 std::string result;
1091 replace_invalid(s.begin(), s.end(), std::back_inserter(result));
1092 return result;
1093 }
1094
1095 template <typename octet_iterator>
1096 utfchar32_t next(octet_iterator& it)
1097 {
1098 utfchar32_t cp = utf8::internal::mask8(*it);
1099 switch (utf8::internal::sequence_length(it)) {
1100 case 1:
1101 break;
1102 case 2:
1103 ++it;
1104 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
1105 break;
1106 case 3:
1107 ++it;
1108 cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
1109 ++it;
1110 cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
1111 break;
1112 case 4:
1113 ++it;
1114 cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
1115 ++it;
1116 cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff));
1117 ++it;
1118 cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
1119 break;
1120 }
1121 ++it;
1122 return cp;
1123 }
1124
1125 template <typename octet_iterator>
1126 utfchar32_t peek_next(octet_iterator it)
1127 {
1128 return utf8::unchecked::next(it);
1129 }
1130
1131 template <typename word_iterator>
1132 utfchar32_t next16(word_iterator& it)
1133 {
1134 utfchar32_t cp = utf8::internal::mask16(*it++);
1135 if (utf8::internal::is_lead_surrogate(cp))
1136 return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET;
1137 return cp;
1138 }
1139
1140 template <typename octet_iterator>
1141 utfchar32_t prior(octet_iterator& it)
1142 {
1143 while (utf8::internal::is_trail(*(--it))) ;
1144 octet_iterator temp = it;
1145 return utf8::unchecked::next(temp);
1146 }
1147
1148 template <typename octet_iterator, typename distance_type>
1149 void advance(octet_iterator& it, distance_type n)
1150 {
1151 const distance_type zero(0);
1152 if (n < zero) {
1153 // backward
1154 for (distance_type i = n; i < zero; ++i)
1155 utf8::unchecked::prior(it);
1156 } else {
1157 // forward
1158 for (distance_type i = zero; i < n; ++i)
1159 utf8::unchecked::next(it);
1160 }
1161 }
1162
1163 template <typename octet_iterator>
1164 typename std::iterator_traits<octet_iterator>::difference_type
1165 distance(octet_iterator first, octet_iterator last)
1166 {
1167 typename std::iterator_traits<octet_iterator>::difference_type dist;
1168 for (dist = 0; first < last; ++dist)
1169 utf8::unchecked::next(first);
1170 return dist;
1171 }
1172
1173 template <typename u16bit_iterator, typename octet_iterator>
1174 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
1175 {
1176 while (start != end) {
1177 utfchar32_t cp = utf8::internal::mask16(*start++);
1178 // Take care of surrogate pairs first
1179 if (utf8::internal::is_lead_surrogate(cp)) {
1180 if (start == end)
1181 return result;
1182 utfchar32_t trail_surrogate = utf8::internal::mask16(*start++);
1183 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
1184 }
1185 result = utf8::unchecked::append(cp, result);
1186 }
1187 return result;
1188 }
1189
1190 template <typename u16bit_iterator, typename octet_iterator>
1191 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
1192 {
1193 while (start < end) {
1194 utfchar32_t cp = utf8::unchecked::next(start);
1195 if (cp > 0xffff) { //make a surrogate pair
1196 *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET);
1197 *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
1198 }
1199 else
1200 *result++ = static_cast<utfchar16_t>(cp);
1201 }
1202 return result;
1203 }
1204
1205 template <typename octet_iterator, typename u32bit_iterator>
1206 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
1207 {
1208 while (start != end)
1209 result = utf8::unchecked::append(*(start++), result);
1210
1211 return result;
1212 }
1213
1214 template <typename octet_iterator, typename u32bit_iterator>
1215 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
1216 {
1217 while (start < end)
1218 (*result++) = utf8::unchecked::next(start);
1219
1220 return result;
1221 }
1222
1223 // The iterator class
1224 template <typename octet_iterator>
1225 class iterator {
1226 octet_iterator it;
1227 public:
1228 typedef utfchar32_t value_type;
1229 typedef utfchar32_t* pointer;
1230 typedef utfchar32_t& reference;
1231 typedef std::ptrdiff_t difference_type;
1232 typedef std::bidirectional_iterator_tag iterator_category;
1233 iterator () {}
1234 explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
1235 // the default "big three" are OK
1236 octet_iterator base () const { return it; }
1237 utfchar32_t operator * () const
1238 {
1239 octet_iterator temp = it;
1240 return utf8::unchecked::next(temp);
1241 }
1242 bool operator == (const iterator& rhs) const
1243 {
1244 return (it == rhs.it);
1245 }
1246 bool operator != (const iterator& rhs) const
1247 {
1248 return !(operator == (rhs));
1249 }
1250 iterator& operator ++ ()
1251 {
1252 ::std::advance(it, utf8::internal::sequence_length(it));
1253 return *this;
1254 }
1255 iterator operator ++ (int)
1256 {
1257 iterator temp = *this;
1258 ::std::advance(it, utf8::internal::sequence_length(it));
1259 return temp;
1260 }
1261 iterator& operator -- ()
1262 {
1263 utf8::unchecked::prior(it);
1264 return *this;
1265 }
1266 iterator operator -- (int)
1267 {
1268 iterator temp = *this;
1269 utf8::unchecked::prior(it);
1270 return temp;
1271 }
1272 }; // class iterator
1273
1274 } // namespace utf8::unchecked
1275} // namespace utf8
1276
1277#endif // header guard