diff options
Diffstat (limited to 'src/3rdParty')
| -rwxr-xr-x | src/3rdParty/colib/LICENSE | 21 | ||||
| -rw-r--r-- | src/3rdParty/colib/ljson.c | 925 | ||||
| -rwxr-xr-x | src/3rdParty/utf8cpp.h | 1277 |
3 files changed, 2223 insertions, 0 deletions
diff --git a/src/3rdParty/colib/LICENSE b/src/3rdParty/colib/LICENSE new file mode 100755 index 0000000..e0eddeb --- /dev/null +++ b/src/3rdParty/colib/LICENSE | |||
| @@ -0,0 +1,21 @@ | |||
| 1 | MIT License | ||
| 2 | |||
| 3 | Copyright (c) 2020 colin | ||
| 4 | |||
| 5 | Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 6 | of this software and associated documentation files (the "Software"), to deal | ||
| 7 | in the Software without restriction, including without limitation the rights | ||
| 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 9 | copies of the Software, and to permit persons to whom the Software is | ||
| 10 | furnished to do so, subject to the following conditions: | ||
| 11 | |||
| 12 | The above copyright notice and this permission notice shall be included in all | ||
| 13 | copies or substantial portions of the Software. | ||
| 14 | |||
| 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 21 | SOFTWARE. | ||
diff --git a/src/3rdParty/colib/ljson.c b/src/3rdParty/colib/ljson.c new file mode 100644 index 0000000..4daba07 --- /dev/null +++ b/src/3rdParty/colib/ljson.c | |||
| @@ -0,0 +1,925 @@ | |||
| 1 | /** | ||
| 2 | * json解析器:只支持utf-8格式,Lua只支持64位的数字 | ||
| 3 | */ | ||
| 4 | #define LUA_LIB | ||
| 5 | #include <stdlib.h> | ||
| 6 | #include <string.h> | ||
| 7 | #include <stdint.h> | ||
| 8 | #include <stdio.h> | ||
| 9 | #include <ctype.h> | ||
| 10 | #include <assert.h> | ||
| 11 | #include <errno.h> | ||
| 12 | #include <setjmp.h> | ||
| 13 | #include <ctype.h> | ||
| 14 | #include <limits.h> | ||
| 15 | #include <float.h> | ||
| 16 | #include <math.h> | ||
| 17 | #include "lua.h" | ||
| 18 | #include "lauxlib.h" | ||
| 19 | |||
| 20 | #if LUA_VERSION_NUM > 501 | ||
| 21 | #ifndef LUA_COMPAT_5_1 | ||
| 22 | #ifndef lua_objlen | ||
| 23 | #define lua_objlen lua_rawlen | ||
| 24 | #endif // lua_objlen | ||
| 25 | #endif // LUA_COMPAT_5_1 | ||
| 26 | #endif // LUA_VERSION_NUM | ||
| 27 | |||
| 28 | // 内存分配函数,方便替换 | ||
| 29 | #define co_malloc malloc | ||
| 30 | #define co_free free | ||
| 31 | #define co_realloc realloc | ||
| 32 | #define co_calloc calloc | ||
| 33 | |||
| 34 | |||
| 35 | #if !defined(likely) | ||
| 36 | #if defined(__GNUC__) | ||
| 37 | #define likely(x) (__builtin_expect(((x) != 0), 1)) | ||
| 38 | #define unlikely(x) (__builtin_expect(((x) != 0), 0)) | ||
| 39 | #else | ||
| 40 | #define likely(x) (x) | ||
| 41 | #define unlikely(x) (x) | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #endif | ||
| 45 | |||
| 46 | //----------------------------------------------------------------------------- | ||
| 47 | // membuffer | ||
| 48 | |||
| 49 | #define STACK_BUFF_SIZE 512 | ||
| 50 | |||
| 51 | typedef struct membuffer { | ||
| 52 | char *b; // 内存buffer | ||
| 53 | size_t sz; // buffer已用长度 | ||
| 54 | size_t cap; // buffer实际大小 | ||
| 55 | char s[STACK_BUFF_SIZE]; | ||
| 56 | } membuffer_t; | ||
| 57 | |||
| 58 | // 初始化buffer | ||
| 59 | static inline void membuffer_init(membuffer_t *buff) { | ||
| 60 | buff->b = buff->s; | ||
| 61 | buff->cap = STACK_BUFF_SIZE; | ||
| 62 | buff->sz = 0; | ||
| 63 | } | ||
| 64 | |||
| 65 | static inline void membuffer_add_size(membuffer_t *buff, size_t sz) { | ||
| 66 | buff->sz += sz; | ||
| 67 | } | ||
| 68 | |||
| 69 | static inline void membuffer_reset(membuffer_t *buff) { | ||
| 70 | buff->sz = 0; | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline void membuffer_free(membuffer_t *buff) { | ||
| 74 | if (buff->b && buff->b != buff->s) { | ||
| 75 | co_free(buff->b); | ||
| 76 | buff->b = NULL; | ||
| 77 | } | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline void _membuffer_grow(membuffer_t *buff, size_t needsz) { | ||
| 81 | if (buff->cap < needsz) { | ||
| 82 | size_t newcap = buff->cap * 2; | ||
| 83 | if (newcap < needsz) | ||
| 84 | newcap = needsz; | ||
| 85 | if (buff->b == buff->s) { | ||
| 86 | buff->b = (char*)co_malloc(newcap); | ||
| 87 | memcpy(buff->b, buff->s, buff->sz); | ||
| 88 | } else { | ||
| 89 | buff->b = (char*)co_realloc(buff->b, newcap); | ||
| 90 | } | ||
| 91 | buff->cap = newcap; | ||
| 92 | } | ||
| 93 | } | ||
| 94 | |||
| 95 | // 确保缓存中还有sz的可用空间 | ||
| 96 | static inline void membuffer_ensure_space(membuffer_t *buff, size_t sz) { | ||
| 97 | if (buff->sz + sz > buff->cap) { | ||
| 98 | _membuffer_grow(buff, buff->sz+sz); | ||
| 99 | } | ||
| 100 | } | ||
| 101 | |||
| 102 | // 压入一个字符 | ||
| 103 | static inline void membuffer_putc(membuffer_t *buff, char c) { | ||
| 104 | membuffer_ensure_space(buff, 1); | ||
| 105 | buff->b[buff->sz++] = c; | ||
| 106 | } | ||
| 107 | |||
| 108 | // 写入一段内存 | ||
| 109 | static inline void membuffer_putb(membuffer_t *buff, const void *b, size_t sz) { | ||
| 110 | membuffer_ensure_space(buff, sz); | ||
| 111 | memcpy(buff->b + buff->sz, b, sz); | ||
| 112 | buff->sz += sz; | ||
| 113 | } | ||
| 114 | |||
| 115 | // 压入一个字符:不检查空间(不安全版本) | ||
| 116 | static inline void membuffer_putc_unsafe(membuffer_t *buff, char c) { | ||
| 117 | buff->b[buff->sz++] = c; | ||
| 118 | } | ||
| 119 | |||
| 120 | #if LUA_VERSION_NUM > 501 | ||
| 121 | // 写入一段内存:不检查空间(不安全版本) | ||
| 122 | static inline void membuffer_putb_unsafe(membuffer_t *buff, const void *b, size_t sz) { | ||
| 123 | memcpy(buff->b + buff->sz, b, sz); | ||
| 124 | buff->sz += sz; | ||
| 125 | } | ||
| 126 | #endif | ||
| 127 | |||
| 128 | // 取当前的指针 | ||
| 129 | static inline char* membuffer_getp(membuffer_t *buff) { | ||
| 130 | return buff->b + buff->sz; | ||
| 131 | } | ||
| 132 | |||
| 133 | //----------------------------------------------------------------------------- | ||
| 134 | // parser | ||
| 135 | |||
| 136 | //------------------------------------- | ||
| 137 | // 与Lua相关的代码 | ||
| 138 | |||
| 139 | static inline void l_add_object(lua_State *L) { | ||
| 140 | luaL_checkstack(L, 6, NULL); | ||
| 141 | lua_createtable(L, 0, 4); | ||
| 142 | } | ||
| 143 | static inline void l_begin_pair(lua_State *L, const char *k, size_t sz) { | ||
| 144 | lua_pushlstring(L, k, sz); | ||
| 145 | } | ||
| 146 | static inline void l_end_pair(lua_State *L) { | ||
| 147 | lua_rawset(L, -3); | ||
| 148 | } | ||
| 149 | static inline void l_add_array(lua_State *L) { | ||
| 150 | luaL_checkstack(L, 6, NULL); | ||
| 151 | lua_createtable(L, 4, 0); | ||
| 152 | } | ||
| 153 | static inline void l_add_index(lua_State *L, int i) { | ||
| 154 | lua_rawseti(L, -2, i+1); | ||
| 155 | } | ||
| 156 | static inline void l_add_string(lua_State *L, const char *s, size_t sz) { | ||
| 157 | lua_pushlstring(L, s, sz); | ||
| 158 | } | ||
| 159 | static inline void l_add_float(lua_State *L, double f) { | ||
| 160 | lua_pushnumber(L, (lua_Number)f); | ||
| 161 | } | ||
| 162 | static inline void l_add_integer(lua_State *L, int64_t i) { | ||
| 163 | lua_pushinteger(L, (lua_Integer)i); | ||
| 164 | } | ||
| 165 | static inline void l_add_boolean(lua_State *L, int b) { | ||
| 166 | lua_pushboolean(L, b); | ||
| 167 | } | ||
| 168 | static inline void l_add_null(lua_State *L) { | ||
| 169 | lua_pushlightuserdata(L, NULL); | ||
| 170 | } | ||
| 171 | static inline void l_error(lua_State *L, const char *msg) { | ||
| 172 | luaL_error(L, msg); | ||
| 173 | } | ||
| 174 | |||
| 175 | // 解析事件 | ||
| 176 | #define ON_ADD_OBJECT(ud) l_add_object((lua_State*)(ud)) | ||
| 177 | #define ON_BEGIN_PAIR(ud, k, sz) l_begin_pair((lua_State*)(ud), k, sz) | ||
| 178 | #define ON_END_PAIR(ud) l_end_pair((lua_State*)(ud)) | ||
| 179 | #define ON_ADD_ARRAY(ud) l_add_array((lua_State*)(ud)) | ||
| 180 | #define ON_ADD_INDEX(ud, i) l_add_index((lua_State*)(ud), i) | ||
| 181 | #define ON_ADD_STRING(ud, s, sz) l_add_string((lua_State*)(ud), s, sz) | ||
| 182 | #define ON_ADD_FLOAT(ud, f) l_add_float((lua_State*)(ud), f) | ||
| 183 | #define ON_ADD_INTEGER(ud, i) l_add_integer((lua_State*)(ud), i) | ||
| 184 | #define ON_ADD_BOOLEAN(ud, b) l_add_boolean((lua_State*)(ud), b) | ||
| 185 | #define ON_ADD_NULL(ud) l_add_null((lua_State*)(ud)) | ||
| 186 | #define ON_ERROR(ud, msg) l_error((lua_State*)(ud), msg) | ||
| 187 | |||
| 188 | //------------------------------------- | ||
| 189 | // 解析json,这部分代码与Lua无关,是通用的解析器;如果要移植这部分代码,需要把 //>>> 开头的注释去掉 | ||
| 190 | |||
| 191 | // 错误消息的大小 | ||
| 192 | #define ERRMSG_SIZE 256 | ||
| 193 | |||
| 194 | // json解析器 | ||
| 195 | typedef struct { | ||
| 196 | const char *str; // json字符串 | ||
| 197 | const char *ptr; // json字符串解析指针 | ||
| 198 | void *ud; // 解析事件的用户数据 | ||
| 199 | membuffer_t buff; // 临时缓存 | ||
| 200 | int curdepth; // 当前层次 | ||
| 201 | int maxdepth; // 最大层次 | ||
| 202 | int allowcomment; // 是否允许注释 | ||
| 203 | char errmsg[ERRMSG_SIZE]; // 保存错误消息 | ||
| 204 | //>>>jmp_buf jb; // 用于实现从解析中出错直接跳出 | ||
| 205 | } json_parser_t; | ||
| 206 | |||
| 207 | static inline void parser_init(json_parser_t *parser, const char *str, size_t size, void *ud, | ||
| 208 | int maxdepth, int allowcomment) { | ||
| 209 | membuffer_init(&parser->buff); | ||
| 210 | membuffer_ensure_space(&parser->buff, size); | ||
| 211 | parser->str = str; | ||
| 212 | parser->ptr = str; | ||
| 213 | parser->ud = ud; | ||
| 214 | parser->maxdepth = maxdepth; | ||
| 215 | parser->curdepth = 0; | ||
| 216 | parser->allowcomment = allowcomment; | ||
| 217 | } | ||
| 218 | |||
| 219 | static inline void parser_free(json_parser_t *parser) { | ||
| 220 | membuffer_free(&parser->buff); | ||
| 221 | } | ||
| 222 | |||
| 223 | // 抛出错误 | ||
| 224 | static void parser_throw_error(json_parser_t *parser, const char *fmt, ...) { | ||
| 225 | membuffer_free(&parser->buff); | ||
| 226 | va_list arg; | ||
| 227 | va_start(arg, fmt); | ||
| 228 | vsnprintf(parser->errmsg, ERRMSG_SIZE, fmt, arg); | ||
| 229 | va_end(arg); | ||
| 230 | ON_ERROR(parser->ud, parser->errmsg); | ||
| 231 | // 直接跳出解析代码,由于Lua的lua_error也是用longjmp,所以下面的代码没有机会执行到。但其他语言就不一定。 | ||
| 232 | //>>>longjmp(parser->jb, 1); | ||
| 233 | } | ||
| 234 | |||
| 235 | // 辅助宏 | ||
| 236 | #define peekchar(p) (*(p)->ptr) | ||
| 237 | #define skipchar(p) (++(p)->ptr) | ||
| 238 | #define get_and_next(p) (*(p)->ptr++) | ||
| 239 | #define next_and_get(p) (*(++(p)->ptr)) | ||
| 240 | #define savechar(p, c) membuffer_putc_unsafe(&(p)->buff, (c)) | ||
| 241 | #define currpos(p) (size_t)((p)->ptr - (p)->str) | ||
| 242 | |||
| 243 | // 取解析到的错误内容 | ||
| 244 | static const char* parser_error_content(json_parser_t *p) { | ||
| 245 | size_t n = currpos(p); | ||
| 246 | if (n > 50) n = 50; // 调整这个数获得更长的内容 | ||
| 247 | membuffer_reset(&p->buff); | ||
| 248 | membuffer_putb(&p->buff, p->ptr - n, n); | ||
| 249 | membuffer_putc(&p->buff, '\0'); | ||
| 250 | return p->buff.b; | ||
| 251 | } | ||
| 252 | |||
| 253 | // 增加深度 | ||
| 254 | static inline void parser_add_depth(json_parser_t *p) { | ||
| 255 | p->curdepth++; | ||
| 256 | if (p->curdepth >= p->maxdepth) | ||
| 257 | parser_throw_error(p, "Too many nested data, max depth is %d, at: %s[:%lu]", p->maxdepth, | ||
| 258 | parser_error_content(p), currpos(p)); | ||
| 259 | } | ||
| 260 | |||
| 261 | static inline void parser_skip_whitespaces(json_parser_t *p) { | ||
| 262 | // colin: 要支持注释,请将下面注释去掉 | ||
| 263 | // if (likely(!p->allowcomment)) { | ||
| 264 | char ch = peekchar(p); | ||
| 265 | while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') | ||
| 266 | ch = next_and_get(p); | ||
| 267 | // } else { | ||
| 268 | // char ch = peekchar(p); | ||
| 269 | // for (;;) { | ||
| 270 | // while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') | ||
| 271 | // ch = next_and_get(p); | ||
| 272 | // if (ch == '/') { | ||
| 273 | // ch = next_and_get(p); | ||
| 274 | // if (ch == '/') { | ||
| 275 | // ch = next_and_get(p); | ||
| 276 | // while (ch != '\n' && ch != '\r' && ch != '\0') | ||
| 277 | // ch = next_and_get(p); | ||
| 278 | // continue; | ||
| 279 | // } else { | ||
| 280 | // parser_throw_error(p, "Invalid comment, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 281 | // } | ||
| 282 | // } | ||
| 283 | // break; | ||
| 284 | // } | ||
| 285 | // } | ||
| 286 | } | ||
| 287 | |||
| 288 | static inline void parser_expect_char(json_parser_t *p, char c) { | ||
| 289 | if (likely(peekchar(p) == c)) | ||
| 290 | skipchar(p); | ||
| 291 | else | ||
| 292 | parser_throw_error(p, "Expect '%c' at: %s[:%lu]", c, parser_error_content(p), currpos(p)); | ||
| 293 | } | ||
| 294 | |||
| 295 | static inline void parser_process_false(json_parser_t *p) { | ||
| 296 | if (likely(p->ptr[0] == 'a' && p->ptr[1] == 'l' && p->ptr[2] == 's' && p->ptr[3] == 'e')) { | ||
| 297 | p->ptr += 4; | ||
| 298 | ON_ADD_BOOLEAN(p->ud, 0); | ||
| 299 | } else { | ||
| 300 | parser_throw_error(p, "Invalid boolean, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 301 | } | ||
| 302 | } | ||
| 303 | |||
| 304 | static inline void parser_process_true(json_parser_t *p) { | ||
| 305 | if (likely(p->ptr[0] == 'r' && p->ptr[1] == 'u' && p->ptr[2] == 'e')) { | ||
| 306 | p->ptr += 3; | ||
| 307 | ON_ADD_BOOLEAN(p->ud, 1); | ||
| 308 | } else { | ||
| 309 | parser_throw_error(p, "Invalid boolean, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 310 | } | ||
| 311 | } | ||
| 312 | |||
| 313 | static inline void parser_process_null(json_parser_t *p) { | ||
| 314 | if (likely(p->ptr[0] == 'u' && p->ptr[1] == 'l' && p->ptr[2] == 'l')) { | ||
| 315 | p->ptr += 3; | ||
| 316 | ON_ADD_NULL(p->ud); | ||
| 317 | } else { | ||
| 318 | parser_throw_error(p, "Invalid null, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 322 | static inline uint32_t parser_read_hex(json_parser_t *p) { | ||
| 323 | uint32_t cp = 0; | ||
| 324 | unsigned char ch; | ||
| 325 | int i = 4; | ||
| 326 | while (i--) { | ||
| 327 | ch = (unsigned char)get_and_next(p); | ||
| 328 | if ('0' <= ch && ch <= '9') | ||
| 329 | ch -= '0'; | ||
| 330 | else if (ch >= 'a' && ch <= 'f') | ||
| 331 | ch = ch - 'a' + 10; | ||
| 332 | else if (ch >= 'A' && ch <= 'F') | ||
| 333 | ch = ch - 'A' + 10; | ||
| 334 | else { | ||
| 335 | parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 336 | return cp; | ||
| 337 | } | ||
| 338 | cp = (cp << 4) + ch; | ||
| 339 | } | ||
| 340 | return cp; | ||
| 341 | } | ||
| 342 | |||
| 343 | static inline void parser_process_utf8esc(json_parser_t *p) { | ||
| 344 | uint32_t cp = parser_read_hex(p); | ||
| 345 | // UTF-16 surrogate pairs, see https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs | ||
| 346 | if (cp >= 0xD800 && cp <= 0xDBFF) { | ||
| 347 | char p0 = p->ptr[0]; | ||
| 348 | char p1 = p->ptr[1]; | ||
| 349 | if (p0 != '\\' || p1 != 'u') | ||
| 350 | parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 351 | p->ptr += 2; | ||
| 352 | uint32_t cp2 = parser_read_hex(p); | ||
| 353 | if (cp2 < 0xDC00 || cp2 > 0xDFFF) | ||
| 354 | parser_throw_error(p, "Invalid utf8 escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 355 | cp = 0x10000 + (((cp & 0x03FF) << 10) | (cp2 & 0x03FF)); | ||
| 356 | } | ||
| 357 | if (cp < 0x80) { | ||
| 358 | membuffer_putc_unsafe(&p->buff, (char)cp); | ||
| 359 | } else if (cp < 0x800) { | ||
| 360 | membuffer_putc_unsafe(&p->buff, 0xC0 | (cp >> 6)); | ||
| 361 | membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F)); | ||
| 362 | } else if (cp < 0x10000) { | ||
| 363 | membuffer_putc_unsafe(&p->buff, 0xE0 | (cp >> 12)); | ||
| 364 | membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 6) & 0x3F)); | ||
| 365 | membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F)); | ||
| 366 | } else { | ||
| 367 | membuffer_putc_unsafe(&p->buff, 0xF0 | (cp >> 18)); | ||
| 368 | membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 12) & 0x3F)); | ||
| 369 | membuffer_putc_unsafe(&p->buff, 0x80 | ((cp >> 6) & 0x3F)); | ||
| 370 | membuffer_putc_unsafe(&p->buff, 0x80 | (cp & 0x3F)); | ||
| 371 | } | ||
| 372 | } | ||
| 373 | |||
| 374 | static const char escape2char[256] = { | ||
| 375 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0~19 | ||
| 376 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\"',0, 0, 0, 0, 0, // 20~39 | ||
| 377 | 0, 0, 0, 0, 0, 0, 0, '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40~59 | ||
| 378 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60~79 | ||
| 379 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\\',0, 0, 0, 0, 0, '\b',0, // 80~99 | ||
| 380 | 0, 0, '\f',0, 0, 0, 0, 0, 0, 0, '\n',0, 0, 0, '\r',0, '\t',0, 0, 0, // 100~119 | ||
| 381 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 120~139 | ||
| 382 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 140~159 | ||
| 383 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160~179 | ||
| 384 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 180~199 | ||
| 385 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 200~219 | ||
| 386 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 220~239 | ||
| 387 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 240~256 | ||
| 388 | }; | ||
| 389 | |||
| 390 | static inline void parser_process_string(json_parser_t *p) { | ||
| 391 | membuffer_reset(&p->buff); | ||
| 392 | char ch = get_and_next(p); | ||
| 393 | for (;;) { | ||
| 394 | if (ch == '\\') { | ||
| 395 | unsigned char nch = (unsigned char)peekchar(p); | ||
| 396 | if (likely(escape2char[nch])) { | ||
| 397 | savechar(p, escape2char[nch]); | ||
| 398 | skipchar(p); | ||
| 399 | } else if (nch == 'u') { | ||
| 400 | skipchar(p); | ||
| 401 | parser_process_utf8esc(p); | ||
| 402 | } else { | ||
| 403 | parser_throw_error(p, "Invalid escape sequence, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 404 | } | ||
| 405 | } else if (ch == '"') { | ||
| 406 | break; | ||
| 407 | } else if ((unsigned char)ch < 0x20) { | ||
| 408 | parser_throw_error(p, "Invalid string, at: %s[:%lu]", parser_error_content(p), currpos(p)); | ||
| 409 | } else { | ||
| 410 | savechar(p, ch); | ||
| 411 | } | ||
| 412 | ch = get_and_next(p); | ||
| 413 | } | ||
| 414 | } | ||
| 415 | |||
| 416 | #define invalid_number(p) parser_throw_error(p, "Invalid value, at: %s[:%lu]", parser_error_content(p), currpos(p)) | ||
| 417 | #define MAXBY10 (int64_t)(922337203685477580) | ||
| 418 | #define MAXLASTD (int)(7) | ||
| 419 | static double powersOf10[] = {10., 100., 1.0e4, 1.0e8, 1.0e16, 1.0e32, 1.0e64, 1.0e128, 1.0e256}; | ||
| 420 | static inline void parser_process_number(json_parser_t *p, char ch) { | ||
| 421 | double db; // 浮点数 | ||
| 422 | int64_t in = 0; // 整型值 | ||
| 423 | int isdouble = 0; // 是否是浮点数 | ||
| 424 | int neg = 0; // 是否是负数 | ||
| 425 | int exponent = 0; // 指数位数 | ||
| 426 | |||
| 427 | if (ch == '-') { // 负值 | ||
| 428 | neg = 1; | ||
| 429 | ch = get_and_next(p); | ||
| 430 | } | ||
| 431 | if (unlikely(ch == '0')) { // 0开头的后面只能是:.eE或\0 | ||
| 432 | ch = peekchar(p); | ||
| 433 | } else if (likely(ch >= '1' && ch <= '9')) { | ||
| 434 | in = ch - '0'; | ||
| 435 | ch = peekchar(p); | ||
| 436 | while (ch >= '0' && ch <= '9') { | ||
| 437 | if (unlikely(in >= MAXBY10 && (in > MAXBY10 || (ch - '0') > MAXLASTD + neg))) { // 更大的数字就用浮点数表示 | ||
| 438 | isdouble = 1; | ||
| 439 | db = (double)in; | ||
| 440 | do { | ||
| 441 | db = db * 10.0 + (ch - '0'); | ||
| 442 | ch = next_and_get(p); | ||
| 443 | } while (ch >= '0' && ch <= '9'); | ||
| 444 | break; | ||
| 445 | } | ||
| 446 | in = in * 10 + (ch - '0'); | ||
| 447 | ch = next_and_get(p); | ||
| 448 | } | ||
| 449 | } else { | ||
| 450 | invalid_number(p); | ||
| 451 | } | ||
| 452 | |||
| 453 | if (ch == '.') { // 小数点部分 | ||
| 454 | if (likely(!isdouble)) { | ||
| 455 | isdouble = 1; | ||
| 456 | db = (double)in; | ||
| 457 | } | ||
| 458 | ch = next_and_get(p); | ||
| 459 | if (unlikely(!(ch >= '0' && ch <= '9'))) | ||
| 460 | invalid_number(p); // .后面一定是数字 | ||
| 461 | do { | ||
| 462 | db = db * 10. + (ch - '0'); | ||
| 463 | exponent--; | ||
| 464 | ch = next_and_get(p); | ||
| 465 | } while (ch >= '0' && ch <= '9'); | ||
| 466 | } | ||
| 467 | |||
| 468 | if (ch == 'e' || ch == 'E') { // 指数部分 | ||
| 469 | if (!isdouble) { // 有e强制认为是浮点数 | ||
| 470 | isdouble = 1; | ||
| 471 | db = (double)in; | ||
| 472 | } | ||
| 473 | ch = next_and_get(p); | ||
| 474 | int eneg = 0; | ||
| 475 | if (ch == '-') { | ||
| 476 | eneg = 1; | ||
| 477 | ch = next_and_get(p); | ||
| 478 | } else if (ch == '+') { | ||
| 479 | ch = next_and_get(p); | ||
| 480 | } | ||
| 481 | if (unlikely(!(ch >= '0' && ch <= '9'))) | ||
| 482 | invalid_number(p); // 后面一定是数字 | ||
| 483 | int exp = 0; | ||
| 484 | do { | ||
| 485 | exp = exp * 10. + (ch - '0'); | ||
| 486 | ch = next_and_get(p); | ||
| 487 | } while (ch >= '0' && ch <= '9'); | ||
| 488 | exponent += eneg ? (-exp) : (exp); | ||
| 489 | } | ||
| 490 | |||
| 491 | if (isdouble) { | ||
| 492 | int n = exponent < 0 ? -exponent : exponent; | ||
| 493 | if (unlikely(n>511)) | ||
| 494 | n = 511; // inf | ||
| 495 | double p10 = 1.0; | ||
| 496 | double *d; | ||
| 497 | for (d = powersOf10; n != 0; n >>= 1, d += 1) { | ||
| 498 | if (n & 1) p10 *= *d; | ||
| 499 | } | ||
| 500 | if (exponent < 0) | ||
| 501 | db /= p10; | ||
| 502 | else | ||
| 503 | db *= p10; | ||
| 504 | if (neg) db = -db; | ||
| 505 | ON_ADD_FLOAT(p->ud, db); | ||
| 506 | } else { | ||
| 507 | if (neg) in = -in; | ||
| 508 | ON_ADD_INTEGER(p->ud, in); | ||
| 509 | } | ||
| 510 | } | ||
| 511 | |||
| 512 | static void parser_process_value(json_parser_t *p); | ||
| 513 | |||
| 514 | static inline void parser_process_object(json_parser_t *p) { | ||
| 515 | parser_add_depth(p); | ||
| 516 | ON_ADD_OBJECT(p->ud); | ||
| 517 | parser_skip_whitespaces(p); | ||
| 518 | char ch = peekchar(p); | ||
| 519 | if (ch == '}') { | ||
| 520 | skipchar(p); | ||
| 521 | p->curdepth--; | ||
| 522 | return; | ||
| 523 | } | ||
| 524 | for (;;) { | ||
| 525 | parser_expect_char(p, '"'); | ||
| 526 | parser_process_string(p); // key | ||
| 527 | ON_BEGIN_PAIR(p->ud, p->buff.b, p->buff.sz); | ||
| 528 | |||
| 529 | parser_skip_whitespaces(p); | ||
| 530 | parser_expect_char(p, ':'); | ||
| 531 | |||
| 532 | parser_process_value(p); // value | ||
| 533 | ON_END_PAIR(p->ud); | ||
| 534 | |||
| 535 | parser_skip_whitespaces(p); | ||
| 536 | if (peekchar(p) == '}') { | ||
| 537 | skipchar(p); | ||
| 538 | p->curdepth--; | ||
| 539 | return; | ||
| 540 | } | ||
| 541 | else { | ||
| 542 | parser_expect_char(p, ','); | ||
| 543 | parser_skip_whitespaces(p); | ||
| 544 | } | ||
| 545 | } | ||
| 546 | } | ||
| 547 | |||
| 548 | static inline void parser_process_array(json_parser_t *p) { | ||
| 549 | parser_add_depth(p); | ||
| 550 | ON_ADD_ARRAY(p->ud); | ||
| 551 | parser_skip_whitespaces(p); | ||
| 552 | char ch = peekchar(p); | ||
| 553 | if (ch == ']') { | ||
| 554 | skipchar(p); | ||
| 555 | p->curdepth--; | ||
| 556 | return; | ||
| 557 | } | ||
| 558 | int i; | ||
| 559 | for (i = 0; ;++i) { | ||
| 560 | parser_process_value(p); | ||
| 561 | ON_ADD_INDEX(p->ud, i); | ||
| 562 | |||
| 563 | parser_skip_whitespaces(p); | ||
| 564 | if (peekchar(p) == ']') { | ||
| 565 | skipchar(p); | ||
| 566 | p->curdepth--; | ||
| 567 | return; | ||
| 568 | } | ||
| 569 | else { | ||
| 570 | parser_expect_char(p, ','); | ||
| 571 | } | ||
| 572 | } | ||
| 573 | } | ||
| 574 | |||
| 575 | static void parser_process_value(json_parser_t *p) { | ||
| 576 | parser_skip_whitespaces(p); | ||
| 577 | char ch = get_and_next(p); | ||
| 578 | switch (ch) { | ||
| 579 | case 'f': | ||
| 580 | parser_process_false(p); | ||
| 581 | break; | ||
| 582 | case 't': | ||
| 583 | parser_process_true(p); | ||
| 584 | break; | ||
| 585 | case 'n': | ||
| 586 | parser_process_null(p); | ||
| 587 | break; | ||
| 588 | case '"': | ||
| 589 | parser_process_string(p); | ||
| 590 | ON_ADD_STRING(p->ud, p->buff.b, p->buff.sz); | ||
| 591 | break; | ||
| 592 | case '{': | ||
| 593 | parser_process_object(p); | ||
| 594 | break; | ||
| 595 | case '[': | ||
| 596 | parser_process_array(p); | ||
| 597 | break; | ||
| 598 | default: | ||
| 599 | parser_process_number(p, ch); | ||
| 600 | break; | ||
| 601 | } | ||
| 602 | } | ||
| 603 | |||
| 604 | // 解析json文本 | ||
| 605 | static void parser_do_parse(const char *str, size_t size, void *ud, int maxdepth, int allowcomment) { | ||
| 606 | json_parser_t p; | ||
| 607 | parser_init(&p, str, size, ud, maxdepth, allowcomment); | ||
| 608 | //>>>if (setjmp(p.jb) == 0) { | ||
| 609 | parser_process_value(&p); | ||
| 610 | parser_skip_whitespaces(&p); | ||
| 611 | if (peekchar(&p) != '\0') { | ||
| 612 | parser_throw_error(&p, "Expect '<eof>' but got '%c', at: %s[:%lu]", peekchar(&p), | ||
| 613 | parser_error_content(&p), currpos(&p)); | ||
| 614 | } | ||
| 615 | parser_free(&p); | ||
| 616 | //>>>} | ||
| 617 | } | ||
| 618 | |||
| 619 | //----------------------------------------------------------------------------- | ||
| 620 | // dumpper | ||
| 621 | |||
| 622 | typedef struct { | ||
| 623 | membuffer_t buff; // 临时缓存 | ||
| 624 | int maxdepth; // 最大层次 | ||
| 625 | int format; // 是否格式化 | ||
| 626 | int empty_as_array; // 空表是否当成数组 | ||
| 627 | int num_as_str; // 数字Key转为字符串 | ||
| 628 | char errmsg[ERRMSG_SIZE]; // 保存错误消息 | ||
| 629 | } json_dumpper_t; | ||
| 630 | |||
| 631 | // 足够转换数字的缓存大小 | ||
| 632 | #define NUMBER_BUFF_SZ 44 | ||
| 633 | #define INTEGER_BUFF_SZ 24 | ||
| 634 | |||
| 635 | // 抛出错误 | ||
| 636 | static void dumpper_throw_error(json_dumpper_t *d, lua_State *L, const char *fmt, ...) { | ||
| 637 | membuffer_free(&d->buff); | ||
| 638 | va_list arg; | ||
| 639 | va_start(arg, fmt); | ||
| 640 | vsnprintf(d->errmsg, ERRMSG_SIZE, fmt, arg); | ||
| 641 | va_end(arg); | ||
| 642 | luaL_error(L, d->errmsg); | ||
| 643 | } | ||
| 644 | |||
| 645 | #if LUA_VERSION_NUM > 501 | ||
| 646 | static void dumpper_process_integer(json_dumpper_t *d, lua_State *L, int idx) { | ||
| 647 | char nbuff[INTEGER_BUFF_SZ]; | ||
| 648 | int i = INTEGER_BUFF_SZ; | ||
| 649 | membuffer_ensure_space(&d->buff, INTEGER_BUFF_SZ); | ||
| 650 | int64_t x = (int64_t)lua_tointeger(L, idx); | ||
| 651 | uint64_t ux = (uint64_t)x; | ||
| 652 | if (x < 0) { | ||
| 653 | membuffer_putc_unsafe(&d->buff, '-'); | ||
| 654 | ux = ~ux + 1; | ||
| 655 | } | ||
| 656 | do { | ||
| 657 | nbuff[--i] = (ux % 10) + '0'; | ||
| 658 | } while (ux /= 10); | ||
| 659 | membuffer_putb_unsafe(&d->buff, nbuff+i, INTEGER_BUFF_SZ-i); | ||
| 660 | } | ||
| 661 | #endif | ||
| 662 | |||
| 663 | static void dumpper_process_number(json_dumpper_t *d, lua_State *L, int idx) { | ||
| 664 | lua_Number num = lua_tonumber(L, idx); | ||
| 665 | if (isinf(num) || isnan(num)) | ||
| 666 | dumpper_throw_error(d, L, "The number is NaN or Infinity"); | ||
| 667 | membuffer_ensure_space(&d->buff, NUMBER_BUFF_SZ); | ||
| 668 | char *p = membuffer_getp(&d->buff); | ||
| 669 | int len = sprintf(p, LUA_NUMBER_FMT, num); | ||
| 670 | membuffer_add_size(&d->buff, len); | ||
| 671 | } | ||
| 672 | |||
| 673 | // 字符转义表 | ||
| 674 | static const char char2escape[256] = { | ||
| 675 | 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', 'u', 'u', 'u', 'u', // 0~19 | ||
| 676 | 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 0, 0, '"', 0, 0, 0, 0, 0, // 20~39 | ||
| 677 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40~59 | ||
| 678 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60~79 | ||
| 679 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\\', 0, 0, 0, 0, 0, 0, 0, // 80~99 | ||
| 680 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 100~119 | ||
| 681 | 0, 0, 0, 0, 0, 0, 0, 'u', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 120~139 | ||
| 682 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 140~159 | ||
| 683 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160~179 | ||
| 684 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 180~199 | ||
| 685 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 200~219 | ||
| 686 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 220~239 | ||
| 687 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 240~256 | ||
| 688 | }; | ||
| 689 | static const char hex_digits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; | ||
| 690 | |||
| 691 | static void dumpper_process_string(json_dumpper_t *d, lua_State *L, int idx) { | ||
| 692 | membuffer_t *buff = &d->buff; | ||
| 693 | size_t len, i; | ||
| 694 | const char *str = lua_tolstring(L, idx, &len); | ||
| 695 | membuffer_ensure_space(buff, len * 6 + 2); | ||
| 696 | membuffer_putc_unsafe(buff, '\"'); | ||
| 697 | char esc; | ||
| 698 | unsigned char ch; | ||
| 699 | for (i = 0; i < len; ++i) { | ||
| 700 | ch = (unsigned char)str[i]; | ||
| 701 | esc = char2escape[ch]; | ||
| 702 | if (likely(!esc)) | ||
| 703 | membuffer_putc_unsafe(buff, (char)ch); | ||
| 704 | else { | ||
| 705 | membuffer_putc_unsafe(buff, '\\'); | ||
| 706 | membuffer_putc_unsafe(buff, esc); | ||
| 707 | if (esc == 'u') { | ||
| 708 | membuffer_putc_unsafe(buff, '0'); | ||
| 709 | membuffer_putc_unsafe(buff, '0'); | ||
| 710 | membuffer_putc_unsafe(buff, hex_digits[(unsigned char)esc >> 4]); | ||
| 711 | membuffer_putc_unsafe(buff, hex_digits[(unsigned char)esc & 0xF]); | ||
| 712 | } | ||
| 713 | } | ||
| 714 | } | ||
| 715 | membuffer_putc_unsafe(buff, '\"'); | ||
| 716 | } | ||
| 717 | |||
| 718 | static void dumpper_process_value(json_dumpper_t *d, lua_State *L, int depth); | ||
| 719 | |||
| 720 | static int dumpper_check_array(json_dumpper_t *d, lua_State *L, int *len) { | ||
| 721 | int asize = lua_objlen(L, -1); | ||
| 722 | if (asize > 0) { | ||
| 723 | lua_pushinteger(L, asize); | ||
| 724 | if (lua_next(L, -2) == 0) { | ||
| 725 | *len = asize; | ||
| 726 | return 1; | ||
| 727 | } else { | ||
| 728 | lua_pop(L, 2); | ||
| 729 | return 0; | ||
| 730 | } | ||
| 731 | } else { | ||
| 732 | lua_pushnil(L); | ||
| 733 | if (lua_next(L, -2) == 0) { | ||
| 734 | *len = asize; | ||
| 735 | return d->empty_as_array; | ||
| 736 | } else { | ||
| 737 | lua_pop(L, 2); | ||
| 738 | return 0; | ||
| 739 | } | ||
| 740 | } | ||
| 741 | } | ||
| 742 | |||
| 743 | static inline void dumpper_add_indent(json_dumpper_t *d, int count) { | ||
| 744 | membuffer_ensure_space(&d->buff, count); | ||
| 745 | int i; | ||
| 746 | for (i = 0; i < count; ++i) | ||
| 747 | membuffer_putc_unsafe(&d->buff, '\t'); | ||
| 748 | } | ||
| 749 | |||
| 750 | static void dumpper_process_array(json_dumpper_t *d, lua_State *L, int len, int depth) { | ||
| 751 | membuffer_t *buff = &d->buff; | ||
| 752 | membuffer_putc(buff, '['); | ||
| 753 | |||
| 754 | int i; | ||
| 755 | for (i = 1; i <= len; ++i) { | ||
| 756 | if (unlikely(d->format && i == 1)) membuffer_putc(buff, '\n'); | ||
| 757 | lua_rawgeti(L, -1, i); | ||
| 758 | if (unlikely(d->format)) dumpper_add_indent(d, depth); | ||
| 759 | dumpper_process_value(d, L, depth); | ||
| 760 | lua_pop(L, 1); | ||
| 761 | if (i < len) | ||
| 762 | membuffer_putc(buff, ','); | ||
| 763 | if (unlikely(d->format)) membuffer_putc(buff, '\n'); | ||
| 764 | } | ||
| 765 | |||
| 766 | if (unlikely(d->format && i > 1)) dumpper_add_indent(d, depth-1); | ||
| 767 | membuffer_putc(buff, ']'); | ||
| 768 | } | ||
| 769 | |||
| 770 | static void dumpper_process_object(json_dumpper_t *d, lua_State *L, int depth) { | ||
| 771 | membuffer_t *buff = &d->buff; | ||
| 772 | membuffer_putc(buff, '{'); | ||
| 773 | |||
| 774 | int ktp; | ||
| 775 | int comma = 0; | ||
| 776 | lua_pushnil(L); // t nil | ||
| 777 | while (lua_next(L, -2) != 0) { // t k v | ||
| 778 | if (comma) { | ||
| 779 | membuffer_putc(buff, ','); | ||
| 780 | if (unlikely(d->format)) membuffer_putc(buff, '\n'); | ||
| 781 | } else { | ||
| 782 | comma = 1; | ||
| 783 | if (unlikely(d->format)) membuffer_putc(buff, '\n'); | ||
| 784 | } | ||
| 785 | // key | ||
| 786 | ktp = lua_type(L, -2); | ||
| 787 | if (ktp == LUA_TSTRING) { | ||
| 788 | if (unlikely(d->format)) dumpper_add_indent(d, depth); | ||
| 789 | dumpper_process_string(d, L, -2); | ||
| 790 | if (likely(!d->format)) | ||
| 791 | membuffer_putc(buff, ':'); | ||
| 792 | else | ||
| 793 | membuffer_putb(buff, " : ", 3); | ||
| 794 | } else if (ktp == LUA_TNUMBER && d->num_as_str) { | ||
| 795 | if (unlikely(d->format)) dumpper_add_indent(d, depth); | ||
| 796 | membuffer_putc(buff, '\"'); | ||
| 797 | #if LUA_VERSION_NUM > 501 | ||
| 798 | if (lua_isinteger(L, -2)) | ||
| 799 | dumpper_process_integer(d, L, -2); | ||
| 800 | else | ||
| 801 | #endif | ||
| 802 | dumpper_process_number(d, L, -2); | ||
| 803 | if (likely(!d->format)) | ||
| 804 | membuffer_putb(buff, "\":", 2); | ||
| 805 | else | ||
| 806 | membuffer_putb(buff, "\" : ", 4); | ||
| 807 | } else { | ||
| 808 | dumpper_throw_error(d, L, "Table key must be a string"); | ||
| 809 | } | ||
| 810 | // value | ||
| 811 | dumpper_process_value(d, L, depth); | ||
| 812 | lua_pop(L, 1); | ||
| 813 | } | ||
| 814 | if (unlikely(d->format && comma)) { | ||
| 815 | membuffer_putc(buff, '\n'); | ||
| 816 | dumpper_add_indent(d, depth-1); | ||
| 817 | } | ||
| 818 | membuffer_putc(buff, '}'); | ||
| 819 | } | ||
| 820 | |||
| 821 | static inline void dumpper_process_table(json_dumpper_t *d, lua_State *L, int depth) { | ||
| 822 | depth++; | ||
| 823 | if (depth > d->maxdepth) | ||
| 824 | dumpper_throw_error(d, L, "Too many nested data, max depth is %d", d->maxdepth); | ||
| 825 | luaL_checkstack(L, 6, NULL); | ||
| 826 | |||
| 827 | int len; | ||
| 828 | if (dumpper_check_array(d, L, &len)) | ||
| 829 | dumpper_process_array(d, L, len, depth); | ||
| 830 | else | ||
| 831 | dumpper_process_object(d, L, depth); | ||
| 832 | } | ||
| 833 | |||
| 834 | static void dumpper_process_value(json_dumpper_t *d, lua_State *L, int depth) { | ||
| 835 | int tp = lua_type(L, -1); | ||
| 836 | switch (tp) { | ||
| 837 | case LUA_TSTRING: | ||
| 838 | dumpper_process_string(d, L, -1); | ||
| 839 | break; | ||
| 840 | case LUA_TNUMBER: | ||
| 841 | #if LUA_VERSION_NUM > 501 | ||
| 842 | if (lua_isinteger(L, -1)) | ||
| 843 | dumpper_process_integer(d, L, -1); | ||
| 844 | else | ||
| 845 | #endif | ||
| 846 | dumpper_process_number(d, L, -1); | ||
| 847 | break; | ||
| 848 | case LUA_TBOOLEAN: | ||
| 849 | if (lua_toboolean(L, -1)) | ||
| 850 | membuffer_putb(&d->buff, "true", 4); | ||
| 851 | else | ||
| 852 | membuffer_putb(&d->buff, "false", 5); | ||
| 853 | break; | ||
| 854 | case LUA_TTABLE: | ||
| 855 | dumpper_process_table(d, L, depth); | ||
| 856 | break; | ||
| 857 | case LUA_TNIL: | ||
| 858 | membuffer_putb(&d->buff, "null", 4); | ||
| 859 | break; | ||
| 860 | case LUA_TLIGHTUSERDATA: | ||
| 861 | if (lua_touserdata(L, -1) == NULL) { | ||
| 862 | membuffer_putb(&d->buff, "null", 4); | ||
| 863 | break; | ||
| 864 | } | ||
| 865 | goto error; | ||
| 866 | default: | ||
| 867 | error: | ||
| 868 | dumpper_throw_error(d, L, "Unsupport type %s", lua_typename(L, tp)); | ||
| 869 | } | ||
| 870 | } | ||
| 871 | |||
| 872 | //----------------------------------------------------------------------------- | ||
| 873 | // 接口 | ||
| 874 | #define DEF_MAX_DEPTH 128 | ||
| 875 | |||
| 876 | // 从字符串加载:json.decode(str, maxdepth) -> obj | ||
| 877 | // 要求字符串必须以0结尾 | ||
| 878 | int colibc_json_decode(lua_State *L) { | ||
| 879 | size_t size; | ||
| 880 | const char *str = luaL_checklstring(L, 1, &size); | ||
| 881 | int maxdepth = (int)luaL_optinteger(L, 2, DEF_MAX_DEPTH); | ||
| 882 | int allowcomment = lua_toboolean(L, 3); | ||
| 883 | parser_do_parse(str, size, L, maxdepth, allowcomment); | ||
| 884 | return 1; | ||
| 885 | } | ||
| 886 | |||
| 887 | // 保存到字符串: json.encode(obj) -> str | ||
| 888 | int colibc_json_encode(lua_State *L) { | ||
| 889 | luaL_checkany(L, 1); | ||
| 890 | json_dumpper_t dumpper; | ||
| 891 | membuffer_init(&dumpper.buff); | ||
| 892 | dumpper.format = lua_toboolean(L, 2); | ||
| 893 | dumpper.empty_as_array = lua_toboolean(L, 3); | ||
| 894 | dumpper.num_as_str = lua_toboolean(L, 4); | ||
| 895 | dumpper.maxdepth = (int)luaL_optinteger(L, 5, DEF_MAX_DEPTH); | ||
| 896 | |||
| 897 | lua_settop(L, 1); | ||
| 898 | dumpper_process_value(&dumpper, L, 0); | ||
| 899 | lua_pushlstring(L, dumpper.buff.b, dumpper.buff.sz); | ||
| 900 | membuffer_free(&dumpper.buff); | ||
| 901 | return 1; | ||
| 902 | } | ||
| 903 | |||
| 904 | static const luaL_Reg lib[] = { | ||
| 905 | {"decode", colibc_json_decode}, | ||
| 906 | {"encode", colibc_json_encode}, | ||
| 907 | {NULL, NULL}, | ||
| 908 | }; | ||
| 909 | |||
| 910 | LUALIB_API int luaopen_colibc_json(lua_State* L) { | ||
| 911 | #if LUA_VERSION_NUM > 501 | ||
| 912 | luaL_newlib(L, lib); // json | ||
| 913 | #else | ||
| 914 | lua_getglobal(L, "package"); // package | ||
| 915 | lua_getfield(L, -1, "loaded"); // package loaded | ||
| 916 | lua_createtable(L, 0, 0); // package loaded json | ||
| 917 | lua_pushvalue(L, -1); // package loaded json json | ||
| 918 | lua_setfield(L, -3, "cojson"); // loaded["cojson"] = json, package loaded json | ||
| 919 | luaL_register(L, NULL, lib); // package loaded json | ||
| 920 | #endif | ||
| 921 | // json.null | ||
| 922 | lua_pushlightuserdata(L, NULL); | ||
| 923 | lua_setfield(L, -2, "null"); | ||
| 924 | return 1; | ||
| 925 | } | ||
diff --git a/src/3rdParty/utf8cpp.h b/src/3rdParty/utf8cpp.h new file mode 100755 index 0000000..76f0fa1 --- /dev/null +++ b/src/3rdParty/utf8cpp.h | |||
| @@ -0,0 +1,1277 @@ | |||
| 1 | // Copyright 2006 Nemanja Trifunovic | ||
| 2 | |||
| 3 | /* | ||
| 4 | Permission is hereby granted, free of charge, to any person or organization | ||
| 5 | obtaining a copy of the software and accompanying documentation covered by | ||
| 6 | this license (the "Software") to use, reproduce, display, distribute, | ||
| 7 | execute, and transmit the Software, and to prepare derivative works of the | ||
| 8 | Software, and to permit third-parties to whom the Software is furnished to | ||
| 9 | do so, all subject to the following: | ||
| 10 | |||
| 11 | The copyright notices in the Software and this entire statement, including | ||
| 12 | the above license grant, this restriction and the following disclaimer, | ||
| 13 | must be included in all copies of the Software, in whole or in part, and | ||
| 14 | all derivative works of the Software, unless such copies or derivative | ||
| 15 | works are solely in the form of machine-executable object code generated by | ||
| 16 | a source language processor. | ||
| 17 | |||
| 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
| 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
| 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
| 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
| 24 | DEALINGS IN THE SOFTWARE. | ||
| 25 | */ | ||
| 26 | |||
| 27 | |||
| 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
| 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
| 30 | |||
| 31 | /* | ||
| 32 | To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro | ||
| 33 | and set it to one of the values used by the __cplusplus predefined macro. | ||
| 34 | |||
| 35 | For instance, | ||
| 36 | #define UTF_CPP_CPLUSPLUS 199711L | ||
| 37 | will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. | ||
| 38 | Some library features will be disabled. | ||
| 39 | |||
| 40 | If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. | ||
| 41 | */ | ||
| 42 | |||
| 43 | #include <iterator> | ||
| 44 | #include <cstring> | ||
| 45 | #include <string> | ||
| 46 | |||
| 47 | // Determine the C++ standard version. | ||
| 48 | // If the user defines UTF_CPP_CPLUSPLUS, use that. | ||
| 49 | // Otherwise, trust the unreliable predefined macro __cplusplus | ||
| 50 | |||
| 51 | #if !defined UTF_CPP_CPLUSPLUS | ||
| 52 | #define UTF_CPP_CPLUSPLUS __cplusplus | ||
| 53 | #endif | ||
| 54 | |||
| 55 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 56 | #define UTF_CPP_OVERRIDE override | ||
| 57 | #define UTF_CPP_NOEXCEPT noexcept | ||
| 58 | #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); | ||
| 59 | #else // C++ 98/03 | ||
| 60 | #define UTF_CPP_OVERRIDE | ||
| 61 | #define UTF_CPP_NOEXCEPT throw() | ||
| 62 | // Simulate static_assert: | ||
| 63 | template<bool> struct UtfCppCompileTimeAssert; | ||
| 64 | template<> struct UtfCppCompileTimeAssert <true> { }; | ||
| 65 | #define UTF_CPP_STATIC_ASSERT(condition) (UtfCppCompileTimeAssert <(condition) != 0>()) | ||
| 66 | #endif // C++ 11 or later | ||
| 67 | |||
| 68 | |||
| 69 | namespace utf8 | ||
| 70 | { | ||
| 71 | // The typedefs for 8-bit, 16-bit and 32-bit code units | ||
| 72 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 73 | #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later | ||
| 74 | typedef char8_t utfchar8_t; | ||
| 75 | #else // C++ 11/14/17 | ||
| 76 | typedef unsigned char utfchar8_t; | ||
| 77 | #endif | ||
| 78 | typedef char16_t utfchar16_t; | ||
| 79 | typedef char32_t utfchar32_t; | ||
| 80 | #else // C++ 98/03 | ||
| 81 | typedef unsigned char utfchar8_t; | ||
| 82 | typedef unsigned short utfchar16_t; | ||
| 83 | typedef unsigned int utfchar32_t; | ||
| 84 | #endif // C++ 11 or later | ||
| 85 | |||
| 86 | // Helper code - not intended to be directly called by the library users. May be changed at any time | ||
| 87 | namespace internal | ||
| 88 | { | ||
| 89 | // Unicode constants | ||
| 90 | // Leading (high) surrogates: 0xd800 - 0xdbff | ||
| 91 | // Trailing (low) surrogates: 0xdc00 - 0xdfff | ||
| 92 | const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; | ||
| 93 | const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; | ||
| 94 | const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; | ||
| 95 | const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; | ||
| 96 | const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) | ||
| 97 | const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN | ||
| 98 | |||
| 99 | // Maximum valid value for a Unicode code point | ||
| 100 | const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; | ||
| 101 | |||
| 102 | template<typename octet_type> | ||
| 103 | inline utfchar8_t mask8(octet_type oc) | ||
| 104 | { | ||
| 105 | return static_cast<utfchar8_t>(0xff & oc); | ||
| 106 | } | ||
| 107 | |||
| 108 | template<typename u16_type> | ||
| 109 | inline utfchar16_t mask16(u16_type oc) | ||
| 110 | { | ||
| 111 | return static_cast<utfchar16_t>(0xffff & oc); | ||
| 112 | } | ||
| 113 | |||
| 114 | template<typename octet_type> | ||
| 115 | inline bool is_trail(octet_type oc) | ||
| 116 | { | ||
| 117 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); | ||
| 118 | } | ||
| 119 | |||
| 120 | inline bool is_lead_surrogate(utfchar32_t cp) | ||
| 121 | { | ||
| 122 | return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX)); | ||
| 123 | } | ||
| 124 | |||
| 125 | inline bool is_trail_surrogate(utfchar32_t cp) | ||
| 126 | { | ||
| 127 | return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX)); | ||
| 128 | } | ||
| 129 | |||
| 130 | inline bool is_surrogate(utfchar32_t cp) | ||
| 131 | { | ||
| 132 | return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX)); | ||
| 133 | } | ||
| 134 | |||
| 135 | inline bool is_code_point_valid(utfchar32_t cp) | ||
| 136 | { | ||
| 137 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); | ||
| 138 | } | ||
| 139 | |||
| 140 | inline bool is_in_bmp(utfchar32_t cp) | ||
| 141 | { | ||
| 142 | return cp < utfchar32_t(0x10000); | ||
| 143 | } | ||
| 144 | |||
| 145 | template <typename octet_iterator> | ||
| 146 | int sequence_length(octet_iterator lead_it) | ||
| 147 | { | ||
| 148 | const utfchar8_t lead = utf8::internal::mask8(*lead_it); | ||
| 149 | if (lead < 0x80) | ||
| 150 | return 1; | ||
| 151 | else if ((lead >> 5) == 0x6) | ||
| 152 | return 2; | ||
| 153 | else if ((lead >> 4) == 0xe) | ||
| 154 | return 3; | ||
| 155 | else if ((lead >> 3) == 0x1e) | ||
| 156 | return 4; | ||
| 157 | else | ||
| 158 | return 0; | ||
| 159 | } | ||
| 160 | |||
| 161 | inline bool is_overlong_sequence(utfchar32_t cp, int length) | ||
| 162 | { | ||
| 163 | if (cp < 0x80) { | ||
| 164 | if (length != 1) | ||
| 165 | return true; | ||
| 166 | } | ||
| 167 | else if (cp < 0x800) { | ||
| 168 | if (length != 2) | ||
| 169 | return true; | ||
| 170 | } | ||
| 171 | else if (cp < 0x10000) { | ||
| 172 | if (length != 3) | ||
| 173 | return true; | ||
| 174 | } | ||
| 175 | return false; | ||
| 176 | } | ||
| 177 | |||
| 178 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; | ||
| 179 | |||
| 180 | /// Helper for get_sequence_x | ||
| 181 | template <typename octet_iterator> | ||
| 182 | utf_error increase_safely(octet_iterator& it, const octet_iterator end) | ||
| 183 | { | ||
| 184 | if (++it == end) | ||
| 185 | return NOT_ENOUGH_ROOM; | ||
| 186 | |||
| 187 | if (!utf8::internal::is_trail(*it)) | ||
| 188 | return INCOMPLETE_SEQUENCE; | ||
| 189 | |||
| 190 | return UTF8_OK; | ||
| 191 | } | ||
| 192 | |||
| 193 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} | ||
| 194 | |||
| 195 | /// get_sequence_x functions decode utf-8 sequences of the length x | ||
| 196 | template <typename octet_iterator> | ||
| 197 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 198 | { | ||
| 199 | if (it == end) | ||
| 200 | return NOT_ENOUGH_ROOM; | ||
| 201 | |||
| 202 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 203 | |||
| 204 | return UTF8_OK; | ||
| 205 | } | ||
| 206 | |||
| 207 | template <typename octet_iterator> | ||
| 208 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 209 | { | ||
| 210 | if (it == end) | ||
| 211 | return NOT_ENOUGH_ROOM; | ||
| 212 | |||
| 213 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 214 | |||
| 215 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 216 | |||
| 217 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); | ||
| 218 | |||
| 219 | return UTF8_OK; | ||
| 220 | } | ||
| 221 | |||
| 222 | template <typename octet_iterator> | ||
| 223 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 224 | { | ||
| 225 | if (it == end) | ||
| 226 | return NOT_ENOUGH_ROOM; | ||
| 227 | |||
| 228 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 229 | |||
| 230 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 231 | |||
| 232 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
| 233 | |||
| 234 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 235 | |||
| 236 | code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f)); | ||
| 237 | |||
| 238 | return UTF8_OK; | ||
| 239 | } | ||
| 240 | |||
| 241 | template <typename octet_iterator> | ||
| 242 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 243 | { | ||
| 244 | if (it == end) | ||
| 245 | return NOT_ENOUGH_ROOM; | ||
| 246 | |||
| 247 | code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it)); | ||
| 248 | |||
| 249 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 250 | |||
| 251 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
| 252 | |||
| 253 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 254 | |||
| 255 | code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); | ||
| 256 | |||
| 257 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
| 258 | |||
| 259 | code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f)); | ||
| 260 | |||
| 261 | return UTF8_OK; | ||
| 262 | } | ||
| 263 | |||
| 264 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR | ||
| 265 | |||
| 266 | template <typename octet_iterator> | ||
| 267 | utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) | ||
| 268 | { | ||
| 269 | if (it == end) | ||
| 270 | return NOT_ENOUGH_ROOM; | ||
| 271 | |||
| 272 | // Save the original value of it so we can go back in case of failure | ||
| 273 | // Of course, it does not make much sense with i.e. stream iterators | ||
| 274 | octet_iterator original_it = it; | ||
| 275 | |||
| 276 | utfchar32_t cp = 0; | ||
| 277 | // Determine the sequence length based on the lead octet | ||
| 278 | const int length = utf8::internal::sequence_length(it); | ||
| 279 | |||
| 280 | // Get trail octets and calculate the code point | ||
| 281 | utf_error err = UTF8_OK; | ||
| 282 | switch (length) { | ||
| 283 | case 0: | ||
| 284 | return INVALID_LEAD; | ||
| 285 | case 1: | ||
| 286 | err = utf8::internal::get_sequence_1(it, end, cp); | ||
| 287 | break; | ||
| 288 | case 2: | ||
| 289 | err = utf8::internal::get_sequence_2(it, end, cp); | ||
| 290 | break; | ||
| 291 | case 3: | ||
| 292 | err = utf8::internal::get_sequence_3(it, end, cp); | ||
| 293 | break; | ||
| 294 | case 4: | ||
| 295 | err = utf8::internal::get_sequence_4(it, end, cp); | ||
| 296 | break; | ||
| 297 | } | ||
| 298 | |||
| 299 | if (err == UTF8_OK) { | ||
| 300 | // Decoding succeeded. Now, security checks... | ||
| 301 | if (utf8::internal::is_code_point_valid(cp)) { | ||
| 302 | if (!utf8::internal::is_overlong_sequence(cp, length)){ | ||
| 303 | // Passed! Return here. | ||
| 304 | code_point = cp; | ||
| 305 | ++it; | ||
| 306 | return UTF8_OK; | ||
| 307 | } | ||
| 308 | else | ||
| 309 | err = OVERLONG_SEQUENCE; | ||
| 310 | } | ||
| 311 | else | ||
| 312 | err = INVALID_CODE_POINT; | ||
| 313 | } | ||
| 314 | |||
| 315 | // Failure branch - restore the original value of the iterator | ||
| 316 | it = original_it; | ||
| 317 | return err; | ||
| 318 | } | ||
| 319 | |||
| 320 | template <typename octet_iterator> | ||
| 321 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { | ||
| 322 | utfchar32_t ignored; | ||
| 323 | return utf8::internal::validate_next(it, end, ignored); | ||
| 324 | } | ||
| 325 | |||
| 326 | template <typename word_iterator> | ||
| 327 | utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) | ||
| 328 | { | ||
| 329 | // Make sure the iterator dereferences a large enough type | ||
| 330 | typedef typename std::iterator_traits<word_iterator>::value_type word_type; | ||
| 331 | UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); | ||
| 332 | // Check the edge case: | ||
| 333 | if (it == end) | ||
| 334 | return NOT_ENOUGH_ROOM; | ||
| 335 | // Save the original value of it so we can go back in case of failure | ||
| 336 | // Of course, it does not make much sense with i.e. stream iterators | ||
| 337 | word_iterator original_it = it; | ||
| 338 | |||
| 339 | utf_error err = UTF8_OK; | ||
| 340 | |||
| 341 | const utfchar16_t first_word = *it++; | ||
| 342 | if (!is_surrogate(first_word)) { | ||
| 343 | code_point = first_word; | ||
| 344 | return UTF8_OK; | ||
| 345 | } | ||
| 346 | else { | ||
| 347 | if (it == end) | ||
| 348 | err = NOT_ENOUGH_ROOM; | ||
| 349 | else if (is_lead_surrogate(first_word)) { | ||
| 350 | const utfchar16_t second_word = *it++; | ||
| 351 | if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) { | ||
| 352 | code_point = static_cast<utfchar32_t>(first_word << 10) + static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET; | ||
| 353 | return UTF8_OK; | ||
| 354 | } else | ||
| 355 | err = INCOMPLETE_SEQUENCE; | ||
| 356 | |||
| 357 | } else { | ||
| 358 | err = INVALID_LEAD; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | // error branch | ||
| 362 | it = original_it; | ||
| 363 | return err; | ||
| 364 | } | ||
| 365 | |||
| 366 | // Internal implementation of both checked and unchecked append() function | ||
| 367 | // This function will be invoked by the overloads below, as they will know | ||
| 368 | // the octet_type. | ||
| 369 | template <typename octet_iterator, typename octet_type> | ||
| 370 | octet_iterator append(utfchar32_t cp, octet_iterator result) { | ||
| 371 | if (cp < 0x80) // one octet | ||
| 372 | *(result++) = static_cast<octet_type>(cp); | ||
| 373 | else if (cp < 0x800) { // two octets | ||
| 374 | *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0); | ||
| 375 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 376 | } | ||
| 377 | else if (cp < 0x10000) { // three octets | ||
| 378 | *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0); | ||
| 379 | *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); | ||
| 380 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 381 | } | ||
| 382 | else { // four octets | ||
| 383 | *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0); | ||
| 384 | *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80); | ||
| 385 | *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); | ||
| 386 | *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); | ||
| 387 | } | ||
| 388 | return result; | ||
| 389 | } | ||
| 390 | |||
| 391 | // One of the following overloads will be invoked from the API calls | ||
| 392 | |||
| 393 | // A simple (but dangerous) case: the caller appends byte(s) to a char array | ||
| 394 | inline char* append(utfchar32_t cp, char* result) { | ||
| 395 | return append<char*, char>(cp, result); | ||
| 396 | } | ||
| 397 | |||
| 398 | // Hopefully, most common case: the caller uses back_inserter | ||
| 399 | // i.e. append(cp, std::back_inserter(str)); | ||
| 400 | template<typename container_type> | ||
| 401 | std::back_insert_iterator<container_type> append | ||
| 402 | (utfchar32_t cp, std::back_insert_iterator<container_type> result) { | ||
| 403 | return append<std::back_insert_iterator<container_type>, | ||
| 404 | typename container_type::value_type>(cp, result); | ||
| 405 | } | ||
| 406 | |||
| 407 | // The caller uses some other kind of output operator - not covered above | ||
| 408 | // Note that in this case we are not able to determine octet_type | ||
| 409 | // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. | ||
| 410 | template <typename octet_iterator> | ||
| 411 | octet_iterator append(utfchar32_t cp, octet_iterator result) { | ||
| 412 | return append<octet_iterator, utfchar8_t>(cp, result); | ||
| 413 | } | ||
| 414 | |||
| 415 | // Internal implementation of both checked and unchecked append16() function | ||
| 416 | // This function will be invoked by the overloads below, as they will know | ||
| 417 | // the word_type. | ||
| 418 | template <typename word_iterator, typename word_type> | ||
| 419 | word_iterator append16(utfchar32_t cp, word_iterator result) { | ||
| 420 | UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); | ||
| 421 | if (is_in_bmp(cp)) | ||
| 422 | *(result++) = static_cast<word_type>(cp); | ||
| 423 | else { | ||
| 424 | // Code points from the supplementary planes are encoded via surrogate pairs | ||
| 425 | *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10)); | ||
| 426 | *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); | ||
| 427 | } | ||
| 428 | return result; | ||
| 429 | } | ||
| 430 | |||
| 431 | // Hopefully, most common case: the caller uses back_inserter | ||
| 432 | // i.e. append16(cp, std::back_inserter(str)); | ||
| 433 | template<typename container_type> | ||
| 434 | std::back_insert_iterator<container_type> append16 | ||
| 435 | (utfchar32_t cp, std::back_insert_iterator<container_type> result) { | ||
| 436 | return append16<std::back_insert_iterator<container_type>, | ||
| 437 | typename container_type::value_type>(cp, result); | ||
| 438 | } | ||
| 439 | |||
| 440 | // The caller uses some other kind of output operator - not covered above | ||
| 441 | // Note that in this case we are not able to determine word_type | ||
| 442 | // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. | ||
| 443 | template <typename word_iterator> | ||
| 444 | word_iterator append16(utfchar32_t cp, word_iterator result) { | ||
| 445 | return append16<word_iterator, utfchar16_t>(cp, result); | ||
| 446 | } | ||
| 447 | |||
| 448 | } // namespace internal | ||
| 449 | |||
| 450 | /// The library API - functions intended to be called by the users | ||
| 451 | |||
| 452 | // Byte order mark | ||
| 453 | const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; | ||
| 454 | |||
| 455 | template <typename octet_iterator> | ||
| 456 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) | ||
| 457 | { | ||
| 458 | octet_iterator result = start; | ||
| 459 | while (result != end) { | ||
| 460 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); | ||
| 461 | if (err_code != internal::UTF8_OK) | ||
| 462 | return result; | ||
| 463 | } | ||
| 464 | return result; | ||
| 465 | } | ||
| 466 | |||
| 467 | inline const char* find_invalid(const char* str) | ||
| 468 | { | ||
| 469 | const char* end = str + std::strlen(str); | ||
| 470 | return find_invalid(str, end); | ||
| 471 | } | ||
| 472 | |||
| 473 | inline std::size_t find_invalid(const std::string& s) | ||
| 474 | { | ||
| 475 | std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 476 | return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 477 | } | ||
| 478 | |||
| 479 | template <typename octet_iterator> | ||
| 480 | inline bool is_valid(octet_iterator start, octet_iterator end) | ||
| 481 | { | ||
| 482 | return (utf8::find_invalid(start, end) == end); | ||
| 483 | } | ||
| 484 | |||
| 485 | inline bool is_valid(const char* str) | ||
| 486 | { | ||
| 487 | return (*(utf8::find_invalid(str)) == '\0'); | ||
| 488 | } | ||
| 489 | |||
| 490 | inline bool is_valid(const std::string& s) | ||
| 491 | { | ||
| 492 | return is_valid(s.begin(), s.end()); | ||
| 493 | } | ||
| 494 | |||
| 495 | |||
| 496 | |||
| 497 | template <typename octet_iterator> | ||
| 498 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) | ||
| 499 | { | ||
| 500 | return ( | ||
| 501 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && | ||
| 502 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && | ||
| 503 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) | ||
| 504 | ); | ||
| 505 | } | ||
| 506 | |||
| 507 | inline bool starts_with_bom(const std::string& s) | ||
| 508 | { | ||
| 509 | return starts_with_bom(s.begin(), s.end()); | ||
| 510 | } | ||
| 511 | } // namespace utf8 | ||
| 512 | |||
| 513 | #include <stdexcept> | ||
| 514 | |||
| 515 | namespace utf8 | ||
| 516 | { | ||
| 517 | // Base for the exceptions that may be thrown from the library | ||
| 518 | class exception : public ::std::exception { | ||
| 519 | }; | ||
| 520 | |||
| 521 | // Exceptions that may be thrown from the library functions. | ||
| 522 | class invalid_code_point : public exception { | ||
| 523 | utfchar32_t cp; | ||
| 524 | public: | ||
| 525 | invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} | ||
| 526 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } | ||
| 527 | utfchar32_t code_point() const {return cp;} | ||
| 528 | }; | ||
| 529 | |||
| 530 | class invalid_utf8 : public exception { | ||
| 531 | utfchar8_t u8; | ||
| 532 | public: | ||
| 533 | invalid_utf8 (utfchar8_t u) : u8(u) {} | ||
| 534 | invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {} | ||
| 535 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } | ||
| 536 | utfchar8_t utf8_octet() const {return u8;} | ||
| 537 | }; | ||
| 538 | |||
| 539 | class invalid_utf16 : public exception { | ||
| 540 | utfchar16_t u16; | ||
| 541 | public: | ||
| 542 | invalid_utf16 (utfchar16_t u) : u16(u) {} | ||
| 543 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } | ||
| 544 | utfchar16_t utf16_word() const {return u16;} | ||
| 545 | }; | ||
| 546 | |||
| 547 | class not_enough_room : public exception { | ||
| 548 | public: | ||
| 549 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } | ||
| 550 | }; | ||
| 551 | |||
| 552 | /// The library API - functions intended to be called by the users | ||
| 553 | |||
| 554 | template <typename octet_iterator> | ||
| 555 | octet_iterator append(utfchar32_t cp, octet_iterator result) | ||
| 556 | { | ||
| 557 | if (!utf8::internal::is_code_point_valid(cp)) | ||
| 558 | throw invalid_code_point(cp); | ||
| 559 | |||
| 560 | return internal::append(cp, result); | ||
| 561 | } | ||
| 562 | |||
| 563 | inline void append(utfchar32_t cp, std::string& s) | ||
| 564 | { | ||
| 565 | append(cp, std::back_inserter(s)); | ||
| 566 | } | ||
| 567 | |||
| 568 | template <typename word_iterator> | ||
| 569 | word_iterator append16(utfchar32_t cp, word_iterator result) | ||
| 570 | { | ||
| 571 | if (!utf8::internal::is_code_point_valid(cp)) | ||
| 572 | throw invalid_code_point(cp); | ||
| 573 | |||
| 574 | return internal::append16(cp, result); | ||
| 575 | } | ||
| 576 | |||
| 577 | template <typename octet_iterator, typename output_iterator> | ||
| 578 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) | ||
| 579 | { | ||
| 580 | while (start != end) { | ||
| 581 | octet_iterator sequence_start = start; | ||
| 582 | internal::utf_error err_code = utf8::internal::validate_next(start, end); | ||
| 583 | switch (err_code) { | ||
| 584 | case internal::UTF8_OK : | ||
| 585 | for (octet_iterator it = sequence_start; it != start; ++it) | ||
| 586 | *out++ = *it; | ||
| 587 | break; | ||
| 588 | case internal::NOT_ENOUGH_ROOM: | ||
| 589 | out = utf8::append (replacement, out); | ||
| 590 | start = end; | ||
| 591 | break; | ||
| 592 | case internal::INVALID_LEAD: | ||
| 593 | out = utf8::append (replacement, out); | ||
| 594 | ++start; | ||
| 595 | break; | ||
| 596 | case internal::INCOMPLETE_SEQUENCE: | ||
| 597 | case internal::OVERLONG_SEQUENCE: | ||
| 598 | case internal::INVALID_CODE_POINT: | ||
| 599 | out = utf8::append (replacement, out); | ||
| 600 | ++start; | ||
| 601 | // just one replacement mark for the sequence | ||
| 602 | while (start != end && utf8::internal::is_trail(*start)) | ||
| 603 | ++start; | ||
| 604 | break; | ||
| 605 | } | ||
| 606 | } | ||
| 607 | return out; | ||
| 608 | } | ||
| 609 | |||
| 610 | template <typename octet_iterator, typename output_iterator> | ||
| 611 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | ||
| 612 | { | ||
| 613 | static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd)); | ||
| 614 | return utf8::replace_invalid(start, end, out, replacement_marker); | ||
| 615 | } | ||
| 616 | |||
| 617 | inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) | ||
| 618 | { | ||
| 619 | std::string result; | ||
| 620 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 621 | return result; | ||
| 622 | } | ||
| 623 | |||
| 624 | inline std::string replace_invalid(const std::string& s) | ||
| 625 | { | ||
| 626 | std::string result; | ||
| 627 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 628 | return result; | ||
| 629 | } | ||
| 630 | |||
| 631 | template <typename octet_iterator> | ||
| 632 | utfchar32_t next(octet_iterator& it, octet_iterator end) | ||
| 633 | { | ||
| 634 | utfchar32_t cp = 0; | ||
| 635 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | ||
| 636 | switch (err_code) { | ||
| 637 | case internal::UTF8_OK : | ||
| 638 | break; | ||
| 639 | case internal::NOT_ENOUGH_ROOM : | ||
| 640 | throw not_enough_room(); | ||
| 641 | case internal::INVALID_LEAD : | ||
| 642 | case internal::INCOMPLETE_SEQUENCE : | ||
| 643 | case internal::OVERLONG_SEQUENCE : | ||
| 644 | throw invalid_utf8(static_cast<utfchar8_t>(*it)); | ||
| 645 | case internal::INVALID_CODE_POINT : | ||
| 646 | throw invalid_code_point(cp); | ||
| 647 | } | ||
| 648 | return cp; | ||
| 649 | } | ||
| 650 | |||
| 651 | template <typename word_iterator> | ||
| 652 | utfchar32_t next16(word_iterator& it, word_iterator end) | ||
| 653 | { | ||
| 654 | utfchar32_t cp = 0; | ||
| 655 | internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); | ||
| 656 | if (err_code == internal::NOT_ENOUGH_ROOM) | ||
| 657 | throw not_enough_room(); | ||
| 658 | return cp; | ||
| 659 | } | ||
| 660 | |||
| 661 | template <typename octet_iterator> | ||
| 662 | utfchar32_t peek_next(octet_iterator it, octet_iterator end) | ||
| 663 | { | ||
| 664 | return utf8::next(it, end); | ||
| 665 | } | ||
| 666 | |||
| 667 | template <typename octet_iterator> | ||
| 668 | utfchar32_t prior(octet_iterator& it, octet_iterator start) | ||
| 669 | { | ||
| 670 | // can't do much if it == start | ||
| 671 | if (it == start) | ||
| 672 | throw not_enough_room(); | ||
| 673 | |||
| 674 | octet_iterator end = it; | ||
| 675 | // Go back until we hit either a lead octet or start | ||
| 676 | while (utf8::internal::is_trail(*(--it))) | ||
| 677 | if (it == start) | ||
| 678 | throw invalid_utf8(*it); // error - no lead byte in the sequence | ||
| 679 | return utf8::peek_next(it, end); | ||
| 680 | } | ||
| 681 | |||
| 682 | template <typename octet_iterator, typename distance_type> | ||
| 683 | void advance (octet_iterator& it, distance_type n, octet_iterator end) | ||
| 684 | { | ||
| 685 | const distance_type zero(0); | ||
| 686 | if (n < zero) { | ||
| 687 | // backward | ||
| 688 | for (distance_type i = n; i < zero; ++i) | ||
| 689 | utf8::prior(it, end); | ||
| 690 | } else { | ||
| 691 | // forward | ||
| 692 | for (distance_type i = zero; i < n; ++i) | ||
| 693 | utf8::next(it, end); | ||
| 694 | } | ||
| 695 | } | ||
| 696 | |||
| 697 | template <typename octet_iterator> | ||
| 698 | typename std::iterator_traits<octet_iterator>::difference_type | ||
| 699 | distance (octet_iterator first, octet_iterator last) | ||
| 700 | { | ||
| 701 | typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
| 702 | for (dist = 0; first < last; ++dist) | ||
| 703 | utf8::next(first, last); | ||
| 704 | return dist; | ||
| 705 | } | ||
| 706 | |||
| 707 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 708 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
| 709 | { | ||
| 710 | while (start != end) { | ||
| 711 | utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++)); | ||
| 712 | // Take care of surrogate pairs first | ||
| 713 | if (utf8::internal::is_lead_surrogate(cp)) { | ||
| 714 | if (start != end) { | ||
| 715 | const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++)); | ||
| 716 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) | ||
| 717 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
| 718 | else | ||
| 719 | throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate)); | ||
| 720 | } | ||
| 721 | else | ||
| 722 | throw invalid_utf16(static_cast<utfchar16_t>(cp)); | ||
| 723 | |||
| 724 | } | ||
| 725 | // Lone trail surrogate | ||
| 726 | else if (utf8::internal::is_trail_surrogate(cp)) | ||
| 727 | throw invalid_utf16(static_cast<utfchar16_t>(cp)); | ||
| 728 | |||
| 729 | result = utf8::append(cp, result); | ||
| 730 | } | ||
| 731 | return result; | ||
| 732 | } | ||
| 733 | |||
| 734 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 735 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
| 736 | { | ||
| 737 | while (start < end) { | ||
| 738 | const utfchar32_t cp = utf8::next(start, end); | ||
| 739 | if (cp > 0xffff) { //make a surrogate pair | ||
| 740 | *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
| 741 | *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
| 742 | } | ||
| 743 | else | ||
| 744 | *result++ = static_cast<utfchar16_t>(cp); | ||
| 745 | } | ||
| 746 | return result; | ||
| 747 | } | ||
| 748 | |||
| 749 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 750 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
| 751 | { | ||
| 752 | while (start != end) | ||
| 753 | result = utf8::append(*(start++), result); | ||
| 754 | |||
| 755 | return result; | ||
| 756 | } | ||
| 757 | |||
| 758 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 759 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
| 760 | { | ||
| 761 | while (start < end) | ||
| 762 | (*result++) = utf8::next(start, end); | ||
| 763 | |||
| 764 | return result; | ||
| 765 | } | ||
| 766 | |||
| 767 | // The iterator class | ||
| 768 | template <typename octet_iterator> | ||
| 769 | class iterator { | ||
| 770 | octet_iterator it; | ||
| 771 | octet_iterator range_start; | ||
| 772 | octet_iterator range_end; | ||
| 773 | public: | ||
| 774 | typedef utfchar32_t value_type; | ||
| 775 | typedef utfchar32_t* pointer; | ||
| 776 | typedef utfchar32_t& reference; | ||
| 777 | typedef std::ptrdiff_t difference_type; | ||
| 778 | typedef std::bidirectional_iterator_tag iterator_category; | ||
| 779 | iterator () {} | ||
| 780 | explicit iterator (const octet_iterator& octet_it, | ||
| 781 | const octet_iterator& rangestart, | ||
| 782 | const octet_iterator& rangeend) : | ||
| 783 | it(octet_it), range_start(rangestart), range_end(rangeend) | ||
| 784 | { | ||
| 785 | if (it < range_start || it > range_end) | ||
| 786 | throw std::out_of_range("Invalid utf-8 iterator position"); | ||
| 787 | } | ||
| 788 | // the default "big three" are OK | ||
| 789 | octet_iterator base () const { return it; } | ||
| 790 | utfchar32_t operator * () const | ||
| 791 | { | ||
| 792 | octet_iterator temp = it; | ||
| 793 | return utf8::next(temp, range_end); | ||
| 794 | } | ||
| 795 | bool operator == (const iterator& rhs) const | ||
| 796 | { | ||
| 797 | if (range_start != rhs.range_start || range_end != rhs.range_end) | ||
| 798 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); | ||
| 799 | return (it == rhs.it); | ||
| 800 | } | ||
| 801 | bool operator != (const iterator& rhs) const | ||
| 802 | { | ||
| 803 | return !(operator == (rhs)); | ||
| 804 | } | ||
| 805 | iterator& operator ++ () | ||
| 806 | { | ||
| 807 | utf8::next(it, range_end); | ||
| 808 | return *this; | ||
| 809 | } | ||
| 810 | iterator operator ++ (int) | ||
| 811 | { | ||
| 812 | iterator temp = *this; | ||
| 813 | utf8::next(it, range_end); | ||
| 814 | return temp; | ||
| 815 | } | ||
| 816 | iterator& operator -- () | ||
| 817 | { | ||
| 818 | utf8::prior(it, range_start); | ||
| 819 | return *this; | ||
| 820 | } | ||
| 821 | iterator operator -- (int) | ||
| 822 | { | ||
| 823 | iterator temp = *this; | ||
| 824 | utf8::prior(it, range_start); | ||
| 825 | return temp; | ||
| 826 | } | ||
| 827 | }; // class iterator | ||
| 828 | |||
| 829 | } // namespace utf8 | ||
| 830 | |||
| 831 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later | ||
| 832 | namespace utf8 | ||
| 833 | { | ||
| 834 | inline void append16(utfchar32_t cp, std::u16string& s) | ||
| 835 | { | ||
| 836 | append16(cp, std::back_inserter(s)); | ||
| 837 | } | ||
| 838 | |||
| 839 | inline std::string utf16to8(const std::u16string& s) | ||
| 840 | { | ||
| 841 | std::string result; | ||
| 842 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 843 | return result; | ||
| 844 | } | ||
| 845 | |||
| 846 | inline std::u16string utf8to16(const std::string& s) | ||
| 847 | { | ||
| 848 | std::u16string result; | ||
| 849 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 850 | return result; | ||
| 851 | } | ||
| 852 | |||
| 853 | inline std::string utf32to8(const std::u32string& s) | ||
| 854 | { | ||
| 855 | std::string result; | ||
| 856 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 857 | return result; | ||
| 858 | } | ||
| 859 | |||
| 860 | inline std::u32string utf8to32(const std::string& s) | ||
| 861 | { | ||
| 862 | std::u32string result; | ||
| 863 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 864 | return result; | ||
| 865 | } | ||
| 866 | } // namespace utf8 | ||
| 867 | #endif // C++ 11 or later | ||
| 868 | |||
| 869 | #if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later | ||
| 870 | namespace utf8 | ||
| 871 | { | ||
| 872 | inline std::string utf16to8(std::u16string_view s) | ||
| 873 | { | ||
| 874 | std::string result; | ||
| 875 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 876 | return result; | ||
| 877 | } | ||
| 878 | |||
| 879 | inline std::u16string utf8to16(std::string_view s) | ||
| 880 | { | ||
| 881 | std::u16string result; | ||
| 882 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 883 | return result; | ||
| 884 | } | ||
| 885 | |||
| 886 | inline std::string utf32to8(std::u32string_view s) | ||
| 887 | { | ||
| 888 | std::string result; | ||
| 889 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 890 | return result; | ||
| 891 | } | ||
| 892 | |||
| 893 | inline std::u32string utf8to32(std::string_view s) | ||
| 894 | { | ||
| 895 | std::u32string result; | ||
| 896 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 897 | return result; | ||
| 898 | } | ||
| 899 | |||
| 900 | inline std::size_t find_invalid(std::string_view s) | ||
| 901 | { | ||
| 902 | std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 903 | return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 904 | } | ||
| 905 | |||
| 906 | inline bool is_valid(std::string_view s) | ||
| 907 | { | ||
| 908 | return is_valid(s.begin(), s.end()); | ||
| 909 | } | ||
| 910 | |||
| 911 | inline std::string replace_invalid(std::string_view s, char32_t replacement) | ||
| 912 | { | ||
| 913 | std::string result; | ||
| 914 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 915 | return result; | ||
| 916 | } | ||
| 917 | |||
| 918 | inline std::string replace_invalid(std::string_view s) | ||
| 919 | { | ||
| 920 | std::string result; | ||
| 921 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 922 | return result; | ||
| 923 | } | ||
| 924 | |||
| 925 | inline bool starts_with_bom(std::string_view s) | ||
| 926 | { | ||
| 927 | return starts_with_bom(s.begin(), s.end()); | ||
| 928 | } | ||
| 929 | |||
| 930 | } // namespace utf8 | ||
| 931 | #endif // C++ 17 or later | ||
| 932 | |||
| 933 | #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later | ||
| 934 | namespace utf8 | ||
| 935 | { | ||
| 936 | inline std::u8string utf16tou8(const std::u16string& s) | ||
| 937 | { | ||
| 938 | std::u8string result; | ||
| 939 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 940 | return result; | ||
| 941 | } | ||
| 942 | |||
| 943 | inline std::u8string utf16tou8(std::u16string_view s) | ||
| 944 | { | ||
| 945 | std::u8string result; | ||
| 946 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 947 | return result; | ||
| 948 | } | ||
| 949 | |||
| 950 | inline std::u16string utf8to16(const std::u8string& s) | ||
| 951 | { | ||
| 952 | std::u16string result; | ||
| 953 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 954 | return result; | ||
| 955 | } | ||
| 956 | |||
| 957 | inline std::u16string utf8to16(const std::u8string_view& s) | ||
| 958 | { | ||
| 959 | std::u16string result; | ||
| 960 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); | ||
| 961 | return result; | ||
| 962 | } | ||
| 963 | |||
| 964 | inline std::u8string utf32tou8(const std::u32string& s) | ||
| 965 | { | ||
| 966 | std::u8string result; | ||
| 967 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 968 | return result; | ||
| 969 | } | ||
| 970 | |||
| 971 | inline std::u8string utf32tou8(const std::u32string_view& s) | ||
| 972 | { | ||
| 973 | std::u8string result; | ||
| 974 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); | ||
| 975 | return result; | ||
| 976 | } | ||
| 977 | |||
| 978 | inline std::u32string utf8to32(const std::u8string& s) | ||
| 979 | { | ||
| 980 | std::u32string result; | ||
| 981 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 982 | return result; | ||
| 983 | } | ||
| 984 | |||
| 985 | inline std::u32string utf8to32(const std::u8string_view& s) | ||
| 986 | { | ||
| 987 | std::u32string result; | ||
| 988 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); | ||
| 989 | return result; | ||
| 990 | } | ||
| 991 | |||
| 992 | inline std::size_t find_invalid(const std::u8string& s) | ||
| 993 | { | ||
| 994 | std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); | ||
| 995 | return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); | ||
| 996 | } | ||
| 997 | |||
| 998 | inline bool is_valid(const std::u8string& s) | ||
| 999 | { | ||
| 1000 | return is_valid(s.begin(), s.end()); | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) | ||
| 1004 | { | ||
| 1005 | std::u8string result; | ||
| 1006 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 1007 | return result; | ||
| 1008 | } | ||
| 1009 | |||
| 1010 | inline std::u8string replace_invalid(const std::u8string& s) | ||
| 1011 | { | ||
| 1012 | std::u8string result; | ||
| 1013 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 1014 | return result; | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | inline bool starts_with_bom(const std::u8string& s) | ||
| 1018 | { | ||
| 1019 | return starts_with_bom(s.begin(), s.end()); | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | } // namespace utf8 | ||
| 1023 | #endif // C++ 20 or later | ||
| 1024 | |||
| 1025 | namespace utf8 | ||
| 1026 | { | ||
| 1027 | namespace unchecked | ||
| 1028 | { | ||
| 1029 | template <typename octet_iterator> | ||
| 1030 | octet_iterator append(utfchar32_t cp, octet_iterator result) | ||
| 1031 | { | ||
| 1032 | return internal::append(cp, result); | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | template <typename word_iterator> | ||
| 1036 | word_iterator append16(utfchar32_t cp, word_iterator result) | ||
| 1037 | { | ||
| 1038 | return internal::append16(cp, result); | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | template <typename octet_iterator, typename output_iterator> | ||
| 1042 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) | ||
| 1043 | { | ||
| 1044 | while (start != end) { | ||
| 1045 | octet_iterator sequence_start = start; | ||
| 1046 | internal::utf_error err_code = utf8::internal::validate_next(start, end); | ||
| 1047 | switch (err_code) { | ||
| 1048 | case internal::UTF8_OK : | ||
| 1049 | for (octet_iterator it = sequence_start; it != start; ++it) | ||
| 1050 | *out++ = *it; | ||
| 1051 | break; | ||
| 1052 | case internal::NOT_ENOUGH_ROOM: | ||
| 1053 | out = utf8::unchecked::append(replacement, out); | ||
| 1054 | start = end; | ||
| 1055 | break; | ||
| 1056 | case internal::INVALID_LEAD: | ||
| 1057 | out = utf8::unchecked::append(replacement, out); | ||
| 1058 | ++start; | ||
| 1059 | break; | ||
| 1060 | case internal::INCOMPLETE_SEQUENCE: | ||
| 1061 | case internal::OVERLONG_SEQUENCE: | ||
| 1062 | case internal::INVALID_CODE_POINT: | ||
| 1063 | out = utf8::unchecked::append(replacement, out); | ||
| 1064 | ++start; | ||
| 1065 | // just one replacement mark for the sequence | ||
| 1066 | while (start != end && utf8::internal::is_trail(*start)) | ||
| 1067 | ++start; | ||
| 1068 | break; | ||
| 1069 | } | ||
| 1070 | } | ||
| 1071 | return out; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | template <typename octet_iterator, typename output_iterator> | ||
| 1075 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | ||
| 1076 | { | ||
| 1077 | static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd)); | ||
| 1078 | return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); | ||
| 1079 | } | ||
| 1080 | |||
| 1081 | inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) | ||
| 1082 | { | ||
| 1083 | std::string result; | ||
| 1084 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); | ||
| 1085 | return result; | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | inline std::string replace_invalid(const std::string& s) | ||
| 1089 | { | ||
| 1090 | std::string result; | ||
| 1091 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); | ||
| 1092 | return result; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | template <typename octet_iterator> | ||
| 1096 | utfchar32_t next(octet_iterator& it) | ||
| 1097 | { | ||
| 1098 | utfchar32_t cp = utf8::internal::mask8(*it); | ||
| 1099 | switch (utf8::internal::sequence_length(it)) { | ||
| 1100 | case 1: | ||
| 1101 | break; | ||
| 1102 | case 2: | ||
| 1103 | ++it; | ||
| 1104 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); | ||
| 1105 | break; | ||
| 1106 | case 3: | ||
| 1107 | ++it; | ||
| 1108 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
| 1109 | ++it; | ||
| 1110 | cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f)); | ||
| 1111 | break; | ||
| 1112 | case 4: | ||
| 1113 | ++it; | ||
| 1114 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
| 1115 | ++it; | ||
| 1116 | cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); | ||
| 1117 | ++it; | ||
| 1118 | cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f)); | ||
| 1119 | break; | ||
| 1120 | } | ||
| 1121 | ++it; | ||
| 1122 | return cp; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | template <typename octet_iterator> | ||
| 1126 | utfchar32_t peek_next(octet_iterator it) | ||
| 1127 | { | ||
| 1128 | return utf8::unchecked::next(it); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | template <typename word_iterator> | ||
| 1132 | utfchar32_t next16(word_iterator& it) | ||
| 1133 | { | ||
| 1134 | utfchar32_t cp = utf8::internal::mask16(*it++); | ||
| 1135 | if (utf8::internal::is_lead_surrogate(cp)) | ||
| 1136 | return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; | ||
| 1137 | return cp; | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | template <typename octet_iterator> | ||
| 1141 | utfchar32_t prior(octet_iterator& it) | ||
| 1142 | { | ||
| 1143 | while (utf8::internal::is_trail(*(--it))) ; | ||
| 1144 | octet_iterator temp = it; | ||
| 1145 | return utf8::unchecked::next(temp); | ||
| 1146 | } | ||
| 1147 | |||
| 1148 | template <typename octet_iterator, typename distance_type> | ||
| 1149 | void advance(octet_iterator& it, distance_type n) | ||
| 1150 | { | ||
| 1151 | const distance_type zero(0); | ||
| 1152 | if (n < zero) { | ||
| 1153 | // backward | ||
| 1154 | for (distance_type i = n; i < zero; ++i) | ||
| 1155 | utf8::unchecked::prior(it); | ||
| 1156 | } else { | ||
| 1157 | // forward | ||
| 1158 | for (distance_type i = zero; i < n; ++i) | ||
| 1159 | utf8::unchecked::next(it); | ||
| 1160 | } | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | template <typename octet_iterator> | ||
| 1164 | typename std::iterator_traits<octet_iterator>::difference_type | ||
| 1165 | distance(octet_iterator first, octet_iterator last) | ||
| 1166 | { | ||
| 1167 | typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
| 1168 | for (dist = 0; first < last; ++dist) | ||
| 1169 | utf8::unchecked::next(first); | ||
| 1170 | return dist; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 1174 | octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
| 1175 | { | ||
| 1176 | while (start != end) { | ||
| 1177 | utfchar32_t cp = utf8::internal::mask16(*start++); | ||
| 1178 | // Take care of surrogate pairs first | ||
| 1179 | if (utf8::internal::is_lead_surrogate(cp)) { | ||
| 1180 | if (start == end) | ||
| 1181 | return result; | ||
| 1182 | utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); | ||
| 1183 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
| 1184 | } | ||
| 1185 | result = utf8::unchecked::append(cp, result); | ||
| 1186 | } | ||
| 1187 | return result; | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | template <typename u16bit_iterator, typename octet_iterator> | ||
| 1191 | u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
| 1192 | { | ||
| 1193 | while (start < end) { | ||
| 1194 | utfchar32_t cp = utf8::unchecked::next(start); | ||
| 1195 | if (cp > 0xffff) { //make a surrogate pair | ||
| 1196 | *result++ = static_cast<utfchar16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
| 1197 | *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
| 1198 | } | ||
| 1199 | else | ||
| 1200 | *result++ = static_cast<utfchar16_t>(cp); | ||
| 1201 | } | ||
| 1202 | return result; | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 1206 | octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
| 1207 | { | ||
| 1208 | while (start != end) | ||
| 1209 | result = utf8::unchecked::append(*(start++), result); | ||
| 1210 | |||
| 1211 | return result; | ||
| 1212 | } | ||
| 1213 | |||
| 1214 | template <typename octet_iterator, typename u32bit_iterator> | ||
| 1215 | u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
| 1216 | { | ||
| 1217 | while (start < end) | ||
| 1218 | (*result++) = utf8::unchecked::next(start); | ||
| 1219 | |||
| 1220 | return result; | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | // The iterator class | ||
| 1224 | template <typename octet_iterator> | ||
| 1225 | class iterator { | ||
| 1226 | octet_iterator it; | ||
| 1227 | public: | ||
| 1228 | typedef utfchar32_t value_type; | ||
| 1229 | typedef utfchar32_t* pointer; | ||
| 1230 | typedef utfchar32_t& reference; | ||
| 1231 | typedef std::ptrdiff_t difference_type; | ||
| 1232 | typedef std::bidirectional_iterator_tag iterator_category; | ||
| 1233 | iterator () {} | ||
| 1234 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} | ||
| 1235 | // the default "big three" are OK | ||
| 1236 | octet_iterator base () const { return it; } | ||
| 1237 | utfchar32_t operator * () const | ||
| 1238 | { | ||
| 1239 | octet_iterator temp = it; | ||
| 1240 | return utf8::unchecked::next(temp); | ||
| 1241 | } | ||
| 1242 | bool operator == (const iterator& rhs) const | ||
| 1243 | { | ||
| 1244 | return (it == rhs.it); | ||
| 1245 | } | ||
| 1246 | bool operator != (const iterator& rhs) const | ||
| 1247 | { | ||
| 1248 | return !(operator == (rhs)); | ||
| 1249 | } | ||
| 1250 | iterator& operator ++ () | ||
| 1251 | { | ||
| 1252 | ::std::advance(it, utf8::internal::sequence_length(it)); | ||
| 1253 | return *this; | ||
| 1254 | } | ||
| 1255 | iterator operator ++ (int) | ||
| 1256 | { | ||
| 1257 | iterator temp = *this; | ||
| 1258 | ::std::advance(it, utf8::internal::sequence_length(it)); | ||
| 1259 | return temp; | ||
| 1260 | } | ||
| 1261 | iterator& operator -- () | ||
| 1262 | { | ||
| 1263 | utf8::unchecked::prior(it); | ||
| 1264 | return *this; | ||
| 1265 | } | ||
| 1266 | iterator operator -- (int) | ||
| 1267 | { | ||
| 1268 | iterator temp = *this; | ||
| 1269 | utf8::unchecked::prior(it); | ||
| 1270 | return temp; | ||
| 1271 | } | ||
| 1272 | }; // class iterator | ||
| 1273 | |||
| 1274 | } // namespace utf8::unchecked | ||
| 1275 | } // namespace utf8 | ||
| 1276 | |||
| 1277 | #endif // header guard | ||
