diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 /lptree.c | |
parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lptree.c')
-rw-r--r-- | lptree.c | 55 |
1 files changed, 53 insertions, 2 deletions
@@ -21,7 +21,7 @@ | |||
21 | /* number of siblings for each tree */ | 21 | /* number of siblings for each tree */ |
22 | const byte numsiblings[] = { | 22 | const byte numsiblings[] = { |
23 | 0, 0, 0, /* char, set, any */ | 23 | 0, 0, 0, /* char, set, any */ |
24 | 0, 0, /* true, false */ | 24 | 0, 0, 0, /* true, false, utf-range */ |
25 | 1, /* rep */ | 25 | 1, /* rep */ |
26 | 2, 2, /* seq, choice */ | 26 | 2, 2, /* seq, choice */ |
27 | 1, 1, /* not, and */ | 27 | 1, 1, /* not, and */ |
@@ -675,6 +675,56 @@ static int lp_range (lua_State *L) { | |||
675 | 675 | ||
676 | 676 | ||
677 | /* | 677 | /* |
678 | ** Fills a tree node with basic information about the UTF-8 code point | ||
679 | ** 'cpu': its value in 'n', its length in 'cap', and its first byte in | ||
680 | ** 'key' | ||
681 | */ | ||
682 | static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { | ||
683 | int len, fb, cp; | ||
684 | cp = (int)cpu; | ||
685 | if (cp <= 0x7f) { /* one byte? */ | ||
686 | len = 1; | ||
687 | fb = cp; | ||
688 | } else if (cp <= 0x7ff) { | ||
689 | len = 2; | ||
690 | fb = 0xC0 | (cp >> 6); | ||
691 | } else if (cp <= 0xffff) { | ||
692 | len = 3; | ||
693 | fb = 0xE0 | (cp >> 12); | ||
694 | } | ||
695 | else { | ||
696 | luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); | ||
697 | len = 4; | ||
698 | fb = 0xF0 | (cp >> 18); | ||
699 | } | ||
700 | t->u.n = cp; | ||
701 | t->cap = len; | ||
702 | t->key = fb; | ||
703 | } | ||
704 | |||
705 | |||
706 | static int lp_utfr (lua_State *L) { | ||
707 | lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); | ||
708 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | ||
709 | luaL_argcheck(L, from <= to, 2, "empty range"); | ||
710 | if (to <= 0x7f) { /* ascii range? */ | ||
711 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
712 | unsigned int f; | ||
713 | for (f = (int)from; f <= to; f++) | ||
714 | setchar(treebuffer(tree), f); | ||
715 | } | ||
716 | else { /* multi-byte utf-8 range */ | ||
717 | TTree *tree = newtree(L, 2); | ||
718 | tree->tag = TUTFR; | ||
719 | codeutftree(L, tree, from, 1); | ||
720 | sib1(tree)->tag = TXInfo; | ||
721 | codeutftree(L, sib1(tree), to, 2); | ||
722 | } | ||
723 | return 1; | ||
724 | } | ||
725 | |||
726 | |||
727 | /* | ||
678 | ** Look-behind predicate | 728 | ** Look-behind predicate |
679 | */ | 729 | */ |
680 | static int lp_behind (lua_State *L) { | 730 | static int lp_behind (lua_State *L) { |
@@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, | |||
1008 | tailcall: | 1058 | tailcall: |
1009 | switch (tree->tag) { | 1059 | switch (tree->tag) { |
1010 | case TChar: case TSet: case TAny: | 1060 | case TChar: case TSet: case TAny: |
1011 | case TFalse: | 1061 | case TFalse: case TUTFR: |
1012 | return nb; /* cannot pass from here */ | 1062 | return nb; /* cannot pass from here */ |
1013 | case TTrue: | 1063 | case TTrue: |
1014 | case TBehind: /* look-behind cannot have calls */ | 1064 | case TBehind: /* look-behind cannot have calls */ |
@@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = { | |||
1271 | {"P", lp_P}, | 1321 | {"P", lp_P}, |
1272 | {"S", lp_set}, | 1322 | {"S", lp_set}, |
1273 | {"R", lp_range}, | 1323 | {"R", lp_range}, |
1324 | {"utfR", lp_utfr}, | ||
1274 | {"locale", lp_locale}, | 1325 | {"locale", lp_locale}, |
1275 | {"version", lp_version}, | 1326 | {"version", lp_version}, |
1276 | {"setmaxstack", lp_setmax}, | 1327 | {"setmaxstack", lp_setmax}, |