aboutsummaryrefslogtreecommitdiff
path: root/lptree.c
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
commit24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree646cd65d6e2dab57691f98f83f15f25c70685ef8 /lptree.c
parent3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
downloadlpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lptree.c')
-rw-r--r--lptree.c55
1 files changed, 53 insertions, 2 deletions
diff --git a/lptree.c b/lptree.c
index 557090b..62acc5c 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,7 +21,7 @@
21/* number of siblings for each tree */ 21/* number of siblings for each tree */
22const byte numsiblings[] = { 22const byte numsiblings[] = {
23 0, 0, 0, /* char, set, any */ 23 0, 0, 0, /* char, set, any */
24 0, 0, /* true, false */ 24 0, 0, 0, /* true, false, utf-range */
25 1, /* rep */ 25 1, /* rep */
26 2, 2, /* seq, choice */ 26 2, 2, /* seq, choice */
27 1, 1, /* not, and */ 27 1, 1, /* not, and */
@@ -675,6 +675,56 @@ static int lp_range (lua_State *L) {
675 675
676 676
677/* 677/*
678** Fills a tree node with basic information about the UTF-8 code point
679** 'cpu': its value in 'n', its length in 'cap', and its first byte in
680** 'key'
681*/
682static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
683 int len, fb, cp;
684 cp = (int)cpu;
685 if (cp <= 0x7f) { /* one byte? */
686 len = 1;
687 fb = cp;
688 } else if (cp <= 0x7ff) {
689 len = 2;
690 fb = 0xC0 | (cp >> 6);
691 } else if (cp <= 0xffff) {
692 len = 3;
693 fb = 0xE0 | (cp >> 12);
694 }
695 else {
696 luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
697 len = 4;
698 fb = 0xF0 | (cp >> 18);
699 }
700 t->u.n = cp;
701 t->cap = len;
702 t->key = fb;
703}
704
705
706static int lp_utfr (lua_State *L) {
707 lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
708 lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
709 luaL_argcheck(L, from <= to, 2, "empty range");
710 if (to <= 0x7f) { /* ascii range? */
711 TTree *tree = newcharset(L); /* code it as a regular charset */
712 unsigned int f;
713 for (f = (int)from; f <= to; f++)
714 setchar(treebuffer(tree), f);
715 }
716 else { /* multi-byte utf-8 range */
717 TTree *tree = newtree(L, 2);
718 tree->tag = TUTFR;
719 codeutftree(L, tree, from, 1);
720 sib1(tree)->tag = TXInfo;
721 codeutftree(L, sib1(tree), to, 2);
722 }
723 return 1;
724}
725
726
727/*
678** Look-behind predicate 728** Look-behind predicate
679*/ 729*/
680static int lp_behind (lua_State *L) { 730static int lp_behind (lua_State *L) {
@@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
1008 tailcall: 1058 tailcall:
1009 switch (tree->tag) { 1059 switch (tree->tag) {
1010 case TChar: case TSet: case TAny: 1060 case TChar: case TSet: case TAny:
1011 case TFalse: 1061 case TFalse: case TUTFR:
1012 return nb; /* cannot pass from here */ 1062 return nb; /* cannot pass from here */
1013 case TTrue: 1063 case TTrue:
1014 case TBehind: /* look-behind cannot have calls */ 1064 case TBehind: /* look-behind cannot have calls */
@@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = {
1271 {"P", lp_P}, 1321 {"P", lp_P},
1272 {"S", lp_set}, 1322 {"S", lp_set},
1273 {"R", lp_range}, 1323 {"R", lp_range},
1324 {"utfR", lp_utfr},
1274 {"locale", lp_locale}, 1325 {"locale", lp_locale},
1275 {"version", lp_version}, 1326 {"version", lp_version},
1276 {"setmaxstack", lp_setmax}, 1327 {"setmaxstack", lp_setmax},