aboutsummaryrefslogtreecommitdiff
path: root/lpcode.c
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
commit24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpcode.c
parent3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
downloadlpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lpcode.c')
-rw-r--r--lpcode.c43
1 files changed, 32 insertions, 11 deletions
diff --git a/lpcode.c b/lpcode.c
index 5fddfab..7d47a00 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) {
196int checkaux (TTree *tree, int pred) { 196int checkaux (TTree *tree, int pred) {
197 tailcall: 197 tailcall:
198 switch (tree->tag) { 198 switch (tree->tag) {
199 case TChar: case TSet: case TAny: 199 case TChar: case TSet: case TAny: case TUTFR:
200 case TFalse: case TOpenCall: 200 case TFalse: case TOpenCall:
201 return 0; /* not nullable */ 201 return 0; /* not nullable */
202 case TRep: case TTrue: 202 case TRep: case TTrue:
@@ -239,6 +239,8 @@ int fixedlen (TTree *tree) {
239 switch (tree->tag) { 239 switch (tree->tag) {
240 case TChar: case TSet: case TAny: 240 case TChar: case TSet: case TAny:
241 return len + 1; 241 return len + 1;
242 case TUTFR:
243 return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1;
242 case TFalse: case TTrue: case TNot: case TAnd: case TBehind: 244 case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
243 return len; 245 return len;
244 case TRep: case TRunTime: case TOpenCall: 246 case TRep: case TRunTime: case TOpenCall:
@@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
298 tocharset(tree, firstset); 300 tocharset(tree, firstset);
299 return 0; 301 return 0;
300 } 302 }
303 case TUTFR: {
304 int c;
305 loopset(i, firstset->cs[i] = 0); /* erase all chars */
306 for (c = tree->key; c <= sib1(tree)->key; c++)
307 setchar(firstset->cs, c);
308 return 0;
309 }
301 case TTrue: { 310 case TTrue: {
302 loopset(i, firstset->cs[i] = follow->cs[i]); 311 loopset(i, firstset->cs[i] = follow->cs[i]);
303 return 1; /* accepts the empty string */ 312 return 1; /* accepts the empty string */
@@ -380,7 +389,7 @@ static int headfail (TTree *tree) {
380 case TChar: case TSet: case TAny: case TFalse: 389 case TChar: case TSet: case TAny: case TFalse:
381 return 1; 390 return 1;
382 case TTrue: case TRep: case TRunTime: case TNot: 391 case TTrue: case TRep: case TRunTime: case TNot:
383 case TBehind: 392 case TBehind: case TUTFR:
384 return 0; 393 return 0;
385 case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: 394 case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd:
386 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ 395 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
@@ -407,7 +416,7 @@ static int headfail (TTree *tree) {
407static int needfollow (TTree *tree) { 416static int needfollow (TTree *tree) {
408 tailcall: 417 tailcall:
409 switch (tree->tag) { 418 switch (tree->tag) {
410 case TChar: case TSet: case TAny: 419 case TChar: case TSet: case TAny: case TUTFR:
411 case TFalse: case TTrue: case TAnd: case TNot: 420 case TFalse: case TTrue: case TAnd: case TNot:
412 case TRunTime: case TGrammar: case TCall: case TBehind: 421 case TRunTime: case TGrammar: case TCall: case TBehind:
413 return 0; 422 return 0;
@@ -418,7 +427,7 @@ static int needfollow (TTree *tree) {
418 case TSeq: 427 case TSeq:
419 tree = sib2(tree); goto tailcall; 428 tree = sib2(tree); goto tailcall;
420 default: assert(0); return 0; 429 default: assert(0); return 0;
421 } 430 }
422} 431}
423 432
424/* }====================================================== */ 433/* }====================================================== */
@@ -441,6 +450,7 @@ int sizei (const Instruction *i) {
441 case ITestSet: return CHARSETINSTSIZE + 1; 450 case ITestSet: return CHARSETINSTSIZE + 1;
442 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: 451 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
443 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: 452 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
453 case IUTFR:
444 return 2; 454 return 2;
445 default: return 1; 455 default: return 1;
446 } 456 }
@@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) {
518} 528}
519 529
520 530
531static void codeutfr (CompileState *compst, TTree *tree) {
532 int i = addoffsetinst(compst, IUTFR);
533 int to = sib1(tree)->u.n;
534 assert(sib1(tree)->tag == TXInfo);
535 getinstr(compst, i + 1).offset = tree->u.n;
536 getinstr(compst, i).i.aux = to & 0xff;
537 getinstr(compst, i).i.key = to >> 8;
538}
539
540
521/* 541/*
522** Add a capture instruction: 542** Add a capture instruction:
523** 'op' is the capture instruction; 'cap' the capture kind; 543** 'op' is the capture instruction; 'cap' the capture kind;
@@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) {
665 685
666/* 686/*
667** Choice; optimizations: 687** Choice; optimizations:
668** - when p1 is headfail or 688** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
669** when first(p1) and first(p2) are disjoint, than 689** than a character not in first(p1) cannot go to p1 and a character
670** a character not in first(p1) cannot go to p1, and a character 690** in first(p1) cannot go to p2, either because p1 will accept
671** in first(p1) cannot go to p2 (at it is not in first(p2)). 691** (headfail) or because it is not in first(p2) (disjoint).
672** (The optimization is not valid if p1 accepts the empty string, 692** (The second case is not valid if p1 accepts the empty string,
673** as then there is no character at all...) 693** as then there is no character at all...)
674** - when p2 is empty and opt is true; a IPartialCommit can reuse 694** - when p2 is empty and opt is true; a IPartialCommit can reuse
675** the Choice already active in the stack. 695** the Choice already active in the stack.
@@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
686 int jmp = NOINST; 706 int jmp = NOINST;
687 codegen(compst, p1, 0, test, fl); 707 codegen(compst, p1, 0, test, fl);
688 if (!emptyp2) 708 if (!emptyp2)
689 jmp = addoffsetinst(compst, IJmp); 709 jmp = addoffsetinst(compst, IJmp);
690 jumptohere(compst, test); 710 jumptohere(compst, test);
691 codegen(compst, p2, opt, NOINST, fl); 711 codegen(compst, p2, opt, NOINST, fl);
692 jumptohere(compst, jmp); 712 jumptohere(compst, jmp);
@@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
697 codegen(compst, p1, 1, NOINST, fullset); 717 codegen(compst, p1, 1, NOINST, fullset);
698 } 718 }
699 else { 719 else {
700 /* <p1 / p2> == 720 /* <p1 / p2> ==
701 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ 721 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
702 int pcommit; 722 int pcommit;
703 int test = codetestset(compst, &cs1, e1); 723 int test = codetestset(compst, &cs1, e1);
@@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
927 case TSet: codecharset(compst, treebuffer(tree), tt); break; 947 case TSet: codecharset(compst, treebuffer(tree), tt); break;
928 case TTrue: break; 948 case TTrue: break;
929 case TFalse: addinstruction(compst, IFail, 0); break; 949 case TFalse: addinstruction(compst, IFail, 0); break;
950 case TUTFR: codeutfr(compst, tree); break;
930 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; 951 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
931 case TRep: coderep(compst, sib1(tree), opt, fl); break; 952 case TRep: coderep(compst, sib1(tree), opt, fl); break;
932 case TBehind: codebehind(compst, tree); break; 953 case TBehind: codebehind(compst, tree); break;