aboutsummaryrefslogtreecommitdiff
path: root/lpcode.c
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-27 10:32:39 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-27 10:32:39 -0300
commit012cf9c86cf91cb8354e229bde335592d41b84b2 (patch)
tree353f17797b1952eaec231c8e4fd5c21e02daf875 /lpcode.c
parent3403b0c7256435560b63f828da92026c5d4c898b (diff)
downloadlpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz
lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2
lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip
Compact charsets used in trees, too.
Diffstat (limited to 'lpcode.c')
-rw-r--r--lpcode.c118
1 files changed, 38 insertions, 80 deletions
diff --git a/lpcode.c b/lpcode.c
index 66d2f3f..9289bd3 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -45,31 +45,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
45 45
46 46
47/* 47/*
48** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
49** charset and return 1; else return 0.
50*/
51int tocharset (TTree *tree, Charset *cs) {
52 switch (tree->tag) {
53 case TSet: { /* copy set */
54 loopset(i, cs->cs[i] = treebuffer(tree)[i]);
55 return 1;
56 }
57 case TChar: { /* only one char */
58 assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
59 loopset(i, cs->cs[i] = 0); /* erase all chars */
60 setchar(cs->cs, tree->u.n); /* add that one */
61 return 1;
62 }
63 case TAny: {
64 loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */
65 return 1;
66 }
67 default: return 0;
68 }
69}
70
71
72/*
73** Visit a TCall node taking care to stop recursion. If node not yet 48** Visit a TCall node taking care to stop recursion. If node not yet
74** visited, return 'f(sib2(tree))', otherwise return 'def' (default 49** visited, return 'f(sib2(tree))', otherwise return 'def' (default
75** value) 50** value)
@@ -240,7 +215,7 @@ int fixedlen (TTree *tree) {
240static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { 215static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
241 tailcall: 216 tailcall:
242 switch (tree->tag) { 217 switch (tree->tag) {
243 case TChar: case TSet: case TAny: { 218 case TChar: case TSet: case TAny: case TFalse: {
244 tocharset(tree, firstset); 219 tocharset(tree, firstset);
245 return 0; 220 return 0;
246 } 221 }
@@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
255 loopset(i, firstset->cs[i] = follow->cs[i]); 230 loopset(i, firstset->cs[i] = follow->cs[i]);
256 return 1; /* accepts the empty string */ 231 return 1; /* accepts the empty string */
257 } 232 }
258 case TFalse: {
259 loopset(i, firstset->cs[i] = 0);
260 return 0;
261 }
262 case TChoice: { 233 case TChoice: {
263 Charset csaux; 234 Charset csaux;
264 int e1 = getfirst(sib1(tree), follow, firstset); 235 int e1 = getfirst(sib1(tree), follow, firstset);
@@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) {
543/* 514/*
544** Add a charset posfix to an instruction. 515** Add a charset posfix to an instruction.
545*/ 516*/
546static void addcharset (CompileState *compst, int inst, const byte *cs, 517static void addcharset (CompileState *compst, int inst, charsetinfo *info) {
547 const charsetinfo *info) {
548 int p; 518 int p;
549 Instruction *I = &getinstr(compst, inst); 519 Instruction *I = &getinstr(compst, inst);
550 byte *charset; 520 byte *charset;
551 int isize = instsize(info->size); /* size in instructions */ 521 int isize = instsize(info->size); /* size in instructions */
552 int i; 522 int i;
553 I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ 523 I->i.aux2.set.offset = info->offset * 8; /* offset in bits */
554 I->i.aux2.set.size = isize; 524 I->i.aux2.set.size = isize;
555 I->i.aux1 = info->deflt; 525 I->i.aux1 = info->deflt;
556 p = nextinstruction(compst, isize); /* space for charset */ 526 p = nextinstruction(compst, isize); /* space for charset */
557 charset = getinstr(compst, p).buff; /* charset buffer */ 527 charset = getinstr(compst, p).buff; /* charset buffer */
558 for (i = 0; i < isize * (int)sizeof(Instruction); i++) 528 for (i = 0; i < isize * (int)sizeof(Instruction); i++)
559 charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ 529 charset[i] = getbytefromcharset(info, i); /* copy the buffer */
560} 530}
561 531
562 532
563/* 533/*
564** Check whether compact charset cs is dominated by instruction 'p' 534** Check whether charset 'info' is dominated by instruction 'p'
565*/ 535*/
566static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { 536static int cs_equal (Instruction *p, charsetinfo *info) {
567 if (p->i.code != ITestSet) 537 if (p->i.code != ITestSet)
568 return 0; 538 return 0;
569 else if (p->i.aux2.set.offset != info->aux1 * 8 || 539 else if (p->i.aux2.set.offset != info->offset * 8 ||
570 p->i.aux2.set.size != instsize(info->size) || 540 p->i.aux2.set.size != instsize(info->size) ||
571 p->i.aux1 != info->deflt) 541 p->i.aux1 != info->deflt)
572 return 0; 542 return 0;
573 else { 543 else {
574 int i; 544 int i;
575 for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { 545 for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) {
576 if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) 546 if ((p + 2)->buff[i] != getbytefromcharset(info, i))
577 return 0; 547 return 0;
578 } 548 }
579 } 549 }
@@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) {
582 552
583 553
584/* 554/*
585** code a char set, optimizing unit sets for IChar, "complete" 555** Code a char set, using IAny when instruction is dominated by an
586** sets for IAny, and empty sets for IFail; also use an IAny 556** equivalent test.
587** when instruction is dominated by an equivalent test.
588*/ 557*/
589static void codecharset (CompileState *compst, const byte *cs, int tt) { 558static void codecharset (CompileState *compst, TTree *tree, int tt) {
590 charsetinfo info; 559 charsetinfo info;
591 Opcode op = charsettype(cs, &info); 560 tree2cset(tree, &info);
592 switch (op) { 561 if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info))
593 case IChar: codechar(compst, info.aux1, tt); break; 562 addinstruction(compst, IAny, 0);
594 case ISet: { /* non-trivial set? */ 563 else {
595 if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) 564 int i = addinstruction(compst, ISet, 0);
596 addinstruction(compst, IAny, 0); 565 addcharset(compst, i, &info);
597 else {
598 int i = addinstruction(compst, ISet, 0);
599 addcharset(compst, i, cs, &info);
600 }
601 break;
602 }
603 default:
604 assert(op == IFail || op == IAny);
605 addinstruction(compst, op, 0);
606 break;
607 } 566 }
608} 567}
609 568
610 569
611/* 570/*
612** code a test set, optimizing unit sets for ITestChar, "complete" 571** Code a test set, optimizing unit sets for ITestChar, "complete"
613** sets for ITestAny, and empty sets for IJmp (always fails). 572** sets for ITestAny, and empty sets for IJmp (always fails).
614** 'e' is true iff test should accept the empty string. (Test 573** 'e' is true iff test should accept the empty string. (Test
615** instructions in the current VM never accept the empty string.) 574** instructions in the current VM never accept the empty string.)
@@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) {
624 case IAny: return addoffsetinst(compst, ITestAny); 583 case IAny: return addoffsetinst(compst, ITestAny);
625 case IChar: { 584 case IChar: {
626 int i = addoffsetinst(compst, ITestChar); 585 int i = addoffsetinst(compst, ITestChar);
627 getinstr(compst, i).i.aux1 = info.aux1; 586 getinstr(compst, i).i.aux1 = info.offset;
628 return i; 587 return i;
629 } 588 }
630 default: { /* regular set */ 589 default: { /* regular set */
631 int i = addoffsetinst(compst, ITestSet); 590 int i = addoffsetinst(compst, ITestSet);
632 addcharset(compst, i, cs->cs, &info); 591 addcharset(compst, i, &info);
633 assert(op == ISet); 592 assert(op == ISet);
634 return i; 593 return i;
635 } 594 }
@@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) {
778 737
779 738
780/* 739/*
781** Repetition of charsets: 740** Try repetition of charsets:
782** For an empty set, repetition of fail is a no-op; 741** For an empty set, repetition of fail is a no-op;
783** For any or char, code a tight loop; 742** For any or char, code a tight loop;
784** For generic charset, use a span instruction. 743** For generic charset, use a span instruction.
785*/ 744*/
786static void coderepcharset (CompileState *compst, Charset *st) { 745static int coderepcharset (CompileState *compst, TTree *tree) {
787 charsetinfo info; 746 switch (tree->tag) {
788 Opcode op = charsettype(st->cs, &info); 747 case TFalse: return 1; /* 'fail*' is a no-op */
789 switch (op) { 748 case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */
790 case IFail: break; /* fail* is a no-op */
791 case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */
792 int test = addoffsetinst(compst, ITestAny); 749 int test = addoffsetinst(compst, ITestAny);
793 addinstruction(compst, IAny, 0); 750 addinstruction(compst, IAny, 0);
794 closeloop(compst, test); 751 closeloop(compst, test);
795 break; 752 return 1;
796 } 753 }
797 case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ 754 case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */
798 int test = addoffsetinst(compst, ITestChar); 755 int test = addoffsetinst(compst, ITestChar);
799 getinstr(compst, test).i.aux1 = info.aux1; 756 getinstr(compst, test).i.aux1 = tree->u.n;
800 addinstruction(compst, IAny, 0); 757 addinstruction(compst, IAny, 0);
801 closeloop(compst, test); 758 closeloop(compst, test);
802 break; 759 return 1;
803 } 760 }
804 default: { /* regular set */ 761 case TSet: { /* regular set */
762 charsetinfo info;
805 int i = addinstruction(compst, ISpan, 0); 763 int i = addinstruction(compst, ISpan, 0);
806 addcharset(compst, i, st->cs, &info); 764 tree2cset(tree, &info);
807 assert(op == ISet); 765 addcharset(compst, i, &info);
766 return 1;
808 } 767 }
768 default: return 0; /* not a charset */
809 } 769 }
810} 770}
811 771
@@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) {
822*/ 782*/
823static void coderep (CompileState *compst, TTree *tree, int opt, 783static void coderep (CompileState *compst, TTree *tree, int opt,
824 const Charset *fl) { 784 const Charset *fl) {
825 Charset st; 785 if (!coderepcharset(compst, tree)) {
826 if (tocharset(tree, &st)) 786 Charset st;
827 coderepcharset(compst, &st);
828 else {
829 int e1 = getfirst(tree, fullset, &st); 787 int e1 = getfirst(tree, fullset, &st);
830 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { 788 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
831 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ 789 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
@@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
965 switch (tree->tag) { 923 switch (tree->tag) {
966 case TChar: codechar(compst, tree->u.n, tt); break; 924 case TChar: codechar(compst, tree->u.n, tt); break;
967 case TAny: addinstruction(compst, IAny, 0); break; 925 case TAny: addinstruction(compst, IAny, 0); break;
968 case TSet: codecharset(compst, treebuffer(tree), tt); break; 926 case TSet: codecharset(compst, tree, tt); break;
969 case TTrue: break; 927 case TTrue: break;
970 case TFalse: addinstruction(compst, IFail, 0); break; 928 case TFalse: addinstruction(compst, IFail, 0); break;
971 case TUTFR: codeutfr(compst, tree); break; 929 case TUTFR: codeutfr(compst, tree); break;