diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
commit | 012cf9c86cf91cb8354e229bde335592d41b84b2 (patch) | |
tree | 353f17797b1952eaec231c8e4fd5c21e02daf875 /lpcode.c | |
parent | 3403b0c7256435560b63f828da92026c5d4c898b (diff) | |
download | lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2 lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip |
Compact charsets used in trees, too.
Diffstat (limited to 'lpcode.c')
-rw-r--r-- | lpcode.c | 118 |
1 files changed, 38 insertions, 80 deletions
@@ -45,31 +45,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | |||
45 | 45 | ||
46 | 46 | ||
47 | /* | 47 | /* |
48 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a | ||
49 | ** charset and return 1; else return 0. | ||
50 | */ | ||
51 | int tocharset (TTree *tree, Charset *cs) { | ||
52 | switch (tree->tag) { | ||
53 | case TSet: { /* copy set */ | ||
54 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
55 | return 1; | ||
56 | } | ||
57 | case TChar: { /* only one char */ | ||
58 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
59 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
60 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
61 | return 1; | ||
62 | } | ||
63 | case TAny: { | ||
64 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
65 | return 1; | ||
66 | } | ||
67 | default: return 0; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | ** Visit a TCall node taking care to stop recursion. If node not yet | 48 | ** Visit a TCall node taking care to stop recursion. If node not yet |
74 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default | 49 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default |
75 | ** value) | 50 | ** value) |
@@ -240,7 +215,7 @@ int fixedlen (TTree *tree) { | |||
240 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | 215 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { |
241 | tailcall: | 216 | tailcall: |
242 | switch (tree->tag) { | 217 | switch (tree->tag) { |
243 | case TChar: case TSet: case TAny: { | 218 | case TChar: case TSet: case TAny: case TFalse: { |
244 | tocharset(tree, firstset); | 219 | tocharset(tree, firstset); |
245 | return 0; | 220 | return 0; |
246 | } | 221 | } |
@@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
255 | loopset(i, firstset->cs[i] = follow->cs[i]); | 230 | loopset(i, firstset->cs[i] = follow->cs[i]); |
256 | return 1; /* accepts the empty string */ | 231 | return 1; /* accepts the empty string */ |
257 | } | 232 | } |
258 | case TFalse: { | ||
259 | loopset(i, firstset->cs[i] = 0); | ||
260 | return 0; | ||
261 | } | ||
262 | case TChoice: { | 233 | case TChoice: { |
263 | Charset csaux; | 234 | Charset csaux; |
264 | int e1 = getfirst(sib1(tree), follow, firstset); | 235 | int e1 = getfirst(sib1(tree), follow, firstset); |
@@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
543 | /* | 514 | /* |
544 | ** Add a charset posfix to an instruction. | 515 | ** Add a charset posfix to an instruction. |
545 | */ | 516 | */ |
546 | static void addcharset (CompileState *compst, int inst, const byte *cs, | 517 | static void addcharset (CompileState *compst, int inst, charsetinfo *info) { |
547 | const charsetinfo *info) { | ||
548 | int p; | 518 | int p; |
549 | Instruction *I = &getinstr(compst, inst); | 519 | Instruction *I = &getinstr(compst, inst); |
550 | byte *charset; | 520 | byte *charset; |
551 | int isize = instsize(info->size); /* size in instructions */ | 521 | int isize = instsize(info->size); /* size in instructions */ |
552 | int i; | 522 | int i; |
553 | I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ | 523 | I->i.aux2.set.offset = info->offset * 8; /* offset in bits */ |
554 | I->i.aux2.set.size = isize; | 524 | I->i.aux2.set.size = isize; |
555 | I->i.aux1 = info->deflt; | 525 | I->i.aux1 = info->deflt; |
556 | p = nextinstruction(compst, isize); /* space for charset */ | 526 | p = nextinstruction(compst, isize); /* space for charset */ |
557 | charset = getinstr(compst, p).buff; /* charset buffer */ | 527 | charset = getinstr(compst, p).buff; /* charset buffer */ |
558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) | 528 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ | 529 | charset[i] = getbytefromcharset(info, i); /* copy the buffer */ |
560 | } | 530 | } |
561 | 531 | ||
562 | 532 | ||
563 | /* | 533 | /* |
564 | ** Check whether compact charset cs is dominated by instruction 'p' | 534 | ** Check whether charset 'info' is dominated by instruction 'p' |
565 | */ | 535 | */ |
566 | static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | 536 | static int cs_equal (Instruction *p, charsetinfo *info) { |
567 | if (p->i.code != ITestSet) | 537 | if (p->i.code != ITestSet) |
568 | return 0; | 538 | return 0; |
569 | else if (p->i.aux2.set.offset != info->aux1 * 8 || | 539 | else if (p->i.aux2.set.offset != info->offset * 8 || |
570 | p->i.aux2.set.size != instsize(info->size) || | 540 | p->i.aux2.set.size != instsize(info->size) || |
571 | p->i.aux1 != info->deflt) | 541 | p->i.aux1 != info->deflt) |
572 | return 0; | 542 | return 0; |
573 | else { | 543 | else { |
574 | int i; | 544 | int i; |
575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | 545 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) | 546 | if ((p + 2)->buff[i] != getbytefromcharset(info, i)) |
577 | return 0; | 547 | return 0; |
578 | } | 548 | } |
579 | } | 549 | } |
@@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
582 | 552 | ||
583 | 553 | ||
584 | /* | 554 | /* |
585 | ** code a char set, optimizing unit sets for IChar, "complete" | 555 | ** Code a char set, using IAny when instruction is dominated by an |
586 | ** sets for IAny, and empty sets for IFail; also use an IAny | 556 | ** equivalent test. |
587 | ** when instruction is dominated by an equivalent test. | ||
588 | */ | 557 | */ |
589 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | 558 | static void codecharset (CompileState *compst, TTree *tree, int tt) { |
590 | charsetinfo info; | 559 | charsetinfo info; |
591 | Opcode op = charsettype(cs, &info); | 560 | tree2cset(tree, &info); |
592 | switch (op) { | 561 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info)) |
593 | case IChar: codechar(compst, info.aux1, tt); break; | 562 | addinstruction(compst, IAny, 0); |
594 | case ISet: { /* non-trivial set? */ | 563 | else { |
595 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) | 564 | int i = addinstruction(compst, ISet, 0); |
596 | addinstruction(compst, IAny, 0); | 565 | addcharset(compst, i, &info); |
597 | else { | ||
598 | int i = addinstruction(compst, ISet, 0); | ||
599 | addcharset(compst, i, cs, &info); | ||
600 | } | ||
601 | break; | ||
602 | } | ||
603 | default: | ||
604 | assert(op == IFail || op == IAny); | ||
605 | addinstruction(compst, op, 0); | ||
606 | break; | ||
607 | } | 566 | } |
608 | } | 567 | } |
609 | 568 | ||
610 | 569 | ||
611 | /* | 570 | /* |
612 | ** code a test set, optimizing unit sets for ITestChar, "complete" | 571 | ** Code a test set, optimizing unit sets for ITestChar, "complete" |
613 | ** sets for ITestAny, and empty sets for IJmp (always fails). | 572 | ** sets for ITestAny, and empty sets for IJmp (always fails). |
614 | ** 'e' is true iff test should accept the empty string. (Test | 573 | ** 'e' is true iff test should accept the empty string. (Test |
615 | ** instructions in the current VM never accept the empty string.) | 574 | ** instructions in the current VM never accept the empty string.) |
@@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { | |||
624 | case IAny: return addoffsetinst(compst, ITestAny); | 583 | case IAny: return addoffsetinst(compst, ITestAny); |
625 | case IChar: { | 584 | case IChar: { |
626 | int i = addoffsetinst(compst, ITestChar); | 585 | int i = addoffsetinst(compst, ITestChar); |
627 | getinstr(compst, i).i.aux1 = info.aux1; | 586 | getinstr(compst, i).i.aux1 = info.offset; |
628 | return i; | 587 | return i; |
629 | } | 588 | } |
630 | default: { /* regular set */ | 589 | default: { /* regular set */ |
631 | int i = addoffsetinst(compst, ITestSet); | 590 | int i = addoffsetinst(compst, ITestSet); |
632 | addcharset(compst, i, cs->cs, &info); | 591 | addcharset(compst, i, &info); |
633 | assert(op == ISet); | 592 | assert(op == ISet); |
634 | return i; | 593 | return i; |
635 | } | 594 | } |
@@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) { | |||
778 | 737 | ||
779 | 738 | ||
780 | /* | 739 | /* |
781 | ** Repetition of charsets: | 740 | ** Try repetition of charsets: |
782 | ** For an empty set, repetition of fail is a no-op; | 741 | ** For an empty set, repetition of fail is a no-op; |
783 | ** For any or char, code a tight loop; | 742 | ** For any or char, code a tight loop; |
784 | ** For generic charset, use a span instruction. | 743 | ** For generic charset, use a span instruction. |
785 | */ | 744 | */ |
786 | static void coderepcharset (CompileState *compst, Charset *st) { | 745 | static int coderepcharset (CompileState *compst, TTree *tree) { |
787 | charsetinfo info; | 746 | switch (tree->tag) { |
788 | Opcode op = charsettype(st->cs, &info); | 747 | case TFalse: return 1; /* 'fail*' is a no-op */ |
789 | switch (op) { | 748 | case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */ |
790 | case IFail: break; /* fail* is a no-op */ | ||
791 | case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */ | ||
792 | int test = addoffsetinst(compst, ITestAny); | 749 | int test = addoffsetinst(compst, ITestAny); |
793 | addinstruction(compst, IAny, 0); | 750 | addinstruction(compst, IAny, 0); |
794 | closeloop(compst, test); | 751 | closeloop(compst, test); |
795 | break; | 752 | return 1; |
796 | } | 753 | } |
797 | case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ | 754 | case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ |
798 | int test = addoffsetinst(compst, ITestChar); | 755 | int test = addoffsetinst(compst, ITestChar); |
799 | getinstr(compst, test).i.aux1 = info.aux1; | 756 | getinstr(compst, test).i.aux1 = tree->u.n; |
800 | addinstruction(compst, IAny, 0); | 757 | addinstruction(compst, IAny, 0); |
801 | closeloop(compst, test); | 758 | closeloop(compst, test); |
802 | break; | 759 | return 1; |
803 | } | 760 | } |
804 | default: { /* regular set */ | 761 | case TSet: { /* regular set */ |
762 | charsetinfo info; | ||
805 | int i = addinstruction(compst, ISpan, 0); | 763 | int i = addinstruction(compst, ISpan, 0); |
806 | addcharset(compst, i, st->cs, &info); | 764 | tree2cset(tree, &info); |
807 | assert(op == ISet); | 765 | addcharset(compst, i, &info); |
766 | return 1; | ||
808 | } | 767 | } |
768 | default: return 0; /* not a charset */ | ||
809 | } | 769 | } |
810 | } | 770 | } |
811 | 771 | ||
@@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) { | |||
822 | */ | 782 | */ |
823 | static void coderep (CompileState *compst, TTree *tree, int opt, | 783 | static void coderep (CompileState *compst, TTree *tree, int opt, |
824 | const Charset *fl) { | 784 | const Charset *fl) { |
825 | Charset st; | 785 | if (!coderepcharset(compst, tree)) { |
826 | if (tocharset(tree, &st)) | 786 | Charset st; |
827 | coderepcharset(compst, &st); | ||
828 | else { | ||
829 | int e1 = getfirst(tree, fullset, &st); | 787 | int e1 = getfirst(tree, fullset, &st); |
830 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | 788 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { |
831 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | 789 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ |
@@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
965 | switch (tree->tag) { | 923 | switch (tree->tag) { |
966 | case TChar: codechar(compst, tree->u.n, tt); break; | 924 | case TChar: codechar(compst, tree->u.n, tt); break; |
967 | case TAny: addinstruction(compst, IAny, 0); break; | 925 | case TAny: addinstruction(compst, IAny, 0); break; |
968 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 926 | case TSet: codecharset(compst, tree, tt); break; |
969 | case TTrue: break; | 927 | case TTrue: break; |
970 | case TFalse: addinstruction(compst, IFail, 0); break; | 928 | case TFalse: addinstruction(compst, IFail, 0); break; |
971 | case TUTFR: codeutfr(compst, tree); break; | 929 | case TUTFR: codeutfr(compst, tree); break; |