diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
commit | 012cf9c86cf91cb8354e229bde335592d41b84b2 (patch) | |
tree | 353f17797b1952eaec231c8e4fd5c21e02daf875 | |
parent | 3403b0c7256435560b63f828da92026c5d4c898b (diff) | |
download | lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2 lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip |
Compact charsets used in trees, too.
-rw-r--r-- | lpcode.c | 118 | ||||
-rw-r--r-- | lpcode.h | 1 | ||||
-rw-r--r-- | lpcset.c | 55 | ||||
-rw-r--r-- | lpcset.h | 15 | ||||
-rw-r--r-- | lpprint.c | 13 | ||||
-rw-r--r-- | lptree.c | 70 | ||||
-rw-r--r-- | lptree.h | 9 | ||||
-rw-r--r-- | makefile | 13 | ||||
-rwxr-xr-x | test.lua | 4 |
9 files changed, 178 insertions, 120 deletions
@@ -45,31 +45,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | |||
45 | 45 | ||
46 | 46 | ||
47 | /* | 47 | /* |
48 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a | ||
49 | ** charset and return 1; else return 0. | ||
50 | */ | ||
51 | int tocharset (TTree *tree, Charset *cs) { | ||
52 | switch (tree->tag) { | ||
53 | case TSet: { /* copy set */ | ||
54 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
55 | return 1; | ||
56 | } | ||
57 | case TChar: { /* only one char */ | ||
58 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
59 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
60 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
61 | return 1; | ||
62 | } | ||
63 | case TAny: { | ||
64 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
65 | return 1; | ||
66 | } | ||
67 | default: return 0; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | ** Visit a TCall node taking care to stop recursion. If node not yet | 48 | ** Visit a TCall node taking care to stop recursion. If node not yet |
74 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default | 49 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default |
75 | ** value) | 50 | ** value) |
@@ -240,7 +215,7 @@ int fixedlen (TTree *tree) { | |||
240 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | 215 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { |
241 | tailcall: | 216 | tailcall: |
242 | switch (tree->tag) { | 217 | switch (tree->tag) { |
243 | case TChar: case TSet: case TAny: { | 218 | case TChar: case TSet: case TAny: case TFalse: { |
244 | tocharset(tree, firstset); | 219 | tocharset(tree, firstset); |
245 | return 0; | 220 | return 0; |
246 | } | 221 | } |
@@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
255 | loopset(i, firstset->cs[i] = follow->cs[i]); | 230 | loopset(i, firstset->cs[i] = follow->cs[i]); |
256 | return 1; /* accepts the empty string */ | 231 | return 1; /* accepts the empty string */ |
257 | } | 232 | } |
258 | case TFalse: { | ||
259 | loopset(i, firstset->cs[i] = 0); | ||
260 | return 0; | ||
261 | } | ||
262 | case TChoice: { | 233 | case TChoice: { |
263 | Charset csaux; | 234 | Charset csaux; |
264 | int e1 = getfirst(sib1(tree), follow, firstset); | 235 | int e1 = getfirst(sib1(tree), follow, firstset); |
@@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
543 | /* | 514 | /* |
544 | ** Add a charset posfix to an instruction. | 515 | ** Add a charset posfix to an instruction. |
545 | */ | 516 | */ |
546 | static void addcharset (CompileState *compst, int inst, const byte *cs, | 517 | static void addcharset (CompileState *compst, int inst, charsetinfo *info) { |
547 | const charsetinfo *info) { | ||
548 | int p; | 518 | int p; |
549 | Instruction *I = &getinstr(compst, inst); | 519 | Instruction *I = &getinstr(compst, inst); |
550 | byte *charset; | 520 | byte *charset; |
551 | int isize = instsize(info->size); /* size in instructions */ | 521 | int isize = instsize(info->size); /* size in instructions */ |
552 | int i; | 522 | int i; |
553 | I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ | 523 | I->i.aux2.set.offset = info->offset * 8; /* offset in bits */ |
554 | I->i.aux2.set.size = isize; | 524 | I->i.aux2.set.size = isize; |
555 | I->i.aux1 = info->deflt; | 525 | I->i.aux1 = info->deflt; |
556 | p = nextinstruction(compst, isize); /* space for charset */ | 526 | p = nextinstruction(compst, isize); /* space for charset */ |
557 | charset = getinstr(compst, p).buff; /* charset buffer */ | 527 | charset = getinstr(compst, p).buff; /* charset buffer */ |
558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) | 528 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ | 529 | charset[i] = getbytefromcharset(info, i); /* copy the buffer */ |
560 | } | 530 | } |
561 | 531 | ||
562 | 532 | ||
563 | /* | 533 | /* |
564 | ** Check whether compact charset cs is dominated by instruction 'p' | 534 | ** Check whether charset 'info' is dominated by instruction 'p' |
565 | */ | 535 | */ |
566 | static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | 536 | static int cs_equal (Instruction *p, charsetinfo *info) { |
567 | if (p->i.code != ITestSet) | 537 | if (p->i.code != ITestSet) |
568 | return 0; | 538 | return 0; |
569 | else if (p->i.aux2.set.offset != info->aux1 * 8 || | 539 | else if (p->i.aux2.set.offset != info->offset * 8 || |
570 | p->i.aux2.set.size != instsize(info->size) || | 540 | p->i.aux2.set.size != instsize(info->size) || |
571 | p->i.aux1 != info->deflt) | 541 | p->i.aux1 != info->deflt) |
572 | return 0; | 542 | return 0; |
573 | else { | 543 | else { |
574 | int i; | 544 | int i; |
575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | 545 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) | 546 | if ((p + 2)->buff[i] != getbytefromcharset(info, i)) |
577 | return 0; | 547 | return 0; |
578 | } | 548 | } |
579 | } | 549 | } |
@@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
582 | 552 | ||
583 | 553 | ||
584 | /* | 554 | /* |
585 | ** code a char set, optimizing unit sets for IChar, "complete" | 555 | ** Code a char set, using IAny when instruction is dominated by an |
586 | ** sets for IAny, and empty sets for IFail; also use an IAny | 556 | ** equivalent test. |
587 | ** when instruction is dominated by an equivalent test. | ||
588 | */ | 557 | */ |
589 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | 558 | static void codecharset (CompileState *compst, TTree *tree, int tt) { |
590 | charsetinfo info; | 559 | charsetinfo info; |
591 | Opcode op = charsettype(cs, &info); | 560 | tree2cset(tree, &info); |
592 | switch (op) { | 561 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info)) |
593 | case IChar: codechar(compst, info.aux1, tt); break; | 562 | addinstruction(compst, IAny, 0); |
594 | case ISet: { /* non-trivial set? */ | 563 | else { |
595 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) | 564 | int i = addinstruction(compst, ISet, 0); |
596 | addinstruction(compst, IAny, 0); | 565 | addcharset(compst, i, &info); |
597 | else { | ||
598 | int i = addinstruction(compst, ISet, 0); | ||
599 | addcharset(compst, i, cs, &info); | ||
600 | } | ||
601 | break; | ||
602 | } | ||
603 | default: | ||
604 | assert(op == IFail || op == IAny); | ||
605 | addinstruction(compst, op, 0); | ||
606 | break; | ||
607 | } | 566 | } |
608 | } | 567 | } |
609 | 568 | ||
610 | 569 | ||
611 | /* | 570 | /* |
612 | ** code a test set, optimizing unit sets for ITestChar, "complete" | 571 | ** Code a test set, optimizing unit sets for ITestChar, "complete" |
613 | ** sets for ITestAny, and empty sets for IJmp (always fails). | 572 | ** sets for ITestAny, and empty sets for IJmp (always fails). |
614 | ** 'e' is true iff test should accept the empty string. (Test | 573 | ** 'e' is true iff test should accept the empty string. (Test |
615 | ** instructions in the current VM never accept the empty string.) | 574 | ** instructions in the current VM never accept the empty string.) |
@@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { | |||
624 | case IAny: return addoffsetinst(compst, ITestAny); | 583 | case IAny: return addoffsetinst(compst, ITestAny); |
625 | case IChar: { | 584 | case IChar: { |
626 | int i = addoffsetinst(compst, ITestChar); | 585 | int i = addoffsetinst(compst, ITestChar); |
627 | getinstr(compst, i).i.aux1 = info.aux1; | 586 | getinstr(compst, i).i.aux1 = info.offset; |
628 | return i; | 587 | return i; |
629 | } | 588 | } |
630 | default: { /* regular set */ | 589 | default: { /* regular set */ |
631 | int i = addoffsetinst(compst, ITestSet); | 590 | int i = addoffsetinst(compst, ITestSet); |
632 | addcharset(compst, i, cs->cs, &info); | 591 | addcharset(compst, i, &info); |
633 | assert(op == ISet); | 592 | assert(op == ISet); |
634 | return i; | 593 | return i; |
635 | } | 594 | } |
@@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) { | |||
778 | 737 | ||
779 | 738 | ||
780 | /* | 739 | /* |
781 | ** Repetition of charsets: | 740 | ** Try repetition of charsets: |
782 | ** For an empty set, repetition of fail is a no-op; | 741 | ** For an empty set, repetition of fail is a no-op; |
783 | ** For any or char, code a tight loop; | 742 | ** For any or char, code a tight loop; |
784 | ** For generic charset, use a span instruction. | 743 | ** For generic charset, use a span instruction. |
785 | */ | 744 | */ |
786 | static void coderepcharset (CompileState *compst, Charset *st) { | 745 | static int coderepcharset (CompileState *compst, TTree *tree) { |
787 | charsetinfo info; | 746 | switch (tree->tag) { |
788 | Opcode op = charsettype(st->cs, &info); | 747 | case TFalse: return 1; /* 'fail*' is a no-op */ |
789 | switch (op) { | 748 | case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */ |
790 | case IFail: break; /* fail* is a no-op */ | ||
791 | case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */ | ||
792 | int test = addoffsetinst(compst, ITestAny); | 749 | int test = addoffsetinst(compst, ITestAny); |
793 | addinstruction(compst, IAny, 0); | 750 | addinstruction(compst, IAny, 0); |
794 | closeloop(compst, test); | 751 | closeloop(compst, test); |
795 | break; | 752 | return 1; |
796 | } | 753 | } |
797 | case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ | 754 | case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ |
798 | int test = addoffsetinst(compst, ITestChar); | 755 | int test = addoffsetinst(compst, ITestChar); |
799 | getinstr(compst, test).i.aux1 = info.aux1; | 756 | getinstr(compst, test).i.aux1 = tree->u.n; |
800 | addinstruction(compst, IAny, 0); | 757 | addinstruction(compst, IAny, 0); |
801 | closeloop(compst, test); | 758 | closeloop(compst, test); |
802 | break; | 759 | return 1; |
803 | } | 760 | } |
804 | default: { /* regular set */ | 761 | case TSet: { /* regular set */ |
762 | charsetinfo info; | ||
805 | int i = addinstruction(compst, ISpan, 0); | 763 | int i = addinstruction(compst, ISpan, 0); |
806 | addcharset(compst, i, st->cs, &info); | 764 | tree2cset(tree, &info); |
807 | assert(op == ISet); | 765 | addcharset(compst, i, &info); |
766 | return 1; | ||
808 | } | 767 | } |
768 | default: return 0; /* not a charset */ | ||
809 | } | 769 | } |
810 | } | 770 | } |
811 | 771 | ||
@@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) { | |||
822 | */ | 782 | */ |
823 | static void coderep (CompileState *compst, TTree *tree, int opt, | 783 | static void coderep (CompileState *compst, TTree *tree, int opt, |
824 | const Charset *fl) { | 784 | const Charset *fl) { |
825 | Charset st; | 785 | if (!coderepcharset(compst, tree)) { |
826 | if (tocharset(tree, &st)) | 786 | Charset st; |
827 | coderepcharset(compst, &st); | ||
828 | else { | ||
829 | int e1 = getfirst(tree, fullset, &st); | 787 | int e1 = getfirst(tree, fullset, &st); |
830 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | 788 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { |
831 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | 789 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ |
@@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
965 | switch (tree->tag) { | 923 | switch (tree->tag) { |
966 | case TChar: codechar(compst, tree->u.n, tt); break; | 924 | case TChar: codechar(compst, tree->u.n, tt); break; |
967 | case TAny: addinstruction(compst, IAny, 0); break; | 925 | case TAny: addinstruction(compst, IAny, 0); break; |
968 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 926 | case TSet: codecharset(compst, tree, tt); break; |
969 | case TTrue: break; | 927 | case TTrue: break; |
970 | case TFalse: addinstruction(compst, IFail, 0); break; | 928 | case TFalse: addinstruction(compst, IFail, 0); break; |
971 | case TUTFR: codeutfr(compst, tree); break; | 929 | case TUTFR: codeutfr(compst, tree); break; |
@@ -8,7 +8,6 @@ | |||
8 | #include "lptree.h" | 8 | #include "lptree.h" |
9 | #include "lpvm.h" | 9 | #include "lpvm.h" |
10 | 10 | ||
11 | int tocharset (TTree *tree, Charset *cs); | ||
12 | int checkaux (TTree *tree, int pred); | 11 | int checkaux (TTree *tree, int pred); |
13 | int fixedlen (TTree *tree); | 12 | int fixedlen (TTree *tree); |
14 | int hascaptures (TTree *tree); | 13 | int hascaptures (TTree *tree); |
@@ -16,7 +16,7 @@ static int onlybit (int c, int b) { | |||
16 | 16 | ||
17 | /* | 17 | /* |
18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | 18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), |
19 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | 19 | ** full (IAny), or none of those (ISet). When singleton, 'info.offset' |
20 | ** returns which character it is. When generic set, 'info' returns | 20 | ** returns which character it is. When generic set, 'info' returns |
21 | ** information about its range. | 21 | ** information about its range. |
22 | */ | 22 | */ |
@@ -31,7 +31,7 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
31 | if (low1 == high1) { /* only one byte with 1-bits? */ | 31 | if (low1 == high1) { /* only one byte with 1-bits? */ |
32 | int b = cs[low1]; | 32 | int b = cs[low1]; |
33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | 33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ |
34 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | 34 | info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ |
35 | return IChar; /* single character */ | 35 | return IChar; /* single character */ |
36 | } | 36 | } |
37 | } | 37 | } |
@@ -42,15 +42,16 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | 42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) |
43 | /* find highest byte with a 0-bit; low0 is a sentinel */; | 43 | /* find highest byte with a 0-bit; low0 is a sentinel */; |
44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | 44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ |
45 | info->aux1 = low1; | 45 | info->offset = low1; |
46 | info->size = high1 - low1 + 1; | 46 | info->size = high1 - low1 + 1; |
47 | info->deflt = 0; /* all discharged bits were 0 */ | 47 | info->deflt = 0; /* all discharged bits were 0 */ |
48 | } | 48 | } |
49 | else { | 49 | else { |
50 | info->aux1 = low0; | 50 | info->offset = low0; |
51 | info->size = high0 - low0 + 1; | 51 | info->size = high0 - low0 + 1; |
52 | info->deflt = 0xFF; /* all discharged bits were 1 */ | 52 | info->deflt = 0xFF; /* all discharged bits were 1 */ |
53 | } | 53 | } |
54 | info->cs = cs + info->offset; | ||
54 | return ISet; | 55 | return ISet; |
55 | } | 56 | } |
56 | 57 | ||
@@ -60,10 +61,50 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
60 | ** range, get the byte from the supporting charset (correcting it | 61 | ** range, get the byte from the supporting charset (correcting it |
61 | ** by the offset). Otherwise, return the default for the set. | 62 | ** by the offset). Otherwise, return the default for the set. |
62 | */ | 63 | */ |
63 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, | 64 | byte getbytefromcharset (const charsetinfo *info, int index) { |
64 | int index) { | ||
65 | if (index < info->size) | 65 | if (index < info->size) |
66 | return cs[info->aux1 + index]; | 66 | return info->cs[index]; |
67 | else return info->deflt; | 67 | else return info->deflt; |
68 | } | 68 | } |
69 | 69 | ||
70 | |||
71 | /* | ||
72 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it | ||
73 | ** into a charset and return 1; else return 0. | ||
74 | */ | ||
75 | int tocharset (TTree *tree, Charset *cs) { | ||
76 | switch (tree->tag) { | ||
77 | case TChar: { /* only one char */ | ||
78 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
79 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
80 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
81 | return 1; | ||
82 | } | ||
83 | case TAny: { | ||
84 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
85 | return 1; | ||
86 | } | ||
87 | case TFalse: { | ||
88 | loopset(i, cs->cs[i] = 0); /* empty set */ | ||
89 | return 1; | ||
90 | } | ||
91 | case TSet: { /* fill set */ | ||
92 | int i; | ||
93 | loopset(j, cs->cs[j] = tree->u.set.deflt); | ||
94 | for (i = 0; i < tree->u.set.size; i++) | ||
95 | cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i]; | ||
96 | return 1; | ||
97 | } | ||
98 | default: return 0; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | |||
103 | void tree2cset (TTree *tree, charsetinfo *info) { | ||
104 | assert(tree->tag == TSet); | ||
105 | info->offset = tree->u.set.offset; | ||
106 | info->size = tree->u.set.size; | ||
107 | info->deflt = tree->u.set.deflt; | ||
108 | info->cs = treebuffer(tree); | ||
109 | } | ||
110 | |||
@@ -4,22 +4,27 @@ | |||
4 | 4 | ||
5 | #include "lpcset.h" | 5 | #include "lpcset.h" |
6 | #include "lpcode.h" | 6 | #include "lpcode.h" |
7 | #include "lptree.h" | ||
7 | 8 | ||
8 | 9 | ||
9 | /* | 10 | /* |
10 | ** Extra information for the result of 'charsettype'. When result is | 11 | ** Extra information for the result of 'charsettype'. When result is |
11 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | 12 | ** IChar, 'offset' is the character. When result is ISet, 'cs' is the |
12 | ** offset (in bytes), 'size' is the size (in bytes), and | 13 | ** supporting bit array (with offset included), 'offset' is the offset |
13 | ** 'delt' is the default value for bytes outside the set. | 14 | ** (in bytes), 'size' is the size (in bytes), and 'delt' is the default |
15 | ** value for bytes outside the set. | ||
14 | */ | 16 | */ |
15 | typedef struct { | 17 | typedef struct { |
16 | int aux1; | 18 | const byte *cs; |
19 | int offset; | ||
17 | int size; | 20 | int size; |
18 | int deflt; | 21 | int deflt; |
19 | } charsetinfo; | 22 | } charsetinfo; |
20 | 23 | ||
21 | 24 | ||
25 | int tocharset (TTree *tree, Charset *cs); | ||
22 | Opcode charsettype (const byte *cs, charsetinfo *info); | 26 | Opcode charsettype (const byte *cs, charsetinfo *info); |
23 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); | 27 | byte getbytefromcharset (const charsetinfo *info, int index); |
28 | void tree2cset (TTree *tree, charsetinfo *info); | ||
24 | 29 | ||
25 | #endif | 30 | #endif |
@@ -46,6 +46,17 @@ static void printIcharset (const Instruction *inst, const byte *buff) { | |||
46 | } | 46 | } |
47 | 47 | ||
48 | 48 | ||
49 | static void printTcharset (TTree *tree) { | ||
50 | byte cs[CHARSETSIZE]; | ||
51 | int i; | ||
52 | printf("(%02x-%d) ", tree->u.set.offset, tree->u.set.size); | ||
53 | loopset(j, cs[j] = tree->u.set.deflt); | ||
54 | for (i = 0; i < tree->u.set.size; i++) | ||
55 | cs[tree->u.set.offset + i] = treebuffer(tree)[i]; | ||
56 | printcharset(cs); | ||
57 | } | ||
58 | |||
59 | |||
49 | static const char *capkind (int kind) { | 60 | static const char *capkind (int kind) { |
50 | const char *const modes[] = { | 61 | const char *const modes[] = { |
51 | "close", "position", "constant", "backref", | 62 | "close", "position", "constant", "backref", |
@@ -186,7 +197,7 @@ void printtree (TTree *tree, int ident) { | |||
186 | break; | 197 | break; |
187 | } | 198 | } |
188 | case TSet: { | 199 | case TSet: { |
189 | printcharset(treebuffer(tree)); | 200 | printTcharset(tree); |
190 | printf("\n"); | 201 | printf("\n"); |
191 | break; | 202 | break; |
192 | } | 203 | } |
@@ -12,6 +12,7 @@ | |||
12 | #include "lpcode.h" | 12 | #include "lpcode.h" |
13 | #include "lpprint.h" | 13 | #include "lpprint.h" |
14 | #include "lptree.h" | 14 | #include "lptree.h" |
15 | #include "lpcset.h" | ||
15 | 16 | ||
16 | 17 | ||
17 | /* number of siblings for each tree */ | 18 | /* number of siblings for each tree */ |
@@ -370,14 +371,37 @@ static TTree *newleaf (lua_State *L, int tag) { | |||
370 | } | 371 | } |
371 | 372 | ||
372 | 373 | ||
373 | static TTree *newcharset (lua_State *L) { | 374 | static TTree *newccharset (lua_State *L, byte *cs, charsetinfo *info) { |
374 | TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); | 375 | int i; |
376 | TTree *tree = newtree(L, bytes2slots(info->size) + 1); | ||
375 | tree->tag = TSet; | 377 | tree->tag = TSet; |
376 | loopset(i, treebuffer(tree)[i] = 0); | 378 | tree->u.set.offset = info->offset; |
379 | tree->u.set.size = info->size; | ||
380 | tree->u.set.deflt = info->deflt; | ||
381 | for (i = 0; i < info->size; i++) | ||
382 | treebuffer(tree)[i] = cs[info->offset + i]; | ||
377 | return tree; | 383 | return tree; |
378 | } | 384 | } |
379 | 385 | ||
380 | 386 | ||
387 | static TTree *newcharset (lua_State *L, byte *cs) { | ||
388 | charsetinfo info; | ||
389 | Opcode op = charsettype(cs, &info); | ||
390 | switch (op) { | ||
391 | case IFail: return newleaf(L, TFalse); | ||
392 | case IAny: return newleaf(L, TAny); | ||
393 | case IChar: { | ||
394 | TTree *tree =newleaf(L, TChar); | ||
395 | tree->u.n = info.offset; | ||
396 | return tree; | ||
397 | } | ||
398 | default: | ||
399 | assert(op == ISet); | ||
400 | return newccharset(L, cs, &info); | ||
401 | } | ||
402 | } | ||
403 | |||
404 | |||
381 | /* | 405 | /* |
382 | ** add to tree a sequence where first sibling is 'sib' (with size | 406 | ** add to tree a sequence where first sibling is 'sib' (with size |
383 | ** 'sibsize'); returns position for second sibling | 407 | ** 'sibsize'); returns position for second sibling |
@@ -549,8 +573,8 @@ static int lp_choice (lua_State *L) { | |||
549 | TTree *t1 = getpatt(L, 1, NULL); | 573 | TTree *t1 = getpatt(L, 1, NULL); |
550 | TTree *t2 = getpatt(L, 2, NULL); | 574 | TTree *t2 = getpatt(L, 2, NULL); |
551 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | 575 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { |
552 | TTree *t = newcharset(L); | 576 | loopset(i, st1.cs[i] |= st2.cs[i]); |
553 | loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); | 577 | newcharset(L, st1.cs); |
554 | } | 578 | } |
555 | else if (nofail(t1) || t2->tag == TFalse) | 579 | else if (nofail(t1) || t2->tag == TFalse) |
556 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ | 580 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ |
@@ -626,8 +650,8 @@ static int lp_sub (lua_State *L) { | |||
626 | TTree *t1 = getpatt(L, 1, &s1); | 650 | TTree *t1 = getpatt(L, 1, &s1); |
627 | TTree *t2 = getpatt(L, 2, &s2); | 651 | TTree *t2 = getpatt(L, 2, &s2); |
628 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | 652 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { |
629 | TTree *t = newcharset(L); | 653 | loopset(i, st1.cs[i] &= ~st2.cs[i]); |
630 | loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); | 654 | newcharset(L, st1.cs); |
631 | } | 655 | } |
632 | else { | 656 | else { |
633 | TTree *tree = newtree(L, 2 + s1 + s2); | 657 | TTree *tree = newtree(L, 2 + s1 + s2); |
@@ -645,11 +669,13 @@ static int lp_sub (lua_State *L) { | |||
645 | static int lp_set (lua_State *L) { | 669 | static int lp_set (lua_State *L) { |
646 | size_t l; | 670 | size_t l; |
647 | const char *s = luaL_checklstring(L, 1, &l); | 671 | const char *s = luaL_checklstring(L, 1, &l); |
648 | TTree *tree = newcharset(L); | 672 | byte buff[CHARSETSIZE]; |
673 | loopset(i, buff[i] = 0); | ||
649 | while (l--) { | 674 | while (l--) { |
650 | setchar(treebuffer(tree), (byte)(*s)); | 675 | setchar(buff, (byte)(*s)); |
651 | s++; | 676 | s++; |
652 | } | 677 | } |
678 | newcharset(L, buff); | ||
653 | return 1; | 679 | return 1; |
654 | } | 680 | } |
655 | 681 | ||
@@ -657,15 +683,17 @@ static int lp_set (lua_State *L) { | |||
657 | static int lp_range (lua_State *L) { | 683 | static int lp_range (lua_State *L) { |
658 | int arg; | 684 | int arg; |
659 | int top = lua_gettop(L); | 685 | int top = lua_gettop(L); |
660 | TTree *tree = newcharset(L); | 686 | byte buff[CHARSETSIZE]; |
687 | loopset(i, buff[i] = 0); | ||
661 | for (arg = 1; arg <= top; arg++) { | 688 | for (arg = 1; arg <= top; arg++) { |
662 | int c; | 689 | int c; |
663 | size_t l; | 690 | size_t l; |
664 | const char *r = luaL_checklstring(L, arg, &l); | 691 | const char *r = luaL_checklstring(L, arg, &l); |
665 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); | 692 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); |
666 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) | 693 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) |
667 | setchar(treebuffer(tree), c); | 694 | setchar(buff, c); |
668 | } | 695 | } |
696 | newcharset(L, buff); | ||
669 | return 1; | 697 | return 1; |
670 | } | 698 | } |
671 | 699 | ||
@@ -704,10 +732,12 @@ static int lp_utfr (lua_State *L) { | |||
704 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | 732 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); |
705 | luaL_argcheck(L, from <= to, 2, "empty range"); | 733 | luaL_argcheck(L, from <= to, 2, "empty range"); |
706 | if (to <= 0x7f) { /* ascii range? */ | 734 | if (to <= 0x7f) { /* ascii range? */ |
707 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
708 | unsigned int f; | 735 | unsigned int f; |
736 | byte buff[CHARSETSIZE]; /* code it as a regular charset */ | ||
737 | loopset(i, buff[i] = 0); | ||
709 | for (f = (int)from; f <= to; f++) | 738 | for (f = (int)from; f <= to; f++) |
710 | setchar(treebuffer(tree), f); | 739 | setchar(buff, f); |
740 | newcharset(L, buff); | ||
711 | } | 741 | } |
712 | else { /* multi-byte utf-8 range */ | 742 | else { /* multi-byte utf-8 range */ |
713 | TTree *tree = newtree(L, 2); | 743 | TTree *tree = newtree(L, 2); |
@@ -1261,11 +1291,17 @@ int lp_gc (lua_State *L) { | |||
1261 | } | 1291 | } |
1262 | 1292 | ||
1263 | 1293 | ||
1294 | /* | ||
1295 | ** Create a charset representing a category of characters, given by | ||
1296 | ** the predicate 'catf'. | ||
1297 | */ | ||
1264 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { | 1298 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { |
1265 | TTree *t = newcharset(L); | 1299 | int c; |
1266 | int i; | 1300 | byte buff[CHARSETSIZE]; |
1267 | for (i = 0; i <= UCHAR_MAX; i++) | 1301 | loopset(i, buff[i] = 0); |
1268 | if (catf(i)) setchar(treebuffer(t), i); | 1302 | for (c = 0; c <= UCHAR_MAX; c++) |
1303 | if (catf(c)) setchar(buff, c); | ||
1304 | newcharset(L, buff); | ||
1269 | lua_setfield(L, -2, catname); | 1305 | lua_setfield(L, -2, catname); |
1270 | } | 1306 | } |
1271 | 1307 | ||
@@ -3,7 +3,7 @@ | |||
3 | #define lptree_h | 3 | #define lptree_h |
4 | 4 | ||
5 | 5 | ||
6 | #include "lptypes.h" | 6 | #include "lptypes.h" |
7 | 7 | ||
8 | 8 | ||
9 | /* | 9 | /* |
@@ -11,7 +11,7 @@ | |||
11 | */ | 11 | */ |
12 | typedef enum TTag { | 12 | typedef enum TTag { |
13 | TChar = 0, /* 'n' = char */ | 13 | TChar = 0, /* 'n' = char */ |
14 | TSet, /* the set is stored in next CHARSETSIZE bytes */ | 14 | TSet, /* the set is encoded in 'u.set' and the next 'u.set.size' bytes */ |
15 | TAny, | 15 | TAny, |
16 | TTrue, | 16 | TTrue, |
17 | TFalse, | 17 | TFalse, |
@@ -52,6 +52,11 @@ typedef struct TTree { | |||
52 | union { | 52 | union { |
53 | int ps; /* occasional second child */ | 53 | int ps; /* occasional second child */ |
54 | int n; /* occasional counter */ | 54 | int n; /* occasional counter */ |
55 | struct { | ||
56 | byte offset; /* compact set offset (in bytes) */ | ||
57 | byte size; /* compact set size (in bytes) */ | ||
58 | byte deflt; /* default value */ | ||
59 | } set; /* for compact sets */ | ||
55 | } u; | 60 | } u; |
56 | } TTree; | 61 | } TTree; |
57 | 62 | ||
@@ -1,9 +1,9 @@ | |||
1 | LIBNAME = lpeg | 1 | LIBNAME = lpeg |
2 | LUADIR = ./lua/ | 2 | LUADIR = ./lua/ |
3 | 3 | ||
4 | COPT = -O2 -DNDEBUG | 4 | # COPT = -O2 -DNDEBUG |
5 | # COPT = -g | 5 | # COPT = -g |
6 | # COPT = -DLPEG_DEBUG | 6 | COPT = -DLPEG_DEBUG -g |
7 | 7 | ||
8 | CWARNS = -Wall -Wextra -pedantic \ | 8 | CWARNS = -Wall -Wextra -pedantic \ |
9 | -Waggregate-return \ | 9 | -Waggregate-return \ |
@@ -49,8 +49,9 @@ clean: | |||
49 | 49 | ||
50 | 50 | ||
51 | lpcap.o: lpcap.c lpcap.h lptypes.h | 51 | lpcap.o: lpcap.c lpcap.h lptypes.h |
52 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | 52 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h lpcset.h |
53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | 53 | lpcset.o: lpcset.c lptypes.h lpcset.h lpcode.h lptree.h lpvm.h lpcap.h |
54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | 54 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h lpcode.h |
55 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h \ | ||
56 | lpcset.h | ||
55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | 57 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h |
56 | lpcset.o: lpcset.c lpcset.h lptypes.h | ||
@@ -68,6 +68,8 @@ assert(m.match(#m.P(true) * "a", "a") == 2) | |||
68 | assert(m.match("a" * #m.P(false), "a") == nil) | 68 | assert(m.match("a" * #m.P(false), "a") == nil) |
69 | assert(m.match("a" * #m.P(true), "a") == 2) | 69 | assert(m.match("a" * #m.P(true), "a") == 2) |
70 | 70 | ||
71 | assert(m.match(m.P(1)^0, "abcd") == 5) | ||
72 | assert(m.match(m.S("")^0, "abcd") == 1) | ||
71 | 73 | ||
72 | -- tests for locale | 74 | -- tests for locale |
73 | do | 75 | do |
@@ -1167,7 +1169,7 @@ end | |||
1167 | 1169 | ||
1168 | 1170 | ||
1169 | -- bug in 1.0: problems with math-times returning too many captures | 1171 | -- bug in 1.0: problems with math-times returning too many captures |
1170 | do | 1172 | if _VERSION >= "Lua 5.2" then |
1171 | local lim = 2^11 - 10 | 1173 | local lim = 2^11 - 10 |
1172 | local res = {m.match(manyCmt(lim), "a")} | 1174 | local res = {m.match(manyCmt(lim), "a")} |
1173 | assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) | 1175 | assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) |