aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-27 10:32:39 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-27 10:32:39 -0300
commit012cf9c86cf91cb8354e229bde335592d41b84b2 (patch)
tree353f17797b1952eaec231c8e4fd5c21e02daf875
parent3403b0c7256435560b63f828da92026c5d4c898b (diff)
downloadlpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz
lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2
lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip
Compact charsets used in trees, too.
-rw-r--r--lpcode.c118
-rw-r--r--lpcode.h1
-rw-r--r--lpcset.c55
-rw-r--r--lpcset.h15
-rw-r--r--lpprint.c13
-rw-r--r--lptree.c70
-rw-r--r--lptree.h9
-rw-r--r--makefile13
-rwxr-xr-xtest.lua4
9 files changed, 178 insertions, 120 deletions
diff --git a/lpcode.c b/lpcode.c
index 66d2f3f..9289bd3 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -45,31 +45,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
45 45
46 46
47/* 47/*
48** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
49** charset and return 1; else return 0.
50*/
51int tocharset (TTree *tree, Charset *cs) {
52 switch (tree->tag) {
53 case TSet: { /* copy set */
54 loopset(i, cs->cs[i] = treebuffer(tree)[i]);
55 return 1;
56 }
57 case TChar: { /* only one char */
58 assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
59 loopset(i, cs->cs[i] = 0); /* erase all chars */
60 setchar(cs->cs, tree->u.n); /* add that one */
61 return 1;
62 }
63 case TAny: {
64 loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */
65 return 1;
66 }
67 default: return 0;
68 }
69}
70
71
72/*
73** Visit a TCall node taking care to stop recursion. If node not yet 48** Visit a TCall node taking care to stop recursion. If node not yet
74** visited, return 'f(sib2(tree))', otherwise return 'def' (default 49** visited, return 'f(sib2(tree))', otherwise return 'def' (default
75** value) 50** value)
@@ -240,7 +215,7 @@ int fixedlen (TTree *tree) {
240static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { 215static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
241 tailcall: 216 tailcall:
242 switch (tree->tag) { 217 switch (tree->tag) {
243 case TChar: case TSet: case TAny: { 218 case TChar: case TSet: case TAny: case TFalse: {
244 tocharset(tree, firstset); 219 tocharset(tree, firstset);
245 return 0; 220 return 0;
246 } 221 }
@@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
255 loopset(i, firstset->cs[i] = follow->cs[i]); 230 loopset(i, firstset->cs[i] = follow->cs[i]);
256 return 1; /* accepts the empty string */ 231 return 1; /* accepts the empty string */
257 } 232 }
258 case TFalse: {
259 loopset(i, firstset->cs[i] = 0);
260 return 0;
261 }
262 case TChoice: { 233 case TChoice: {
263 Charset csaux; 234 Charset csaux;
264 int e1 = getfirst(sib1(tree), follow, firstset); 235 int e1 = getfirst(sib1(tree), follow, firstset);
@@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) {
543/* 514/*
544** Add a charset posfix to an instruction. 515** Add a charset posfix to an instruction.
545*/ 516*/
546static void addcharset (CompileState *compst, int inst, const byte *cs, 517static void addcharset (CompileState *compst, int inst, charsetinfo *info) {
547 const charsetinfo *info) {
548 int p; 518 int p;
549 Instruction *I = &getinstr(compst, inst); 519 Instruction *I = &getinstr(compst, inst);
550 byte *charset; 520 byte *charset;
551 int isize = instsize(info->size); /* size in instructions */ 521 int isize = instsize(info->size); /* size in instructions */
552 int i; 522 int i;
553 I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ 523 I->i.aux2.set.offset = info->offset * 8; /* offset in bits */
554 I->i.aux2.set.size = isize; 524 I->i.aux2.set.size = isize;
555 I->i.aux1 = info->deflt; 525 I->i.aux1 = info->deflt;
556 p = nextinstruction(compst, isize); /* space for charset */ 526 p = nextinstruction(compst, isize); /* space for charset */
557 charset = getinstr(compst, p).buff; /* charset buffer */ 527 charset = getinstr(compst, p).buff; /* charset buffer */
558 for (i = 0; i < isize * (int)sizeof(Instruction); i++) 528 for (i = 0; i < isize * (int)sizeof(Instruction); i++)
559 charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ 529 charset[i] = getbytefromcharset(info, i); /* copy the buffer */
560} 530}
561 531
562 532
563/* 533/*
564** Check whether compact charset cs is dominated by instruction 'p' 534** Check whether charset 'info' is dominated by instruction 'p'
565*/ 535*/
566static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { 536static int cs_equal (Instruction *p, charsetinfo *info) {
567 if (p->i.code != ITestSet) 537 if (p->i.code != ITestSet)
568 return 0; 538 return 0;
569 else if (p->i.aux2.set.offset != info->aux1 * 8 || 539 else if (p->i.aux2.set.offset != info->offset * 8 ||
570 p->i.aux2.set.size != instsize(info->size) || 540 p->i.aux2.set.size != instsize(info->size) ||
571 p->i.aux1 != info->deflt) 541 p->i.aux1 != info->deflt)
572 return 0; 542 return 0;
573 else { 543 else {
574 int i; 544 int i;
575 for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { 545 for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) {
576 if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) 546 if ((p + 2)->buff[i] != getbytefromcharset(info, i))
577 return 0; 547 return 0;
578 } 548 }
579 } 549 }
@@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) {
582 552
583 553
584/* 554/*
585** code a char set, optimizing unit sets for IChar, "complete" 555** Code a char set, using IAny when instruction is dominated by an
586** sets for IAny, and empty sets for IFail; also use an IAny 556** equivalent test.
587** when instruction is dominated by an equivalent test.
588*/ 557*/
589static void codecharset (CompileState *compst, const byte *cs, int tt) { 558static void codecharset (CompileState *compst, TTree *tree, int tt) {
590 charsetinfo info; 559 charsetinfo info;
591 Opcode op = charsettype(cs, &info); 560 tree2cset(tree, &info);
592 switch (op) { 561 if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info))
593 case IChar: codechar(compst, info.aux1, tt); break; 562 addinstruction(compst, IAny, 0);
594 case ISet: { /* non-trivial set? */ 563 else {
595 if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) 564 int i = addinstruction(compst, ISet, 0);
596 addinstruction(compst, IAny, 0); 565 addcharset(compst, i, &info);
597 else {
598 int i = addinstruction(compst, ISet, 0);
599 addcharset(compst, i, cs, &info);
600 }
601 break;
602 }
603 default:
604 assert(op == IFail || op == IAny);
605 addinstruction(compst, op, 0);
606 break;
607 } 566 }
608} 567}
609 568
610 569
611/* 570/*
612** code a test set, optimizing unit sets for ITestChar, "complete" 571** Code a test set, optimizing unit sets for ITestChar, "complete"
613** sets for ITestAny, and empty sets for IJmp (always fails). 572** sets for ITestAny, and empty sets for IJmp (always fails).
614** 'e' is true iff test should accept the empty string. (Test 573** 'e' is true iff test should accept the empty string. (Test
615** instructions in the current VM never accept the empty string.) 574** instructions in the current VM never accept the empty string.)
@@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) {
624 case IAny: return addoffsetinst(compst, ITestAny); 583 case IAny: return addoffsetinst(compst, ITestAny);
625 case IChar: { 584 case IChar: {
626 int i = addoffsetinst(compst, ITestChar); 585 int i = addoffsetinst(compst, ITestChar);
627 getinstr(compst, i).i.aux1 = info.aux1; 586 getinstr(compst, i).i.aux1 = info.offset;
628 return i; 587 return i;
629 } 588 }
630 default: { /* regular set */ 589 default: { /* regular set */
631 int i = addoffsetinst(compst, ITestSet); 590 int i = addoffsetinst(compst, ITestSet);
632 addcharset(compst, i, cs->cs, &info); 591 addcharset(compst, i, &info);
633 assert(op == ISet); 592 assert(op == ISet);
634 return i; 593 return i;
635 } 594 }
@@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) {
778 737
779 738
780/* 739/*
781** Repetition of charsets: 740** Try repetition of charsets:
782** For an empty set, repetition of fail is a no-op; 741** For an empty set, repetition of fail is a no-op;
783** For any or char, code a tight loop; 742** For any or char, code a tight loop;
784** For generic charset, use a span instruction. 743** For generic charset, use a span instruction.
785*/ 744*/
786static void coderepcharset (CompileState *compst, Charset *st) { 745static int coderepcharset (CompileState *compst, TTree *tree) {
787 charsetinfo info; 746 switch (tree->tag) {
788 Opcode op = charsettype(st->cs, &info); 747 case TFalse: return 1; /* 'fail*' is a no-op */
789 switch (op) { 748 case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */
790 case IFail: break; /* fail* is a no-op */
791 case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */
792 int test = addoffsetinst(compst, ITestAny); 749 int test = addoffsetinst(compst, ITestAny);
793 addinstruction(compst, IAny, 0); 750 addinstruction(compst, IAny, 0);
794 closeloop(compst, test); 751 closeloop(compst, test);
795 break; 752 return 1;
796 } 753 }
797 case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ 754 case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */
798 int test = addoffsetinst(compst, ITestChar); 755 int test = addoffsetinst(compst, ITestChar);
799 getinstr(compst, test).i.aux1 = info.aux1; 756 getinstr(compst, test).i.aux1 = tree->u.n;
800 addinstruction(compst, IAny, 0); 757 addinstruction(compst, IAny, 0);
801 closeloop(compst, test); 758 closeloop(compst, test);
802 break; 759 return 1;
803 } 760 }
804 default: { /* regular set */ 761 case TSet: { /* regular set */
762 charsetinfo info;
805 int i = addinstruction(compst, ISpan, 0); 763 int i = addinstruction(compst, ISpan, 0);
806 addcharset(compst, i, st->cs, &info); 764 tree2cset(tree, &info);
807 assert(op == ISet); 765 addcharset(compst, i, &info);
766 return 1;
808 } 767 }
768 default: return 0; /* not a charset */
809 } 769 }
810} 770}
811 771
@@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) {
822*/ 782*/
823static void coderep (CompileState *compst, TTree *tree, int opt, 783static void coderep (CompileState *compst, TTree *tree, int opt,
824 const Charset *fl) { 784 const Charset *fl) {
825 Charset st; 785 if (!coderepcharset(compst, tree)) {
826 if (tocharset(tree, &st)) 786 Charset st;
827 coderepcharset(compst, &st);
828 else {
829 int e1 = getfirst(tree, fullset, &st); 787 int e1 = getfirst(tree, fullset, &st);
830 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { 788 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
831 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ 789 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
@@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
965 switch (tree->tag) { 923 switch (tree->tag) {
966 case TChar: codechar(compst, tree->u.n, tt); break; 924 case TChar: codechar(compst, tree->u.n, tt); break;
967 case TAny: addinstruction(compst, IAny, 0); break; 925 case TAny: addinstruction(compst, IAny, 0); break;
968 case TSet: codecharset(compst, treebuffer(tree), tt); break; 926 case TSet: codecharset(compst, tree, tt); break;
969 case TTrue: break; 927 case TTrue: break;
970 case TFalse: addinstruction(compst, IFail, 0); break; 928 case TFalse: addinstruction(compst, IFail, 0); break;
971 case TUTFR: codeutfr(compst, tree); break; 929 case TUTFR: codeutfr(compst, tree); break;
diff --git a/lpcode.h b/lpcode.h
index ec5f43f..3c71451 100644
--- a/lpcode.h
+++ b/lpcode.h
@@ -8,7 +8,6 @@
8#include "lptree.h" 8#include "lptree.h"
9#include "lpvm.h" 9#include "lpvm.h"
10 10
11int tocharset (TTree *tree, Charset *cs);
12int checkaux (TTree *tree, int pred); 11int checkaux (TTree *tree, int pred);
13int fixedlen (TTree *tree); 12int fixedlen (TTree *tree);
14int hascaptures (TTree *tree); 13int hascaptures (TTree *tree);
diff --git a/lpcset.c b/lpcset.c
index 9ecf475..2e62d94 100644
--- a/lpcset.c
+++ b/lpcset.c
@@ -16,7 +16,7 @@ static int onlybit (int c, int b) {
16 16
17/* 17/*
18** Check whether a charset is empty (returns IFail), singleton (IChar), 18** Check whether a charset is empty (returns IFail), singleton (IChar),
19** full (IAny), or none of those (ISet). When singleton, 'info.aux1' 19** full (IAny), or none of those (ISet). When singleton, 'info.offset'
20** returns which character it is. When generic set, 'info' returns 20** returns which character it is. When generic set, 'info' returns
21** information about its range. 21** information about its range.
22*/ 22*/
@@ -31,7 +31,7 @@ Opcode charsettype (const byte *cs, charsetinfo *info) {
31 if (low1 == high1) { /* only one byte with 1-bits? */ 31 if (low1 == high1) { /* only one byte with 1-bits? */
32 int b = cs[low1]; 32 int b = cs[low1];
33 if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ 33 if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */
34 info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ 34 info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */
35 return IChar; /* single character */ 35 return IChar; /* single character */
36 } 36 }
37 } 37 }
@@ -42,15 +42,16 @@ Opcode charsettype (const byte *cs, charsetinfo *info) {
42 for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) 42 for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--)
43 /* find highest byte with a 0-bit; low0 is a sentinel */; 43 /* find highest byte with a 0-bit; low0 is a sentinel */;
44 if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ 44 if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */
45 info->aux1 = low1; 45 info->offset = low1;
46 info->size = high1 - low1 + 1; 46 info->size = high1 - low1 + 1;
47 info->deflt = 0; /* all discharged bits were 0 */ 47 info->deflt = 0; /* all discharged bits were 0 */
48 } 48 }
49 else { 49 else {
50 info->aux1 = low0; 50 info->offset = low0;
51 info->size = high0 - low0 + 1; 51 info->size = high0 - low0 + 1;
52 info->deflt = 0xFF; /* all discharged bits were 1 */ 52 info->deflt = 0xFF; /* all discharged bits were 1 */
53 } 53 }
54 info->cs = cs + info->offset;
54 return ISet; 55 return ISet;
55} 56}
56 57
@@ -60,10 +61,50 @@ Opcode charsettype (const byte *cs, charsetinfo *info) {
60** range, get the byte from the supporting charset (correcting it 61** range, get the byte from the supporting charset (correcting it
61** by the offset). Otherwise, return the default for the set. 62** by the offset). Otherwise, return the default for the set.
62*/ 63*/
63byte getbytefromcharset (const byte *cs, const charsetinfo *info, 64byte getbytefromcharset (const charsetinfo *info, int index) {
64 int index) {
65 if (index < info->size) 65 if (index < info->size)
66 return cs[info->aux1 + index]; 66 return info->cs[index];
67 else return info->deflt; 67 else return info->deflt;
68} 68}
69 69
70
71/*
72** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it
73** into a charset and return 1; else return 0.
74*/
75int tocharset (TTree *tree, Charset *cs) {
76 switch (tree->tag) {
77 case TChar: { /* only one char */
78 assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
79 loopset(i, cs->cs[i] = 0); /* erase all chars */
80 setchar(cs->cs, tree->u.n); /* add that one */
81 return 1;
82 }
83 case TAny: {
84 loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */
85 return 1;
86 }
87 case TFalse: {
88 loopset(i, cs->cs[i] = 0); /* empty set */
89 return 1;
90 }
91 case TSet: { /* fill set */
92 int i;
93 loopset(j, cs->cs[j] = tree->u.set.deflt);
94 for (i = 0; i < tree->u.set.size; i++)
95 cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i];
96 return 1;
97 }
98 default: return 0;
99 }
100}
101
102
103void tree2cset (TTree *tree, charsetinfo *info) {
104 assert(tree->tag == TSet);
105 info->offset = tree->u.set.offset;
106 info->size = tree->u.set.size;
107 info->deflt = tree->u.set.deflt;
108 info->cs = treebuffer(tree);
109}
110
diff --git a/lpcset.h b/lpcset.h
index e5152c4..b69fef9 100644
--- a/lpcset.h
+++ b/lpcset.h
@@ -4,22 +4,27 @@
4 4
5#include "lpcset.h" 5#include "lpcset.h"
6#include "lpcode.h" 6#include "lpcode.h"
7#include "lptree.h"
7 8
8 9
9/* 10/*
10** Extra information for the result of 'charsettype'. When result is 11** Extra information for the result of 'charsettype'. When result is
11** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the 12** IChar, 'offset' is the character. When result is ISet, 'cs' is the
12** offset (in bytes), 'size' is the size (in bytes), and 13** supporting bit array (with offset included), 'offset' is the offset
13** 'delt' is the default value for bytes outside the set. 14** (in bytes), 'size' is the size (in bytes), and 'delt' is the default
15** value for bytes outside the set.
14*/ 16*/
15typedef struct { 17typedef struct {
16 int aux1; 18 const byte *cs;
19 int offset;
17 int size; 20 int size;
18 int deflt; 21 int deflt;
19} charsetinfo; 22} charsetinfo;
20 23
21 24
25int tocharset (TTree *tree, Charset *cs);
22Opcode charsettype (const byte *cs, charsetinfo *info); 26Opcode charsettype (const byte *cs, charsetinfo *info);
23byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); 27byte getbytefromcharset (const charsetinfo *info, int index);
28void tree2cset (TTree *tree, charsetinfo *info);
24 29
25#endif 30#endif
diff --git a/lpprint.c b/lpprint.c
index 518d822..a432263 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -46,6 +46,17 @@ static void printIcharset (const Instruction *inst, const byte *buff) {
46} 46}
47 47
48 48
49static void printTcharset (TTree *tree) {
50 byte cs[CHARSETSIZE];
51 int i;
52 printf("(%02x-%d) ", tree->u.set.offset, tree->u.set.size);
53 loopset(j, cs[j] = tree->u.set.deflt);
54 for (i = 0; i < tree->u.set.size; i++)
55 cs[tree->u.set.offset + i] = treebuffer(tree)[i];
56 printcharset(cs);
57}
58
59
49static const char *capkind (int kind) { 60static const char *capkind (int kind) {
50 const char *const modes[] = { 61 const char *const modes[] = {
51 "close", "position", "constant", "backref", 62 "close", "position", "constant", "backref",
@@ -186,7 +197,7 @@ void printtree (TTree *tree, int ident) {
186 break; 197 break;
187 } 198 }
188 case TSet: { 199 case TSet: {
189 printcharset(treebuffer(tree)); 200 printTcharset(tree);
190 printf("\n"); 201 printf("\n");
191 break; 202 break;
192 } 203 }
diff --git a/lptree.c b/lptree.c
index 4affac9..f9e170b 100644
--- a/lptree.c
+++ b/lptree.c
@@ -12,6 +12,7 @@
12#include "lpcode.h" 12#include "lpcode.h"
13#include "lpprint.h" 13#include "lpprint.h"
14#include "lptree.h" 14#include "lptree.h"
15#include "lpcset.h"
15 16
16 17
17/* number of siblings for each tree */ 18/* number of siblings for each tree */
@@ -370,14 +371,37 @@ static TTree *newleaf (lua_State *L, int tag) {
370} 371}
371 372
372 373
373static TTree *newcharset (lua_State *L) { 374static TTree *newccharset (lua_State *L, byte *cs, charsetinfo *info) {
374 TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); 375 int i;
376 TTree *tree = newtree(L, bytes2slots(info->size) + 1);
375 tree->tag = TSet; 377 tree->tag = TSet;
376 loopset(i, treebuffer(tree)[i] = 0); 378 tree->u.set.offset = info->offset;
379 tree->u.set.size = info->size;
380 tree->u.set.deflt = info->deflt;
381 for (i = 0; i < info->size; i++)
382 treebuffer(tree)[i] = cs[info->offset + i];
377 return tree; 383 return tree;
378} 384}
379 385
380 386
387static TTree *newcharset (lua_State *L, byte *cs) {
388 charsetinfo info;
389 Opcode op = charsettype(cs, &info);
390 switch (op) {
391 case IFail: return newleaf(L, TFalse);
392 case IAny: return newleaf(L, TAny);
393 case IChar: {
394 TTree *tree =newleaf(L, TChar);
395 tree->u.n = info.offset;
396 return tree;
397 }
398 default:
399 assert(op == ISet);
400 return newccharset(L, cs, &info);
401 }
402}
403
404
381/* 405/*
382** add to tree a sequence where first sibling is 'sib' (with size 406** add to tree a sequence where first sibling is 'sib' (with size
383** 'sibsize'); returns position for second sibling 407** 'sibsize'); returns position for second sibling
@@ -549,8 +573,8 @@ static int lp_choice (lua_State *L) {
549 TTree *t1 = getpatt(L, 1, NULL); 573 TTree *t1 = getpatt(L, 1, NULL);
550 TTree *t2 = getpatt(L, 2, NULL); 574 TTree *t2 = getpatt(L, 2, NULL);
551 if (tocharset(t1, &st1) && tocharset(t2, &st2)) { 575 if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
552 TTree *t = newcharset(L); 576 loopset(i, st1.cs[i] |= st2.cs[i]);
553 loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); 577 newcharset(L, st1.cs);
554 } 578 }
555 else if (nofail(t1) || t2->tag == TFalse) 579 else if (nofail(t1) || t2->tag == TFalse)
556 lua_pushvalue(L, 1); /* true / x => true, x / false => x */ 580 lua_pushvalue(L, 1); /* true / x => true, x / false => x */
@@ -626,8 +650,8 @@ static int lp_sub (lua_State *L) {
626 TTree *t1 = getpatt(L, 1, &s1); 650 TTree *t1 = getpatt(L, 1, &s1);
627 TTree *t2 = getpatt(L, 2, &s2); 651 TTree *t2 = getpatt(L, 2, &s2);
628 if (tocharset(t1, &st1) && tocharset(t2, &st2)) { 652 if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
629 TTree *t = newcharset(L); 653 loopset(i, st1.cs[i] &= ~st2.cs[i]);
630 loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); 654 newcharset(L, st1.cs);
631 } 655 }
632 else { 656 else {
633 TTree *tree = newtree(L, 2 + s1 + s2); 657 TTree *tree = newtree(L, 2 + s1 + s2);
@@ -645,11 +669,13 @@ static int lp_sub (lua_State *L) {
645static int lp_set (lua_State *L) { 669static int lp_set (lua_State *L) {
646 size_t l; 670 size_t l;
647 const char *s = luaL_checklstring(L, 1, &l); 671 const char *s = luaL_checklstring(L, 1, &l);
648 TTree *tree = newcharset(L); 672 byte buff[CHARSETSIZE];
673 loopset(i, buff[i] = 0);
649 while (l--) { 674 while (l--) {
650 setchar(treebuffer(tree), (byte)(*s)); 675 setchar(buff, (byte)(*s));
651 s++; 676 s++;
652 } 677 }
678 newcharset(L, buff);
653 return 1; 679 return 1;
654} 680}
655 681
@@ -657,15 +683,17 @@ static int lp_set (lua_State *L) {
657static int lp_range (lua_State *L) { 683static int lp_range (lua_State *L) {
658 int arg; 684 int arg;
659 int top = lua_gettop(L); 685 int top = lua_gettop(L);
660 TTree *tree = newcharset(L); 686 byte buff[CHARSETSIZE];
687 loopset(i, buff[i] = 0);
661 for (arg = 1; arg <= top; arg++) { 688 for (arg = 1; arg <= top; arg++) {
662 int c; 689 int c;
663 size_t l; 690 size_t l;
664 const char *r = luaL_checklstring(L, arg, &l); 691 const char *r = luaL_checklstring(L, arg, &l);
665 luaL_argcheck(L, l == 2, arg, "range must have two characters"); 692 luaL_argcheck(L, l == 2, arg, "range must have two characters");
666 for (c = (byte)r[0]; c <= (byte)r[1]; c++) 693 for (c = (byte)r[0]; c <= (byte)r[1]; c++)
667 setchar(treebuffer(tree), c); 694 setchar(buff, c);
668 } 695 }
696 newcharset(L, buff);
669 return 1; 697 return 1;
670} 698}
671 699
@@ -704,10 +732,12 @@ static int lp_utfr (lua_State *L) {
704 lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); 732 lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
705 luaL_argcheck(L, from <= to, 2, "empty range"); 733 luaL_argcheck(L, from <= to, 2, "empty range");
706 if (to <= 0x7f) { /* ascii range? */ 734 if (to <= 0x7f) { /* ascii range? */
707 TTree *tree = newcharset(L); /* code it as a regular charset */
708 unsigned int f; 735 unsigned int f;
736 byte buff[CHARSETSIZE]; /* code it as a regular charset */
737 loopset(i, buff[i] = 0);
709 for (f = (int)from; f <= to; f++) 738 for (f = (int)from; f <= to; f++)
710 setchar(treebuffer(tree), f); 739 setchar(buff, f);
740 newcharset(L, buff);
711 } 741 }
712 else { /* multi-byte utf-8 range */ 742 else { /* multi-byte utf-8 range */
713 TTree *tree = newtree(L, 2); 743 TTree *tree = newtree(L, 2);
@@ -1261,11 +1291,17 @@ int lp_gc (lua_State *L) {
1261} 1291}
1262 1292
1263 1293
1294/*
1295** Create a charset representing a category of characters, given by
1296** the predicate 'catf'.
1297*/
1264static void createcat (lua_State *L, const char *catname, int (catf) (int)) { 1298static void createcat (lua_State *L, const char *catname, int (catf) (int)) {
1265 TTree *t = newcharset(L); 1299 int c;
1266 int i; 1300 byte buff[CHARSETSIZE];
1267 for (i = 0; i <= UCHAR_MAX; i++) 1301 loopset(i, buff[i] = 0);
1268 if (catf(i)) setchar(treebuffer(t), i); 1302 for (c = 0; c <= UCHAR_MAX; c++)
1303 if (catf(c)) setchar(buff, c);
1304 newcharset(L, buff);
1269 lua_setfield(L, -2, catname); 1305 lua_setfield(L, -2, catname);
1270} 1306}
1271 1307
diff --git a/lptree.h b/lptree.h
index aa331d2..b76c235 100644
--- a/lptree.h
+++ b/lptree.h
@@ -3,7 +3,7 @@
3#define lptree_h 3#define lptree_h
4 4
5 5
6#include "lptypes.h" 6#include "lptypes.h"
7 7
8 8
9/* 9/*
@@ -11,7 +11,7 @@
11*/ 11*/
12typedef enum TTag { 12typedef enum TTag {
13 TChar = 0, /* 'n' = char */ 13 TChar = 0, /* 'n' = char */
14 TSet, /* the set is stored in next CHARSETSIZE bytes */ 14 TSet, /* the set is encoded in 'u.set' and the next 'u.set.size' bytes */
15 TAny, 15 TAny,
16 TTrue, 16 TTrue,
17 TFalse, 17 TFalse,
@@ -52,6 +52,11 @@ typedef struct TTree {
52 union { 52 union {
53 int ps; /* occasional second child */ 53 int ps; /* occasional second child */
54 int n; /* occasional counter */ 54 int n; /* occasional counter */
55 struct {
56 byte offset; /* compact set offset (in bytes) */
57 byte size; /* compact set size (in bytes) */
58 byte deflt; /* default value */
59 } set; /* for compact sets */
55 } u; 60 } u;
56} TTree; 61} TTree;
57 62
diff --git a/makefile b/makefile
index 9493c6e..2b3c12e 100644
--- a/makefile
+++ b/makefile
@@ -1,9 +1,9 @@
1LIBNAME = lpeg 1LIBNAME = lpeg
2LUADIR = ./lua/ 2LUADIR = ./lua/
3 3
4COPT = -O2 -DNDEBUG 4# COPT = -O2 -DNDEBUG
5# COPT = -g 5# COPT = -g
6# COPT = -DLPEG_DEBUG 6COPT = -DLPEG_DEBUG -g
7 7
8CWARNS = -Wall -Wextra -pedantic \ 8CWARNS = -Wall -Wextra -pedantic \
9 -Waggregate-return \ 9 -Waggregate-return \
@@ -49,8 +49,9 @@ clean:
49 49
50 50
51lpcap.o: lpcap.c lpcap.h lptypes.h 51lpcap.o: lpcap.c lpcap.h lptypes.h
52lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h 52lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h lpcset.h
53lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h 53lpcset.o: lpcset.c lptypes.h lpcset.h lpcode.h lptree.h lpvm.h lpcap.h
54lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h 54lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h lpcode.h
55lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h \
56 lpcset.h
55lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h 57lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h
56lpcset.o: lpcset.c lpcset.h lptypes.h
diff --git a/test.lua b/test.lua
index d76e3f1..9f8d226 100755
--- a/test.lua
+++ b/test.lua
@@ -68,6 +68,8 @@ assert(m.match(#m.P(true) * "a", "a") == 2)
68assert(m.match("a" * #m.P(false), "a") == nil) 68assert(m.match("a" * #m.P(false), "a") == nil)
69assert(m.match("a" * #m.P(true), "a") == 2) 69assert(m.match("a" * #m.P(true), "a") == 2)
70 70
71assert(m.match(m.P(1)^0, "abcd") == 5)
72assert(m.match(m.S("")^0, "abcd") == 1)
71 73
72-- tests for locale 74-- tests for locale
73do 75do
@@ -1167,7 +1169,7 @@ end
1167 1169
1168 1170
1169-- bug in 1.0: problems with math-times returning too many captures 1171-- bug in 1.0: problems with math-times returning too many captures
1170do 1172if _VERSION >= "Lua 5.2" then
1171 local lim = 2^11 - 10 1173 local lim = 2^11 - 10
1172 local res = {m.match(manyCmt(lim), "a")} 1174 local res = {m.match(manyCmt(lim), "a")}
1173 assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) 1175 assert(#res == lim and res[1] == lim - 1 and res[lim] == 0)