diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
| commit | 012cf9c86cf91cb8354e229bde335592d41b84b2 (patch) | |
| tree | 353f17797b1952eaec231c8e4fd5c21e02daf875 | |
| parent | 3403b0c7256435560b63f828da92026c5d4c898b (diff) | |
| download | lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2 lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip | |
Compact charsets used in trees, too.
| -rw-r--r-- | lpcode.c | 118 | ||||
| -rw-r--r-- | lpcode.h | 1 | ||||
| -rw-r--r-- | lpcset.c | 55 | ||||
| -rw-r--r-- | lpcset.h | 15 | ||||
| -rw-r--r-- | lpprint.c | 13 | ||||
| -rw-r--r-- | lptree.c | 70 | ||||
| -rw-r--r-- | lptree.h | 9 | ||||
| -rw-r--r-- | makefile | 13 | ||||
| -rwxr-xr-x | test.lua | 4 |
9 files changed, 178 insertions, 120 deletions
| @@ -45,31 +45,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) { | |||
| 45 | 45 | ||
| 46 | 46 | ||
| 47 | /* | 47 | /* |
| 48 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a | ||
| 49 | ** charset and return 1; else return 0. | ||
| 50 | */ | ||
| 51 | int tocharset (TTree *tree, Charset *cs) { | ||
| 52 | switch (tree->tag) { | ||
| 53 | case TSet: { /* copy set */ | ||
| 54 | loopset(i, cs->cs[i] = treebuffer(tree)[i]); | ||
| 55 | return 1; | ||
| 56 | } | ||
| 57 | case TChar: { /* only one char */ | ||
| 58 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
| 59 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
| 60 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
| 61 | return 1; | ||
| 62 | } | ||
| 63 | case TAny: { | ||
| 64 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
| 65 | return 1; | ||
| 66 | } | ||
| 67 | default: return 0; | ||
| 68 | } | ||
| 69 | } | ||
| 70 | |||
| 71 | |||
| 72 | /* | ||
| 73 | ** Visit a TCall node taking care to stop recursion. If node not yet | 48 | ** Visit a TCall node taking care to stop recursion. If node not yet |
| 74 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default | 49 | ** visited, return 'f(sib2(tree))', otherwise return 'def' (default |
| 75 | ** value) | 50 | ** value) |
| @@ -240,7 +215,7 @@ int fixedlen (TTree *tree) { | |||
| 240 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | 215 | static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { |
| 241 | tailcall: | 216 | tailcall: |
| 242 | switch (tree->tag) { | 217 | switch (tree->tag) { |
| 243 | case TChar: case TSet: case TAny: { | 218 | case TChar: case TSet: case TAny: case TFalse: { |
| 244 | tocharset(tree, firstset); | 219 | tocharset(tree, firstset); |
| 245 | return 0; | 220 | return 0; |
| 246 | } | 221 | } |
| @@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
| 255 | loopset(i, firstset->cs[i] = follow->cs[i]); | 230 | loopset(i, firstset->cs[i] = follow->cs[i]); |
| 256 | return 1; /* accepts the empty string */ | 231 | return 1; /* accepts the empty string */ |
| 257 | } | 232 | } |
| 258 | case TFalse: { | ||
| 259 | loopset(i, firstset->cs[i] = 0); | ||
| 260 | return 0; | ||
| 261 | } | ||
| 262 | case TChoice: { | 233 | case TChoice: { |
| 263 | Charset csaux; | 234 | Charset csaux; |
| 264 | int e1 = getfirst(sib1(tree), follow, firstset); | 235 | int e1 = getfirst(sib1(tree), follow, firstset); |
| @@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
| 543 | /* | 514 | /* |
| 544 | ** Add a charset posfix to an instruction. | 515 | ** Add a charset posfix to an instruction. |
| 545 | */ | 516 | */ |
| 546 | static void addcharset (CompileState *compst, int inst, const byte *cs, | 517 | static void addcharset (CompileState *compst, int inst, charsetinfo *info) { |
| 547 | const charsetinfo *info) { | ||
| 548 | int p; | 518 | int p; |
| 549 | Instruction *I = &getinstr(compst, inst); | 519 | Instruction *I = &getinstr(compst, inst); |
| 550 | byte *charset; | 520 | byte *charset; |
| 551 | int isize = instsize(info->size); /* size in instructions */ | 521 | int isize = instsize(info->size); /* size in instructions */ |
| 552 | int i; | 522 | int i; |
| 553 | I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ | 523 | I->i.aux2.set.offset = info->offset * 8; /* offset in bits */ |
| 554 | I->i.aux2.set.size = isize; | 524 | I->i.aux2.set.size = isize; |
| 555 | I->i.aux1 = info->deflt; | 525 | I->i.aux1 = info->deflt; |
| 556 | p = nextinstruction(compst, isize); /* space for charset */ | 526 | p = nextinstruction(compst, isize); /* space for charset */ |
| 557 | charset = getinstr(compst, p).buff; /* charset buffer */ | 527 | charset = getinstr(compst, p).buff; /* charset buffer */ |
| 558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) | 528 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
| 559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ | 529 | charset[i] = getbytefromcharset(info, i); /* copy the buffer */ |
| 560 | } | 530 | } |
| 561 | 531 | ||
| 562 | 532 | ||
| 563 | /* | 533 | /* |
| 564 | ** Check whether compact charset cs is dominated by instruction 'p' | 534 | ** Check whether charset 'info' is dominated by instruction 'p' |
| 565 | */ | 535 | */ |
| 566 | static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | 536 | static int cs_equal (Instruction *p, charsetinfo *info) { |
| 567 | if (p->i.code != ITestSet) | 537 | if (p->i.code != ITestSet) |
| 568 | return 0; | 538 | return 0; |
| 569 | else if (p->i.aux2.set.offset != info->aux1 * 8 || | 539 | else if (p->i.aux2.set.offset != info->offset * 8 || |
| 570 | p->i.aux2.set.size != instsize(info->size) || | 540 | p->i.aux2.set.size != instsize(info->size) || |
| 571 | p->i.aux1 != info->deflt) | 541 | p->i.aux1 != info->deflt) |
| 572 | return 0; | 542 | return 0; |
| 573 | else { | 543 | else { |
| 574 | int i; | 544 | int i; |
| 575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | 545 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
| 576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) | 546 | if ((p + 2)->buff[i] != getbytefromcharset(info, i)) |
| 577 | return 0; | 547 | return 0; |
| 578 | } | 548 | } |
| 579 | } | 549 | } |
| @@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
| 582 | 552 | ||
| 583 | 553 | ||
| 584 | /* | 554 | /* |
| 585 | ** code a char set, optimizing unit sets for IChar, "complete" | 555 | ** Code a char set, using IAny when instruction is dominated by an |
| 586 | ** sets for IAny, and empty sets for IFail; also use an IAny | 556 | ** equivalent test. |
| 587 | ** when instruction is dominated by an equivalent test. | ||
| 588 | */ | 557 | */ |
| 589 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | 558 | static void codecharset (CompileState *compst, TTree *tree, int tt) { |
| 590 | charsetinfo info; | 559 | charsetinfo info; |
| 591 | Opcode op = charsettype(cs, &info); | 560 | tree2cset(tree, &info); |
| 592 | switch (op) { | 561 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info)) |
| 593 | case IChar: codechar(compst, info.aux1, tt); break; | 562 | addinstruction(compst, IAny, 0); |
| 594 | case ISet: { /* non-trivial set? */ | 563 | else { |
| 595 | if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) | 564 | int i = addinstruction(compst, ISet, 0); |
| 596 | addinstruction(compst, IAny, 0); | 565 | addcharset(compst, i, &info); |
| 597 | else { | ||
| 598 | int i = addinstruction(compst, ISet, 0); | ||
| 599 | addcharset(compst, i, cs, &info); | ||
| 600 | } | ||
| 601 | break; | ||
| 602 | } | ||
| 603 | default: | ||
| 604 | assert(op == IFail || op == IAny); | ||
| 605 | addinstruction(compst, op, 0); | ||
| 606 | break; | ||
| 607 | } | 566 | } |
| 608 | } | 567 | } |
| 609 | 568 | ||
| 610 | 569 | ||
| 611 | /* | 570 | /* |
| 612 | ** code a test set, optimizing unit sets for ITestChar, "complete" | 571 | ** Code a test set, optimizing unit sets for ITestChar, "complete" |
| 613 | ** sets for ITestAny, and empty sets for IJmp (always fails). | 572 | ** sets for ITestAny, and empty sets for IJmp (always fails). |
| 614 | ** 'e' is true iff test should accept the empty string. (Test | 573 | ** 'e' is true iff test should accept the empty string. (Test |
| 615 | ** instructions in the current VM never accept the empty string.) | 574 | ** instructions in the current VM never accept the empty string.) |
| @@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { | |||
| 624 | case IAny: return addoffsetinst(compst, ITestAny); | 583 | case IAny: return addoffsetinst(compst, ITestAny); |
| 625 | case IChar: { | 584 | case IChar: { |
| 626 | int i = addoffsetinst(compst, ITestChar); | 585 | int i = addoffsetinst(compst, ITestChar); |
| 627 | getinstr(compst, i).i.aux1 = info.aux1; | 586 | getinstr(compst, i).i.aux1 = info.offset; |
| 628 | return i; | 587 | return i; |
| 629 | } | 588 | } |
| 630 | default: { /* regular set */ | 589 | default: { /* regular set */ |
| 631 | int i = addoffsetinst(compst, ITestSet); | 590 | int i = addoffsetinst(compst, ITestSet); |
| 632 | addcharset(compst, i, cs->cs, &info); | 591 | addcharset(compst, i, &info); |
| 633 | assert(op == ISet); | 592 | assert(op == ISet); |
| 634 | return i; | 593 | return i; |
| 635 | } | 594 | } |
| @@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) { | |||
| 778 | 737 | ||
| 779 | 738 | ||
| 780 | /* | 739 | /* |
| 781 | ** Repetition of charsets: | 740 | ** Try repetition of charsets: |
| 782 | ** For an empty set, repetition of fail is a no-op; | 741 | ** For an empty set, repetition of fail is a no-op; |
| 783 | ** For any or char, code a tight loop; | 742 | ** For any or char, code a tight loop; |
| 784 | ** For generic charset, use a span instruction. | 743 | ** For generic charset, use a span instruction. |
| 785 | */ | 744 | */ |
| 786 | static void coderepcharset (CompileState *compst, Charset *st) { | 745 | static int coderepcharset (CompileState *compst, TTree *tree) { |
| 787 | charsetinfo info; | 746 | switch (tree->tag) { |
| 788 | Opcode op = charsettype(st->cs, &info); | 747 | case TFalse: return 1; /* 'fail*' is a no-op */ |
| 789 | switch (op) { | 748 | case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */ |
| 790 | case IFail: break; /* fail* is a no-op */ | ||
| 791 | case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */ | ||
| 792 | int test = addoffsetinst(compst, ITestAny); | 749 | int test = addoffsetinst(compst, ITestAny); |
| 793 | addinstruction(compst, IAny, 0); | 750 | addinstruction(compst, IAny, 0); |
| 794 | closeloop(compst, test); | 751 | closeloop(compst, test); |
| 795 | break; | 752 | return 1; |
| 796 | } | 753 | } |
| 797 | case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ | 754 | case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ |
| 798 | int test = addoffsetinst(compst, ITestChar); | 755 | int test = addoffsetinst(compst, ITestChar); |
| 799 | getinstr(compst, test).i.aux1 = info.aux1; | 756 | getinstr(compst, test).i.aux1 = tree->u.n; |
| 800 | addinstruction(compst, IAny, 0); | 757 | addinstruction(compst, IAny, 0); |
| 801 | closeloop(compst, test); | 758 | closeloop(compst, test); |
| 802 | break; | 759 | return 1; |
| 803 | } | 760 | } |
| 804 | default: { /* regular set */ | 761 | case TSet: { /* regular set */ |
| 762 | charsetinfo info; | ||
| 805 | int i = addinstruction(compst, ISpan, 0); | 763 | int i = addinstruction(compst, ISpan, 0); |
| 806 | addcharset(compst, i, st->cs, &info); | 764 | tree2cset(tree, &info); |
| 807 | assert(op == ISet); | 765 | addcharset(compst, i, &info); |
| 766 | return 1; | ||
| 808 | } | 767 | } |
| 768 | default: return 0; /* not a charset */ | ||
| 809 | } | 769 | } |
| 810 | } | 770 | } |
| 811 | 771 | ||
| @@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) { | |||
| 822 | */ | 782 | */ |
| 823 | static void coderep (CompileState *compst, TTree *tree, int opt, | 783 | static void coderep (CompileState *compst, TTree *tree, int opt, |
| 824 | const Charset *fl) { | 784 | const Charset *fl) { |
| 825 | Charset st; | 785 | if (!coderepcharset(compst, tree)) { |
| 826 | if (tocharset(tree, &st)) | 786 | Charset st; |
| 827 | coderepcharset(compst, &st); | ||
| 828 | else { | ||
| 829 | int e1 = getfirst(tree, fullset, &st); | 787 | int e1 = getfirst(tree, fullset, &st); |
| 830 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | 788 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { |
| 831 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | 789 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ |
| @@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
| 965 | switch (tree->tag) { | 923 | switch (tree->tag) { |
| 966 | case TChar: codechar(compst, tree->u.n, tt); break; | 924 | case TChar: codechar(compst, tree->u.n, tt); break; |
| 967 | case TAny: addinstruction(compst, IAny, 0); break; | 925 | case TAny: addinstruction(compst, IAny, 0); break; |
| 968 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 926 | case TSet: codecharset(compst, tree, tt); break; |
| 969 | case TTrue: break; | 927 | case TTrue: break; |
| 970 | case TFalse: addinstruction(compst, IFail, 0); break; | 928 | case TFalse: addinstruction(compst, IFail, 0); break; |
| 971 | case TUTFR: codeutfr(compst, tree); break; | 929 | case TUTFR: codeutfr(compst, tree); break; |
| @@ -8,7 +8,6 @@ | |||
| 8 | #include "lptree.h" | 8 | #include "lptree.h" |
| 9 | #include "lpvm.h" | 9 | #include "lpvm.h" |
| 10 | 10 | ||
| 11 | int tocharset (TTree *tree, Charset *cs); | ||
| 12 | int checkaux (TTree *tree, int pred); | 11 | int checkaux (TTree *tree, int pred); |
| 13 | int fixedlen (TTree *tree); | 12 | int fixedlen (TTree *tree); |
| 14 | int hascaptures (TTree *tree); | 13 | int hascaptures (TTree *tree); |
| @@ -16,7 +16,7 @@ static int onlybit (int c, int b) { | |||
| 16 | 16 | ||
| 17 | /* | 17 | /* |
| 18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | 18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), |
| 19 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | 19 | ** full (IAny), or none of those (ISet). When singleton, 'info.offset' |
| 20 | ** returns which character it is. When generic set, 'info' returns | 20 | ** returns which character it is. When generic set, 'info' returns |
| 21 | ** information about its range. | 21 | ** information about its range. |
| 22 | */ | 22 | */ |
| @@ -31,7 +31,7 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
| 31 | if (low1 == high1) { /* only one byte with 1-bits? */ | 31 | if (low1 == high1) { /* only one byte with 1-bits? */ |
| 32 | int b = cs[low1]; | 32 | int b = cs[low1]; |
| 33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | 33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ |
| 34 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | 34 | info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ |
| 35 | return IChar; /* single character */ | 35 | return IChar; /* single character */ |
| 36 | } | 36 | } |
| 37 | } | 37 | } |
| @@ -42,15 +42,16 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
| 42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | 42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) |
| 43 | /* find highest byte with a 0-bit; low0 is a sentinel */; | 43 | /* find highest byte with a 0-bit; low0 is a sentinel */; |
| 44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | 44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ |
| 45 | info->aux1 = low1; | 45 | info->offset = low1; |
| 46 | info->size = high1 - low1 + 1; | 46 | info->size = high1 - low1 + 1; |
| 47 | info->deflt = 0; /* all discharged bits were 0 */ | 47 | info->deflt = 0; /* all discharged bits were 0 */ |
| 48 | } | 48 | } |
| 49 | else { | 49 | else { |
| 50 | info->aux1 = low0; | 50 | info->offset = low0; |
| 51 | info->size = high0 - low0 + 1; | 51 | info->size = high0 - low0 + 1; |
| 52 | info->deflt = 0xFF; /* all discharged bits were 1 */ | 52 | info->deflt = 0xFF; /* all discharged bits were 1 */ |
| 53 | } | 53 | } |
| 54 | info->cs = cs + info->offset; | ||
| 54 | return ISet; | 55 | return ISet; |
| 55 | } | 56 | } |
| 56 | 57 | ||
| @@ -60,10 +61,50 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
| 60 | ** range, get the byte from the supporting charset (correcting it | 61 | ** range, get the byte from the supporting charset (correcting it |
| 61 | ** by the offset). Otherwise, return the default for the set. | 62 | ** by the offset). Otherwise, return the default for the set. |
| 62 | */ | 63 | */ |
| 63 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, | 64 | byte getbytefromcharset (const charsetinfo *info, int index) { |
| 64 | int index) { | ||
| 65 | if (index < info->size) | 65 | if (index < info->size) |
| 66 | return cs[info->aux1 + index]; | 66 | return info->cs[index]; |
| 67 | else return info->deflt; | 67 | else return info->deflt; |
| 68 | } | 68 | } |
| 69 | 69 | ||
| 70 | |||
| 71 | /* | ||
| 72 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it | ||
| 73 | ** into a charset and return 1; else return 0. | ||
| 74 | */ | ||
| 75 | int tocharset (TTree *tree, Charset *cs) { | ||
| 76 | switch (tree->tag) { | ||
| 77 | case TChar: { /* only one char */ | ||
| 78 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
| 79 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
| 80 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
| 81 | return 1; | ||
| 82 | } | ||
| 83 | case TAny: { | ||
| 84 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
| 85 | return 1; | ||
| 86 | } | ||
| 87 | case TFalse: { | ||
| 88 | loopset(i, cs->cs[i] = 0); /* empty set */ | ||
| 89 | return 1; | ||
| 90 | } | ||
| 91 | case TSet: { /* fill set */ | ||
| 92 | int i; | ||
| 93 | loopset(j, cs->cs[j] = tree->u.set.deflt); | ||
| 94 | for (i = 0; i < tree->u.set.size; i++) | ||
| 95 | cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i]; | ||
| 96 | return 1; | ||
| 97 | } | ||
| 98 | default: return 0; | ||
| 99 | } | ||
| 100 | } | ||
| 101 | |||
| 102 | |||
| 103 | void tree2cset (TTree *tree, charsetinfo *info) { | ||
| 104 | assert(tree->tag == TSet); | ||
| 105 | info->offset = tree->u.set.offset; | ||
| 106 | info->size = tree->u.set.size; | ||
| 107 | info->deflt = tree->u.set.deflt; | ||
| 108 | info->cs = treebuffer(tree); | ||
| 109 | } | ||
| 110 | |||
| @@ -4,22 +4,27 @@ | |||
| 4 | 4 | ||
| 5 | #include "lpcset.h" | 5 | #include "lpcset.h" |
| 6 | #include "lpcode.h" | 6 | #include "lpcode.h" |
| 7 | #include "lptree.h" | ||
| 7 | 8 | ||
| 8 | 9 | ||
| 9 | /* | 10 | /* |
| 10 | ** Extra information for the result of 'charsettype'. When result is | 11 | ** Extra information for the result of 'charsettype'. When result is |
| 11 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | 12 | ** IChar, 'offset' is the character. When result is ISet, 'cs' is the |
| 12 | ** offset (in bytes), 'size' is the size (in bytes), and | 13 | ** supporting bit array (with offset included), 'offset' is the offset |
| 13 | ** 'delt' is the default value for bytes outside the set. | 14 | ** (in bytes), 'size' is the size (in bytes), and 'delt' is the default |
| 15 | ** value for bytes outside the set. | ||
| 14 | */ | 16 | */ |
| 15 | typedef struct { | 17 | typedef struct { |
| 16 | int aux1; | 18 | const byte *cs; |
| 19 | int offset; | ||
| 17 | int size; | 20 | int size; |
| 18 | int deflt; | 21 | int deflt; |
| 19 | } charsetinfo; | 22 | } charsetinfo; |
| 20 | 23 | ||
| 21 | 24 | ||
| 25 | int tocharset (TTree *tree, Charset *cs); | ||
| 22 | Opcode charsettype (const byte *cs, charsetinfo *info); | 26 | Opcode charsettype (const byte *cs, charsetinfo *info); |
| 23 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); | 27 | byte getbytefromcharset (const charsetinfo *info, int index); |
| 28 | void tree2cset (TTree *tree, charsetinfo *info); | ||
| 24 | 29 | ||
| 25 | #endif | 30 | #endif |
| @@ -46,6 +46,17 @@ static void printIcharset (const Instruction *inst, const byte *buff) { | |||
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | 48 | ||
| 49 | static void printTcharset (TTree *tree) { | ||
| 50 | byte cs[CHARSETSIZE]; | ||
| 51 | int i; | ||
| 52 | printf("(%02x-%d) ", tree->u.set.offset, tree->u.set.size); | ||
| 53 | loopset(j, cs[j] = tree->u.set.deflt); | ||
| 54 | for (i = 0; i < tree->u.set.size; i++) | ||
| 55 | cs[tree->u.set.offset + i] = treebuffer(tree)[i]; | ||
| 56 | printcharset(cs); | ||
| 57 | } | ||
| 58 | |||
| 59 | |||
| 49 | static const char *capkind (int kind) { | 60 | static const char *capkind (int kind) { |
| 50 | const char *const modes[] = { | 61 | const char *const modes[] = { |
| 51 | "close", "position", "constant", "backref", | 62 | "close", "position", "constant", "backref", |
| @@ -186,7 +197,7 @@ void printtree (TTree *tree, int ident) { | |||
| 186 | break; | 197 | break; |
| 187 | } | 198 | } |
| 188 | case TSet: { | 199 | case TSet: { |
| 189 | printcharset(treebuffer(tree)); | 200 | printTcharset(tree); |
| 190 | printf("\n"); | 201 | printf("\n"); |
| 191 | break; | 202 | break; |
| 192 | } | 203 | } |
| @@ -12,6 +12,7 @@ | |||
| 12 | #include "lpcode.h" | 12 | #include "lpcode.h" |
| 13 | #include "lpprint.h" | 13 | #include "lpprint.h" |
| 14 | #include "lptree.h" | 14 | #include "lptree.h" |
| 15 | #include "lpcset.h" | ||
| 15 | 16 | ||
| 16 | 17 | ||
| 17 | /* number of siblings for each tree */ | 18 | /* number of siblings for each tree */ |
| @@ -370,14 +371,37 @@ static TTree *newleaf (lua_State *L, int tag) { | |||
| 370 | } | 371 | } |
| 371 | 372 | ||
| 372 | 373 | ||
| 373 | static TTree *newcharset (lua_State *L) { | 374 | static TTree *newccharset (lua_State *L, byte *cs, charsetinfo *info) { |
| 374 | TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); | 375 | int i; |
| 376 | TTree *tree = newtree(L, bytes2slots(info->size) + 1); | ||
| 375 | tree->tag = TSet; | 377 | tree->tag = TSet; |
| 376 | loopset(i, treebuffer(tree)[i] = 0); | 378 | tree->u.set.offset = info->offset; |
| 379 | tree->u.set.size = info->size; | ||
| 380 | tree->u.set.deflt = info->deflt; | ||
| 381 | for (i = 0; i < info->size; i++) | ||
| 382 | treebuffer(tree)[i] = cs[info->offset + i]; | ||
| 377 | return tree; | 383 | return tree; |
| 378 | } | 384 | } |
| 379 | 385 | ||
| 380 | 386 | ||
| 387 | static TTree *newcharset (lua_State *L, byte *cs) { | ||
| 388 | charsetinfo info; | ||
| 389 | Opcode op = charsettype(cs, &info); | ||
| 390 | switch (op) { | ||
| 391 | case IFail: return newleaf(L, TFalse); | ||
| 392 | case IAny: return newleaf(L, TAny); | ||
| 393 | case IChar: { | ||
| 394 | TTree *tree =newleaf(L, TChar); | ||
| 395 | tree->u.n = info.offset; | ||
| 396 | return tree; | ||
| 397 | } | ||
| 398 | default: | ||
| 399 | assert(op == ISet); | ||
| 400 | return newccharset(L, cs, &info); | ||
| 401 | } | ||
| 402 | } | ||
| 403 | |||
| 404 | |||
| 381 | /* | 405 | /* |
| 382 | ** add to tree a sequence where first sibling is 'sib' (with size | 406 | ** add to tree a sequence where first sibling is 'sib' (with size |
| 383 | ** 'sibsize'); returns position for second sibling | 407 | ** 'sibsize'); returns position for second sibling |
| @@ -549,8 +573,8 @@ static int lp_choice (lua_State *L) { | |||
| 549 | TTree *t1 = getpatt(L, 1, NULL); | 573 | TTree *t1 = getpatt(L, 1, NULL); |
| 550 | TTree *t2 = getpatt(L, 2, NULL); | 574 | TTree *t2 = getpatt(L, 2, NULL); |
| 551 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | 575 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { |
| 552 | TTree *t = newcharset(L); | 576 | loopset(i, st1.cs[i] |= st2.cs[i]); |
| 553 | loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); | 577 | newcharset(L, st1.cs); |
| 554 | } | 578 | } |
| 555 | else if (nofail(t1) || t2->tag == TFalse) | 579 | else if (nofail(t1) || t2->tag == TFalse) |
| 556 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ | 580 | lua_pushvalue(L, 1); /* true / x => true, x / false => x */ |
| @@ -626,8 +650,8 @@ static int lp_sub (lua_State *L) { | |||
| 626 | TTree *t1 = getpatt(L, 1, &s1); | 650 | TTree *t1 = getpatt(L, 1, &s1); |
| 627 | TTree *t2 = getpatt(L, 2, &s2); | 651 | TTree *t2 = getpatt(L, 2, &s2); |
| 628 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { | 652 | if (tocharset(t1, &st1) && tocharset(t2, &st2)) { |
| 629 | TTree *t = newcharset(L); | 653 | loopset(i, st1.cs[i] &= ~st2.cs[i]); |
| 630 | loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); | 654 | newcharset(L, st1.cs); |
| 631 | } | 655 | } |
| 632 | else { | 656 | else { |
| 633 | TTree *tree = newtree(L, 2 + s1 + s2); | 657 | TTree *tree = newtree(L, 2 + s1 + s2); |
| @@ -645,11 +669,13 @@ static int lp_sub (lua_State *L) { | |||
| 645 | static int lp_set (lua_State *L) { | 669 | static int lp_set (lua_State *L) { |
| 646 | size_t l; | 670 | size_t l; |
| 647 | const char *s = luaL_checklstring(L, 1, &l); | 671 | const char *s = luaL_checklstring(L, 1, &l); |
| 648 | TTree *tree = newcharset(L); | 672 | byte buff[CHARSETSIZE]; |
| 673 | loopset(i, buff[i] = 0); | ||
| 649 | while (l--) { | 674 | while (l--) { |
| 650 | setchar(treebuffer(tree), (byte)(*s)); | 675 | setchar(buff, (byte)(*s)); |
| 651 | s++; | 676 | s++; |
| 652 | } | 677 | } |
| 678 | newcharset(L, buff); | ||
| 653 | return 1; | 679 | return 1; |
| 654 | } | 680 | } |
| 655 | 681 | ||
| @@ -657,15 +683,17 @@ static int lp_set (lua_State *L) { | |||
| 657 | static int lp_range (lua_State *L) { | 683 | static int lp_range (lua_State *L) { |
| 658 | int arg; | 684 | int arg; |
| 659 | int top = lua_gettop(L); | 685 | int top = lua_gettop(L); |
| 660 | TTree *tree = newcharset(L); | 686 | byte buff[CHARSETSIZE]; |
| 687 | loopset(i, buff[i] = 0); | ||
| 661 | for (arg = 1; arg <= top; arg++) { | 688 | for (arg = 1; arg <= top; arg++) { |
| 662 | int c; | 689 | int c; |
| 663 | size_t l; | 690 | size_t l; |
| 664 | const char *r = luaL_checklstring(L, arg, &l); | 691 | const char *r = luaL_checklstring(L, arg, &l); |
| 665 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); | 692 | luaL_argcheck(L, l == 2, arg, "range must have two characters"); |
| 666 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) | 693 | for (c = (byte)r[0]; c <= (byte)r[1]; c++) |
| 667 | setchar(treebuffer(tree), c); | 694 | setchar(buff, c); |
| 668 | } | 695 | } |
| 696 | newcharset(L, buff); | ||
| 669 | return 1; | 697 | return 1; |
| 670 | } | 698 | } |
| 671 | 699 | ||
| @@ -704,10 +732,12 @@ static int lp_utfr (lua_State *L) { | |||
| 704 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | 732 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); |
| 705 | luaL_argcheck(L, from <= to, 2, "empty range"); | 733 | luaL_argcheck(L, from <= to, 2, "empty range"); |
| 706 | if (to <= 0x7f) { /* ascii range? */ | 734 | if (to <= 0x7f) { /* ascii range? */ |
| 707 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
| 708 | unsigned int f; | 735 | unsigned int f; |
| 736 | byte buff[CHARSETSIZE]; /* code it as a regular charset */ | ||
| 737 | loopset(i, buff[i] = 0); | ||
| 709 | for (f = (int)from; f <= to; f++) | 738 | for (f = (int)from; f <= to; f++) |
| 710 | setchar(treebuffer(tree), f); | 739 | setchar(buff, f); |
| 740 | newcharset(L, buff); | ||
| 711 | } | 741 | } |
| 712 | else { /* multi-byte utf-8 range */ | 742 | else { /* multi-byte utf-8 range */ |
| 713 | TTree *tree = newtree(L, 2); | 743 | TTree *tree = newtree(L, 2); |
| @@ -1261,11 +1291,17 @@ int lp_gc (lua_State *L) { | |||
| 1261 | } | 1291 | } |
| 1262 | 1292 | ||
| 1263 | 1293 | ||
| 1294 | /* | ||
| 1295 | ** Create a charset representing a category of characters, given by | ||
| 1296 | ** the predicate 'catf'. | ||
| 1297 | */ | ||
| 1264 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { | 1298 | static void createcat (lua_State *L, const char *catname, int (catf) (int)) { |
| 1265 | TTree *t = newcharset(L); | 1299 | int c; |
| 1266 | int i; | 1300 | byte buff[CHARSETSIZE]; |
| 1267 | for (i = 0; i <= UCHAR_MAX; i++) | 1301 | loopset(i, buff[i] = 0); |
| 1268 | if (catf(i)) setchar(treebuffer(t), i); | 1302 | for (c = 0; c <= UCHAR_MAX; c++) |
| 1303 | if (catf(c)) setchar(buff, c); | ||
| 1304 | newcharset(L, buff); | ||
| 1269 | lua_setfield(L, -2, catname); | 1305 | lua_setfield(L, -2, catname); |
| 1270 | } | 1306 | } |
| 1271 | 1307 | ||
| @@ -3,7 +3,7 @@ | |||
| 3 | #define lptree_h | 3 | #define lptree_h |
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | #include "lptypes.h" | 6 | #include "lptypes.h" |
| 7 | 7 | ||
| 8 | 8 | ||
| 9 | /* | 9 | /* |
| @@ -11,7 +11,7 @@ | |||
| 11 | */ | 11 | */ |
| 12 | typedef enum TTag { | 12 | typedef enum TTag { |
| 13 | TChar = 0, /* 'n' = char */ | 13 | TChar = 0, /* 'n' = char */ |
| 14 | TSet, /* the set is stored in next CHARSETSIZE bytes */ | 14 | TSet, /* the set is encoded in 'u.set' and the next 'u.set.size' bytes */ |
| 15 | TAny, | 15 | TAny, |
| 16 | TTrue, | 16 | TTrue, |
| 17 | TFalse, | 17 | TFalse, |
| @@ -52,6 +52,11 @@ typedef struct TTree { | |||
| 52 | union { | 52 | union { |
| 53 | int ps; /* occasional second child */ | 53 | int ps; /* occasional second child */ |
| 54 | int n; /* occasional counter */ | 54 | int n; /* occasional counter */ |
| 55 | struct { | ||
| 56 | byte offset; /* compact set offset (in bytes) */ | ||
| 57 | byte size; /* compact set size (in bytes) */ | ||
| 58 | byte deflt; /* default value */ | ||
| 59 | } set; /* for compact sets */ | ||
| 55 | } u; | 60 | } u; |
| 56 | } TTree; | 61 | } TTree; |
| 57 | 62 | ||
| @@ -1,9 +1,9 @@ | |||
| 1 | LIBNAME = lpeg | 1 | LIBNAME = lpeg |
| 2 | LUADIR = ./lua/ | 2 | LUADIR = ./lua/ |
| 3 | 3 | ||
| 4 | COPT = -O2 -DNDEBUG | 4 | # COPT = -O2 -DNDEBUG |
| 5 | # COPT = -g | 5 | # COPT = -g |
| 6 | # COPT = -DLPEG_DEBUG | 6 | COPT = -DLPEG_DEBUG -g |
| 7 | 7 | ||
| 8 | CWARNS = -Wall -Wextra -pedantic \ | 8 | CWARNS = -Wall -Wextra -pedantic \ |
| 9 | -Waggregate-return \ | 9 | -Waggregate-return \ |
| @@ -49,8 +49,9 @@ clean: | |||
| 49 | 49 | ||
| 50 | 50 | ||
| 51 | lpcap.o: lpcap.c lpcap.h lptypes.h | 51 | lpcap.o: lpcap.c lpcap.h lptypes.h |
| 52 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | 52 | lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h lpcset.h |
| 53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | 53 | lpcset.o: lpcset.c lptypes.h lpcset.h lpcode.h lptree.h lpvm.h lpcap.h |
| 54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | 54 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h lpcode.h |
| 55 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h \ | ||
| 56 | lpcset.h | ||
| 55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | 57 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h |
| 56 | lpcset.o: lpcset.c lpcset.h lptypes.h | ||
| @@ -68,6 +68,8 @@ assert(m.match(#m.P(true) * "a", "a") == 2) | |||
| 68 | assert(m.match("a" * #m.P(false), "a") == nil) | 68 | assert(m.match("a" * #m.P(false), "a") == nil) |
| 69 | assert(m.match("a" * #m.P(true), "a") == 2) | 69 | assert(m.match("a" * #m.P(true), "a") == 2) |
| 70 | 70 | ||
| 71 | assert(m.match(m.P(1)^0, "abcd") == 5) | ||
| 72 | assert(m.match(m.S("")^0, "abcd") == 1) | ||
| 71 | 73 | ||
| 72 | -- tests for locale | 74 | -- tests for locale |
| 73 | do | 75 | do |
| @@ -1167,7 +1169,7 @@ end | |||
| 1167 | 1169 | ||
| 1168 | 1170 | ||
| 1169 | -- bug in 1.0: problems with math-times returning too many captures | 1171 | -- bug in 1.0: problems with math-times returning too many captures |
| 1170 | do | 1172 | if _VERSION >= "Lua 5.2" then |
| 1171 | local lim = 2^11 - 10 | 1173 | local lim = 2^11 - 10 |
| 1172 | local res = {m.match(manyCmt(lim), "a")} | 1174 | local res = {m.match(manyCmt(lim), "a")} |
| 1173 | assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) | 1175 | assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) |
