diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-24 17:44:05 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-24 17:44:05 -0300 |
commit | 0476d60007ec6693fd9643f6c92aa3adb9fde8d7 (patch) | |
tree | 7f2e00409650eecd2f64dd76b82b3f9387be6927 | |
parent | f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef (diff) | |
download | lpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.tar.gz lpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.tar.bz2 lpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.zip |
Smaller encoding for charsets in code
-rw-r--r-- | lpcode.c | 198 | ||||
-rw-r--r-- | lpvm.c | 6 | ||||
-rw-r--r-- | lpvm.h | 4 |
3 files changed, 133 insertions, 75 deletions
@@ -29,53 +29,66 @@ static const Charset *fullset = &fullset_; | |||
29 | ** ======================================================= | 29 | ** ======================================================= |
30 | */ | 30 | */ |
31 | 31 | ||
32 | |||
33 | /* | ||
34 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
35 | */ | ||
36 | static int onlybit (int c, int b) { | ||
37 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
38 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
39 | if ((b & 0x02) != 0) { c += 1; } | ||
40 | return c; | ||
41 | } | ||
42 | |||
43 | |||
44 | /* | ||
45 | ** Extra information for the result of 'charsettype'. | ||
46 | */ | ||
47 | typedef struct { | ||
48 | /* unique character for result IChar, offset (in bytes) for result ISet */ | ||
49 | int aux1; | ||
50 | int size; /* size (in instructions) for result ISet */ | ||
51 | int deflt; /* default value for bits outside that set */ | ||
52 | } charsetinfo; | ||
53 | |||
32 | /* | 54 | /* |
33 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | 55 | ** Check whether a charset is empty (returns IFail), singleton (IChar), |
34 | ** full (IAny), or none of those (ISet). When singleton, '*c' returns | 56 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' |
35 | ** which character it is. (When generic set, the set was the input, | 57 | ** returns which character it is. When generic set, 'info' returns |
36 | ** so there is no need to return it.) | 58 | ** information about its range. |
37 | */ | 59 | */ |
38 | static Opcode charsettype (const byte *cs, int *c) { | 60 | static Opcode charsettype (const byte *cs, charsetinfo *info) { |
39 | int count = 0; /* number of characters in the set */ | 61 | int low0, low1, high0, high1; |
40 | int i; | 62 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) |
41 | int candidate = -1; /* candidate position for the singleton char */ | 63 | /* find lowest byte with a 1-bit */; |
42 | for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ | 64 | if (low1 == CHARSETSIZE) |
43 | int b = cs[i]; | 65 | return IFail; /* no characters in set */ |
44 | if (b == 0) { /* is byte empty? */ | 66 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) |
45 | if (count > 1) /* was set neither empty nor singleton? */ | 67 | /* find highest byte with a 1-bit; low1 is a sentinel */; |
46 | return ISet; /* neither full nor empty nor singleton */ | 68 | if (low1 == high1) { /* only one byte with 1-bits? */ |
47 | /* else set is still empty or singleton */ | 69 | int b = cs[low1]; |
70 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
71 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
72 | return IChar; /* single character */ | ||
48 | } | 73 | } |
49 | else if (b == 0xFF) { /* is byte full? */ | ||
50 | if (count < (i * BITSPERCHAR)) /* was set not full? */ | ||
51 | return ISet; /* neither full nor empty nor singleton */ | ||
52 | else count += BITSPERCHAR; /* set is still full */ | ||
53 | } | ||
54 | else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ | ||
55 | if (count > 0) /* was set not empty? */ | ||
56 | return ISet; /* neither full nor empty nor singleton */ | ||
57 | else { /* set has only one char till now; track it */ | ||
58 | count++; | ||
59 | candidate = i; | ||
60 | } | ||
61 | } | ||
62 | else return ISet; /* byte is neither empty, full, nor singleton */ | ||
63 | } | 74 | } |
64 | switch (count) { | 75 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) |
65 | case 0: return IFail; /* empty set */ | 76 | /* find lowest byte with a 0-bit */; |
66 | case 1: { /* singleton; find character bit inside byte */ | 77 | if (low0 == CHARSETSIZE) |
67 | int b = cs[candidate]; | 78 | return IAny; /* set has all bits set */ |
68 | *c = candidate * BITSPERCHAR; | 79 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) |
69 | if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } | 80 | /* find highest byte with a 0-bit; low0 is a sentinel */; |
70 | if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } | 81 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ |
71 | if ((b & 0x02) != 0) { *c += 1; } | 82 | info->aux1 = low1; |
72 | return IChar; | 83 | info->size = instsize(high1 - low1 + 1); |
73 | } | 84 | info->deflt = 0; /* all discharged bits were 0 */ |
74 | default: { | 85 | } |
75 | assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ | 86 | else { |
76 | return IAny; | 87 | info->aux1 = low0; |
77 | } | 88 | info->size = instsize(high0 - low0 + 1); |
89 | info->deflt = 1; /* all discharged bits were 1 */ | ||
78 | } | 90 | } |
91 | return ISet; | ||
79 | } | 92 | } |
80 | 93 | ||
81 | 94 | ||
@@ -584,19 +597,23 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
584 | 597 | ||
585 | 598 | ||
586 | /* | 599 | /* |
587 | ** Add a charset posfix to an instruction | 600 | ** Add a charset posfix to an instruction. |
588 | */ | 601 | */ |
589 | static void addcharset (CompileState *compst, int inst, const byte *cs) { | 602 | static void addcharset (CompileState *compst, int inst, const byte *cs, |
603 | const charsetinfo *info) { | ||
590 | int p = gethere(compst); | 604 | int p = gethere(compst); |
605 | Instruction *I = &getinstr(compst, inst); | ||
606 | byte *charset; | ||
591 | int i; | 607 | int i; |
592 | /* for now, all sets use all bits */ | 608 | I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ |
593 | getinstr(compst, inst).i.aux2.set.offset = 0; | 609 | I->i.aux2.set.size = info->size; /* size in instructions */ |
594 | getinstr(compst, inst).i.aux2.set.size = instsize(CHARSETSIZE); | 610 | I->i.aux1 = info->deflt; |
595 | getinstr(compst, inst).i.aux1 = 0; | 611 | for (i = 0; i < info->size; i++) |
596 | for (i = 0; i < (int)instsize(CHARSETSIZE); i++) | 612 | nextinstruction(compst); /* space for charset */ |
597 | nextinstruction(compst); /* space for buffer */ | 613 | charset = getinstr(compst, p).buff; /* previous loop may reallocate things */ |
598 | /* fill buffer with charset */ | 614 | /* fill buffer with charset */ |
599 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | 615 | for (i = 0; i < info->size * (int)sizeof(Instruction); i++) |
616 | charset[i] = cs[i + info->aux1]; | ||
600 | } | 617 | } |
601 | 618 | ||
602 | 619 | ||
@@ -606,21 +623,24 @@ static void addcharset (CompileState *compst, int inst, const byte *cs) { | |||
606 | ** when instruction is dominated by an equivalent test. | 623 | ** when instruction is dominated by an equivalent test. |
607 | */ | 624 | */ |
608 | static void codecharset (CompileState *compst, const byte *cs, int tt) { | 625 | static void codecharset (CompileState *compst, const byte *cs, int tt) { |
609 | int c = 0; /* (=) to avoid warnings */ | 626 | charsetinfo info; |
610 | Opcode op = charsettype(cs, &c); | 627 | Opcode op = charsettype(cs, &info); |
611 | switch (op) { | 628 | switch (op) { |
612 | case IChar: codechar(compst, c, tt); break; | 629 | case IChar: codechar(compst, info.aux1, tt); break; |
613 | case ISet: { /* non-trivial set? */ | 630 | case ISet: { /* non-trivial set? */ |
614 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && | 631 | if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && |
615 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | 632 | cs_equal(cs, getinstr(compst, tt + 2).buff)) |
616 | addinstruction(compst, IAny, 0); | 633 | addinstruction(compst, IAny, 0); |
617 | else { | 634 | else { |
618 | int i = addinstruction(compst, ISet, 0); | 635 | int i = addinstruction(compst, ISet, 0); |
619 | addcharset(compst, i, cs); | 636 | addcharset(compst, i, cs, &info); |
620 | } | 637 | } |
621 | break; | 638 | break; |
622 | } | 639 | } |
623 | default: addinstruction(compst, op, c); break; | 640 | default: |
641 | assert(op == IFail || op == IAny); | ||
642 | addinstruction(compst, op, 0); | ||
643 | break; | ||
624 | } | 644 | } |
625 | } | 645 | } |
626 | 646 | ||
@@ -634,22 +654,22 @@ static void codecharset (CompileState *compst, const byte *cs, int tt) { | |||
634 | static int codetestset (CompileState *compst, Charset *cs, int e) { | 654 | static int codetestset (CompileState *compst, Charset *cs, int e) { |
635 | if (e) return NOINST; /* no test */ | 655 | if (e) return NOINST; /* no test */ |
636 | else { | 656 | else { |
637 | int c = 0; | 657 | charsetinfo info; |
638 | Opcode op = charsettype(cs->cs, &c); | 658 | Opcode op = charsettype(cs->cs, &info); |
639 | switch (op) { | 659 | switch (op) { |
640 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ | 660 | case IFail: return addoffsetinst(compst, IJmp); /* always jump */ |
641 | case IAny: return addoffsetinst(compst, ITestAny); | 661 | case IAny: return addoffsetinst(compst, ITestAny); |
642 | case IChar: { | 662 | case IChar: { |
643 | int i = addoffsetinst(compst, ITestChar); | 663 | int i = addoffsetinst(compst, ITestChar); |
644 | getinstr(compst, i).i.aux1 = c; | 664 | getinstr(compst, i).i.aux1 = info.aux1; |
645 | return i; | 665 | return i; |
646 | } | 666 | } |
647 | case ISet: { | 667 | default: { /* regular set */ |
648 | int i = addoffsetinst(compst, ITestSet); | 668 | int i = addoffsetinst(compst, ITestSet); |
649 | addcharset(compst, i, cs->cs); | 669 | addcharset(compst, i, cs->cs, &info); |
670 | assert(op == ISet); | ||
650 | return i; | 671 | return i; |
651 | } | 672 | } |
652 | default: assert(0); return 0; | ||
653 | } | 673 | } |
654 | } | 674 | } |
655 | } | 675 | } |
@@ -785,8 +805,51 @@ static void coderuntime (CompileState *compst, TTree *tree, int tt) { | |||
785 | 805 | ||
786 | 806 | ||
787 | /* | 807 | /* |
808 | ** Create a jump to 'test' and fix 'test' to jump to next instruction | ||
809 | */ | ||
810 | static void closeloop (CompileState *compst, int test) { | ||
811 | int jmp = addoffsetinst(compst, IJmp); | ||
812 | jumptohere(compst, test); | ||
813 | jumptothere(compst, jmp, test); | ||
814 | } | ||
815 | |||
816 | |||
817 | /* | ||
818 | ** Repetition of charsets: | ||
819 | ** For an empty set, repetition of fail is a no-op; | ||
820 | ** For any or char, code a tight loop; | ||
821 | ** For generic charset, use a span instruction. | ||
822 | */ | ||
823 | static void coderepcharset (CompileState *compst, Charset *st) { | ||
824 | charsetinfo info; | ||
825 | Opcode op = charsettype(st->cs, &info); | ||
826 | switch (op) { | ||
827 | case IFail: break; /* fail* is a no-op */ | ||
828 | case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */ | ||
829 | int test = addoffsetinst(compst, ITestAny); | ||
830 | addinstruction(compst, IAny, 0); | ||
831 | closeloop(compst, test); | ||
832 | break; | ||
833 | } | ||
834 | case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ | ||
835 | int test = addoffsetinst(compst, ITestChar); | ||
836 | getinstr(compst, test).i.aux1 = info.aux1; | ||
837 | addinstruction(compst, IAny, 0); | ||
838 | closeloop(compst, test); | ||
839 | break; | ||
840 | } | ||
841 | default: { /* regular set */ | ||
842 | int i = addinstruction(compst, ISpan, 0); | ||
843 | addcharset(compst, i, st->cs, &info); | ||
844 | assert(op == ISet); | ||
845 | } | ||
846 | } | ||
847 | } | ||
848 | |||
849 | |||
850 | /* | ||
788 | ** Repetion; optimizations: | 851 | ** Repetion; optimizations: |
789 | ** When pattern is a charset, can use special instruction ISpan. | 852 | ** When pattern is a charset, use special code. |
790 | ** When pattern is head fail, or if it starts with characters that | 853 | ** When pattern is head fail, or if it starts with characters that |
791 | ** are disjoint from what follows the repetions, a simple test | 854 | ** are disjoint from what follows the repetions, a simple test |
792 | ** is enough (a fail inside the repetition would backtrack to fail | 855 | ** is enough (a fail inside the repetition would backtrack to fail |
@@ -797,20 +860,15 @@ static void coderuntime (CompileState *compst, TTree *tree, int tt) { | |||
797 | static void coderep (CompileState *compst, TTree *tree, int opt, | 860 | static void coderep (CompileState *compst, TTree *tree, int opt, |
798 | const Charset *fl) { | 861 | const Charset *fl) { |
799 | Charset st; | 862 | Charset st; |
800 | if (tocharset(tree, &st)) { | 863 | if (tocharset(tree, &st)) |
801 | int i = addinstruction(compst, ISpan, 0); | 864 | coderepcharset(compst, &st); |
802 | addcharset(compst, i, st.cs); | ||
803 | } | ||
804 | else { | 865 | else { |
805 | int e1 = getfirst(tree, fullset, &st); | 866 | int e1 = getfirst(tree, fullset, &st); |
806 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { | 867 | if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { |
807 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ | 868 | /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ |
808 | int jmp; | ||
809 | int test = codetestset(compst, &st, 0); | 869 | int test = codetestset(compst, &st, 0); |
810 | codegen(compst, tree, 0, test, fullset); | 870 | codegen(compst, tree, 0, test, fullset); |
811 | jmp = addoffsetinst(compst, IJmp); | 871 | closeloop(compst, test); |
812 | jumptohere(compst, test); | ||
813 | jumptothere(compst, jmp, test); | ||
814 | } | 872 | } |
815 | else { | 873 | else { |
816 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ | 874 | /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ |
@@ -24,12 +24,12 @@ static const Instruction giveup = {{IGiveup, 0, {0}}}; | |||
24 | 24 | ||
25 | 25 | ||
26 | int charinset (const Instruction *i, const byte *buff, unsigned int c) { | 26 | int charinset (const Instruction *i, const byte *buff, unsigned int c) { |
27 | c -= (unsigned int)i->i.aux2.set.offset; | 27 | c -= i->i.aux2.set.offset; |
28 | if (c >= ((unsigned int)i->i.aux2.set.size /* size in instructions... */ | 28 | if (c >= ((unsigned int)i->i.aux2.set.size /* size in instructions... */ |
29 | * (unsigned int)sizeof(Instruction) /* in bytes... */ | 29 | * (unsigned int)sizeof(Instruction) /* in bytes... */ |
30 | * 8u)) /* in bits */ | 30 | * 8u)) /* in bits */ |
31 | return i->i.aux1; /* out of range */ | 31 | return i->i.aux1; /* out of range; return default value */ |
32 | return (testchar(buff, c) != i->i.aux1); | 32 | return testchar(buff, c); |
33 | } | 33 | } |
34 | 34 | ||
35 | 35 | ||
@@ -7,8 +7,8 @@ | |||
7 | 7 | ||
8 | /* | 8 | /* |
9 | ** About Character sets in instructions: a set is a bit map with an | 9 | ** About Character sets in instructions: a set is a bit map with an |
10 | ** initial offset, in bits, and a size, in number of instructions. If | 10 | ** initial offset, in bits, and a size, in number of instructions. |
11 | ** aux1 is one, set is inverted (bit == 1 means char is not in set). | 11 | ** aux1 has the default value for the bits outsize that range. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | 14 | ||