aboutsummaryrefslogtreecommitdiff
path: root/lpcode.c
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-24 17:44:05 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-24 17:44:05 -0300
commit0476d60007ec6693fd9643f6c92aa3adb9fde8d7 (patch)
tree7f2e00409650eecd2f64dd76b82b3f9387be6927 /lpcode.c
parentf8e9bc1c721a0802b2260f48ced72c7e04d7b1ef (diff)
downloadlpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.tar.gz
lpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.tar.bz2
lpeg-0476d60007ec6693fd9643f6c92aa3adb9fde8d7.zip
Smaller encoding for charsets in code
Diffstat (limited to 'lpcode.c')
-rw-r--r--lpcode.c198
1 files changed, 128 insertions, 70 deletions
diff --git a/lpcode.c b/lpcode.c
index 8dab896..1ef7e2d 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -29,53 +29,66 @@ static const Charset *fullset = &fullset_;
29** ======================================================= 29** =======================================================
30*/ 30*/
31 31
32
33/*
34** Add to 'c' the index of the (only) bit set in byte 'b'
35*/
36static int onlybit (int c, int b) {
37 if ((b & 0xF0) != 0) { c += 4; b >>= 4; }
38 if ((b & 0x0C) != 0) { c += 2; b >>= 2; }
39 if ((b & 0x02) != 0) { c += 1; }
40 return c;
41}
42
43
44/*
45** Extra information for the result of 'charsettype'.
46*/
47typedef struct {
48 /* unique character for result IChar, offset (in bytes) for result ISet */
49 int aux1;
50 int size; /* size (in instructions) for result ISet */
51 int deflt; /* default value for bits outside that set */
52} charsetinfo;
53
32/* 54/*
33** Check whether a charset is empty (returns IFail), singleton (IChar), 55** Check whether a charset is empty (returns IFail), singleton (IChar),
34** full (IAny), or none of those (ISet). When singleton, '*c' returns 56** full (IAny), or none of those (ISet). When singleton, 'info.aux1'
35** which character it is. (When generic set, the set was the input, 57** returns which character it is. When generic set, 'info' returns
36** so there is no need to return it.) 58** information about its range.
37*/ 59*/
38static Opcode charsettype (const byte *cs, int *c) { 60static Opcode charsettype (const byte *cs, charsetinfo *info) {
39 int count = 0; /* number of characters in the set */ 61 int low0, low1, high0, high1;
40 int i; 62 for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++)
41 int candidate = -1; /* candidate position for the singleton char */ 63 /* find lowest byte with a 1-bit */;
42 for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ 64 if (low1 == CHARSETSIZE)
43 int b = cs[i]; 65 return IFail; /* no characters in set */
44 if (b == 0) { /* is byte empty? */ 66 for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--)
45 if (count > 1) /* was set neither empty nor singleton? */ 67 /* find highest byte with a 1-bit; low1 is a sentinel */;
46 return ISet; /* neither full nor empty nor singleton */ 68 if (low1 == high1) { /* only one byte with 1-bits? */
47 /* else set is still empty or singleton */ 69 int b = cs[low1];
70 if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */
71 info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */
72 return IChar; /* single character */
48 } 73 }
49 else if (b == 0xFF) { /* is byte full? */
50 if (count < (i * BITSPERCHAR)) /* was set not full? */
51 return ISet; /* neither full nor empty nor singleton */
52 else count += BITSPERCHAR; /* set is still full */
53 }
54 else if ((b & (b - 1)) == 0) { /* has byte only one bit? */
55 if (count > 0) /* was set not empty? */
56 return ISet; /* neither full nor empty nor singleton */
57 else { /* set has only one char till now; track it */
58 count++;
59 candidate = i;
60 }
61 }
62 else return ISet; /* byte is neither empty, full, nor singleton */
63 } 74 }
64 switch (count) { 75 for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++)
65 case 0: return IFail; /* empty set */ 76 /* find lowest byte with a 0-bit */;
66 case 1: { /* singleton; find character bit inside byte */ 77 if (low0 == CHARSETSIZE)
67 int b = cs[candidate]; 78 return IAny; /* set has all bits set */
68 *c = candidate * BITSPERCHAR; 79 for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--)
69 if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } 80 /* find highest byte with a 0-bit; low0 is a sentinel */;
70 if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } 81 if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */
71 if ((b & 0x02) != 0) { *c += 1; } 82 info->aux1 = low1;
72 return IChar; 83 info->size = instsize(high1 - low1 + 1);
73 } 84 info->deflt = 0; /* all discharged bits were 0 */
74 default: { 85 }
75 assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ 86 else {
76 return IAny; 87 info->aux1 = low0;
77 } 88 info->size = instsize(high0 - low0 + 1);
89 info->deflt = 1; /* all discharged bits were 1 */
78 } 90 }
91 return ISet;
79} 92}
80 93
81 94
@@ -584,19 +597,23 @@ static void codechar (CompileState *compst, int c, int tt) {
584 597
585 598
586/* 599/*
587** Add a charset posfix to an instruction 600** Add a charset posfix to an instruction.
588*/ 601*/
589static void addcharset (CompileState *compst, int inst, const byte *cs) { 602static void addcharset (CompileState *compst, int inst, const byte *cs,
603 const charsetinfo *info) {
590 int p = gethere(compst); 604 int p = gethere(compst);
605 Instruction *I = &getinstr(compst, inst);
606 byte *charset;
591 int i; 607 int i;
592 /* for now, all sets use all bits */ 608 I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */
593 getinstr(compst, inst).i.aux2.set.offset = 0; 609 I->i.aux2.set.size = info->size; /* size in instructions */
594 getinstr(compst, inst).i.aux2.set.size = instsize(CHARSETSIZE); 610 I->i.aux1 = info->deflt;
595 getinstr(compst, inst).i.aux1 = 0; 611 for (i = 0; i < info->size; i++)
596 for (i = 0; i < (int)instsize(CHARSETSIZE); i++) 612 nextinstruction(compst); /* space for charset */
597 nextinstruction(compst); /* space for buffer */ 613 charset = getinstr(compst, p).buff; /* previous loop may reallocate things */
598 /* fill buffer with charset */ 614 /* fill buffer with charset */
599 loopset(j, getinstr(compst, p).buff[j] = cs[j]); 615 for (i = 0; i < info->size * (int)sizeof(Instruction); i++)
616 charset[i] = cs[i + info->aux1];
600} 617}
601 618
602 619
@@ -606,21 +623,24 @@ static void addcharset (CompileState *compst, int inst, const byte *cs) {
606** when instruction is dominated by an equivalent test. 623** when instruction is dominated by an equivalent test.
607*/ 624*/
608static void codecharset (CompileState *compst, const byte *cs, int tt) { 625static void codecharset (CompileState *compst, const byte *cs, int tt) {
609 int c = 0; /* (=) to avoid warnings */ 626 charsetinfo info;
610 Opcode op = charsettype(cs, &c); 627 Opcode op = charsettype(cs, &info);
611 switch (op) { 628 switch (op) {
612 case IChar: codechar(compst, c, tt); break; 629 case IChar: codechar(compst, info.aux1, tt); break;
613 case ISet: { /* non-trivial set? */ 630 case ISet: { /* non-trivial set? */
614 if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && 631 if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
615 cs_equal(cs, getinstr(compst, tt + 2).buff)) 632 cs_equal(cs, getinstr(compst, tt + 2).buff))
616 addinstruction(compst, IAny, 0); 633 addinstruction(compst, IAny, 0);
617 else { 634 else {
618 int i = addinstruction(compst, ISet, 0); 635 int i = addinstruction(compst, ISet, 0);
619 addcharset(compst, i, cs); 636 addcharset(compst, i, cs, &info);
620 } 637 }
621 break; 638 break;
622 } 639 }
623 default: addinstruction(compst, op, c); break; 640 default:
641 assert(op == IFail || op == IAny);
642 addinstruction(compst, op, 0);
643 break;
624 } 644 }
625} 645}
626 646
@@ -634,22 +654,22 @@ static void codecharset (CompileState *compst, const byte *cs, int tt) {
634static int codetestset (CompileState *compst, Charset *cs, int e) { 654static int codetestset (CompileState *compst, Charset *cs, int e) {
635 if (e) return NOINST; /* no test */ 655 if (e) return NOINST; /* no test */
636 else { 656 else {
637 int c = 0; 657 charsetinfo info;
638 Opcode op = charsettype(cs->cs, &c); 658 Opcode op = charsettype(cs->cs, &info);
639 switch (op) { 659 switch (op) {
640 case IFail: return addoffsetinst(compst, IJmp); /* always jump */ 660 case IFail: return addoffsetinst(compst, IJmp); /* always jump */
641 case IAny: return addoffsetinst(compst, ITestAny); 661 case IAny: return addoffsetinst(compst, ITestAny);
642 case IChar: { 662 case IChar: {
643 int i = addoffsetinst(compst, ITestChar); 663 int i = addoffsetinst(compst, ITestChar);
644 getinstr(compst, i).i.aux1 = c; 664 getinstr(compst, i).i.aux1 = info.aux1;
645 return i; 665 return i;
646 } 666 }
647 case ISet: { 667 default: { /* regular set */
648 int i = addoffsetinst(compst, ITestSet); 668 int i = addoffsetinst(compst, ITestSet);
649 addcharset(compst, i, cs->cs); 669 addcharset(compst, i, cs->cs, &info);
670 assert(op == ISet);
650 return i; 671 return i;
651 } 672 }
652 default: assert(0); return 0;
653 } 673 }
654 } 674 }
655} 675}
@@ -785,8 +805,51 @@ static void coderuntime (CompileState *compst, TTree *tree, int tt) {
785 805
786 806
787/* 807/*
808** Create a jump to 'test' and fix 'test' to jump to next instruction
809*/
810static void closeloop (CompileState *compst, int test) {
811 int jmp = addoffsetinst(compst, IJmp);
812 jumptohere(compst, test);
813 jumptothere(compst, jmp, test);
814}
815
816
817/*
818** Repetition of charsets:
819** For an empty set, repetition of fail is a no-op;
820** For any or char, code a tight loop;
821** For generic charset, use a span instruction.
822*/
823static void coderepcharset (CompileState *compst, Charset *st) {
824 charsetinfo info;
825 Opcode op = charsettype(st->cs, &info);
826 switch (op) {
827 case IFail: break; /* fail* is a no-op */
828 case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */
829 int test = addoffsetinst(compst, ITestAny);
830 addinstruction(compst, IAny, 0);
831 closeloop(compst, test);
832 break;
833 }
834 case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */
835 int test = addoffsetinst(compst, ITestChar);
836 getinstr(compst, test).i.aux1 = info.aux1;
837 addinstruction(compst, IAny, 0);
838 closeloop(compst, test);
839 break;
840 }
841 default: { /* regular set */
842 int i = addinstruction(compst, ISpan, 0);
843 addcharset(compst, i, st->cs, &info);
844 assert(op == ISet);
845 }
846 }
847}
848
849
850/*
788** Repetion; optimizations: 851** Repetion; optimizations:
789** When pattern is a charset, can use special instruction ISpan. 852** When pattern is a charset, use special code.
790** When pattern is head fail, or if it starts with characters that 853** When pattern is head fail, or if it starts with characters that
791** are disjoint from what follows the repetions, a simple test 854** are disjoint from what follows the repetions, a simple test
792** is enough (a fail inside the repetition would backtrack to fail 855** is enough (a fail inside the repetition would backtrack to fail
@@ -797,20 +860,15 @@ static void coderuntime (CompileState *compst, TTree *tree, int tt) {
797static void coderep (CompileState *compst, TTree *tree, int opt, 860static void coderep (CompileState *compst, TTree *tree, int opt,
798 const Charset *fl) { 861 const Charset *fl) {
799 Charset st; 862 Charset st;
800 if (tocharset(tree, &st)) { 863 if (tocharset(tree, &st))
801 int i = addinstruction(compst, ISpan, 0); 864 coderepcharset(compst, &st);
802 addcharset(compst, i, st.cs);
803 }
804 else { 865 else {
805 int e1 = getfirst(tree, fullset, &st); 866 int e1 = getfirst(tree, fullset, &st);
806 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { 867 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
807 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ 868 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
808 int jmp;
809 int test = codetestset(compst, &st, 0); 869 int test = codetestset(compst, &st, 0);
810 codegen(compst, tree, 0, test, fullset); 870 codegen(compst, tree, 0, test, fullset);
811 jmp = addoffsetinst(compst, IJmp); 871 closeloop(compst, test);
812 jumptohere(compst, test);
813 jumptothere(compst, jmp, test);
814 } 872 }
815 else { 873 else {
816 /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ 874 /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */