From 012cf9c86cf91cb8354e229bde335592d41b84b2 Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Thu, 27 Apr 2023 10:32:39 -0300 Subject: Compact charsets used in trees, too. --- lpcode.c | 118 ++++++++++++++++++++------------------------------------------ lpcode.h | 1 - lpcset.c | 55 +++++++++++++++++++++++++---- lpcset.h | 15 +++++--- lpprint.c | 13 ++++++- lptree.c | 70 ++++++++++++++++++++++++++++--------- lptree.h | 9 +++-- makefile | 13 +++---- test.lua | 4 ++- 9 files changed, 178 insertions(+), 120 deletions(-) diff --git a/lpcode.c b/lpcode.c index 66d2f3f..9289bd3 100644 --- a/lpcode.c +++ b/lpcode.c @@ -44,31 +44,6 @@ static int cs_disjoint (const Charset *cs1, const Charset *cs2) { } -/* -** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a -** charset and return 1; else return 0. -*/ -int tocharset (TTree *tree, Charset *cs) { - switch (tree->tag) { - case TSet: { /* copy set */ - loopset(i, cs->cs[i] = treebuffer(tree)[i]); - return 1; - } - case TChar: { /* only one char */ - assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); - loopset(i, cs->cs[i] = 0); /* erase all chars */ - setchar(cs->cs, tree->u.n); /* add that one */ - return 1; - } - case TAny: { - loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ - return 1; - } - default: return 0; - } -} - - /* ** Visit a TCall node taking care to stop recursion. If node not yet ** visited, return 'f(sib2(tree))', otherwise return 'def' (default @@ -240,7 +215,7 @@ int fixedlen (TTree *tree) { static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: { + case TChar: case TSet: case TAny: case TFalse: { tocharset(tree, firstset); return 0; } @@ -255,10 +230,6 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { loopset(i, firstset->cs[i] = follow->cs[i]); return 1; /* accepts the empty string */ } - case TFalse: { - loopset(i, firstset->cs[i] = 0); - return 0; - } case TChoice: { Charset csaux; int e1 = getfirst(sib1(tree), follow, firstset); @@ -543,37 +514,36 @@ static void codechar (CompileState *compst, int c, int tt) { /* ** Add a charset posfix to an instruction. */ -static void addcharset (CompileState *compst, int inst, const byte *cs, - const charsetinfo *info) { +static void addcharset (CompileState *compst, int inst, charsetinfo *info) { int p; Instruction *I = &getinstr(compst, inst); byte *charset; int isize = instsize(info->size); /* size in instructions */ int i; - I->i.aux2.set.offset = info->aux1 * 8; /* offset in bits */ + I->i.aux2.set.offset = info->offset * 8; /* offset in bits */ I->i.aux2.set.size = isize; I->i.aux1 = info->deflt; p = nextinstruction(compst, isize); /* space for charset */ charset = getinstr(compst, p).buff; /* charset buffer */ for (i = 0; i < isize * (int)sizeof(Instruction); i++) - charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ + charset[i] = getbytefromcharset(info, i); /* copy the buffer */ } /* -** Check whether compact charset cs is dominated by instruction 'p' +** Check whether charset 'info' is dominated by instruction 'p' */ -static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { +static int cs_equal (Instruction *p, charsetinfo *info) { if (p->i.code != ITestSet) return 0; - else if (p->i.aux2.set.offset != info->aux1 * 8 || + else if (p->i.aux2.set.offset != info->offset * 8 || p->i.aux2.set.size != instsize(info->size) || p->i.aux1 != info->deflt) return 0; else { int i; for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { - if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) + if ((p + 2)->buff[i] != getbytefromcharset(info, i)) return 0; } } @@ -582,34 +552,23 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { /* -** code a char set, optimizing unit sets for IChar, "complete" -** sets for IAny, and empty sets for IFail; also use an IAny -** when instruction is dominated by an equivalent test. +** Code a char set, using IAny when instruction is dominated by an +** equivalent test. */ -static void codecharset (CompileState *compst, const byte *cs, int tt) { +static void codecharset (CompileState *compst, TTree *tree, int tt) { charsetinfo info; - Opcode op = charsettype(cs, &info); - switch (op) { - case IChar: codechar(compst, info.aux1, tt); break; - case ISet: { /* non-trivial set? */ - if (tt >= 0 && cs_equal(&getinstr(compst, tt), cs, &info)) - addinstruction(compst, IAny, 0); - else { - int i = addinstruction(compst, ISet, 0); - addcharset(compst, i, cs, &info); - } - break; - } - default: - assert(op == IFail || op == IAny); - addinstruction(compst, op, 0); - break; + tree2cset(tree, &info); + if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info)) + addinstruction(compst, IAny, 0); + else { + int i = addinstruction(compst, ISet, 0); + addcharset(compst, i, &info); } } /* -** code a test set, optimizing unit sets for ITestChar, "complete" +** Code a test set, optimizing unit sets for ITestChar, "complete" ** sets for ITestAny, and empty sets for IJmp (always fails). ** 'e' is true iff test should accept the empty string. (Test ** instructions in the current VM never accept the empty string.) @@ -624,12 +583,12 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { case IAny: return addoffsetinst(compst, ITestAny); case IChar: { int i = addoffsetinst(compst, ITestChar); - getinstr(compst, i).i.aux1 = info.aux1; + getinstr(compst, i).i.aux1 = info.offset; return i; } default: { /* regular set */ int i = addoffsetinst(compst, ITestSet); - addcharset(compst, i, cs->cs, &info); + addcharset(compst, i, &info); assert(op == ISet); return i; } @@ -778,34 +737,35 @@ static void closeloop (CompileState *compst, int test) { /* -** Repetition of charsets: +** Try repetition of charsets: ** For an empty set, repetition of fail is a no-op; ** For any or char, code a tight loop; ** For generic charset, use a span instruction. */ -static void coderepcharset (CompileState *compst, Charset *st) { - charsetinfo info; - Opcode op = charsettype(st->cs, &info); - switch (op) { - case IFail: break; /* fail* is a no-op */ - case IAny: { /* L1: testany -> L2; any; jmp L1; L2: */ +static int coderepcharset (CompileState *compst, TTree *tree) { + switch (tree->tag) { + case TFalse: return 1; /* 'fail*' is a no-op */ + case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */ int test = addoffsetinst(compst, ITestAny); addinstruction(compst, IAny, 0); closeloop(compst, test); - break; + return 1; } - case IChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ + case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ int test = addoffsetinst(compst, ITestChar); - getinstr(compst, test).i.aux1 = info.aux1; + getinstr(compst, test).i.aux1 = tree->u.n; addinstruction(compst, IAny, 0); closeloop(compst, test); - break; + return 1; } - default: { /* regular set */ + case TSet: { /* regular set */ + charsetinfo info; int i = addinstruction(compst, ISpan, 0); - addcharset(compst, i, st->cs, &info); - assert(op == ISet); + tree2cset(tree, &info); + addcharset(compst, i, &info); + return 1; } + default: return 0; /* not a charset */ } } @@ -822,10 +782,8 @@ static void coderepcharset (CompileState *compst, Charset *st) { */ static void coderep (CompileState *compst, TTree *tree, int opt, const Charset *fl) { - Charset st; - if (tocharset(tree, &st)) - coderepcharset(compst, &st); - else { + if (!coderepcharset(compst, tree)) { + Charset st; int e1 = getfirst(tree, fullset, &st); if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { /* L1: test (fail(p1)) -> L2;

; jmp L1; L2: */ @@ -965,7 +923,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, switch (tree->tag) { case TChar: codechar(compst, tree->u.n, tt); break; case TAny: addinstruction(compst, IAny, 0); break; - case TSet: codecharset(compst, treebuffer(tree), tt); break; + case TSet: codecharset(compst, tree, tt); break; case TTrue: break; case TFalse: addinstruction(compst, IFail, 0); break; case TUTFR: codeutfr(compst, tree); break; diff --git a/lpcode.h b/lpcode.h index ec5f43f..3c71451 100644 --- a/lpcode.h +++ b/lpcode.h @@ -8,7 +8,6 @@ #include "lptree.h" #include "lpvm.h" -int tocharset (TTree *tree, Charset *cs); int checkaux (TTree *tree, int pred); int fixedlen (TTree *tree); int hascaptures (TTree *tree); diff --git a/lpcset.c b/lpcset.c index 9ecf475..2e62d94 100644 --- a/lpcset.c +++ b/lpcset.c @@ -16,7 +16,7 @@ static int onlybit (int c, int b) { /* ** Check whether a charset is empty (returns IFail), singleton (IChar), -** full (IAny), or none of those (ISet). When singleton, 'info.aux1' +** full (IAny), or none of those (ISet). When singleton, 'info.offset' ** returns which character it is. When generic set, 'info' returns ** information about its range. */ @@ -31,7 +31,7 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { if (low1 == high1) { /* only one byte with 1-bits? */ int b = cs[low1]; if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ - info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ + info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ return IChar; /* single character */ } } @@ -42,15 +42,16 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) /* find highest byte with a 0-bit; low0 is a sentinel */; if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ - info->aux1 = low1; + info->offset = low1; info->size = high1 - low1 + 1; info->deflt = 0; /* all discharged bits were 0 */ } else { - info->aux1 = low0; + info->offset = low0; info->size = high0 - low0 + 1; info->deflt = 0xFF; /* all discharged bits were 1 */ } + info->cs = cs + info->offset; return ISet; } @@ -60,10 +61,50 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { ** range, get the byte from the supporting charset (correcting it ** by the offset). Otherwise, return the default for the set. */ -byte getbytefromcharset (const byte *cs, const charsetinfo *info, - int index) { +byte getbytefromcharset (const charsetinfo *info, int index) { if (index < info->size) - return cs[info->aux1 + index]; + return info->cs[index]; else return info->deflt; } + +/* +** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it +** into a charset and return 1; else return 0. +*/ +int tocharset (TTree *tree, Charset *cs) { + switch (tree->tag) { + case TChar: { /* only one char */ + assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); + loopset(i, cs->cs[i] = 0); /* erase all chars */ + setchar(cs->cs, tree->u.n); /* add that one */ + return 1; + } + case TAny: { + loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ + return 1; + } + case TFalse: { + loopset(i, cs->cs[i] = 0); /* empty set */ + return 1; + } + case TSet: { /* fill set */ + int i; + loopset(j, cs->cs[j] = tree->u.set.deflt); + for (i = 0; i < tree->u.set.size; i++) + cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i]; + return 1; + } + default: return 0; + } +} + + +void tree2cset (TTree *tree, charsetinfo *info) { + assert(tree->tag == TSet); + info->offset = tree->u.set.offset; + info->size = tree->u.set.size; + info->deflt = tree->u.set.deflt; + info->cs = treebuffer(tree); +} + diff --git a/lpcset.h b/lpcset.h index e5152c4..b69fef9 100644 --- a/lpcset.h +++ b/lpcset.h @@ -4,22 +4,27 @@ #include "lpcset.h" #include "lpcode.h" +#include "lptree.h" /* ** Extra information for the result of 'charsettype'. When result is -** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the -** offset (in bytes), 'size' is the size (in bytes), and -** 'delt' is the default value for bytes outside the set. +** IChar, 'offset' is the character. When result is ISet, 'cs' is the +** supporting bit array (with offset included), 'offset' is the offset +** (in bytes), 'size' is the size (in bytes), and 'delt' is the default +** value for bytes outside the set. */ typedef struct { - int aux1; + const byte *cs; + int offset; int size; int deflt; } charsetinfo; +int tocharset (TTree *tree, Charset *cs); Opcode charsettype (const byte *cs, charsetinfo *info); -byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); +byte getbytefromcharset (const charsetinfo *info, int index); +void tree2cset (TTree *tree, charsetinfo *info); #endif diff --git a/lpprint.c b/lpprint.c index 518d822..a432263 100644 --- a/lpprint.c +++ b/lpprint.c @@ -46,6 +46,17 @@ static void printIcharset (const Instruction *inst, const byte *buff) { } +static void printTcharset (TTree *tree) { + byte cs[CHARSETSIZE]; + int i; + printf("(%02x-%d) ", tree->u.set.offset, tree->u.set.size); + loopset(j, cs[j] = tree->u.set.deflt); + for (i = 0; i < tree->u.set.size; i++) + cs[tree->u.set.offset + i] = treebuffer(tree)[i]; + printcharset(cs); +} + + static const char *capkind (int kind) { const char *const modes[] = { "close", "position", "constant", "backref", @@ -186,7 +197,7 @@ void printtree (TTree *tree, int ident) { break; } case TSet: { - printcharset(treebuffer(tree)); + printTcharset(tree); printf("\n"); break; } diff --git a/lptree.c b/lptree.c index 4affac9..f9e170b 100644 --- a/lptree.c +++ b/lptree.c @@ -12,6 +12,7 @@ #include "lpcode.h" #include "lpprint.h" #include "lptree.h" +#include "lpcset.h" /* number of siblings for each tree */ @@ -370,14 +371,37 @@ static TTree *newleaf (lua_State *L, int tag) { } -static TTree *newcharset (lua_State *L) { - TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); +static TTree *newccharset (lua_State *L, byte *cs, charsetinfo *info) { + int i; + TTree *tree = newtree(L, bytes2slots(info->size) + 1); tree->tag = TSet; - loopset(i, treebuffer(tree)[i] = 0); + tree->u.set.offset = info->offset; + tree->u.set.size = info->size; + tree->u.set.deflt = info->deflt; + for (i = 0; i < info->size; i++) + treebuffer(tree)[i] = cs[info->offset + i]; return tree; } +static TTree *newcharset (lua_State *L, byte *cs) { + charsetinfo info; + Opcode op = charsettype(cs, &info); + switch (op) { + case IFail: return newleaf(L, TFalse); + case IAny: return newleaf(L, TAny); + case IChar: { + TTree *tree =newleaf(L, TChar); + tree->u.n = info.offset; + return tree; + } + default: + assert(op == ISet); + return newccharset(L, cs, &info); + } +} + + /* ** add to tree a sequence where first sibling is 'sib' (with size ** 'sibsize'); returns position for second sibling @@ -549,8 +573,8 @@ static int lp_choice (lua_State *L) { TTree *t1 = getpatt(L, 1, NULL); TTree *t2 = getpatt(L, 2, NULL); if (tocharset(t1, &st1) && tocharset(t2, &st2)) { - TTree *t = newcharset(L); - loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); + loopset(i, st1.cs[i] |= st2.cs[i]); + newcharset(L, st1.cs); } else if (nofail(t1) || t2->tag == TFalse) lua_pushvalue(L, 1); /* true / x => true, x / false => x */ @@ -626,8 +650,8 @@ static int lp_sub (lua_State *L) { TTree *t1 = getpatt(L, 1, &s1); TTree *t2 = getpatt(L, 2, &s2); if (tocharset(t1, &st1) && tocharset(t2, &st2)) { - TTree *t = newcharset(L); - loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); + loopset(i, st1.cs[i] &= ~st2.cs[i]); + newcharset(L, st1.cs); } else { TTree *tree = newtree(L, 2 + s1 + s2); @@ -645,11 +669,13 @@ static int lp_sub (lua_State *L) { static int lp_set (lua_State *L) { size_t l; const char *s = luaL_checklstring(L, 1, &l); - TTree *tree = newcharset(L); + byte buff[CHARSETSIZE]; + loopset(i, buff[i] = 0); while (l--) { - setchar(treebuffer(tree), (byte)(*s)); + setchar(buff, (byte)(*s)); s++; } + newcharset(L, buff); return 1; } @@ -657,15 +683,17 @@ static int lp_set (lua_State *L) { static int lp_range (lua_State *L) { int arg; int top = lua_gettop(L); - TTree *tree = newcharset(L); + byte buff[CHARSETSIZE]; + loopset(i, buff[i] = 0); for (arg = 1; arg <= top; arg++) { int c; size_t l; const char *r = luaL_checklstring(L, arg, &l); luaL_argcheck(L, l == 2, arg, "range must have two characters"); for (c = (byte)r[0]; c <= (byte)r[1]; c++) - setchar(treebuffer(tree), c); + setchar(buff, c); } + newcharset(L, buff); return 1; } @@ -704,10 +732,12 @@ static int lp_utfr (lua_State *L) { lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); luaL_argcheck(L, from <= to, 2, "empty range"); if (to <= 0x7f) { /* ascii range? */ - TTree *tree = newcharset(L); /* code it as a regular charset */ unsigned int f; + byte buff[CHARSETSIZE]; /* code it as a regular charset */ + loopset(i, buff[i] = 0); for (f = (int)from; f <= to; f++) - setchar(treebuffer(tree), f); + setchar(buff, f); + newcharset(L, buff); } else { /* multi-byte utf-8 range */ TTree *tree = newtree(L, 2); @@ -1261,11 +1291,17 @@ int lp_gc (lua_State *L) { } +/* +** Create a charset representing a category of characters, given by +** the predicate 'catf'. +*/ static void createcat (lua_State *L, const char *catname, int (catf) (int)) { - TTree *t = newcharset(L); - int i; - for (i = 0; i <= UCHAR_MAX; i++) - if (catf(i)) setchar(treebuffer(t), i); + int c; + byte buff[CHARSETSIZE]; + loopset(i, buff[i] = 0); + for (c = 0; c <= UCHAR_MAX; c++) + if (catf(c)) setchar(buff, c); + newcharset(L, buff); lua_setfield(L, -2, catname); } diff --git a/lptree.h b/lptree.h index aa331d2..b76c235 100644 --- a/lptree.h +++ b/lptree.h @@ -3,7 +3,7 @@ #define lptree_h -#include "lptypes.h" +#include "lptypes.h" /* @@ -11,7 +11,7 @@ */ typedef enum TTag { TChar = 0, /* 'n' = char */ - TSet, /* the set is stored in next CHARSETSIZE bytes */ + TSet, /* the set is encoded in 'u.set' and the next 'u.set.size' bytes */ TAny, TTrue, TFalse, @@ -52,6 +52,11 @@ typedef struct TTree { union { int ps; /* occasional second child */ int n; /* occasional counter */ + struct { + byte offset; /* compact set offset (in bytes) */ + byte size; /* compact set size (in bytes) */ + byte deflt; /* default value */ + } set; /* for compact sets */ } u; } TTree; diff --git a/makefile b/makefile index 9493c6e..2b3c12e 100644 --- a/makefile +++ b/makefile @@ -1,9 +1,9 @@ LIBNAME = lpeg LUADIR = ./lua/ -COPT = -O2 -DNDEBUG +# COPT = -O2 -DNDEBUG # COPT = -g -# COPT = -DLPEG_DEBUG +COPT = -DLPEG_DEBUG -g CWARNS = -Wall -Wextra -pedantic \ -Waggregate-return \ @@ -49,8 +49,9 @@ clean: lpcap.o: lpcap.c lpcap.h lptypes.h -lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h -lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h -lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h +lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h lpcset.h +lpcset.o: lpcset.c lptypes.h lpcset.h lpcode.h lptree.h lpvm.h lpcap.h +lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h lpcode.h +lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h \ + lpcset.h lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h -lpcset.o: lpcset.c lpcset.h lptypes.h diff --git a/test.lua b/test.lua index d76e3f1..9f8d226 100755 --- a/test.lua +++ b/test.lua @@ -68,6 +68,8 @@ assert(m.match(#m.P(true) * "a", "a") == 2) assert(m.match("a" * #m.P(false), "a") == nil) assert(m.match("a" * #m.P(true), "a") == 2) +assert(m.match(m.P(1)^0, "abcd") == 5) +assert(m.match(m.S("")^0, "abcd") == 1) -- tests for locale do @@ -1167,7 +1169,7 @@ end -- bug in 1.0: problems with math-times returning too many captures -do +if _VERSION >= "Lua 5.2" then local lim = 2^11 - 10 local res = {m.match(manyCmt(lim), "a")} assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) -- cgit v1.2.3-55-g6feb