From cf1705c1d96b549ef5887a2bc3038dbc31912e50 Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Wed, 7 Jun 2023 15:41:01 -0300 Subject: Captures point to string positions using indices That uses 4 bytes (uint) instead of 8 (char*); the size of the structure 'Capture' reduces from 16 to 8 bytes in 64-bit machines. --- lpcap.c | 26 +++++++++++++++----------- lpcap.h | 14 +++++++++++++- lpprint.c | 27 ++++++++++++++++++++------- lpprint.h | 2 +- lptree.c | 1 + lpvm.c | 20 ++++++++++---------- 6 files changed, 60 insertions(+), 30 deletions(-) diff --git a/lpcap.c b/lpcap.c index 4750f01..d17a47a 100644 --- a/lpcap.c +++ b/lpcap.c @@ -10,7 +10,7 @@ #define isclosecap(cap) (captype(cap) == Cclose) -#define closeaddr(c) ((c)->s + (c)->siz - 1) +#define closeaddr(cs,c) ((cs)->s + (c)->index + (c)->siz - 1) #define isfullcap(cap) ((cap)->siz != 0) @@ -82,7 +82,8 @@ static void nextcap (CapState *cs) { static int pushnestedvalues (CapState *cs, int addextra) { Capture *co = cs->cap; if (isfullcap(cs->cap++)) { /* no nested captures? */ - lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */ + /* push whole match */ + lua_pushlstring(cs->L, cs->s + co->index, co->siz - 1); return 1; /* that is it */ } else { @@ -90,7 +91,8 @@ static int pushnestedvalues (CapState *cs, int addextra) { while (!isclosecap(cs->cap)) /* repeat for all nested patterns */ n += pushcapture(cs); if (addextra || n == 0) { /* need extra? */ - lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */ + /* push whole match */ + lua_pushlstring(cs->L, cs->s + co->index, cs->cap->index - co->index); n++; } cs->cap++; /* skip close entry */ @@ -295,7 +297,7 @@ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { assert(captype(open) == Cgroup); id = finddyncap(open, close); /* get first dynamic capture argument */ close->kind = Cclose; /* closes the group */ - close->s = s; + close->index = s - cs->s; cs->cap = open; cs->valuecached = 0; /* prepare capture state */ luaL_checkstack(L, 4, "too many runtime captures"); pushluaval(cs); /* push function to be called */ @@ -342,7 +344,7 @@ typedef struct StrAux { static int getstrcaps (CapState *cs, StrAux *cps, int n) { int k = n++; cps[k].isstring = 1; /* get string value */ - cps[k].u.s.s = cs->cap->s; /* starts here */ + cps[k].u.s.s = cs->s + cs->cap->index; /* starts here */ if (!isfullcap(cs->cap++)) { /* nested captures? */ while (!isclosecap(cs->cap)) { /* traverse them */ if (n >= MAXSTRCAPS) /* too many captures? */ @@ -358,7 +360,7 @@ static int getstrcaps (CapState *cs, StrAux *cps, int n) { } cs->cap++; /* skip close */ } - cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */ + cps[k].u.s.e = closeaddr(cs, cs->cap - 1); /* ends here */ return n; } @@ -407,20 +409,21 @@ static void stringcap (luaL_Buffer *b, CapState *cs) { ** Substitution capture: add result to buffer 'b' */ static void substcap (luaL_Buffer *b, CapState *cs) { - const char *curr = cs->cap->s; + const char *curr = cs->s + cs->cap->index; if (isfullcap(cs->cap)) /* no nested captures? */ luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */ else { cs->cap++; /* skip open entry */ while (!isclosecap(cs->cap)) { /* traverse nested captures */ - const char *next = cs->cap->s; + const char *next = cs->s + cs->cap->index; luaL_addlstring(b, curr, next - curr); /* add text up to capture */ if (addonestring(b, cs, "replacement")) - curr = closeaddr(cs->cap - 1); /* continue after match */ + curr = closeaddr(cs, cs->cap - 1); /* continue after match */ else /* no capture value */ curr = next; /* keep original text in final result */ } - luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */ + /* add last piece of text */ + luaL_addlstring(b, curr, cs->s + cs->cap->index - curr); } cs->cap++; /* go to next capture */ } @@ -473,7 +476,7 @@ static int pushcapture (CapState *cs) { return luaL_error(L, "subcapture nesting too deep"); switch (captype(cs->cap)) { case Cposition: { - lua_pushinteger(L, cs->cap->s - cs->s + 1); + lua_pushinteger(L, cs->cap->index + 1); cs->cap++; res = 1; break; @@ -553,6 +556,7 @@ static int pushcapture (CapState *cs) { int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); int n = 0; + /* printcaplist(capture); */ if (!isclosecap(capture)) { /* is there any capture? */ CapState cs; cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; diff --git a/lpcap.h b/lpcap.h index e72d913..7cf8c24 100644 --- a/lpcap.h +++ b/lpcap.h @@ -27,8 +27,20 @@ typedef enum CapKind { } CapKind; +/* +** An unsigned integer large enough to index any subject entirely. +** It can be size_t, but that will double the size of the array +** of captures in a 64-bit machine. +*/ +#if !defined(Index_t) +typedef uint Index_t; +#endif + +#define MAXINDT (~(Index_t)0) + + typedef struct Capture { - const char *s; /* subject position */ + Index_t index; /* subject position */ unsigned short idx; /* extra info (group name, arg index, etc.) */ byte kind; /* kind of capture */ byte siz; /* size of full capture + 1 (0 = not a full capture) */ diff --git a/lpprint.c b/lpprint.c index 1142376..bdb85b8 100644 --- a/lpprint.c +++ b/lpprint.c @@ -60,7 +60,7 @@ static void printTcharset (TTree *tree) { static const char *capkind (int kind) { const char *const modes[] = { "close", "position", "constant", "backref", - "argument", "simple", "table", "function", "replace", + "argument", "simple", "table", "function", "accumulator", "query", "string", "num", "substitution", "fold", "runtime", "group"}; return modes[kind]; @@ -147,16 +147,29 @@ void printpatt (Instruction *p) { } -static void printcap (Capture *cap) { - printf("%s (idx: %d - size: %d) -> %p\n", - capkind(cap->kind), cap->idx, cap->siz, cap->s); +static void printcap (Capture *cap, int ident) { + while (ident--) printf(" "); + printf("%s (idx: %d - size: %d) -> %ld\n", + capkind(cap->kind), cap->idx, cap->siz, (long)cap->index); } -void printcaplist (Capture *cap, Capture *limit) { +static Capture *printcap2close (Capture *cap, int ident) { + while (cap->kind != Cclose) { + printcap(cap, ident); + if (cap->siz == 0) { + cap = printcap2close(cap + 1, ident + 2); + printcap(cap, ident); /* print 'close' capture */ + } + cap++; + } + return cap; +} + + +void printcaplist (Capture *cap) { printf(">======\n"); - for (; cap->s && (limit == NULL || cap < limit); cap++) - printcap(cap); + printcap2close(cap, 0); printf("=======\n"); } diff --git a/lpprint.h b/lpprint.h index e97f8c0..aafdafc 100644 --- a/lpprint.h +++ b/lpprint.h @@ -13,7 +13,7 @@ void printpatt (Instruction *p); void printtree (TTree *tree, int ident); void printktable (lua_State *L, int idx); void printcharset (const byte *st); -void printcaplist (Capture *cap, Capture *limit); +void printcaplist (Capture *cap); void printinst (const Instruction *op, const Instruction *p); #else diff --git a/lptree.c b/lptree.c index 330471a..475b0c3 100644 --- a/lptree.c +++ b/lptree.c @@ -1253,6 +1253,7 @@ static int lp_match (lua_State *L) { const char *s = luaL_checklstring(L, SUBJIDX, &l); size_t i = initposition(L, l); int ptop = lua_gettop(L); + luaL_argcheck(L, l < MAXINDT, SUBJIDX, "subject too long"); lua_pushnil(L); /* initialize subscache */ lua_pushlightuserdata(L, capture); /* initialize caplistidx */ lua_getuservalue(L, 1); /* initialize ktableidx */ diff --git a/lpvm.c b/lpvm.c index e6f7dac..a65a6f6 100644 --- a/lpvm.c +++ b/lpvm.c @@ -169,7 +169,7 @@ static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { ** value, 'n' is the number of values (at least 1). The open group ** capture is already in 'capture', before the place for the new entries. */ -static void adddyncaptures (const char *s, Capture *capture, int n, int fd) { +static void adddyncaptures (Index_t index, Capture *capture, int n, int fd) { int i; assert(capture[-1].kind == Cgroup && capture[-1].siz == 0); capture[-1].idx = 0; /* make group capture an anonymous group */ @@ -177,11 +177,11 @@ static void adddyncaptures (const char *s, Capture *capture, int n, int fd) { capture[i].kind = Cruntime; capture[i].siz = 1; /* mark it as closed */ capture[i].idx = fd + i; /* stack index of capture value */ - capture[i].s = s; + capture[i].index = index; } capture[n].kind = Cclose; /* close group */ capture[n].siz = 1; - capture[n].s = s; + capture[n].index = index; } @@ -225,7 +225,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, case IEnd: { assert(stack == getstackbase(L, ptop) + 1); capture[captop].kind = Cclose; - capture[captop].s = NULL; + capture[captop].index = MAXINDT; return s; } case IGiveup: { @@ -383,7 +383,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, luaL_error(L, "too many results in match-time capture"); /* add new captures + close group to 'capture' list */ capture = growcap(L, capture, &capsize, captop, n + 1, ptop); - adddyncaptures(s, capture + captop, n, fr); + adddyncaptures(s - o, capture + captop, n, fr); captop += n + 1; /* new captures + close group */ } p++; @@ -394,24 +394,24 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, assert(captop > 0); /* if possible, turn capture into a full capture */ if (capture[captop - 1].siz == 0 && - s1 - capture[captop - 1].s < UCHAR_MAX) { - capture[captop - 1].siz = s1 - capture[captop - 1].s + 1; + (s1 - o) - capture[captop - 1].index < UCHAR_MAX) { + capture[captop - 1].siz = (s1 - o) - capture[captop - 1].index + 1; p++; continue; } else { capture[captop].siz = 1; /* mark entry as closed */ - capture[captop].s = s; + capture[captop].index = s - o; goto pushcapture; } } case IOpenCapture: capture[captop].siz = 0; /* mark entry as open */ - capture[captop].s = s; + capture[captop].index = s - o; goto pushcapture; case IFullCapture: capture[captop].siz = getoff(p) + 1; /* save capture size */ - capture[captop].s = s - getoff(p); + capture[captop].index = s - o - getoff(p); /* goto pushcapture; */ pushcapture: { capture[captop].idx = p->i.aux2.key; -- cgit v1.2.3-55-g6feb