Removed 'unsigned char' limit on number of rules in grammars

Added a new tree-type node 'TXInfo', which follows 'TRule' nodes, to store extra information about a node. (In this case, the rule number, with an 'unsigned short' field.)
author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-14 12:04:23 -0300
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-14 12:04:23 -0300
commit: 3f7797419e4d7493e1364290a5b127d1cb45e3bf (patch)
tree: 8dd91b0d008d5ea9f9c96eada86510495c97d1e3
parent: d9f83dded93a35fb333c4e1bd371c401f7129fd1 (diff)
download: lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.tar.gz
lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.tar.bz2
lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.zip
7 files changed, 75 insertions, 48 deletions
diff --git a/lpcode.c b/lpcode.c
index 3923459..5fddfab 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -220,7 +220,7 @@ int checkaux (TTree *tree, int pred) {
      if (checkaux(sib2(tree), pred)) return 1;
      /* else return checkaux(sib1(tree), pred); */
      tree = sib1(tree); goto tailcall;
-    case TCapture: case TGrammar: case TRule:
+    case TCapture: case TGrammar: case TRule: case TXInfo:
      /* return checkaux(sib1(tree), pred); */
      tree = sib1(tree); goto tailcall;
    case TCall:  /* return checkaux(sib2(tree), pred); */
@@ -243,7 +243,7 @@ int fixedlen (TTree *tree) {
      return len;
    case TRep: case TRunTime: case TOpenCall:
      return -1;
-    case TCapture: case TRule: case TGrammar:
+    case TCapture: case TRule: case TGrammar: case TXInfo:
      /* return fixedlen(sib1(tree)); */
      tree = sib1(tree); goto tailcall;
    case TCall: {
@@ -334,7 +334,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
      loopset(i, firstset->cs[i] |= follow->cs[i]);
      return 1;  /* accept the empty string */
    }
-    case TCapture: case TGrammar: case TRule: {
+    case TCapture: case TGrammar: case TRule: case TXInfo: {
      /* return getfirst(sib1(tree), follow, firstset); */
      tree = sib1(tree); goto tailcall;
    }
@@ -382,7 +382,7 @@ static int headfail (TTree *tree) {
    case TTrue: case TRep: case TRunTime: case TNot:
    case TBehind:
      return 0;
-    case TCapture: case TGrammar: case TRule: case TAnd:
+    case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd:
      tree = sib1(tree); goto tailcall;  /* return headfail(sib1(tree)); */
    case TCall:
      tree = sib2(tree); goto tailcall;  /* return headfail(sib2(tree)); */
@@ -874,8 +874,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) {
  int start = gethere(compst);  /* here starts the initial rule */
  jumptohere(compst, firstcall);
  for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
+    TTree *r = sib1(rule);
+    assert(r->tag == TXInfo);
    positions[rulenumber++] = gethere(compst);  /* save rule position */
-    codegen(compst, sib1(rule), 0, NOINST, fullset);  /* code rule */
+    codegen(compst, sib1(r), 0, NOINST, fullset);  /* code rule */
    addinstruction(compst, IRet, 0);
  }
  assert(rule->tag == TTrue);
@@ -886,8 +888,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) {
 static void codecall (CompileState *compst, TTree *call) {
  int c = addoffsetinst(compst, IOpenCall);  /* to be corrected later */
-  getinstr(compst, c).i.key = sib2(call)->cap;  /* rule number */
+  assert(sib1(sib2(call))->tag == TXInfo);
-  assert(sib2(call)->tag == TRule);
+  getinstr(compst, c).i.key = sib1(sib2(call))->u.n;  /* rule number */
 }
@@ -971,7 +973,7 @@ static void peephole (CompileState *compst) {
          case IRet: case IFail: case IFailTwice:
          case IEnd: {  /* instructions with unconditional implicit jumps */
            code[i] = code[ft];  /* jump becomes that instruction */
-            code[i + 1].i.code = IAny;  /* 'no-op' for target position */
+            code[i + 1].i.code = IEmpty;  /* 'no-op' for target position */
            break;
          }
          case ICommit: case IPartialCommit:
diff --git a/lpprint.c b/lpprint.c
index df62cbe..397785e 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -60,7 +60,8 @@ void printinst (const Instruction *op, const Instruction *p) {
    "ret", "end",
    "choice", "jmp", "call", "open_call",
    "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
-     "fullcapture", "opencapture", "closecapture", "closeruntime"
+     "fullcapture", "opencapture", "closecapture", "closeruntime",
+     "--"
  };
  printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
  switch ((Opcode)p->i.code) {
@@ -151,7 +152,7 @@ static const char *tagnames[] = {
  "rep",
  "seq", "choice",
  "not", "and",
-  "call", "opencall", "rule", "grammar",
+  "call", "opencall", "rule", "xinfo", "grammar",
  "behind",
  "capture", "run-time"
 };
@@ -159,6 +160,7 @@ static const char *tagnames[] = {
 void printtree (TTree *tree, int ident) {
  int i;
+  int sibs = numsiblings[tree->tag];
  for (i = 0; i < ident; i++) printf(" ");
  printf("%s", tagnames[tree->tag]);
  switch (tree->tag) {
@@ -176,24 +178,26 @@ void printtree (TTree *tree, int ident) {
      break;
    }
    case TOpenCall: case TCall: {
-      assert(sib2(tree)->tag == TRule);
+      assert(sib1(sib2(tree))->tag == TXInfo);
-      printf(" key: %d  (rule: %d)\n", tree->key, sib2(tree)->cap);
+      printf(" key: %d  (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
      break;
    }
    case TBehind: {
      printf(" %d\n", tree->u.n);
-        printtree(sib1(tree), ident + 2);
      break;
    }
    case TCapture: {
      printf(" kind: '%s'  key: %d\n", capkind(tree->cap), tree->key);
-      printtree(sib1(tree), ident + 2);
      break;
    }
    case TRule: {
-      printf(" n: %d  key: %d\n", tree->cap, tree->key);
+      printf(" key: %d\n", tree->key);
-      printtree(sib1(tree), ident + 2);
+      sibs = 1;  /* do not print 'sib2' (next rule) as a sibling */
-      break;  /* do not print next rule as a sibling */
+      break;
+    }
+    case TXInfo: {
+      printf(" n: %d\n", tree->u.n);
+      break;
    }
    case TGrammar: {
      TTree *rule = sib1(tree);
@@ -203,18 +207,17 @@ void printtree (TTree *tree, int ident) {
        rule = sib2(rule);
      }
      assert(rule->tag == TTrue);  /* sentinel */
+      sibs = 0;  /* siblings already handled */
      break;
    }
-    default: {
+    default:
-      int sibs = numsiblings[tree->tag];
      printf("\n");
-      if (sibs >= 1) {
-        printtree(sib1(tree), ident + 2);
-        if (sibs >= 2)
-          printtree(sib2(tree), ident + 2);
-      }
      break;
-    }
+  }
+  if (sibs >= 1) {
+    printtree(sib1(tree), ident + 2);
+    if (sibs >= 2)
+      printtree(sib2(tree), ident + 2);
  }
 }
diff --git a/lptree.c b/lptree.c
index 5c8de94..557090b 100644
--- a/lptree.c
+++ b/lptree.c
@@ -25,7 +25,7 @@ const byte numsiblings[] = {
  1,            /* rep */
  2, 2,         /* seq, choice */
  1, 1,         /* not, and */
-  0, 0, 2, 1,  /* call, opencall, rule, grammar */
+  0, 0, 2, 1, 1,  /* call, opencall, rule, prerule, grammar */
  1,  /* behind */
  1, 1  /* capture, runtime capture */
 };
@@ -906,7 +906,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
  int size;  /* accumulator for total size */
  lua_newtable(L);  /* create position table */
  getfirstrule(L, arg, postab);
-  size = 2 + getsize(L, postab + 2);  /* TGrammar + TRule + rule */
+  size = 3 + getsize(L, postab + 2);  /* TGrammar + TRule + TXInfo + rule */
  lua_pushnil(L);  /* prepare to traverse grammar table */
  while (lua_next(L, arg) != 0) {
    if (lua_tonumber(L, -2) == 1 ||
@@ -920,11 +920,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
    lua_pushvalue(L, -2);  /* push key (to insert into position table) */
    lua_pushinteger(L, size);
    lua_settable(L, postab);
-    size += 1 + getsize(L, -1);  /* update size */
+    size += 2 + getsize(L, -1);  /* add 'TRule + TXInfo + rule' to size */
    lua_pushvalue(L, -2);  /* push key (for next lua_next) */
    n++;
  }
-  *totalsize = size + 1;  /* TTrue to finish list of rules */
+  *totalsize = size + 1;  /* space for 'TTrue' finishing list of rules */
  return n;
 }
@@ -936,11 +936,13 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
    int ridx = frule + 2*i + 1;  /* index of i-th rule */
    int rulesize;
    TTree *rn = gettree(L, ridx, &rulesize);
+    TTree *pr = sib1(nd);  /* points to rule's prerule */
    nd->tag = TRule;
    nd->key = 0;  /* will be fixed when rule is used */
-    nd->cap = i;  /* rule number */
+    pr->tag = TXInfo;
-    nd->u.ps = rulesize + 1;  /* point to next rule */
+    pr->u.n = i;  /* rule number */
-    memcpy(sib1(nd), rn, rulesize * sizeof(TTree));  /* copy rule */
+    nd->u.ps = rulesize + 2;  /* point to next rule */
+    memcpy(sib1(pr), rn, rulesize * sizeof(TTree));  /* copy rule */
    mergektable(L, ridx, sib1(nd));  /* merge its ktable into new one */
    nd = sib2(nd);  /* move to next rule */
  }
@@ -976,7 +978,7 @@ static int checkloops (TTree *tree) {
 ** twice in 'passed', there is path from it back to itself without
 ** advancing the subject.
 */
-static int verifyerror (lua_State *L, int *passed, int npassed) {
+static int verifyerror (lua_State *L, unsigned short *passed, int npassed) {
  int i, j;
  for (i = npassed - 1; i >= 0; i--) {  /* search for a repetition */
    for (j = i - 1; j >= 0; j--) {
@@ -1001,8 +1003,8 @@ static int verifyerror (lua_State *L, int *passed, int npassed) {
 ** counts the elements in 'passed'.
 ** Assume ktable at the top of the stack.
 */
-static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
+static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
-                       int nb) {
+                                     int npassed, int nb) {
 tailcall:
  switch (tree->tag) {
    case TChar: case TSet: case TAny:
@@ -1014,7 +1016,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
    case TNot: case TAnd: case TRep:
      /* return verifyrule(L, sib1(tree), passed, npassed, 1); */
      tree = sib1(tree); nb = 1; goto tailcall;
-    case TCapture: case TRunTime:
+    case TCapture: case TRunTime: case TXInfo:
      /* return verifyrule(L, sib1(tree), passed, npassed, nb); */
      tree = sib1(tree); goto tailcall;
    case TCall:
@@ -1030,10 +1032,10 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
      /* return verifyrule(L, sib2(tree), passed, npassed, nb); */
      tree = sib2(tree); goto tailcall;
    case TRule:
-      if (npassed >= MAXRULES)
+      if (npassed >= MAXRULES)  /* too many steps? */
-        return verifyerror(L, passed, npassed);
+        return verifyerror(L, passed, npassed);  /* error */
      else {
-        passed[npassed++] = tree->key;
+        passed[npassed++] = tree->key;  /* add rule to path */
        /* return verifyrule(L, sib1(tree), passed, npassed); */
        tree = sib1(tree); goto tailcall;
      }
@@ -1045,7 +1047,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
 static void verifygrammar (lua_State *L, TTree *grammar) {
-  int passed[MAXRULES];
+  unsigned short passed[MAXRULES];
  TTree *rule;
  /* check left-recursive rules */
  for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
diff --git a/lptree.h b/lptree.h
index 25906d5..3e8b52b 100644
--- a/lptree.h
+++ b/lptree.h
@@ -26,8 +26,8 @@ typedef enum TTag {
  TCall,  /* ktable[key] is rule's key; 'sib2' is rule being called */
  TOpenCall,  /* ktable[key] is rule's key */
  TRule,  /* ktable[key] is rule's key (but key == 0 for unused rules);
-             'sib1' is rule's pattern;
+             'sib1' is rule's pattern pre-rule; 'sib2' is next rule; */
-             'sib2' is next rule; 'cap' is rule's sequential number */
+  TXInfo,  /* extra info; 'n' is rule's sequential number */
  TGrammar,  /* 'sib1' is initial (and first) rule */
  TBehind,  /* 'sib1' is pattern, 'n' is how much to go back */
  TCapture,  /* captures: 'cap' is kind of capture (enum 'CapKind');
diff --git a/lptypes.h b/lptypes.h
index 1d9d59f..223d887 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -51,9 +51,9 @@
 #endif
-/* maximum number of rules in a grammar (limited by 'unsigned char') */
+/* maximum number of rules in a grammar (limited by 'unsigned short') */
 #if !defined(MAXRULES)
-#define MAXRULES        250
+#define MAXRULES        1000
 #endif
diff --git a/lpvm.h b/lpvm.h
index 69ec33d..576429f 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -33,7 +33,8 @@ typedef enum Opcode {
  IFullCapture,  /* complete capture of last 'off' chars */
  IOpenCapture,  /* start a capture */
  ICloseCapture,
-  ICloseRunTime
+  ICloseRunTime,
+  IEmpty  /* to fill empty slots left by optimizations */
 } Opcode;
diff --git a/test.lua b/test.lua
index 8f9f574..f57cdec 100755
--- a/test.lua
+++ b/test.lua
@@ -406,7 +406,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc')
 do
-  -- large dynamic Cc
+  print "testing large dynamic Cc"
  local lim = 2^16 - 1
  local c = 0
  local function seq (n) 
@@ -985,10 +985,10 @@ for i = 1, 10 do
  assert(p:match("aaaaaaaaaaa") == 11 - i + 1)
 end
-print"+"
-- tests for back references
+print "testing back references"
 checkerr("back reference 'x' not found", m.match, m.Cb('x'), '')
 checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a')
@@ -1171,9 +1171,28 @@ t = {p:match('abacc')}
 checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'})
+do  print"testing large grammars"
+  local lim = 1000    -- number of rules
+  local t = {}
+  for i = 3, lim do
+    t[i] = m.V(i - 1)   -- each rule calls previous one
+  end
+  t[1] = m.V(lim)    -- start on last rule
+  t[2] = m.C("alo")  -- final rule
+  local P = m.P(t)   -- build grammar
+  assert(P:match("alo") == "alo")
+  t[#t + 1] = m.P("x")   -- one more rule...
+  checkerr("too many rules", m.P, t)
+end
 -------------------------------------------------------------------
 -- Tests for 're' module
 -------------------------------------------------------------------
+print"testing 're' module"
 local re = require "re"
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-14 12:04:23 -0300
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-14 12:04:23 -0300
commit	3f7797419e4d7493e1364290a5b127d1cb45e3bf (patch)
tree	8dd91b0d008d5ea9f9c96eada86510495c97d1e3
parent	d9f83dded93a35fb333c4e1bd371c401f7129fd1 (diff)
download	lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.tar.gz lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.tar.bz2 lpeg-3f7797419e4d7493e1364290a5b127d1cb45e3bf.zip