Implementation of UTF-8 ranges

New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
commit: 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree: 646cd65d6e2dab57691f98f83f15f25c70685ef8
parent: 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download: lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
9 files changed, 222 insertions, 20 deletions
diff --git a/lpcode.c b/lpcode.c
index 5fddfab..7d47a00 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) {
 int checkaux (TTree *tree, int pred) {
 tailcall:
  switch (tree->tag) {
-    case TChar: case TSet: case TAny:
+    case TChar: case TSet: case TAny: case TUTFR:
    case TFalse: case TOpenCall:
      return 0;  /* not nullable */
    case TRep: case TTrue:
@@ -239,6 +239,8 @@ int fixedlen (TTree *tree) {
  switch (tree->tag) {
    case TChar: case TSet: case TAny:
      return len + 1;
+    case TUTFR:
+      return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1;
    case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
      return len;
    case TRep: case TRunTime: case TOpenCall:
@@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
      tocharset(tree, firstset);
      return 0;
    }
+    case TUTFR: {
+      int c;
+      loopset(i, firstset->cs[i] = 0);  /* erase all chars */
+      for (c = tree->key; c <= sib1(tree)->key; c++)
+        setchar(firstset->cs, c);
+      return 0;
+    }
    case TTrue: {
      loopset(i, firstset->cs[i] = follow->cs[i]);
      return 1;  /* accepts the empty string */
@@ -380,7 +389,7 @@ static int headfail (TTree *tree) {
    case TChar: case TSet: case TAny: case TFalse:
      return 1;
    case TTrue: case TRep: case TRunTime: case TNot:
-    case TBehind:
+    case TBehind: case TUTFR:
      return 0;
    case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd:
      tree = sib1(tree); goto tailcall;  /* return headfail(sib1(tree)); */
@@ -407,7 +416,7 @@ static int headfail (TTree *tree) {
 static int needfollow (TTree *tree) {
 tailcall:
  switch (tree->tag) {
-    case TChar: case TSet: case TAny:
+    case TChar: case TSet: case TAny: case TUTFR:
    case TFalse: case TTrue: case TAnd: case TNot:
    case TRunTime: case TGrammar: case TCall: case TBehind:
      return 0;
@@ -418,7 +427,7 @@ static int needfollow (TTree *tree) {
    case TSeq:
      tree = sib2(tree); goto tailcall;
    default: assert(0); return 0;
-  } 
+  }
 }
 /* }====================================================== */
@@ -441,6 +450,7 @@ int sizei (const Instruction *i) {
    case ITestSet: return CHARSETINSTSIZE + 1;
    case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
    case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
+    case IUTFR:
      return 2;
    default: return 1;
  }
@@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) {
 }
+static void codeutfr (CompileState *compst, TTree *tree) {
+  int i = addoffsetinst(compst, IUTFR);
+  int to = sib1(tree)->u.n;
+  assert(sib1(tree)->tag == TXInfo);
+  getinstr(compst, i + 1).offset = tree->u.n;
+  getinstr(compst, i).i.aux = to & 0xff;
+  getinstr(compst, i).i.key = to >> 8;
+}
 /*
 ** Add a capture instruction:
 ** 'op' is the capture instruction; 'cap' the capture kind;
@@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) {
 /*
 ** Choice; optimizations:
-** - when p1 is headfail or
+** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
-** when first(p1) and first(p2) are disjoint, than
+** than a character not in first(p1) cannot go to p1 and a character
-** a character not in first(p1) cannot go to p1, and a character
+** in first(p1) cannot go to p2, either because p1 will accept
-** in first(p1) cannot go to p2 (at it is not in first(p2)).
+** (headfail) or because it is not in first(p2) (disjoint).
-** (The optimization is not valid if p1 accepts the empty string,
+** (The second case is not valid if p1 accepts the empty string,
 ** as then there is no character at all...)
 ** - when p2 is empty and opt is true; a IPartialCommit can reuse
 ** the Choice already active in the stack.
@@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
    int jmp = NOINST;
    codegen(compst, p1, 0, test, fl);
    if (!emptyp2)
-      jmp = addoffsetinst(compst, IJmp); 
+      jmp = addoffsetinst(compst, IJmp);
    jumptohere(compst, test);
    codegen(compst, p2, opt, NOINST, fl);
    jumptohere(compst, jmp);
@@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
    codegen(compst, p1, 1, NOINST, fullset);
  }
  else {
-    /* <p1 / p2> == 
+    /* <p1 / p2> ==
        test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
    int pcommit;
    int test = codetestset(compst, &cs1, e1);
@@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
    case TSet: codecharset(compst, treebuffer(tree), tt); break;
    case TTrue: break;
    case TFalse: addinstruction(compst, IFail, 0); break;
+    case TUTFR: codeutfr(compst, tree); break;
    case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
    case TRep: coderep(compst, sib1(tree), opt, fl); break;
    case TBehind: codebehind(compst, tree); break;
diff --git a/lpeg.html b/lpeg.html
index 8b9f59c..1295c4f 100644
--- a/lpeg.html
+++ b/lpeg.html
@@ -107,6 +107,9 @@ for creating patterns:
  <td>Matches any character in <code>string</code> (Set)</td></tr>
 <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td>
  <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr>
+<tr><td><a href="#op-utfR"><code>lpeg.utfR(cp1, cp2)</code></a></td>
+  <td>Matches an UTF-8 code point between <code>cp1</code> and
+  <code>cp2</code></td></tr>
 <tr><td><a href="#op-pow"><code>patt^n</code></a></td>
  <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr>
 <tr><td><a href="#op-pow"><code>patt^-n</code></a></td>
@@ -329,6 +332,15 @@ are patterns that always fail.
 </p>
+<h3><a name="op-utfR"></a><code>lpeg.utfR (cp1, cp2)</code></h3>
+<p>
+Returns a pattern that matches a valid UTF-8 byte sequence
+representing a code point in the range <code>[cp1, cp2]</code>.
+The range is limited by the natural Unicode limit of 0x10FFFF,
+but may include surrogates.
+</p>
 <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3>
 <p>
 This operation creates a non-terminal (a <em>variable</em>)
diff --git a/lpprint.c b/lpprint.c
index 397785e..6893bb8 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -56,7 +56,7 @@ void printinst (const Instruction *op, const Instruction *p) {
  const char *const names[] = {
    "any", "char", "set",
    "testany", "testchar", "testset",
-    "span", "behind",
+    "span", "utf-range", "behind",
    "ret", "end",
    "choice", "jmp", "call", "open_call",
    "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
@@ -66,11 +66,15 @@ void printinst (const Instruction *op, const Instruction *p) {
  printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
  switch ((Opcode)p->i.code) {
    case IChar: {
-      printf("'%c'", p->i.aux);
+      printf("'%c' (%02x)", p->i.aux, p->i.aux);
      break;
    }
    case ITestChar: {
-      printf("'%c'", p->i.aux); printjmp(op, p);
+      printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p);
+      break;
+    }
+    case IUTFR: {
+      printf("%d - %d", p[1].offset, utf_to(p));
      break;
    }
    case IFullCapture: {
@@ -148,7 +152,7 @@ void printcaplist (Capture *cap, Capture *limit) {
 static const char *tagnames[] = {
  "char", "set", "any",
-  "true", "false",
+  "true", "false", "utf8.range",
  "rep",
  "seq", "choice",
  "not", "and",
@@ -177,6 +181,13 @@ void printtree (TTree *tree, int ident) {
      printf("\n");
      break;
    }
+    case TUTFR: {
+      assert(sib1(tree)->tag == TXInfo);
+      printf(" %d (%02x %d) - %d (%02x %d) \n",
+        tree->u.n, tree->key, tree->cap,
+        sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap);
+      break;
+    }
    case TOpenCall: case TCall: {
      assert(sib1(sib2(tree))->tag == TXInfo);
      printf(" key: %d  (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
diff --git a/lptree.c b/lptree.c
index 557090b..62acc5c 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,7 +21,7 @@
 /* number of siblings for each tree */
 const byte numsiblings[] = {
  0, 0, 0,      /* char, set, any */
-  0, 0,         /* true, false */       
+  0, 0, 0,      /* true, false, utf-range */
  1,            /* rep */
  2, 2,         /* seq, choice */
  1, 1,         /* not, and */
@@ -675,6 +675,56 @@ static int lp_range (lua_State *L) {
 /*
+** Fills a tree node with basic information about the UTF-8 code point
+** 'cpu': its value in 'n', its length in 'cap', and its first byte in
+** 'key'
+*/
+static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
+  int len, fb, cp;
+  cp = (int)cpu;
+  if (cp <= 0x7f) {  /* one byte? */
+    len = 1;
+    fb = cp;
+  } else if (cp <= 0x7ff) {
+    len = 2;
+    fb = 0xC0 | (cp >> 6);
+  } else if (cp <= 0xffff) {
+    len = 3;
+    fb = 0xE0 | (cp >> 12);
+  }
+  else {
+    luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
+    len = 4;
+    fb = 0xF0 | (cp >> 18);
+  }
+  t->u.n = cp;
+  t->cap = len;
+  t->key = fb;
+}
+static int lp_utfr (lua_State *L) {
+  lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
+  lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
+  luaL_argcheck(L, from <= to, 2, "empty range");
+  if (to <= 0x7f) {  /* ascii range? */
+    TTree *tree = newcharset(L);  /* code it as a regular charset */
+    unsigned int f;
+    for (f = (int)from; f <= to; f++)
+      setchar(treebuffer(tree), f);
+  }
+  else {  /* multi-byte utf-8 range */
+    TTree *tree = newtree(L, 2);
+    tree->tag = TUTFR;
+    codeutftree(L, tree, from, 1);
+    sib1(tree)->tag = TXInfo;
+    codeutftree(L, sib1(tree), to, 2);
+  }
+  return 1;
+}
+/*
 ** Look-behind predicate
 */
 static int lp_behind (lua_State *L) {
@@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
 tailcall:
  switch (tree->tag) {
    case TChar: case TSet: case TAny:
-    case TFalse:
+    case TFalse: case TUTFR:
      return nb;  /* cannot pass from here */
    case TTrue:
    case TBehind:  /* look-behind cannot have calls */
@@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = {
  {"P", lp_P},
  {"S", lp_set},
  {"R", lp_range},
+  {"utfR", lp_utfr},
  {"locale", lp_locale},
  {"version", lp_version},
  {"setmaxstack", lp_setmax},
diff --git a/lptree.h b/lptree.h
index 3e8b52b..892e013 100644
--- a/lptree.h
+++ b/lptree.h
@@ -18,6 +18,9 @@ typedef enum TTag {
  TAny,
  TTrue,
  TFalse,
+  TUTFR,  /* range of UTF-8 codepoints; 'n' has initial codepoint;
+             'cap' has length; 'key' has first byte;
+             extra info is similar for end codepoint */
  TRep,  /* 'sib1'* */
  TSeq,  /* 'sib1' 'sib2' */
  TChoice,  /* 'sib1' / 'sib2' */
@@ -26,8 +29,9 @@ typedef enum TTag {
  TCall,  /* ktable[key] is rule's key; 'sib2' is rule being called */
  TOpenCall,  /* ktable[key] is rule's key */
  TRule,  /* ktable[key] is rule's key (but key == 0 for unused rules);
-             'sib1' is rule's pattern pre-rule; 'sib2' is next rule; */
+             'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
-  TXInfo,  /* extra info; 'n' is rule's sequential number */
+             extra info 'n' is rule's sequential number */
+  TXInfo,  /* extra info */
  TGrammar,  /* 'sib1' is initial (and first) rule */
  TBehind,  /* 'sib1' is pattern, 'n' is how much to go back */
  TCapture,  /* captures: 'cap' is kind of capture (enum 'CapKind');
diff --git a/lptypes.h b/lptypes.h
index 223d887..eea3d0c 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -37,6 +37,8 @@
 #define luaL_setfuncs(L,f,n)    luaL_register(L,NULL,f)
 #define luaL_newlib(L,f)        luaL_register(L,"lpeg",f)
+typedef size_t lua_Unsigned;
 #endif
diff --git a/lpvm.c b/lpvm.c
index 737418c..dbe1a8e 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
 /*
+** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+*/
+static const char *utf8_decode (const char *o, int *val) {
+  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
+  const unsigned char *s = (const unsigned char *)o;
+  unsigned int c = s[0];  /* first byte */
+  unsigned int res = 0;  /* final result */
+  if (c < 0x80)  /* ascii? */
+    res = c;
+  else {
+    int count = 0;  /* to count number of continuation bytes */
+    while (c & 0x40) {  /* still have continuation bytes? */
+      int cc = s[++count];  /* read next byte */
+      if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
+        return NULL;  /* invalid byte sequence */
+      res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
+      c <<= 1;  /* to test next bit */
+    }
+    res |= (c & 0x7F) << (count * 5);  /* add first byte */
+    if (count > 3 || res > 0x10FFFFu || res <= limits[count])
+      return NULL;  /* invalid byte sequence */
+    s += count;  /* skip continuation bytes read */
+  }
+  *val = res;
+  return (const char *)s + 1;  /* +1 to include first byte */
+}
+/*
 ** {======================================================
 ** Virtual Machine
 ** =======================================================
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
        else goto fail;
        continue;
      }
+      case IUTFR: {
+        int codepoint;
+        if (s >= e)
+          goto fail;
+        s = utf8_decode (s, &codepoint);
+        if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
+          p += 2;
+        else
+          goto fail;
+        continue;
+      }
      case ITestAny: {
        if (s < e) p += 2;
        else p += getoffset(p);
diff --git a/lpvm.h b/lpvm.h
index 576429f..9fde967 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -17,6 +17,7 @@ typedef enum Opcode {
  ITestChar,  /* if char != aux, jump to 'offset' */
  ITestSet,  /* if char not in buff, jump to 'offset' */
  ISpan,  /* read a span of chars in buff */
+  IUTFR,  /* if codepoint not in range [offset, utf_to], fail */
  IBehind,  /* walk back 'aux' characters (fail if not possible) */
  IRet,  /* return from a rule */
  IEnd,  /* end of pattern */
@@ -50,6 +51,10 @@ typedef union Instruction {
 } Instruction;
+/* extract 24-bit value from an instruction */
+#define utf_to(inst)    (((inst)->i.key << 8) | (inst)->i.aux)
 void printpatt (Instruction *p, int n);
 const char *match (lua_State *L, const char *o, const char *s, const char *e,
                   Instruction *op, Capture *capture, int ptop);
diff --git a/test.lua b/test.lua
index f57cdec..e86c21a 100755
--- a/test.lua
+++ b/test.lua
@@ -48,7 +48,6 @@ end
 print"General tests for LPeg library"
-assert(type(m.version()) == "string")
 print("version " .. m.version())
 assert(m.type("alo") ~= "pattern")
 assert(m.type(io.input) ~= "pattern")
@@ -1189,6 +1188,63 @@ do  print"testing large grammars"
 end
+print "testing UTF-8 ranges"
+do   -- a few typical UTF-8 ranges
+  local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
+          + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
+          + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
+          + m.utfR(0, 0x7f)^1 / "ascii: %0"
+          + m.utfR(0, 0x10ffff) / "other: %0"
+  p = m.Ct(p^0) * -m.P(1)
+  local cyr = "ждюя"
+  local emot = "\240\159\152\128\240\159\153\128"   --  😀🙀
+  local cjk = "专举乸"
+  local ascii = "alo"
+  local last = "\244\143\191\191"                -- U+10FFFF
+  local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
+  t = (p:match(s))
+  assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
+         t[3] == "emot: " .. emot and t[4] == "other: —" and
+         t[5] == "cjk: " .. cjk and t[6] == "other: —" and
+         t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
+         t[9] == nil)
+end
+do   -- valid and invalid code points
+  local p = m.utfR(0, 0x10ffff)^0
+  assert(p:match("汉字\128") == #"汉字" + 1)
+  assert(p:match("\244\159\191") == 1)
+  assert(p:match("\244\159\191\191") == 1)
+  assert(p:match("\255") == 1)
+   -- basic errors
+  checkerr("empty range", m.utfR, 1, 0)
+  checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
+end
+do  -- back references (fixed width)
+  -- match a byte after a CJK point
+  local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
+  p = m.P{ p + m.P(1) * m.V(1) }   -- search for 'p'
+  assert(p:match("ab д 专X x") == "X")
+  -- match a byte after a hebrew point
+  local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
+  p = m.P(#"ש") * p
+  assert(p:match("שX") == "X")
+  checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
+end
 -------------------------------------------------------------------
 -- Tests for 're' module
 -------------------------------------------------------------------
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
commit	24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree	646cd65d6e2dab57691f98f83f15f25c70685ef8
parent	3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download	lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip