Implementation of UTF-8 ranges

New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
author: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
committer: Roberto Ierusalimschy <roberto@inf.puc-rio.br> 2019-04-17 14:08:22 -0300
commit: 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree: 646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpvm.c
parent: 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download: lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
1 files changed, 40 insertions, 0 deletions
diff --git a/lpvm.c b/lpvm.c
index 737418c..dbe1a8e 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
 /*
+** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+*/
+static const char *utf8_decode (const char *o, int *val) {
+  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
+  const unsigned char *s = (const unsigned char *)o;
+  unsigned int c = s[0];  /* first byte */
+  unsigned int res = 0;  /* final result */
+  if (c < 0x80)  /* ascii? */
+    res = c;
+  else {
+    int count = 0;  /* to count number of continuation bytes */
+    while (c & 0x40) {  /* still have continuation bytes? */
+      int cc = s[++count];  /* read next byte */
+      if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
+        return NULL;  /* invalid byte sequence */
+      res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
+      c <<= 1;  /* to test next bit */
+    }
+    res |= (c & 0x7F) << (count * 5);  /* add first byte */
+    if (count > 3 || res > 0x10FFFFu || res <= limits[count])
+      return NULL;  /* invalid byte sequence */
+    s += count;  /* skip continuation bytes read */
+  }
+  *val = res;
+  return (const char *)s + 1;  /* +1 to include first byte */
+}
+/*
 ** {======================================================
 ** Virtual Machine
 ** =======================================================
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
        else goto fail;
        continue;
      }
+      case IUTFR: {
+        int codepoint;
+        if (s >= e)
+          goto fail;
+        s = utf8_decode (s, &codepoint);
+        if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
+          p += 2;
+        else
+          goto fail;
+        continue;
+      }
      case ITestAny: {
        if (s < e) p += 2;
        else p += getoffset(p);
author	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
committer	Roberto Ierusalimschy <roberto@inf.puc-rio.br>	2019-04-17 14:08:22 -0300
commit	24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree	646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpvm.c
parent	3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
download	lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip

diff --git a/lpvm.c b/lpvm.c index 737418c..dbe1a8e 100644 --- a/lpvm.c +++ b/lpvm.c
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
28		28
29		29
30	/*	30	/*
		31	** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
		32	*/
		33	static const char utf8_decode (const char o, int *val) {
		34	static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
		35	const unsigned char s = (const unsigned char )o;
		36	unsigned int c = s[0]; /* first byte */
		37	unsigned int res = 0; /* final result */
		38	if (c < 0x80) /* ascii? */
		39	res = c;
		40	else {
		41	int count = 0; /* to count number of continuation bytes */
		42	while (c & 0x40) { /* still have continuation bytes? */
		43	int cc = s[++count]; /* read next byte */
		44	if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
		45	return NULL; /* invalid byte sequence */
		46	res = (res << 6) \| (cc & 0x3F); /* add lower 6 bits from cont. byte */
		47	c <<= 1; /* to test next bit */
		48	}
		49	res \|= (c & 0x7F) << (count * 5); /* add first byte */
		50	if (count > 3 \|\| res > 0x10FFFFu \|\| res <= limits[count])
		51	return NULL; /* invalid byte sequence */
		52	s += count; /* skip continuation bytes read */
		53	}
		54	*val = res;
		55	return (const char )s + 1; / +1 to include first byte */
		56	}
		57
		58
		59	/*
31	** {======================================================	60	** {======================================================
32	** Virtual Machine	61	** Virtual Machine
33	** =======================================================	62	** =======================================================
@@ -198,6 +227,17 @@ const char match (lua_State L, const char o, const char s, const char *e,
198	else goto fail;	227	else goto fail;
199	continue;	228	continue;
200	}	229	}
		230	case IUTFR: {
		231	int codepoint;
		232	if (s >= e)
		233	goto fail;
		234	s = utf8_decode (s, &codepoint);
		235	if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
		236	p += 2;
		237	else
		238	goto fail;
		239	continue;
		240	}
201	case ITestAny: {	241	case ITestAny: {
202	if (s < e) p += 2;	242	if (s < e) p += 2;
203	else p += getoffset(p);	243	else p += getoffset(p);