aboutsummaryrefslogtreecommitdiff
path: root/lpvm.c
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
commit24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpvm.c
parent3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
downloadlpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lpvm.c')
-rw-r--r--lpvm.c40
1 files changed, 40 insertions, 0 deletions
diff --git a/lpvm.c b/lpvm.c
index 737418c..dbe1a8e 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
28 28
29 29
30/* 30/*
31** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
32*/
33static const char *utf8_decode (const char *o, int *val) {
34 static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
35 const unsigned char *s = (const unsigned char *)o;
36 unsigned int c = s[0]; /* first byte */
37 unsigned int res = 0; /* final result */
38 if (c < 0x80) /* ascii? */
39 res = c;
40 else {
41 int count = 0; /* to count number of continuation bytes */
42 while (c & 0x40) { /* still have continuation bytes? */
43 int cc = s[++count]; /* read next byte */
44 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
45 return NULL; /* invalid byte sequence */
46 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
47 c <<= 1; /* to test next bit */
48 }
49 res |= (c & 0x7F) << (count * 5); /* add first byte */
50 if (count > 3 || res > 0x10FFFFu || res <= limits[count])
51 return NULL; /* invalid byte sequence */
52 s += count; /* skip continuation bytes read */
53 }
54 *val = res;
55 return (const char *)s + 1; /* +1 to include first byte */
56}
57
58
59/*
31** {====================================================== 60** {======================================================
32** Virtual Machine 61** Virtual Machine
33** ======================================================= 62** =======================================================
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
198 else goto fail; 227 else goto fail;
199 continue; 228 continue;
200 } 229 }
230 case IUTFR: {
231 int codepoint;
232 if (s >= e)
233 goto fail;
234 s = utf8_decode (s, &codepoint);
235 if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
236 p += 2;
237 else
238 goto fail;
239 continue;
240 }
201 case ITestAny: { 241 case ITestAny: {
202 if (s < e) p += 2; 242 if (s < e) p += 2;
203 else p += getoffset(p); 243 else p += getoffset(p);