diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpvm.c | |
parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lpvm.c')
-rw-r--r-- | lpvm.c | 40 |
1 files changed, 40 insertions, 0 deletions
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}}; | |||
28 | 28 | ||
29 | 29 | ||
30 | /* | 30 | /* |
31 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | ||
32 | */ | ||
33 | static const char *utf8_decode (const char *o, int *val) { | ||
34 | static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; | ||
35 | const unsigned char *s = (const unsigned char *)o; | ||
36 | unsigned int c = s[0]; /* first byte */ | ||
37 | unsigned int res = 0; /* final result */ | ||
38 | if (c < 0x80) /* ascii? */ | ||
39 | res = c; | ||
40 | else { | ||
41 | int count = 0; /* to count number of continuation bytes */ | ||
42 | while (c & 0x40) { /* still have continuation bytes? */ | ||
43 | int cc = s[++count]; /* read next byte */ | ||
44 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ | ||
45 | return NULL; /* invalid byte sequence */ | ||
46 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ | ||
47 | c <<= 1; /* to test next bit */ | ||
48 | } | ||
49 | res |= (c & 0x7F) << (count * 5); /* add first byte */ | ||
50 | if (count > 3 || res > 0x10FFFFu || res <= limits[count]) | ||
51 | return NULL; /* invalid byte sequence */ | ||
52 | s += count; /* skip continuation bytes read */ | ||
53 | } | ||
54 | *val = res; | ||
55 | return (const char *)s + 1; /* +1 to include first byte */ | ||
56 | } | ||
57 | |||
58 | |||
59 | /* | ||
31 | ** {====================================================== | 60 | ** {====================================================== |
32 | ** Virtual Machine | 61 | ** Virtual Machine |
33 | ** ======================================================= | 62 | ** ======================================================= |
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
198 | else goto fail; | 227 | else goto fail; |
199 | continue; | 228 | continue; |
200 | } | 229 | } |
230 | case IUTFR: { | ||
231 | int codepoint; | ||
232 | if (s >= e) | ||
233 | goto fail; | ||
234 | s = utf8_decode (s, &codepoint); | ||
235 | if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) | ||
236 | p += 2; | ||
237 | else | ||
238 | goto fail; | ||
239 | continue; | ||
240 | } | ||
201 | case ITestAny: { | 241 | case ITestAny: { |
202 | if (s < e) p += 2; | 242 | if (s < e) p += 2; |
203 | else p += getoffset(p); | 243 | else p += getoffset(p); |