aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2019-04-17 14:08:22 -0300
commit24bf757183d8bd97f6f5b43d916814f3269c8347 (patch)
tree646cd65d6e2dab57691f98f83f15f25c70685ef8
parent3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff)
downloadlpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2
lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches UTF-8 byte sequences representing code points in the range [from, to].
-rw-r--r--lpcode.c43
-rw-r--r--lpeg.html12
-rw-r--r--lpprint.c19
-rw-r--r--lptree.c55
-rw-r--r--lptree.h8
-rw-r--r--lptypes.h2
-rw-r--r--lpvm.c40
-rw-r--r--lpvm.h5
-rwxr-xr-xtest.lua58
9 files changed, 222 insertions, 20 deletions
diff --git a/lpcode.c b/lpcode.c
index 5fddfab..7d47a00 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) {
196int checkaux (TTree *tree, int pred) { 196int checkaux (TTree *tree, int pred) {
197 tailcall: 197 tailcall:
198 switch (tree->tag) { 198 switch (tree->tag) {
199 case TChar: case TSet: case TAny: 199 case TChar: case TSet: case TAny: case TUTFR:
200 case TFalse: case TOpenCall: 200 case TFalse: case TOpenCall:
201 return 0; /* not nullable */ 201 return 0; /* not nullable */
202 case TRep: case TTrue: 202 case TRep: case TTrue:
@@ -239,6 +239,8 @@ int fixedlen (TTree *tree) {
239 switch (tree->tag) { 239 switch (tree->tag) {
240 case TChar: case TSet: case TAny: 240 case TChar: case TSet: case TAny:
241 return len + 1; 241 return len + 1;
242 case TUTFR:
243 return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1;
242 case TFalse: case TTrue: case TNot: case TAnd: case TBehind: 244 case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
243 return len; 245 return len;
244 case TRep: case TRunTime: case TOpenCall: 246 case TRep: case TRunTime: case TOpenCall:
@@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
298 tocharset(tree, firstset); 300 tocharset(tree, firstset);
299 return 0; 301 return 0;
300 } 302 }
303 case TUTFR: {
304 int c;
305 loopset(i, firstset->cs[i] = 0); /* erase all chars */
306 for (c = tree->key; c <= sib1(tree)->key; c++)
307 setchar(firstset->cs, c);
308 return 0;
309 }
301 case TTrue: { 310 case TTrue: {
302 loopset(i, firstset->cs[i] = follow->cs[i]); 311 loopset(i, firstset->cs[i] = follow->cs[i]);
303 return 1; /* accepts the empty string */ 312 return 1; /* accepts the empty string */
@@ -380,7 +389,7 @@ static int headfail (TTree *tree) {
380 case TChar: case TSet: case TAny: case TFalse: 389 case TChar: case TSet: case TAny: case TFalse:
381 return 1; 390 return 1;
382 case TTrue: case TRep: case TRunTime: case TNot: 391 case TTrue: case TRep: case TRunTime: case TNot:
383 case TBehind: 392 case TBehind: case TUTFR:
384 return 0; 393 return 0;
385 case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: 394 case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd:
386 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ 395 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
@@ -407,7 +416,7 @@ static int headfail (TTree *tree) {
407static int needfollow (TTree *tree) { 416static int needfollow (TTree *tree) {
408 tailcall: 417 tailcall:
409 switch (tree->tag) { 418 switch (tree->tag) {
410 case TChar: case TSet: case TAny: 419 case TChar: case TSet: case TAny: case TUTFR:
411 case TFalse: case TTrue: case TAnd: case TNot: 420 case TFalse: case TTrue: case TAnd: case TNot:
412 case TRunTime: case TGrammar: case TCall: case TBehind: 421 case TRunTime: case TGrammar: case TCall: case TBehind:
413 return 0; 422 return 0;
@@ -418,7 +427,7 @@ static int needfollow (TTree *tree) {
418 case TSeq: 427 case TSeq:
419 tree = sib2(tree); goto tailcall; 428 tree = sib2(tree); goto tailcall;
420 default: assert(0); return 0; 429 default: assert(0); return 0;
421 } 430 }
422} 431}
423 432
424/* }====================================================== */ 433/* }====================================================== */
@@ -441,6 +450,7 @@ int sizei (const Instruction *i) {
441 case ITestSet: return CHARSETINSTSIZE + 1; 450 case ITestSet: return CHARSETINSTSIZE + 1;
442 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: 451 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
443 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: 452 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
453 case IUTFR:
444 return 2; 454 return 2;
445 default: return 1; 455 default: return 1;
446 } 456 }
@@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) {
518} 528}
519 529
520 530
531static void codeutfr (CompileState *compst, TTree *tree) {
532 int i = addoffsetinst(compst, IUTFR);
533 int to = sib1(tree)->u.n;
534 assert(sib1(tree)->tag == TXInfo);
535 getinstr(compst, i + 1).offset = tree->u.n;
536 getinstr(compst, i).i.aux = to & 0xff;
537 getinstr(compst, i).i.key = to >> 8;
538}
539
540
521/* 541/*
522** Add a capture instruction: 542** Add a capture instruction:
523** 'op' is the capture instruction; 'cap' the capture kind; 543** 'op' is the capture instruction; 'cap' the capture kind;
@@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) {
665 685
666/* 686/*
667** Choice; optimizations: 687** Choice; optimizations:
668** - when p1 is headfail or 688** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
669** when first(p1) and first(p2) are disjoint, than 689** than a character not in first(p1) cannot go to p1 and a character
670** a character not in first(p1) cannot go to p1, and a character 690** in first(p1) cannot go to p2, either because p1 will accept
671** in first(p1) cannot go to p2 (at it is not in first(p2)). 691** (headfail) or because it is not in first(p2) (disjoint).
672** (The optimization is not valid if p1 accepts the empty string, 692** (The second case is not valid if p1 accepts the empty string,
673** as then there is no character at all...) 693** as then there is no character at all...)
674** - when p2 is empty and opt is true; a IPartialCommit can reuse 694** - when p2 is empty and opt is true; a IPartialCommit can reuse
675** the Choice already active in the stack. 695** the Choice already active in the stack.
@@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
686 int jmp = NOINST; 706 int jmp = NOINST;
687 codegen(compst, p1, 0, test, fl); 707 codegen(compst, p1, 0, test, fl);
688 if (!emptyp2) 708 if (!emptyp2)
689 jmp = addoffsetinst(compst, IJmp); 709 jmp = addoffsetinst(compst, IJmp);
690 jumptohere(compst, test); 710 jumptohere(compst, test);
691 codegen(compst, p2, opt, NOINST, fl); 711 codegen(compst, p2, opt, NOINST, fl);
692 jumptohere(compst, jmp); 712 jumptohere(compst, jmp);
@@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
697 codegen(compst, p1, 1, NOINST, fullset); 717 codegen(compst, p1, 1, NOINST, fullset);
698 } 718 }
699 else { 719 else {
700 /* <p1 / p2> == 720 /* <p1 / p2> ==
701 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ 721 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
702 int pcommit; 722 int pcommit;
703 int test = codetestset(compst, &cs1, e1); 723 int test = codetestset(compst, &cs1, e1);
@@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
927 case TSet: codecharset(compst, treebuffer(tree), tt); break; 947 case TSet: codecharset(compst, treebuffer(tree), tt); break;
928 case TTrue: break; 948 case TTrue: break;
929 case TFalse: addinstruction(compst, IFail, 0); break; 949 case TFalse: addinstruction(compst, IFail, 0); break;
950 case TUTFR: codeutfr(compst, tree); break;
930 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; 951 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
931 case TRep: coderep(compst, sib1(tree), opt, fl); break; 952 case TRep: coderep(compst, sib1(tree), opt, fl); break;
932 case TBehind: codebehind(compst, tree); break; 953 case TBehind: codebehind(compst, tree); break;
diff --git a/lpeg.html b/lpeg.html
index 8b9f59c..1295c4f 100644
--- a/lpeg.html
+++ b/lpeg.html
@@ -107,6 +107,9 @@ for creating patterns:
107 <td>Matches any character in <code>string</code> (Set)</td></tr> 107 <td>Matches any character in <code>string</code> (Set)</td></tr>
108<tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> 108<tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td>
109 <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> 109 <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr>
110<tr><td><a href="#op-utfR"><code>lpeg.utfR(cp1, cp2)</code></a></td>
111 <td>Matches an UTF-8 code point between <code>cp1</code> and
112 <code>cp2</code></td></tr>
110<tr><td><a href="#op-pow"><code>patt^n</code></a></td> 113<tr><td><a href="#op-pow"><code>patt^n</code></a></td>
111 <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> 114 <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr>
112<tr><td><a href="#op-pow"><code>patt^-n</code></a></td> 115<tr><td><a href="#op-pow"><code>patt^-n</code></a></td>
@@ -329,6 +332,15 @@ are patterns that always fail.
329</p> 332</p>
330 333
331 334
335<h3><a name="op-utfR"></a><code>lpeg.utfR (cp1, cp2)</code></h3>
336<p>
337Returns a pattern that matches a valid UTF-8 byte sequence
338representing a code point in the range <code>[cp1, cp2]</code>.
339The range is limited by the natural Unicode limit of 0x10FFFF,
340but may include surrogates.
341</p>
342
343
332<h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> 344<h3><a name="op-v"></a><code>lpeg.V (v)</code></h3>
333<p> 345<p>
334This operation creates a non-terminal (a <em>variable</em>) 346This operation creates a non-terminal (a <em>variable</em>)
diff --git a/lpprint.c b/lpprint.c
index 397785e..6893bb8 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -56,7 +56,7 @@ void printinst (const Instruction *op, const Instruction *p) {
56 const char *const names[] = { 56 const char *const names[] = {
57 "any", "char", "set", 57 "any", "char", "set",
58 "testany", "testchar", "testset", 58 "testany", "testchar", "testset",
59 "span", "behind", 59 "span", "utf-range", "behind",
60 "ret", "end", 60 "ret", "end",
61 "choice", "jmp", "call", "open_call", 61 "choice", "jmp", "call", "open_call",
62 "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", 62 "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
@@ -66,11 +66,15 @@ void printinst (const Instruction *op, const Instruction *p) {
66 printf("%02ld: %s ", (long)(p - op), names[p->i.code]); 66 printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
67 switch ((Opcode)p->i.code) { 67 switch ((Opcode)p->i.code) {
68 case IChar: { 68 case IChar: {
69 printf("'%c'", p->i.aux); 69 printf("'%c' (%02x)", p->i.aux, p->i.aux);
70 break; 70 break;
71 } 71 }
72 case ITestChar: { 72 case ITestChar: {
73 printf("'%c'", p->i.aux); printjmp(op, p); 73 printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p);
74 break;
75 }
76 case IUTFR: {
77 printf("%d - %d", p[1].offset, utf_to(p));
74 break; 78 break;
75 } 79 }
76 case IFullCapture: { 80 case IFullCapture: {
@@ -148,7 +152,7 @@ void printcaplist (Capture *cap, Capture *limit) {
148 152
149static const char *tagnames[] = { 153static const char *tagnames[] = {
150 "char", "set", "any", 154 "char", "set", "any",
151 "true", "false", 155 "true", "false", "utf8.range",
152 "rep", 156 "rep",
153 "seq", "choice", 157 "seq", "choice",
154 "not", "and", 158 "not", "and",
@@ -177,6 +181,13 @@ void printtree (TTree *tree, int ident) {
177 printf("\n"); 181 printf("\n");
178 break; 182 break;
179 } 183 }
184 case TUTFR: {
185 assert(sib1(tree)->tag == TXInfo);
186 printf(" %d (%02x %d) - %d (%02x %d) \n",
187 tree->u.n, tree->key, tree->cap,
188 sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap);
189 break;
190 }
180 case TOpenCall: case TCall: { 191 case TOpenCall: case TCall: {
181 assert(sib1(sib2(tree))->tag == TXInfo); 192 assert(sib1(sib2(tree))->tag == TXInfo);
182 printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); 193 printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
diff --git a/lptree.c b/lptree.c
index 557090b..62acc5c 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,7 +21,7 @@
21/* number of siblings for each tree */ 21/* number of siblings for each tree */
22const byte numsiblings[] = { 22const byte numsiblings[] = {
23 0, 0, 0, /* char, set, any */ 23 0, 0, 0, /* char, set, any */
24 0, 0, /* true, false */ 24 0, 0, 0, /* true, false, utf-range */
25 1, /* rep */ 25 1, /* rep */
26 2, 2, /* seq, choice */ 26 2, 2, /* seq, choice */
27 1, 1, /* not, and */ 27 1, 1, /* not, and */
@@ -675,6 +675,56 @@ static int lp_range (lua_State *L) {
675 675
676 676
677/* 677/*
678** Fills a tree node with basic information about the UTF-8 code point
679** 'cpu': its value in 'n', its length in 'cap', and its first byte in
680** 'key'
681*/
682static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
683 int len, fb, cp;
684 cp = (int)cpu;
685 if (cp <= 0x7f) { /* one byte? */
686 len = 1;
687 fb = cp;
688 } else if (cp <= 0x7ff) {
689 len = 2;
690 fb = 0xC0 | (cp >> 6);
691 } else if (cp <= 0xffff) {
692 len = 3;
693 fb = 0xE0 | (cp >> 12);
694 }
695 else {
696 luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
697 len = 4;
698 fb = 0xF0 | (cp >> 18);
699 }
700 t->u.n = cp;
701 t->cap = len;
702 t->key = fb;
703}
704
705
706static int lp_utfr (lua_State *L) {
707 lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
708 lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
709 luaL_argcheck(L, from <= to, 2, "empty range");
710 if (to <= 0x7f) { /* ascii range? */
711 TTree *tree = newcharset(L); /* code it as a regular charset */
712 unsigned int f;
713 for (f = (int)from; f <= to; f++)
714 setchar(treebuffer(tree), f);
715 }
716 else { /* multi-byte utf-8 range */
717 TTree *tree = newtree(L, 2);
718 tree->tag = TUTFR;
719 codeutftree(L, tree, from, 1);
720 sib1(tree)->tag = TXInfo;
721 codeutftree(L, sib1(tree), to, 2);
722 }
723 return 1;
724}
725
726
727/*
678** Look-behind predicate 728** Look-behind predicate
679*/ 729*/
680static int lp_behind (lua_State *L) { 730static int lp_behind (lua_State *L) {
@@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
1008 tailcall: 1058 tailcall:
1009 switch (tree->tag) { 1059 switch (tree->tag) {
1010 case TChar: case TSet: case TAny: 1060 case TChar: case TSet: case TAny:
1011 case TFalse: 1061 case TFalse: case TUTFR:
1012 return nb; /* cannot pass from here */ 1062 return nb; /* cannot pass from here */
1013 case TTrue: 1063 case TTrue:
1014 case TBehind: /* look-behind cannot have calls */ 1064 case TBehind: /* look-behind cannot have calls */
@@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = {
1271 {"P", lp_P}, 1321 {"P", lp_P},
1272 {"S", lp_set}, 1322 {"S", lp_set},
1273 {"R", lp_range}, 1323 {"R", lp_range},
1324 {"utfR", lp_utfr},
1274 {"locale", lp_locale}, 1325 {"locale", lp_locale},
1275 {"version", lp_version}, 1326 {"version", lp_version},
1276 {"setmaxstack", lp_setmax}, 1327 {"setmaxstack", lp_setmax},
diff --git a/lptree.h b/lptree.h
index 3e8b52b..892e013 100644
--- a/lptree.h
+++ b/lptree.h
@@ -18,6 +18,9 @@ typedef enum TTag {
18 TAny, 18 TAny,
19 TTrue, 19 TTrue,
20 TFalse, 20 TFalse,
21 TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint;
22 'cap' has length; 'key' has first byte;
23 extra info is similar for end codepoint */
21 TRep, /* 'sib1'* */ 24 TRep, /* 'sib1'* */
22 TSeq, /* 'sib1' 'sib2' */ 25 TSeq, /* 'sib1' 'sib2' */
23 TChoice, /* 'sib1' / 'sib2' */ 26 TChoice, /* 'sib1' / 'sib2' */
@@ -26,8 +29,9 @@ typedef enum TTag {
26 TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ 29 TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */
27 TOpenCall, /* ktable[key] is rule's key */ 30 TOpenCall, /* ktable[key] is rule's key */
28 TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); 31 TRule, /* ktable[key] is rule's key (but key == 0 for unused rules);
29 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; */ 32 'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
30 TXInfo, /* extra info; 'n' is rule's sequential number */ 33 extra info 'n' is rule's sequential number */
34 TXInfo, /* extra info */
31 TGrammar, /* 'sib1' is initial (and first) rule */ 35 TGrammar, /* 'sib1' is initial (and first) rule */
32 TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ 36 TBehind, /* 'sib1' is pattern, 'n' is how much to go back */
33 TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); 37 TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind');
diff --git a/lptypes.h b/lptypes.h
index 223d887..eea3d0c 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -37,6 +37,8 @@
37#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) 37#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f)
38#define luaL_newlib(L,f) luaL_register(L,"lpeg",f) 38#define luaL_newlib(L,f) luaL_register(L,"lpeg",f)
39 39
40typedef size_t lua_Unsigned;
41
40#endif 42#endif
41 43
42 44
diff --git a/lpvm.c b/lpvm.c
index 737418c..dbe1a8e 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
28 28
29 29
30/* 30/*
31** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
32*/
33static const char *utf8_decode (const char *o, int *val) {
34 static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
35 const unsigned char *s = (const unsigned char *)o;
36 unsigned int c = s[0]; /* first byte */
37 unsigned int res = 0; /* final result */
38 if (c < 0x80) /* ascii? */
39 res = c;
40 else {
41 int count = 0; /* to count number of continuation bytes */
42 while (c & 0x40) { /* still have continuation bytes? */
43 int cc = s[++count]; /* read next byte */
44 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
45 return NULL; /* invalid byte sequence */
46 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
47 c <<= 1; /* to test next bit */
48 }
49 res |= (c & 0x7F) << (count * 5); /* add first byte */
50 if (count > 3 || res > 0x10FFFFu || res <= limits[count])
51 return NULL; /* invalid byte sequence */
52 s += count; /* skip continuation bytes read */
53 }
54 *val = res;
55 return (const char *)s + 1; /* +1 to include first byte */
56}
57
58
59/*
31** {====================================================== 60** {======================================================
32** Virtual Machine 61** Virtual Machine
33** ======================================================= 62** =======================================================
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
198 else goto fail; 227 else goto fail;
199 continue; 228 continue;
200 } 229 }
230 case IUTFR: {
231 int codepoint;
232 if (s >= e)
233 goto fail;
234 s = utf8_decode (s, &codepoint);
235 if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
236 p += 2;
237 else
238 goto fail;
239 continue;
240 }
201 case ITestAny: { 241 case ITestAny: {
202 if (s < e) p += 2; 242 if (s < e) p += 2;
203 else p += getoffset(p); 243 else p += getoffset(p);
diff --git a/lpvm.h b/lpvm.h
index 576429f..9fde967 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -17,6 +17,7 @@ typedef enum Opcode {
17 ITestChar, /* if char != aux, jump to 'offset' */ 17 ITestChar, /* if char != aux, jump to 'offset' */
18 ITestSet, /* if char not in buff, jump to 'offset' */ 18 ITestSet, /* if char not in buff, jump to 'offset' */
19 ISpan, /* read a span of chars in buff */ 19 ISpan, /* read a span of chars in buff */
20 IUTFR, /* if codepoint not in range [offset, utf_to], fail */
20 IBehind, /* walk back 'aux' characters (fail if not possible) */ 21 IBehind, /* walk back 'aux' characters (fail if not possible) */
21 IRet, /* return from a rule */ 22 IRet, /* return from a rule */
22 IEnd, /* end of pattern */ 23 IEnd, /* end of pattern */
@@ -50,6 +51,10 @@ typedef union Instruction {
50} Instruction; 51} Instruction;
51 52
52 53
54/* extract 24-bit value from an instruction */
55#define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux)
56
57
53void printpatt (Instruction *p, int n); 58void printpatt (Instruction *p, int n);
54const char *match (lua_State *L, const char *o, const char *s, const char *e, 59const char *match (lua_State *L, const char *o, const char *s, const char *e,
55 Instruction *op, Capture *capture, int ptop); 60 Instruction *op, Capture *capture, int ptop);
diff --git a/test.lua b/test.lua
index f57cdec..e86c21a 100755
--- a/test.lua
+++ b/test.lua
@@ -48,7 +48,6 @@ end
48 48
49print"General tests for LPeg library" 49print"General tests for LPeg library"
50 50
51assert(type(m.version()) == "string")
52print("version " .. m.version()) 51print("version " .. m.version())
53assert(m.type("alo") ~= "pattern") 52assert(m.type("alo") ~= "pattern")
54assert(m.type(io.input) ~= "pattern") 53assert(m.type(io.input) ~= "pattern")
@@ -1189,6 +1188,63 @@ do print"testing large grammars"
1189end 1188end
1190 1189
1191 1190
1191print "testing UTF-8 ranges"
1192
1193do -- a few typical UTF-8 ranges
1194 local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
1195 + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
1196 + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
1197 + m.utfR(0, 0x7f)^1 / "ascii: %0"
1198 + m.utfR(0, 0x10ffff) / "other: %0"
1199
1200 p = m.Ct(p^0) * -m.P(1)
1201
1202 local cyr = "ждюя"
1203 local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀
1204 local cjk = "专举乸"
1205 local ascii = "alo"
1206 local last = "\244\143\191\191" -- U+10FFFF
1207
1208 local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
1209 t = (p:match(s))
1210
1211 assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
1212 t[3] == "emot: " .. emot and t[4] == "other: —" and
1213 t[5] == "cjk: " .. cjk and t[6] == "other: —" and
1214 t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
1215 t[9] == nil)
1216end
1217
1218
1219do -- valid and invalid code points
1220 local p = m.utfR(0, 0x10ffff)^0
1221 assert(p:match("汉字\128") == #"汉字" + 1)
1222 assert(p:match("\244\159\191") == 1)
1223 assert(p:match("\244\159\191\191") == 1)
1224 assert(p:match("\255") == 1)
1225
1226 -- basic errors
1227 checkerr("empty range", m.utfR, 1, 0)
1228 checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
1229end
1230
1231
1232do -- back references (fixed width)
1233 -- match a byte after a CJK point
1234 local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
1235 p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p'
1236 assert(p:match("ab д 专X x") == "X")
1237
1238 -- match a byte after a hebrew point
1239 local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
1240 p = m.P(#"ש") * p
1241 assert(p:match("שX") == "X")
1242
1243 checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
1244end
1245
1246
1247
1192------------------------------------------------------------------- 1248-------------------------------------------------------------------
1193-- Tests for 're' module 1249-- Tests for 're' module
1194------------------------------------------------------------------- 1250-------------------------------------------------------------------