diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 | |
parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
-rw-r--r-- | lpcode.c | 43 | ||||
-rw-r--r-- | lpeg.html | 12 | ||||
-rw-r--r-- | lpprint.c | 19 | ||||
-rw-r--r-- | lptree.c | 55 | ||||
-rw-r--r-- | lptree.h | 8 | ||||
-rw-r--r-- | lptypes.h | 2 | ||||
-rw-r--r-- | lpvm.c | 40 | ||||
-rw-r--r-- | lpvm.h | 5 | ||||
-rwxr-xr-x | test.lua | 58 |
9 files changed, 222 insertions, 20 deletions
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) { | |||
196 | int checkaux (TTree *tree, int pred) { | 196 | int checkaux (TTree *tree, int pred) { |
197 | tailcall: | 197 | tailcall: |
198 | switch (tree->tag) { | 198 | switch (tree->tag) { |
199 | case TChar: case TSet: case TAny: | 199 | case TChar: case TSet: case TAny: case TUTFR: |
200 | case TFalse: case TOpenCall: | 200 | case TFalse: case TOpenCall: |
201 | return 0; /* not nullable */ | 201 | return 0; /* not nullable */ |
202 | case TRep: case TTrue: | 202 | case TRep: case TTrue: |
@@ -239,6 +239,8 @@ int fixedlen (TTree *tree) { | |||
239 | switch (tree->tag) { | 239 | switch (tree->tag) { |
240 | case TChar: case TSet: case TAny: | 240 | case TChar: case TSet: case TAny: |
241 | return len + 1; | 241 | return len + 1; |
242 | case TUTFR: | ||
243 | return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; | ||
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | 244 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: |
243 | return len; | 245 | return len; |
244 | case TRep: case TRunTime: case TOpenCall: | 246 | case TRep: case TRunTime: case TOpenCall: |
@@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
298 | tocharset(tree, firstset); | 300 | tocharset(tree, firstset); |
299 | return 0; | 301 | return 0; |
300 | } | 302 | } |
303 | case TUTFR: { | ||
304 | int c; | ||
305 | loopset(i, firstset->cs[i] = 0); /* erase all chars */ | ||
306 | for (c = tree->key; c <= sib1(tree)->key; c++) | ||
307 | setchar(firstset->cs, c); | ||
308 | return 0; | ||
309 | } | ||
301 | case TTrue: { | 310 | case TTrue: { |
302 | loopset(i, firstset->cs[i] = follow->cs[i]); | 311 | loopset(i, firstset->cs[i] = follow->cs[i]); |
303 | return 1; /* accepts the empty string */ | 312 | return 1; /* accepts the empty string */ |
@@ -380,7 +389,7 @@ static int headfail (TTree *tree) { | |||
380 | case TChar: case TSet: case TAny: case TFalse: | 389 | case TChar: case TSet: case TAny: case TFalse: |
381 | return 1; | 390 | return 1; |
382 | case TTrue: case TRep: case TRunTime: case TNot: | 391 | case TTrue: case TRep: case TRunTime: case TNot: |
383 | case TBehind: | 392 | case TBehind: case TUTFR: |
384 | return 0; | 393 | return 0; |
385 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: | 394 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: |
386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | 395 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ |
@@ -407,7 +416,7 @@ static int headfail (TTree *tree) { | |||
407 | static int needfollow (TTree *tree) { | 416 | static int needfollow (TTree *tree) { |
408 | tailcall: | 417 | tailcall: |
409 | switch (tree->tag) { | 418 | switch (tree->tag) { |
410 | case TChar: case TSet: case TAny: | 419 | case TChar: case TSet: case TAny: case TUTFR: |
411 | case TFalse: case TTrue: case TAnd: case TNot: | 420 | case TFalse: case TTrue: case TAnd: case TNot: |
412 | case TRunTime: case TGrammar: case TCall: case TBehind: | 421 | case TRunTime: case TGrammar: case TCall: case TBehind: |
413 | return 0; | 422 | return 0; |
@@ -418,7 +427,7 @@ static int needfollow (TTree *tree) { | |||
418 | case TSeq: | 427 | case TSeq: |
419 | tree = sib2(tree); goto tailcall; | 428 | tree = sib2(tree); goto tailcall; |
420 | default: assert(0); return 0; | 429 | default: assert(0); return 0; |
421 | } | 430 | } |
422 | } | 431 | } |
423 | 432 | ||
424 | /* }====================================================== */ | 433 | /* }====================================================== */ |
@@ -441,6 +450,7 @@ int sizei (const Instruction *i) { | |||
441 | case ITestSet: return CHARSETINSTSIZE + 1; | 450 | case ITestSet: return CHARSETINSTSIZE + 1; |
442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 451 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 452 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
453 | case IUTFR: | ||
444 | return 2; | 454 | return 2; |
445 | default: return 1; | 455 | default: return 1; |
446 | } | 456 | } |
@@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { | |||
518 | } | 528 | } |
519 | 529 | ||
520 | 530 | ||
531 | static void codeutfr (CompileState *compst, TTree *tree) { | ||
532 | int i = addoffsetinst(compst, IUTFR); | ||
533 | int to = sib1(tree)->u.n; | ||
534 | assert(sib1(tree)->tag == TXInfo); | ||
535 | getinstr(compst, i + 1).offset = tree->u.n; | ||
536 | getinstr(compst, i).i.aux = to & 0xff; | ||
537 | getinstr(compst, i).i.key = to >> 8; | ||
538 | } | ||
539 | |||
540 | |||
521 | /* | 541 | /* |
522 | ** Add a capture instruction: | 542 | ** Add a capture instruction: |
523 | ** 'op' is the capture instruction; 'cap' the capture kind; | 543 | ** 'op' is the capture instruction; 'cap' the capture kind; |
@@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) { | |||
665 | 685 | ||
666 | /* | 686 | /* |
667 | ** Choice; optimizations: | 687 | ** Choice; optimizations: |
668 | ** - when p1 is headfail or | 688 | ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, |
669 | ** when first(p1) and first(p2) are disjoint, than | 689 | ** than a character not in first(p1) cannot go to p1 and a character |
670 | ** a character not in first(p1) cannot go to p1, and a character | 690 | ** in first(p1) cannot go to p2, either because p1 will accept |
671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | 691 | ** (headfail) or because it is not in first(p2) (disjoint). |
672 | ** (The optimization is not valid if p1 accepts the empty string, | 692 | ** (The second case is not valid if p1 accepts the empty string, |
673 | ** as then there is no character at all...) | 693 | ** as then there is no character at all...) |
674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | 694 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse |
675 | ** the Choice already active in the stack. | 695 | ** the Choice already active in the stack. |
@@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
686 | int jmp = NOINST; | 706 | int jmp = NOINST; |
687 | codegen(compst, p1, 0, test, fl); | 707 | codegen(compst, p1, 0, test, fl); |
688 | if (!emptyp2) | 708 | if (!emptyp2) |
689 | jmp = addoffsetinst(compst, IJmp); | 709 | jmp = addoffsetinst(compst, IJmp); |
690 | jumptohere(compst, test); | 710 | jumptohere(compst, test); |
691 | codegen(compst, p2, opt, NOINST, fl); | 711 | codegen(compst, p2, opt, NOINST, fl); |
692 | jumptohere(compst, jmp); | 712 | jumptohere(compst, jmp); |
@@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
697 | codegen(compst, p1, 1, NOINST, fullset); | 717 | codegen(compst, p1, 1, NOINST, fullset); |
698 | } | 718 | } |
699 | else { | 719 | else { |
700 | /* <p1 / p2> == | 720 | /* <p1 / p2> == |
701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | 721 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ |
702 | int pcommit; | 722 | int pcommit; |
703 | int test = codetestset(compst, &cs1, e1); | 723 | int test = codetestset(compst, &cs1, e1); |
@@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
927 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 947 | case TSet: codecharset(compst, treebuffer(tree), tt); break; |
928 | case TTrue: break; | 948 | case TTrue: break; |
929 | case TFalse: addinstruction(compst, IFail, 0); break; | 949 | case TFalse: addinstruction(compst, IFail, 0); break; |
950 | case TUTFR: codeutfr(compst, tree); break; | ||
930 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | 951 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; |
931 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | 952 | case TRep: coderep(compst, sib1(tree), opt, fl); break; |
932 | case TBehind: codebehind(compst, tree); break; | 953 | case TBehind: codebehind(compst, tree); break; |
@@ -107,6 +107,9 @@ for creating patterns: | |||
107 | <td>Matches any character in <code>string</code> (Set)</td></tr> | 107 | <td>Matches any character in <code>string</code> (Set)</td></tr> |
108 | <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> | 108 | <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> |
109 | <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> | 109 | <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> |
110 | <tr><td><a href="#op-utfR"><code>lpeg.utfR(cp1, cp2)</code></a></td> | ||
111 | <td>Matches an UTF-8 code point between <code>cp1</code> and | ||
112 | <code>cp2</code></td></tr> | ||
110 | <tr><td><a href="#op-pow"><code>patt^n</code></a></td> | 113 | <tr><td><a href="#op-pow"><code>patt^n</code></a></td> |
111 | <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> | 114 | <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> |
112 | <tr><td><a href="#op-pow"><code>patt^-n</code></a></td> | 115 | <tr><td><a href="#op-pow"><code>patt^-n</code></a></td> |
@@ -329,6 +332,15 @@ are patterns that always fail. | |||
329 | </p> | 332 | </p> |
330 | 333 | ||
331 | 334 | ||
335 | <h3><a name="op-utfR"></a><code>lpeg.utfR (cp1, cp2)</code></h3> | ||
336 | <p> | ||
337 | Returns a pattern that matches a valid UTF-8 byte sequence | ||
338 | representing a code point in the range <code>[cp1, cp2]</code>. | ||
339 | The range is limited by the natural Unicode limit of 0x10FFFF, | ||
340 | but may include surrogates. | ||
341 | </p> | ||
342 | |||
343 | |||
332 | <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> | 344 | <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> |
333 | <p> | 345 | <p> |
334 | This operation creates a non-terminal (a <em>variable</em>) | 346 | This operation creates a non-terminal (a <em>variable</em>) |
@@ -56,7 +56,7 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
56 | const char *const names[] = { | 56 | const char *const names[] = { |
57 | "any", "char", "set", | 57 | "any", "char", "set", |
58 | "testany", "testchar", "testset", | 58 | "testany", "testchar", "testset", |
59 | "span", "behind", | 59 | "span", "utf-range", "behind", |
60 | "ret", "end", | 60 | "ret", "end", |
61 | "choice", "jmp", "call", "open_call", | 61 | "choice", "jmp", "call", "open_call", |
62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", | 62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", |
@@ -66,11 +66,15 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); | 66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); |
67 | switch ((Opcode)p->i.code) { | 67 | switch ((Opcode)p->i.code) { |
68 | case IChar: { | 68 | case IChar: { |
69 | printf("'%c'", p->i.aux); | 69 | printf("'%c' (%02x)", p->i.aux, p->i.aux); |
70 | break; | 70 | break; |
71 | } | 71 | } |
72 | case ITestChar: { | 72 | case ITestChar: { |
73 | printf("'%c'", p->i.aux); printjmp(op, p); | 73 | printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p); |
74 | break; | ||
75 | } | ||
76 | case IUTFR: { | ||
77 | printf("%d - %d", p[1].offset, utf_to(p)); | ||
74 | break; | 78 | break; |
75 | } | 79 | } |
76 | case IFullCapture: { | 80 | case IFullCapture: { |
@@ -148,7 +152,7 @@ void printcaplist (Capture *cap, Capture *limit) { | |||
148 | 152 | ||
149 | static const char *tagnames[] = { | 153 | static const char *tagnames[] = { |
150 | "char", "set", "any", | 154 | "char", "set", "any", |
151 | "true", "false", | 155 | "true", "false", "utf8.range", |
152 | "rep", | 156 | "rep", |
153 | "seq", "choice", | 157 | "seq", "choice", |
154 | "not", "and", | 158 | "not", "and", |
@@ -177,6 +181,13 @@ void printtree (TTree *tree, int ident) { | |||
177 | printf("\n"); | 181 | printf("\n"); |
178 | break; | 182 | break; |
179 | } | 183 | } |
184 | case TUTFR: { | ||
185 | assert(sib1(tree)->tag == TXInfo); | ||
186 | printf(" %d (%02x %d) - %d (%02x %d) \n", | ||
187 | tree->u.n, tree->key, tree->cap, | ||
188 | sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); | ||
189 | break; | ||
190 | } | ||
180 | case TOpenCall: case TCall: { | 191 | case TOpenCall: case TCall: { |
181 | assert(sib1(sib2(tree))->tag == TXInfo); | 192 | assert(sib1(sib2(tree))->tag == TXInfo); |
182 | printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); | 193 | printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); |
@@ -21,7 +21,7 @@ | |||
21 | /* number of siblings for each tree */ | 21 | /* number of siblings for each tree */ |
22 | const byte numsiblings[] = { | 22 | const byte numsiblings[] = { |
23 | 0, 0, 0, /* char, set, any */ | 23 | 0, 0, 0, /* char, set, any */ |
24 | 0, 0, /* true, false */ | 24 | 0, 0, 0, /* true, false, utf-range */ |
25 | 1, /* rep */ | 25 | 1, /* rep */ |
26 | 2, 2, /* seq, choice */ | 26 | 2, 2, /* seq, choice */ |
27 | 1, 1, /* not, and */ | 27 | 1, 1, /* not, and */ |
@@ -675,6 +675,56 @@ static int lp_range (lua_State *L) { | |||
675 | 675 | ||
676 | 676 | ||
677 | /* | 677 | /* |
678 | ** Fills a tree node with basic information about the UTF-8 code point | ||
679 | ** 'cpu': its value in 'n', its length in 'cap', and its first byte in | ||
680 | ** 'key' | ||
681 | */ | ||
682 | static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { | ||
683 | int len, fb, cp; | ||
684 | cp = (int)cpu; | ||
685 | if (cp <= 0x7f) { /* one byte? */ | ||
686 | len = 1; | ||
687 | fb = cp; | ||
688 | } else if (cp <= 0x7ff) { | ||
689 | len = 2; | ||
690 | fb = 0xC0 | (cp >> 6); | ||
691 | } else if (cp <= 0xffff) { | ||
692 | len = 3; | ||
693 | fb = 0xE0 | (cp >> 12); | ||
694 | } | ||
695 | else { | ||
696 | luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); | ||
697 | len = 4; | ||
698 | fb = 0xF0 | (cp >> 18); | ||
699 | } | ||
700 | t->u.n = cp; | ||
701 | t->cap = len; | ||
702 | t->key = fb; | ||
703 | } | ||
704 | |||
705 | |||
706 | static int lp_utfr (lua_State *L) { | ||
707 | lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); | ||
708 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | ||
709 | luaL_argcheck(L, from <= to, 2, "empty range"); | ||
710 | if (to <= 0x7f) { /* ascii range? */ | ||
711 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
712 | unsigned int f; | ||
713 | for (f = (int)from; f <= to; f++) | ||
714 | setchar(treebuffer(tree), f); | ||
715 | } | ||
716 | else { /* multi-byte utf-8 range */ | ||
717 | TTree *tree = newtree(L, 2); | ||
718 | tree->tag = TUTFR; | ||
719 | codeutftree(L, tree, from, 1); | ||
720 | sib1(tree)->tag = TXInfo; | ||
721 | codeutftree(L, sib1(tree), to, 2); | ||
722 | } | ||
723 | return 1; | ||
724 | } | ||
725 | |||
726 | |||
727 | /* | ||
678 | ** Look-behind predicate | 728 | ** Look-behind predicate |
679 | */ | 729 | */ |
680 | static int lp_behind (lua_State *L) { | 730 | static int lp_behind (lua_State *L) { |
@@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, | |||
1008 | tailcall: | 1058 | tailcall: |
1009 | switch (tree->tag) { | 1059 | switch (tree->tag) { |
1010 | case TChar: case TSet: case TAny: | 1060 | case TChar: case TSet: case TAny: |
1011 | case TFalse: | 1061 | case TFalse: case TUTFR: |
1012 | return nb; /* cannot pass from here */ | 1062 | return nb; /* cannot pass from here */ |
1013 | case TTrue: | 1063 | case TTrue: |
1014 | case TBehind: /* look-behind cannot have calls */ | 1064 | case TBehind: /* look-behind cannot have calls */ |
@@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = { | |||
1271 | {"P", lp_P}, | 1321 | {"P", lp_P}, |
1272 | {"S", lp_set}, | 1322 | {"S", lp_set}, |
1273 | {"R", lp_range}, | 1323 | {"R", lp_range}, |
1324 | {"utfR", lp_utfr}, | ||
1274 | {"locale", lp_locale}, | 1325 | {"locale", lp_locale}, |
1275 | {"version", lp_version}, | 1326 | {"version", lp_version}, |
1276 | {"setmaxstack", lp_setmax}, | 1327 | {"setmaxstack", lp_setmax}, |
@@ -18,6 +18,9 @@ typedef enum TTag { | |||
18 | TAny, | 18 | TAny, |
19 | TTrue, | 19 | TTrue, |
20 | TFalse, | 20 | TFalse, |
21 | TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; | ||
22 | 'cap' has length; 'key' has first byte; | ||
23 | extra info is similar for end codepoint */ | ||
21 | TRep, /* 'sib1'* */ | 24 | TRep, /* 'sib1'* */ |
22 | TSeq, /* 'sib1' 'sib2' */ | 25 | TSeq, /* 'sib1' 'sib2' */ |
23 | TChoice, /* 'sib1' / 'sib2' */ | 26 | TChoice, /* 'sib1' / 'sib2' */ |
@@ -26,8 +29,9 @@ typedef enum TTag { | |||
26 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ | 29 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ |
27 | TOpenCall, /* ktable[key] is rule's key */ | 30 | TOpenCall, /* ktable[key] is rule's key */ |
28 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); | 31 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); |
29 | 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; */ | 32 | 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; |
30 | TXInfo, /* extra info; 'n' is rule's sequential number */ | 33 | extra info 'n' is rule's sequential number */ |
34 | TXInfo, /* extra info */ | ||
31 | TGrammar, /* 'sib1' is initial (and first) rule */ | 35 | TGrammar, /* 'sib1' is initial (and first) rule */ |
32 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ | 36 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ |
33 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); | 37 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); |
@@ -37,6 +37,8 @@ | |||
37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) | 37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) |
38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) | 38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) |
39 | 39 | ||
40 | typedef size_t lua_Unsigned; | ||
41 | |||
40 | #endif | 42 | #endif |
41 | 43 | ||
42 | 44 | ||
@@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}}; | |||
28 | 28 | ||
29 | 29 | ||
30 | /* | 30 | /* |
31 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | ||
32 | */ | ||
33 | static const char *utf8_decode (const char *o, int *val) { | ||
34 | static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; | ||
35 | const unsigned char *s = (const unsigned char *)o; | ||
36 | unsigned int c = s[0]; /* first byte */ | ||
37 | unsigned int res = 0; /* final result */ | ||
38 | if (c < 0x80) /* ascii? */ | ||
39 | res = c; | ||
40 | else { | ||
41 | int count = 0; /* to count number of continuation bytes */ | ||
42 | while (c & 0x40) { /* still have continuation bytes? */ | ||
43 | int cc = s[++count]; /* read next byte */ | ||
44 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ | ||
45 | return NULL; /* invalid byte sequence */ | ||
46 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ | ||
47 | c <<= 1; /* to test next bit */ | ||
48 | } | ||
49 | res |= (c & 0x7F) << (count * 5); /* add first byte */ | ||
50 | if (count > 3 || res > 0x10FFFFu || res <= limits[count]) | ||
51 | return NULL; /* invalid byte sequence */ | ||
52 | s += count; /* skip continuation bytes read */ | ||
53 | } | ||
54 | *val = res; | ||
55 | return (const char *)s + 1; /* +1 to include first byte */ | ||
56 | } | ||
57 | |||
58 | |||
59 | /* | ||
31 | ** {====================================================== | 60 | ** {====================================================== |
32 | ** Virtual Machine | 61 | ** Virtual Machine |
33 | ** ======================================================= | 62 | ** ======================================================= |
@@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
198 | else goto fail; | 227 | else goto fail; |
199 | continue; | 228 | continue; |
200 | } | 229 | } |
230 | case IUTFR: { | ||
231 | int codepoint; | ||
232 | if (s >= e) | ||
233 | goto fail; | ||
234 | s = utf8_decode (s, &codepoint); | ||
235 | if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) | ||
236 | p += 2; | ||
237 | else | ||
238 | goto fail; | ||
239 | continue; | ||
240 | } | ||
201 | case ITestAny: { | 241 | case ITestAny: { |
202 | if (s < e) p += 2; | 242 | if (s < e) p += 2; |
203 | else p += getoffset(p); | 243 | else p += getoffset(p); |
@@ -17,6 +17,7 @@ typedef enum Opcode { | |||
17 | ITestChar, /* if char != aux, jump to 'offset' */ | 17 | ITestChar, /* if char != aux, jump to 'offset' */ |
18 | ITestSet, /* if char not in buff, jump to 'offset' */ | 18 | ITestSet, /* if char not in buff, jump to 'offset' */ |
19 | ISpan, /* read a span of chars in buff */ | 19 | ISpan, /* read a span of chars in buff */ |
20 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ | ||
20 | IBehind, /* walk back 'aux' characters (fail if not possible) */ | 21 | IBehind, /* walk back 'aux' characters (fail if not possible) */ |
21 | IRet, /* return from a rule */ | 22 | IRet, /* return from a rule */ |
22 | IEnd, /* end of pattern */ | 23 | IEnd, /* end of pattern */ |
@@ -50,6 +51,10 @@ typedef union Instruction { | |||
50 | } Instruction; | 51 | } Instruction; |
51 | 52 | ||
52 | 53 | ||
54 | /* extract 24-bit value from an instruction */ | ||
55 | #define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux) | ||
56 | |||
57 | |||
53 | void printpatt (Instruction *p, int n); | 58 | void printpatt (Instruction *p, int n); |
54 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | 59 | const char *match (lua_State *L, const char *o, const char *s, const char *e, |
55 | Instruction *op, Capture *capture, int ptop); | 60 | Instruction *op, Capture *capture, int ptop); |
@@ -48,7 +48,6 @@ end | |||
48 | 48 | ||
49 | print"General tests for LPeg library" | 49 | print"General tests for LPeg library" |
50 | 50 | ||
51 | assert(type(m.version()) == "string") | ||
52 | print("version " .. m.version()) | 51 | print("version " .. m.version()) |
53 | assert(m.type("alo") ~= "pattern") | 52 | assert(m.type("alo") ~= "pattern") |
54 | assert(m.type(io.input) ~= "pattern") | 53 | assert(m.type(io.input) ~= "pattern") |
@@ -1189,6 +1188,63 @@ do print"testing large grammars" | |||
1189 | end | 1188 | end |
1190 | 1189 | ||
1191 | 1190 | ||
1191 | print "testing UTF-8 ranges" | ||
1192 | |||
1193 | do -- a few typical UTF-8 ranges | ||
1194 | local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" | ||
1195 | + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" | ||
1196 | + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" | ||
1197 | + m.utfR(0, 0x7f)^1 / "ascii: %0" | ||
1198 | + m.utfR(0, 0x10ffff) / "other: %0" | ||
1199 | |||
1200 | p = m.Ct(p^0) * -m.P(1) | ||
1201 | |||
1202 | local cyr = "ждюя" | ||
1203 | local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 | ||
1204 | local cjk = "专举乸" | ||
1205 | local ascii = "alo" | ||
1206 | local last = "\244\143\191\191" -- U+10FFFF | ||
1207 | |||
1208 | local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last | ||
1209 | t = (p:match(s)) | ||
1210 | |||
1211 | assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and | ||
1212 | t[3] == "emot: " .. emot and t[4] == "other: —" and | ||
1213 | t[5] == "cjk: " .. cjk and t[6] == "other: —" and | ||
1214 | t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and | ||
1215 | t[9] == nil) | ||
1216 | end | ||
1217 | |||
1218 | |||
1219 | do -- valid and invalid code points | ||
1220 | local p = m.utfR(0, 0x10ffff)^0 | ||
1221 | assert(p:match("汉字\128") == #"汉字" + 1) | ||
1222 | assert(p:match("\244\159\191") == 1) | ||
1223 | assert(p:match("\244\159\191\191") == 1) | ||
1224 | assert(p:match("\255") == 1) | ||
1225 | |||
1226 | -- basic errors | ||
1227 | checkerr("empty range", m.utfR, 1, 0) | ||
1228 | checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) | ||
1229 | end | ||
1230 | |||
1231 | |||
1232 | do -- back references (fixed width) | ||
1233 | -- match a byte after a CJK point | ||
1234 | local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) | ||
1235 | p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' | ||
1236 | assert(p:match("ab д 专X x") == "X") | ||
1237 | |||
1238 | -- match a byte after a hebrew point | ||
1239 | local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) | ||
1240 | p = m.P(#"ש") * p | ||
1241 | assert(p:match("שX") == "X") | ||
1242 | |||
1243 | checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) | ||
1244 | end | ||
1245 | |||
1246 | |||
1247 | |||
1192 | ------------------------------------------------------------------- | 1248 | ------------------------------------------------------------------- |
1193 | -- Tests for 're' module | 1249 | -- Tests for 're' module |
1194 | ------------------------------------------------------------------- | 1250 | ------------------------------------------------------------------- |