diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
| commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
| tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 | |
| parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
| download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip | |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
| -rw-r--r-- | lpcode.c | 43 | ||||
| -rw-r--r-- | lpeg.html | 12 | ||||
| -rw-r--r-- | lpprint.c | 19 | ||||
| -rw-r--r-- | lptree.c | 55 | ||||
| -rw-r--r-- | lptree.h | 8 | ||||
| -rw-r--r-- | lptypes.h | 2 | ||||
| -rw-r--r-- | lpvm.c | 40 | ||||
| -rw-r--r-- | lpvm.h | 5 | ||||
| -rwxr-xr-x | test.lua | 58 |
9 files changed, 222 insertions, 20 deletions
| @@ -196,7 +196,7 @@ int hascaptures (TTree *tree) { | |||
| 196 | int checkaux (TTree *tree, int pred) { | 196 | int checkaux (TTree *tree, int pred) { |
| 197 | tailcall: | 197 | tailcall: |
| 198 | switch (tree->tag) { | 198 | switch (tree->tag) { |
| 199 | case TChar: case TSet: case TAny: | 199 | case TChar: case TSet: case TAny: case TUTFR: |
| 200 | case TFalse: case TOpenCall: | 200 | case TFalse: case TOpenCall: |
| 201 | return 0; /* not nullable */ | 201 | return 0; /* not nullable */ |
| 202 | case TRep: case TTrue: | 202 | case TRep: case TTrue: |
| @@ -239,6 +239,8 @@ int fixedlen (TTree *tree) { | |||
| 239 | switch (tree->tag) { | 239 | switch (tree->tag) { |
| 240 | case TChar: case TSet: case TAny: | 240 | case TChar: case TSet: case TAny: |
| 241 | return len + 1; | 241 | return len + 1; |
| 242 | case TUTFR: | ||
| 243 | return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; | ||
| 242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | 244 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: |
| 243 | return len; | 245 | return len; |
| 244 | case TRep: case TRunTime: case TOpenCall: | 246 | case TRep: case TRunTime: case TOpenCall: |
| @@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
| 298 | tocharset(tree, firstset); | 300 | tocharset(tree, firstset); |
| 299 | return 0; | 301 | return 0; |
| 300 | } | 302 | } |
| 303 | case TUTFR: { | ||
| 304 | int c; | ||
| 305 | loopset(i, firstset->cs[i] = 0); /* erase all chars */ | ||
| 306 | for (c = tree->key; c <= sib1(tree)->key; c++) | ||
| 307 | setchar(firstset->cs, c); | ||
| 308 | return 0; | ||
| 309 | } | ||
| 301 | case TTrue: { | 310 | case TTrue: { |
| 302 | loopset(i, firstset->cs[i] = follow->cs[i]); | 311 | loopset(i, firstset->cs[i] = follow->cs[i]); |
| 303 | return 1; /* accepts the empty string */ | 312 | return 1; /* accepts the empty string */ |
| @@ -380,7 +389,7 @@ static int headfail (TTree *tree) { | |||
| 380 | case TChar: case TSet: case TAny: case TFalse: | 389 | case TChar: case TSet: case TAny: case TFalse: |
| 381 | return 1; | 390 | return 1; |
| 382 | case TTrue: case TRep: case TRunTime: case TNot: | 391 | case TTrue: case TRep: case TRunTime: case TNot: |
| 383 | case TBehind: | 392 | case TBehind: case TUTFR: |
| 384 | return 0; | 393 | return 0; |
| 385 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: | 394 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: |
| 386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | 395 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ |
| @@ -407,7 +416,7 @@ static int headfail (TTree *tree) { | |||
| 407 | static int needfollow (TTree *tree) { | 416 | static int needfollow (TTree *tree) { |
| 408 | tailcall: | 417 | tailcall: |
| 409 | switch (tree->tag) { | 418 | switch (tree->tag) { |
| 410 | case TChar: case TSet: case TAny: | 419 | case TChar: case TSet: case TAny: case TUTFR: |
| 411 | case TFalse: case TTrue: case TAnd: case TNot: | 420 | case TFalse: case TTrue: case TAnd: case TNot: |
| 412 | case TRunTime: case TGrammar: case TCall: case TBehind: | 421 | case TRunTime: case TGrammar: case TCall: case TBehind: |
| 413 | return 0; | 422 | return 0; |
| @@ -418,7 +427,7 @@ static int needfollow (TTree *tree) { | |||
| 418 | case TSeq: | 427 | case TSeq: |
| 419 | tree = sib2(tree); goto tailcall; | 428 | tree = sib2(tree); goto tailcall; |
| 420 | default: assert(0); return 0; | 429 | default: assert(0); return 0; |
| 421 | } | 430 | } |
| 422 | } | 431 | } |
| 423 | 432 | ||
| 424 | /* }====================================================== */ | 433 | /* }====================================================== */ |
| @@ -441,6 +450,7 @@ int sizei (const Instruction *i) { | |||
| 441 | case ITestSet: return CHARSETINSTSIZE + 1; | 450 | case ITestSet: return CHARSETINSTSIZE + 1; |
| 442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 451 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
| 443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 452 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
| 453 | case IUTFR: | ||
| 444 | return 2; | 454 | return 2; |
| 445 | default: return 1; | 455 | default: return 1; |
| 446 | } | 456 | } |
| @@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { | |||
| 518 | } | 528 | } |
| 519 | 529 | ||
| 520 | 530 | ||
| 531 | static void codeutfr (CompileState *compst, TTree *tree) { | ||
| 532 | int i = addoffsetinst(compst, IUTFR); | ||
| 533 | int to = sib1(tree)->u.n; | ||
| 534 | assert(sib1(tree)->tag == TXInfo); | ||
| 535 | getinstr(compst, i + 1).offset = tree->u.n; | ||
| 536 | getinstr(compst, i).i.aux = to & 0xff; | ||
| 537 | getinstr(compst, i).i.key = to >> 8; | ||
| 538 | } | ||
| 539 | |||
| 540 | |||
| 521 | /* | 541 | /* |
| 522 | ** Add a capture instruction: | 542 | ** Add a capture instruction: |
| 523 | ** 'op' is the capture instruction; 'cap' the capture kind; | 543 | ** 'op' is the capture instruction; 'cap' the capture kind; |
| @@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) { | |||
| 665 | 685 | ||
| 666 | /* | 686 | /* |
| 667 | ** Choice; optimizations: | 687 | ** Choice; optimizations: |
| 668 | ** - when p1 is headfail or | 688 | ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, |
| 669 | ** when first(p1) and first(p2) are disjoint, than | 689 | ** than a character not in first(p1) cannot go to p1 and a character |
| 670 | ** a character not in first(p1) cannot go to p1, and a character | 690 | ** in first(p1) cannot go to p2, either because p1 will accept |
| 671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | 691 | ** (headfail) or because it is not in first(p2) (disjoint). |
| 672 | ** (The optimization is not valid if p1 accepts the empty string, | 692 | ** (The second case is not valid if p1 accepts the empty string, |
| 673 | ** as then there is no character at all...) | 693 | ** as then there is no character at all...) |
| 674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | 694 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse |
| 675 | ** the Choice already active in the stack. | 695 | ** the Choice already active in the stack. |
| @@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
| 686 | int jmp = NOINST; | 706 | int jmp = NOINST; |
| 687 | codegen(compst, p1, 0, test, fl); | 707 | codegen(compst, p1, 0, test, fl); |
| 688 | if (!emptyp2) | 708 | if (!emptyp2) |
| 689 | jmp = addoffsetinst(compst, IJmp); | 709 | jmp = addoffsetinst(compst, IJmp); |
| 690 | jumptohere(compst, test); | 710 | jumptohere(compst, test); |
| 691 | codegen(compst, p2, opt, NOINST, fl); | 711 | codegen(compst, p2, opt, NOINST, fl); |
| 692 | jumptohere(compst, jmp); | 712 | jumptohere(compst, jmp); |
| @@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
| 697 | codegen(compst, p1, 1, NOINST, fullset); | 717 | codegen(compst, p1, 1, NOINST, fullset); |
| 698 | } | 718 | } |
| 699 | else { | 719 | else { |
| 700 | /* <p1 / p2> == | 720 | /* <p1 / p2> == |
| 701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | 721 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ |
| 702 | int pcommit; | 722 | int pcommit; |
| 703 | int test = codetestset(compst, &cs1, e1); | 723 | int test = codetestset(compst, &cs1, e1); |
| @@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
| 927 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 947 | case TSet: codecharset(compst, treebuffer(tree), tt); break; |
| 928 | case TTrue: break; | 948 | case TTrue: break; |
| 929 | case TFalse: addinstruction(compst, IFail, 0); break; | 949 | case TFalse: addinstruction(compst, IFail, 0); break; |
| 950 | case TUTFR: codeutfr(compst, tree); break; | ||
| 930 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | 951 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; |
| 931 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | 952 | case TRep: coderep(compst, sib1(tree), opt, fl); break; |
| 932 | case TBehind: codebehind(compst, tree); break; | 953 | case TBehind: codebehind(compst, tree); break; |
| @@ -107,6 +107,9 @@ for creating patterns: | |||
| 107 | <td>Matches any character in <code>string</code> (Set)</td></tr> | 107 | <td>Matches any character in <code>string</code> (Set)</td></tr> |
| 108 | <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> | 108 | <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> |
| 109 | <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> | 109 | <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> |
| 110 | <tr><td><a href="#op-utfR"><code>lpeg.utfR(cp1, cp2)</code></a></td> | ||
| 111 | <td>Matches an UTF-8 code point between <code>cp1</code> and | ||
| 112 | <code>cp2</code></td></tr> | ||
| 110 | <tr><td><a href="#op-pow"><code>patt^n</code></a></td> | 113 | <tr><td><a href="#op-pow"><code>patt^n</code></a></td> |
| 111 | <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> | 114 | <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> |
| 112 | <tr><td><a href="#op-pow"><code>patt^-n</code></a></td> | 115 | <tr><td><a href="#op-pow"><code>patt^-n</code></a></td> |
| @@ -329,6 +332,15 @@ are patterns that always fail. | |||
| 329 | </p> | 332 | </p> |
| 330 | 333 | ||
| 331 | 334 | ||
| 335 | <h3><a name="op-utfR"></a><code>lpeg.utfR (cp1, cp2)</code></h3> | ||
| 336 | <p> | ||
| 337 | Returns a pattern that matches a valid UTF-8 byte sequence | ||
| 338 | representing a code point in the range <code>[cp1, cp2]</code>. | ||
| 339 | The range is limited by the natural Unicode limit of 0x10FFFF, | ||
| 340 | but may include surrogates. | ||
| 341 | </p> | ||
| 342 | |||
| 343 | |||
| 332 | <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> | 344 | <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> |
| 333 | <p> | 345 | <p> |
| 334 | This operation creates a non-terminal (a <em>variable</em>) | 346 | This operation creates a non-terminal (a <em>variable</em>) |
| @@ -56,7 +56,7 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
| 56 | const char *const names[] = { | 56 | const char *const names[] = { |
| 57 | "any", "char", "set", | 57 | "any", "char", "set", |
| 58 | "testany", "testchar", "testset", | 58 | "testany", "testchar", "testset", |
| 59 | "span", "behind", | 59 | "span", "utf-range", "behind", |
| 60 | "ret", "end", | 60 | "ret", "end", |
| 61 | "choice", "jmp", "call", "open_call", | 61 | "choice", "jmp", "call", "open_call", |
| 62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", | 62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", |
| @@ -66,11 +66,15 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
| 66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); | 66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); |
| 67 | switch ((Opcode)p->i.code) { | 67 | switch ((Opcode)p->i.code) { |
| 68 | case IChar: { | 68 | case IChar: { |
| 69 | printf("'%c'", p->i.aux); | 69 | printf("'%c' (%02x)", p->i.aux, p->i.aux); |
| 70 | break; | 70 | break; |
| 71 | } | 71 | } |
| 72 | case ITestChar: { | 72 | case ITestChar: { |
| 73 | printf("'%c'", p->i.aux); printjmp(op, p); | 73 | printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p); |
| 74 | break; | ||
| 75 | } | ||
| 76 | case IUTFR: { | ||
| 77 | printf("%d - %d", p[1].offset, utf_to(p)); | ||
| 74 | break; | 78 | break; |
| 75 | } | 79 | } |
| 76 | case IFullCapture: { | 80 | case IFullCapture: { |
| @@ -148,7 +152,7 @@ void printcaplist (Capture *cap, Capture *limit) { | |||
| 148 | 152 | ||
| 149 | static const char *tagnames[] = { | 153 | static const char *tagnames[] = { |
| 150 | "char", "set", "any", | 154 | "char", "set", "any", |
| 151 | "true", "false", | 155 | "true", "false", "utf8.range", |
| 152 | "rep", | 156 | "rep", |
| 153 | "seq", "choice", | 157 | "seq", "choice", |
| 154 | "not", "and", | 158 | "not", "and", |
| @@ -177,6 +181,13 @@ void printtree (TTree *tree, int ident) { | |||
| 177 | printf("\n"); | 181 | printf("\n"); |
| 178 | break; | 182 | break; |
| 179 | } | 183 | } |
| 184 | case TUTFR: { | ||
| 185 | assert(sib1(tree)->tag == TXInfo); | ||
| 186 | printf(" %d (%02x %d) - %d (%02x %d) \n", | ||
| 187 | tree->u.n, tree->key, tree->cap, | ||
| 188 | sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); | ||
| 189 | break; | ||
| 190 | } | ||
| 180 | case TOpenCall: case TCall: { | 191 | case TOpenCall: case TCall: { |
| 181 | assert(sib1(sib2(tree))->tag == TXInfo); | 192 | assert(sib1(sib2(tree))->tag == TXInfo); |
| 182 | printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); | 193 | printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); |
| @@ -21,7 +21,7 @@ | |||
| 21 | /* number of siblings for each tree */ | 21 | /* number of siblings for each tree */ |
| 22 | const byte numsiblings[] = { | 22 | const byte numsiblings[] = { |
| 23 | 0, 0, 0, /* char, set, any */ | 23 | 0, 0, 0, /* char, set, any */ |
| 24 | 0, 0, /* true, false */ | 24 | 0, 0, 0, /* true, false, utf-range */ |
| 25 | 1, /* rep */ | 25 | 1, /* rep */ |
| 26 | 2, 2, /* seq, choice */ | 26 | 2, 2, /* seq, choice */ |
| 27 | 1, 1, /* not, and */ | 27 | 1, 1, /* not, and */ |
| @@ -675,6 +675,56 @@ static int lp_range (lua_State *L) { | |||
| 675 | 675 | ||
| 676 | 676 | ||
| 677 | /* | 677 | /* |
| 678 | ** Fills a tree node with basic information about the UTF-8 code point | ||
| 679 | ** 'cpu': its value in 'n', its length in 'cap', and its first byte in | ||
| 680 | ** 'key' | ||
| 681 | */ | ||
| 682 | static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { | ||
| 683 | int len, fb, cp; | ||
| 684 | cp = (int)cpu; | ||
| 685 | if (cp <= 0x7f) { /* one byte? */ | ||
| 686 | len = 1; | ||
| 687 | fb = cp; | ||
| 688 | } else if (cp <= 0x7ff) { | ||
| 689 | len = 2; | ||
| 690 | fb = 0xC0 | (cp >> 6); | ||
| 691 | } else if (cp <= 0xffff) { | ||
| 692 | len = 3; | ||
| 693 | fb = 0xE0 | (cp >> 12); | ||
| 694 | } | ||
| 695 | else { | ||
| 696 | luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); | ||
| 697 | len = 4; | ||
| 698 | fb = 0xF0 | (cp >> 18); | ||
| 699 | } | ||
| 700 | t->u.n = cp; | ||
| 701 | t->cap = len; | ||
| 702 | t->key = fb; | ||
| 703 | } | ||
| 704 | |||
| 705 | |||
| 706 | static int lp_utfr (lua_State *L) { | ||
| 707 | lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); | ||
| 708 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | ||
| 709 | luaL_argcheck(L, from <= to, 2, "empty range"); | ||
| 710 | if (to <= 0x7f) { /* ascii range? */ | ||
| 711 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
| 712 | unsigned int f; | ||
| 713 | for (f = (int)from; f <= to; f++) | ||
| 714 | setchar(treebuffer(tree), f); | ||
| 715 | } | ||
| 716 | else { /* multi-byte utf-8 range */ | ||
| 717 | TTree *tree = newtree(L, 2); | ||
| 718 | tree->tag = TUTFR; | ||
| 719 | codeutftree(L, tree, from, 1); | ||
| 720 | sib1(tree)->tag = TXInfo; | ||
| 721 | codeutftree(L, sib1(tree), to, 2); | ||
| 722 | } | ||
| 723 | return 1; | ||
| 724 | } | ||
| 725 | |||
| 726 | |||
| 727 | /* | ||
| 678 | ** Look-behind predicate | 728 | ** Look-behind predicate |
| 679 | */ | 729 | */ |
| 680 | static int lp_behind (lua_State *L) { | 730 | static int lp_behind (lua_State *L) { |
| @@ -1008,7 +1058,7 @@ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, | |||
| 1008 | tailcall: | 1058 | tailcall: |
| 1009 | switch (tree->tag) { | 1059 | switch (tree->tag) { |
| 1010 | case TChar: case TSet: case TAny: | 1060 | case TChar: case TSet: case TAny: |
| 1011 | case TFalse: | 1061 | case TFalse: case TUTFR: |
| 1012 | return nb; /* cannot pass from here */ | 1062 | return nb; /* cannot pass from here */ |
| 1013 | case TTrue: | 1063 | case TTrue: |
| 1014 | case TBehind: /* look-behind cannot have calls */ | 1064 | case TBehind: /* look-behind cannot have calls */ |
| @@ -1271,6 +1321,7 @@ static struct luaL_Reg pattreg[] = { | |||
| 1271 | {"P", lp_P}, | 1321 | {"P", lp_P}, |
| 1272 | {"S", lp_set}, | 1322 | {"S", lp_set}, |
| 1273 | {"R", lp_range}, | 1323 | {"R", lp_range}, |
| 1324 | {"utfR", lp_utfr}, | ||
| 1274 | {"locale", lp_locale}, | 1325 | {"locale", lp_locale}, |
| 1275 | {"version", lp_version}, | 1326 | {"version", lp_version}, |
| 1276 | {"setmaxstack", lp_setmax}, | 1327 | {"setmaxstack", lp_setmax}, |
| @@ -18,6 +18,9 @@ typedef enum TTag { | |||
| 18 | TAny, | 18 | TAny, |
| 19 | TTrue, | 19 | TTrue, |
| 20 | TFalse, | 20 | TFalse, |
| 21 | TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; | ||
| 22 | 'cap' has length; 'key' has first byte; | ||
| 23 | extra info is similar for end codepoint */ | ||
| 21 | TRep, /* 'sib1'* */ | 24 | TRep, /* 'sib1'* */ |
| 22 | TSeq, /* 'sib1' 'sib2' */ | 25 | TSeq, /* 'sib1' 'sib2' */ |
| 23 | TChoice, /* 'sib1' / 'sib2' */ | 26 | TChoice, /* 'sib1' / 'sib2' */ |
| @@ -26,8 +29,9 @@ typedef enum TTag { | |||
| 26 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ | 29 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ |
| 27 | TOpenCall, /* ktable[key] is rule's key */ | 30 | TOpenCall, /* ktable[key] is rule's key */ |
| 28 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); | 31 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); |
| 29 | 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; */ | 32 | 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; |
| 30 | TXInfo, /* extra info; 'n' is rule's sequential number */ | 33 | extra info 'n' is rule's sequential number */ |
| 34 | TXInfo, /* extra info */ | ||
| 31 | TGrammar, /* 'sib1' is initial (and first) rule */ | 35 | TGrammar, /* 'sib1' is initial (and first) rule */ |
| 32 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ | 36 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ |
| 33 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); | 37 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); |
| @@ -37,6 +37,8 @@ | |||
| 37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) | 37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) |
| 38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) | 38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) |
| 39 | 39 | ||
| 40 | typedef size_t lua_Unsigned; | ||
| 41 | |||
| 40 | #endif | 42 | #endif |
| 41 | 43 | ||
| 42 | 44 | ||
| @@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}}; | |||
| 28 | 28 | ||
| 29 | 29 | ||
| 30 | /* | 30 | /* |
| 31 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | ||
| 32 | */ | ||
| 33 | static const char *utf8_decode (const char *o, int *val) { | ||
| 34 | static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; | ||
| 35 | const unsigned char *s = (const unsigned char *)o; | ||
| 36 | unsigned int c = s[0]; /* first byte */ | ||
| 37 | unsigned int res = 0; /* final result */ | ||
| 38 | if (c < 0x80) /* ascii? */ | ||
| 39 | res = c; | ||
| 40 | else { | ||
| 41 | int count = 0; /* to count number of continuation bytes */ | ||
| 42 | while (c & 0x40) { /* still have continuation bytes? */ | ||
| 43 | int cc = s[++count]; /* read next byte */ | ||
| 44 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ | ||
| 45 | return NULL; /* invalid byte sequence */ | ||
| 46 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ | ||
| 47 | c <<= 1; /* to test next bit */ | ||
| 48 | } | ||
| 49 | res |= (c & 0x7F) << (count * 5); /* add first byte */ | ||
| 50 | if (count > 3 || res > 0x10FFFFu || res <= limits[count]) | ||
| 51 | return NULL; /* invalid byte sequence */ | ||
| 52 | s += count; /* skip continuation bytes read */ | ||
| 53 | } | ||
| 54 | *val = res; | ||
| 55 | return (const char *)s + 1; /* +1 to include first byte */ | ||
| 56 | } | ||
| 57 | |||
| 58 | |||
| 59 | /* | ||
| 31 | ** {====================================================== | 60 | ** {====================================================== |
| 32 | ** Virtual Machine | 61 | ** Virtual Machine |
| 33 | ** ======================================================= | 62 | ** ======================================================= |
| @@ -198,6 +227,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
| 198 | else goto fail; | 227 | else goto fail; |
| 199 | continue; | 228 | continue; |
| 200 | } | 229 | } |
| 230 | case IUTFR: { | ||
| 231 | int codepoint; | ||
| 232 | if (s >= e) | ||
| 233 | goto fail; | ||
| 234 | s = utf8_decode (s, &codepoint); | ||
| 235 | if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) | ||
| 236 | p += 2; | ||
| 237 | else | ||
| 238 | goto fail; | ||
| 239 | continue; | ||
| 240 | } | ||
| 201 | case ITestAny: { | 241 | case ITestAny: { |
| 202 | if (s < e) p += 2; | 242 | if (s < e) p += 2; |
| 203 | else p += getoffset(p); | 243 | else p += getoffset(p); |
| @@ -17,6 +17,7 @@ typedef enum Opcode { | |||
| 17 | ITestChar, /* if char != aux, jump to 'offset' */ | 17 | ITestChar, /* if char != aux, jump to 'offset' */ |
| 18 | ITestSet, /* if char not in buff, jump to 'offset' */ | 18 | ITestSet, /* if char not in buff, jump to 'offset' */ |
| 19 | ISpan, /* read a span of chars in buff */ | 19 | ISpan, /* read a span of chars in buff */ |
| 20 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ | ||
| 20 | IBehind, /* walk back 'aux' characters (fail if not possible) */ | 21 | IBehind, /* walk back 'aux' characters (fail if not possible) */ |
| 21 | IRet, /* return from a rule */ | 22 | IRet, /* return from a rule */ |
| 22 | IEnd, /* end of pattern */ | 23 | IEnd, /* end of pattern */ |
| @@ -50,6 +51,10 @@ typedef union Instruction { | |||
| 50 | } Instruction; | 51 | } Instruction; |
| 51 | 52 | ||
| 52 | 53 | ||
| 54 | /* extract 24-bit value from an instruction */ | ||
| 55 | #define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux) | ||
| 56 | |||
| 57 | |||
| 53 | void printpatt (Instruction *p, int n); | 58 | void printpatt (Instruction *p, int n); |
| 54 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | 59 | const char *match (lua_State *L, const char *o, const char *s, const char *e, |
| 55 | Instruction *op, Capture *capture, int ptop); | 60 | Instruction *op, Capture *capture, int ptop); |
| @@ -48,7 +48,6 @@ end | |||
| 48 | 48 | ||
| 49 | print"General tests for LPeg library" | 49 | print"General tests for LPeg library" |
| 50 | 50 | ||
| 51 | assert(type(m.version()) == "string") | ||
| 52 | print("version " .. m.version()) | 51 | print("version " .. m.version()) |
| 53 | assert(m.type("alo") ~= "pattern") | 52 | assert(m.type("alo") ~= "pattern") |
| 54 | assert(m.type(io.input) ~= "pattern") | 53 | assert(m.type(io.input) ~= "pattern") |
| @@ -1189,6 +1188,63 @@ do print"testing large grammars" | |||
| 1189 | end | 1188 | end |
| 1190 | 1189 | ||
| 1191 | 1190 | ||
| 1191 | print "testing UTF-8 ranges" | ||
| 1192 | |||
| 1193 | do -- a few typical UTF-8 ranges | ||
| 1194 | local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" | ||
| 1195 | + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" | ||
| 1196 | + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" | ||
| 1197 | + m.utfR(0, 0x7f)^1 / "ascii: %0" | ||
| 1198 | + m.utfR(0, 0x10ffff) / "other: %0" | ||
| 1199 | |||
| 1200 | p = m.Ct(p^0) * -m.P(1) | ||
| 1201 | |||
| 1202 | local cyr = "ждюя" | ||
| 1203 | local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 | ||
| 1204 | local cjk = "专举乸" | ||
| 1205 | local ascii = "alo" | ||
| 1206 | local last = "\244\143\191\191" -- U+10FFFF | ||
| 1207 | |||
| 1208 | local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last | ||
| 1209 | t = (p:match(s)) | ||
| 1210 | |||
| 1211 | assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and | ||
| 1212 | t[3] == "emot: " .. emot and t[4] == "other: —" and | ||
| 1213 | t[5] == "cjk: " .. cjk and t[6] == "other: —" and | ||
| 1214 | t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and | ||
| 1215 | t[9] == nil) | ||
| 1216 | end | ||
| 1217 | |||
| 1218 | |||
| 1219 | do -- valid and invalid code points | ||
| 1220 | local p = m.utfR(0, 0x10ffff)^0 | ||
| 1221 | assert(p:match("汉字\128") == #"汉字" + 1) | ||
| 1222 | assert(p:match("\244\159\191") == 1) | ||
| 1223 | assert(p:match("\244\159\191\191") == 1) | ||
| 1224 | assert(p:match("\255") == 1) | ||
| 1225 | |||
| 1226 | -- basic errors | ||
| 1227 | checkerr("empty range", m.utfR, 1, 0) | ||
| 1228 | checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) | ||
| 1229 | end | ||
| 1230 | |||
| 1231 | |||
| 1232 | do -- back references (fixed width) | ||
| 1233 | -- match a byte after a CJK point | ||
| 1234 | local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) | ||
| 1235 | p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' | ||
| 1236 | assert(p:match("ab д 专X x") == "X") | ||
| 1237 | |||
| 1238 | -- match a byte after a hebrew point | ||
| 1239 | local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) | ||
| 1240 | p = m.P(#"ש") * p | ||
| 1241 | assert(p:match("שX") == "X") | ||
| 1242 | |||
| 1243 | checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) | ||
| 1244 | end | ||
| 1245 | |||
| 1246 | |||
| 1247 | |||
| 1192 | ------------------------------------------------------------------- | 1248 | ------------------------------------------------------------------- |
| 1193 | -- Tests for 're' module | 1249 | -- Tests for 're' module |
| 1194 | ------------------------------------------------------------------- | 1250 | ------------------------------------------------------------------- |
