diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2019-04-17 14:08:22 -0300 |
commit | 24bf757183d8bd97f6f5b43d916814f3269c8347 (patch) | |
tree | 646cd65d6e2dab57691f98f83f15f25c70685ef8 /lpcode.c | |
parent | 3f7797419e4d7493e1364290a5b127d1cb45e3bf (diff) | |
download | lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.gz lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.tar.bz2 lpeg-24bf757183d8bd97f6f5b43d916814f3269c8347.zip |
Implementation of UTF-8 ranges
New constructor 'lpeg.utfR(from, to)' creates a pattern that matches
UTF-8 byte sequences representing code points in the range [from, to].
Diffstat (limited to 'lpcode.c')
-rw-r--r-- | lpcode.c | 43 |
1 files changed, 32 insertions, 11 deletions
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) { | |||
196 | int checkaux (TTree *tree, int pred) { | 196 | int checkaux (TTree *tree, int pred) { |
197 | tailcall: | 197 | tailcall: |
198 | switch (tree->tag) { | 198 | switch (tree->tag) { |
199 | case TChar: case TSet: case TAny: | 199 | case TChar: case TSet: case TAny: case TUTFR: |
200 | case TFalse: case TOpenCall: | 200 | case TFalse: case TOpenCall: |
201 | return 0; /* not nullable */ | 201 | return 0; /* not nullable */ |
202 | case TRep: case TTrue: | 202 | case TRep: case TTrue: |
@@ -239,6 +239,8 @@ int fixedlen (TTree *tree) { | |||
239 | switch (tree->tag) { | 239 | switch (tree->tag) { |
240 | case TChar: case TSet: case TAny: | 240 | case TChar: case TSet: case TAny: |
241 | return len + 1; | 241 | return len + 1; |
242 | case TUTFR: | ||
243 | return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; | ||
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | 244 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: |
243 | return len; | 245 | return len; |
244 | case TRep: case TRunTime: case TOpenCall: | 246 | case TRep: case TRunTime: case TOpenCall: |
@@ -298,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
298 | tocharset(tree, firstset); | 300 | tocharset(tree, firstset); |
299 | return 0; | 301 | return 0; |
300 | } | 302 | } |
303 | case TUTFR: { | ||
304 | int c; | ||
305 | loopset(i, firstset->cs[i] = 0); /* erase all chars */ | ||
306 | for (c = tree->key; c <= sib1(tree)->key; c++) | ||
307 | setchar(firstset->cs, c); | ||
308 | return 0; | ||
309 | } | ||
301 | case TTrue: { | 310 | case TTrue: { |
302 | loopset(i, firstset->cs[i] = follow->cs[i]); | 311 | loopset(i, firstset->cs[i] = follow->cs[i]); |
303 | return 1; /* accepts the empty string */ | 312 | return 1; /* accepts the empty string */ |
@@ -380,7 +389,7 @@ static int headfail (TTree *tree) { | |||
380 | case TChar: case TSet: case TAny: case TFalse: | 389 | case TChar: case TSet: case TAny: case TFalse: |
381 | return 1; | 390 | return 1; |
382 | case TTrue: case TRep: case TRunTime: case TNot: | 391 | case TTrue: case TRep: case TRunTime: case TNot: |
383 | case TBehind: | 392 | case TBehind: case TUTFR: |
384 | return 0; | 393 | return 0; |
385 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: | 394 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: |
386 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | 395 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ |
@@ -407,7 +416,7 @@ static int headfail (TTree *tree) { | |||
407 | static int needfollow (TTree *tree) { | 416 | static int needfollow (TTree *tree) { |
408 | tailcall: | 417 | tailcall: |
409 | switch (tree->tag) { | 418 | switch (tree->tag) { |
410 | case TChar: case TSet: case TAny: | 419 | case TChar: case TSet: case TAny: case TUTFR: |
411 | case TFalse: case TTrue: case TAnd: case TNot: | 420 | case TFalse: case TTrue: case TAnd: case TNot: |
412 | case TRunTime: case TGrammar: case TCall: case TBehind: | 421 | case TRunTime: case TGrammar: case TCall: case TBehind: |
413 | return 0; | 422 | return 0; |
@@ -418,7 +427,7 @@ static int needfollow (TTree *tree) { | |||
418 | case TSeq: | 427 | case TSeq: |
419 | tree = sib2(tree); goto tailcall; | 428 | tree = sib2(tree); goto tailcall; |
420 | default: assert(0); return 0; | 429 | default: assert(0); return 0; |
421 | } | 430 | } |
422 | } | 431 | } |
423 | 432 | ||
424 | /* }====================================================== */ | 433 | /* }====================================================== */ |
@@ -441,6 +450,7 @@ int sizei (const Instruction *i) { | |||
441 | case ITestSet: return CHARSETINSTSIZE + 1; | 450 | case ITestSet: return CHARSETINSTSIZE + 1; |
442 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 451 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
443 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 452 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
453 | case IUTFR: | ||
444 | return 2; | 454 | return 2; |
445 | default: return 1; | 455 | default: return 1; |
446 | } | 456 | } |
@@ -518,6 +528,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { | |||
518 | } | 528 | } |
519 | 529 | ||
520 | 530 | ||
531 | static void codeutfr (CompileState *compst, TTree *tree) { | ||
532 | int i = addoffsetinst(compst, IUTFR); | ||
533 | int to = sib1(tree)->u.n; | ||
534 | assert(sib1(tree)->tag == TXInfo); | ||
535 | getinstr(compst, i + 1).offset = tree->u.n; | ||
536 | getinstr(compst, i).i.aux = to & 0xff; | ||
537 | getinstr(compst, i).i.key = to >> 8; | ||
538 | } | ||
539 | |||
540 | |||
521 | /* | 541 | /* |
522 | ** Add a capture instruction: | 542 | ** Add a capture instruction: |
523 | ** 'op' is the capture instruction; 'cap' the capture kind; | 543 | ** 'op' is the capture instruction; 'cap' the capture kind; |
@@ -665,11 +685,11 @@ static void codebehind (CompileState *compst, TTree *tree) { | |||
665 | 685 | ||
666 | /* | 686 | /* |
667 | ** Choice; optimizations: | 687 | ** Choice; optimizations: |
668 | ** - when p1 is headfail or | 688 | ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, |
669 | ** when first(p1) and first(p2) are disjoint, than | 689 | ** than a character not in first(p1) cannot go to p1 and a character |
670 | ** a character not in first(p1) cannot go to p1, and a character | 690 | ** in first(p1) cannot go to p2, either because p1 will accept |
671 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | 691 | ** (headfail) or because it is not in first(p2) (disjoint). |
672 | ** (The optimization is not valid if p1 accepts the empty string, | 692 | ** (The second case is not valid if p1 accepts the empty string, |
673 | ** as then there is no character at all...) | 693 | ** as then there is no character at all...) |
674 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | 694 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse |
675 | ** the Choice already active in the stack. | 695 | ** the Choice already active in the stack. |
@@ -686,7 +706,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
686 | int jmp = NOINST; | 706 | int jmp = NOINST; |
687 | codegen(compst, p1, 0, test, fl); | 707 | codegen(compst, p1, 0, test, fl); |
688 | if (!emptyp2) | 708 | if (!emptyp2) |
689 | jmp = addoffsetinst(compst, IJmp); | 709 | jmp = addoffsetinst(compst, IJmp); |
690 | jumptohere(compst, test); | 710 | jumptohere(compst, test); |
691 | codegen(compst, p2, opt, NOINST, fl); | 711 | codegen(compst, p2, opt, NOINST, fl); |
692 | jumptohere(compst, jmp); | 712 | jumptohere(compst, jmp); |
@@ -697,7 +717,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
697 | codegen(compst, p1, 1, NOINST, fullset); | 717 | codegen(compst, p1, 1, NOINST, fullset); |
698 | } | 718 | } |
699 | else { | 719 | else { |
700 | /* <p1 / p2> == | 720 | /* <p1 / p2> == |
701 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | 721 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ |
702 | int pcommit; | 722 | int pcommit; |
703 | int test = codetestset(compst, &cs1, e1); | 723 | int test = codetestset(compst, &cs1, e1); |
@@ -927,6 +947,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
927 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 947 | case TSet: codecharset(compst, treebuffer(tree), tt); break; |
928 | case TTrue: break; | 948 | case TTrue: break; |
929 | case TFalse: addinstruction(compst, IFail, 0); break; | 949 | case TFalse: addinstruction(compst, IFail, 0); break; |
950 | case TUTFR: codeutfr(compst, tree); break; | ||
930 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | 951 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; |
931 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | 952 | case TRep: coderep(compst, sib1(tree), opt, fl); break; |
932 | case TBehind: codebehind(compst, tree); break; | 953 | case TBehind: codebehind(compst, tree); break; |