diff options
-rw-r--r-- | lpcode.c | 79 | ||||
-rw-r--r-- | lpprint.c | 54 | ||||
-rw-r--r-- | lptree.c | 104 | ||||
-rw-r--r-- | lptree.h | 11 | ||||
-rw-r--r-- | lptypes.h | 8 | ||||
-rw-r--r-- | lpvm.c | 54 | ||||
-rw-r--r-- | lpvm.h | 15 | ||||
-rwxr-xr-x | test.lua | 98 | ||||
-rw-r--r-- | testlabel.lua | 4 |
9 files changed, 328 insertions, 99 deletions
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) { | |||
196 | int checkaux (TTree *tree, int pred) { | 196 | int checkaux (TTree *tree, int pred) { |
197 | tailcall: | 197 | tailcall: |
198 | switch (tree->tag) { | 198 | switch (tree->tag) { |
199 | case TChar: case TSet: case TAny: | 199 | case TChar: case TSet: case TAny: case TUTFR: |
200 | case TFalse: case TOpenCall: case TThrow: /* labeled failure */ | 200 | case TFalse: case TOpenCall: case TThrow: /* labeled failure */ |
201 | return 0; /* not nullable */ | 201 | return 0; /* not nullable */ |
202 | case TRep: case TTrue: | 202 | case TRep: case TTrue: |
@@ -220,7 +220,7 @@ int checkaux (TTree *tree, int pred) { | |||
220 | if (checkaux(sib2(tree), pred)) return 1; | 220 | if (checkaux(sib2(tree), pred)) return 1; |
221 | /* else return checkaux(sib1(tree), pred); */ | 221 | /* else return checkaux(sib1(tree), pred); */ |
222 | tree = sib1(tree); goto tailcall; | 222 | tree = sib1(tree); goto tailcall; |
223 | case TCapture: case TGrammar: case TRule: | 223 | case TCapture: case TGrammar: case TRule: case TXInfo: |
224 | /* return checkaux(sib1(tree), pred); */ | 224 | /* return checkaux(sib1(tree), pred); */ |
225 | tree = sib1(tree); goto tailcall; | 225 | tree = sib1(tree); goto tailcall; |
226 | case TCall: /* return checkaux(sib2(tree), pred); */ | 226 | case TCall: /* return checkaux(sib2(tree), pred); */ |
@@ -239,12 +239,14 @@ int fixedlen (TTree *tree) { | |||
239 | switch (tree->tag) { | 239 | switch (tree->tag) { |
240 | case TChar: case TSet: case TAny: | 240 | case TChar: case TSet: case TAny: |
241 | return len + 1; | 241 | return len + 1; |
242 | case TUTFR: | ||
243 | return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; | ||
242 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: | 244 | case TFalse: case TTrue: case TNot: case TAnd: case TBehind: |
243 | return len; | 245 | return len; |
244 | case TRep: case TRunTime: case TOpenCall: | 246 | case TRep: case TRunTime: case TOpenCall: |
245 | case TThrow: /* labeled failure */ | 247 | case TThrow: /* labeled failure */ |
246 | return -1; | 248 | return -1; |
247 | case TCapture: case TRule: case TGrammar: | 249 | case TCapture: case TRule: case TGrammar: case TXInfo: |
248 | /* return fixedlen(sib1(tree)); */ | 250 | /* return fixedlen(sib1(tree)); */ |
249 | tree = sib1(tree); goto tailcall; | 251 | tree = sib1(tree); goto tailcall; |
250 | case TCall: { | 252 | case TCall: { |
@@ -254,14 +256,14 @@ int fixedlen (TTree *tree) { | |||
254 | else | 256 | else |
255 | return len + n1; | 257 | return len + n1; |
256 | } | 258 | } |
257 | case TSeq: { | 259 | case TSeq: { |
258 | int n1 = fixedlen(sib1(tree)); | 260 | int n1 = fixedlen(sib1(tree)); |
259 | if (n1 < 0) | 261 | if (n1 < 0) |
260 | return -1; | 262 | return -1; |
261 | /* else return fixedlen(sib2(tree)) + len; */ | 263 | /* else return fixedlen(sib2(tree)) + len; */ |
262 | len += n1; tree = sib2(tree); goto tailcall; | 264 | len += n1; tree = sib2(tree); goto tailcall; |
263 | } | 265 | } |
264 | case TChoice: { | 266 | case TChoice: { |
265 | int n1 = fixedlen(sib1(tree)); | 267 | int n1 = fixedlen(sib1(tree)); |
266 | int n2 = fixedlen(sib2(tree)); | 268 | int n2 = fixedlen(sib2(tree)); |
267 | if (n1 != n2 || n1 < 0) | 269 | if (n1 != n2 || n1 < 0) |
@@ -299,6 +301,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
299 | tocharset(tree, firstset); | 301 | tocharset(tree, firstset); |
300 | return 0; | 302 | return 0; |
301 | } | 303 | } |
304 | case TUTFR: { | ||
305 | int c; | ||
306 | loopset(i, firstset->cs[i] = 0); /* erase all chars */ | ||
307 | for (c = tree->key; c <= sib1(tree)->key; c++) | ||
308 | setchar(firstset->cs, c); | ||
309 | return 0; | ||
310 | } | ||
302 | case TTrue: { | 311 | case TTrue: { |
303 | loopset(i, firstset->cs[i] = follow->cs[i]); | 312 | loopset(i, firstset->cs[i] = follow->cs[i]); |
304 | return 1; /* accepts the empty string */ | 313 | return 1; /* accepts the empty string */ |
@@ -311,7 +320,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
311 | loopset(i, firstset->cs[i] = fullset->cs[i]); | 320 | loopset(i, firstset->cs[i] = fullset->cs[i]); |
312 | return 1; | 321 | return 1; |
313 | } | 322 | } |
314 | case TChoice: { | 323 | case TChoice: { |
315 | Charset csaux; | 324 | Charset csaux; |
316 | int e1 = getfirst(sib1(tree), follow, firstset); | 325 | int e1 = getfirst(sib1(tree), follow, firstset); |
317 | int e2 = getfirst(sib2(tree), follow, &csaux); | 326 | int e2 = getfirst(sib2(tree), follow, &csaux); |
@@ -339,7 +348,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { | |||
339 | loopset(i, firstset->cs[i] |= follow->cs[i]); | 348 | loopset(i, firstset->cs[i] |= follow->cs[i]); |
340 | return 1; /* accept the empty string */ | 349 | return 1; /* accept the empty string */ |
341 | } | 350 | } |
342 | case TCapture: case TGrammar: case TRule: { | 351 | case TCapture: case TGrammar: case TRule: case TXInfo: { |
343 | /* return getfirst(sib1(tree), follow, firstset); */ | 352 | /* return getfirst(sib1(tree), follow, firstset); */ |
344 | tree = sib1(tree); goto tailcall; | 353 | tree = sib1(tree); goto tailcall; |
345 | } | 354 | } |
@@ -385,10 +394,10 @@ static int headfail (TTree *tree) { | |||
385 | case TChar: case TSet: case TAny: case TFalse: | 394 | case TChar: case TSet: case TAny: case TFalse: |
386 | return 1; | 395 | return 1; |
387 | case TTrue: case TRep: case TRunTime: case TNot: | 396 | case TTrue: case TRep: case TRunTime: case TNot: |
388 | case TBehind: | 397 | case TBehind: case TUTFR: |
389 | case TThrow: /* labeled failure: must always throw the label */ | 398 | case TThrow: /* labeled failure: must always throw the label */ |
390 | return 0; | 399 | return 0; |
391 | case TCapture: case TGrammar: case TRule: case TAnd: | 400 | case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: |
392 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ | 401 | tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ |
393 | case TCall: | 402 | case TCall: |
394 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ | 403 | tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ |
@@ -396,7 +405,7 @@ static int headfail (TTree *tree) { | |||
396 | if (!nofail(sib2(tree))) return 0; | 405 | if (!nofail(sib2(tree))) return 0; |
397 | /* else return headfail(sib1(tree)); */ | 406 | /* else return headfail(sib1(tree)); */ |
398 | tree = sib1(tree); goto tailcall; | 407 | tree = sib1(tree); goto tailcall; |
399 | case TChoice: | 408 | case TChoice: |
400 | if (!headfail(sib1(tree))) return 0; | 409 | if (!headfail(sib1(tree))) return 0; |
401 | /* else return headfail(sib2(tree)); */ | 410 | /* else return headfail(sib2(tree)); */ |
402 | tree = sib2(tree); goto tailcall; | 411 | tree = sib2(tree); goto tailcall; |
@@ -413,7 +422,7 @@ static int headfail (TTree *tree) { | |||
413 | static int needfollow (TTree *tree) { | 422 | static int needfollow (TTree *tree) { |
414 | tailcall: | 423 | tailcall: |
415 | switch (tree->tag) { | 424 | switch (tree->tag) { |
416 | case TChar: case TSet: case TAny: | 425 | case TChar: case TSet: case TAny: case TUTFR: |
417 | case TFalse: case TTrue: case TAnd: case TNot: | 426 | case TFalse: case TTrue: case TAnd: case TNot: |
418 | case TRunTime: case TGrammar: case TCall: case TBehind: | 427 | case TRunTime: case TGrammar: case TCall: case TBehind: |
419 | case TThrow: /* (?)labeled failure */ | 428 | case TThrow: /* (?)labeled failure */ |
@@ -425,7 +434,7 @@ static int needfollow (TTree *tree) { | |||
425 | case TSeq: | 434 | case TSeq: |
426 | tree = sib2(tree); goto tailcall; | 435 | tree = sib2(tree); goto tailcall; |
427 | default: assert(0); return 0; | 436 | default: assert(0); return 0; |
428 | } | 437 | } |
429 | } | 438 | } |
430 | 439 | ||
431 | /* }====================================================== */ | 440 | /* }====================================================== */ |
@@ -448,12 +457,12 @@ int sizei (const Instruction *i) { | |||
448 | case ITestSet: return CHARSETINSTSIZE + 1; | 457 | case ITestSet: return CHARSETINSTSIZE + 1; |
449 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 458 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
450 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 459 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
460 | case IUTFR: | ||
451 | return 2; | 461 | return 2; |
452 | case IThrow: case IPredChoice: /* labeled failure */ | 462 | case IThrow: case IPredChoice: /* labeled failure */ |
453 | return 2; | 463 | return 2; |
454 | case IThrowRec: /* labeled failure */ | 464 | case IThrowRec: /* labeled failure */ |
455 | return 3; | 465 | return 3; |
456 | |||
457 | default: return 1; | 466 | default: return 1; |
458 | } | 467 | } |
459 | } | 468 | } |
@@ -517,8 +526,9 @@ static int addinstruction (CompileState *compst, Opcode op, int aux) { | |||
517 | static int addoffsetinst (CompileState *compst, Opcode op) { | 526 | static int addoffsetinst (CompileState *compst, Opcode op) { |
518 | int i = addinstruction(compst, op, 0); /* instruction */ | 527 | int i = addinstruction(compst, op, 0); /* instruction */ |
519 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ | 528 | addinstruction(compst, (Opcode)0, 0); /* open space for offset */ |
520 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 || | 529 | assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 || |
521 | op == IThrowRec); /* labeled failure */ | 530 | op == IThrowRec); /* labeled failure */ |
531 | |||
522 | return i; | 532 | return i; |
523 | } | 533 | } |
524 | 534 | ||
@@ -528,13 +538,13 @@ static void codethrow (CompileState *compst, TTree *throw) { | |||
528 | int recov, aux; | 538 | int recov, aux; |
529 | if (throw->u.ps != 0) { | 539 | if (throw->u.ps != 0) { |
530 | recov = addoffsetinst(compst, IThrowRec); | 540 | recov = addoffsetinst(compst, IThrowRec); |
531 | assert(sib2(throw)->tag == TRule); | 541 | assert(sib1(sib2(throw))->tag == TXInfo); |
532 | } else { | 542 | } else { |
533 | recov = addinstruction(compst, IThrow, 0); | 543 | recov = addinstruction(compst, IThrow, 0); |
534 | } | 544 | } |
535 | aux = nextinstruction(compst); | 545 | aux = nextinstruction(compst); |
536 | getinstr(compst, aux).i.key = throw->key; /* next instruction keeps only rule name */ | 546 | getinstr(compst, aux).i.key = throw->key; /* next instruction keeps only rule name */ |
537 | getinstr(compst, recov).i.key = sib2(throw)->cap; /* rule number */ | 547 | getinstr(compst, recov).i.key = sib1(sib2(throw))->u.n; /* rule number */ |
538 | } | 548 | } |
539 | /* labeled failure */ | 549 | /* labeled failure */ |
540 | 550 | ||
@@ -547,6 +557,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { | |||
547 | } | 557 | } |
548 | 558 | ||
549 | 559 | ||
560 | static void codeutfr (CompileState *compst, TTree *tree) { | ||
561 | int i = addoffsetinst(compst, IUTFR); | ||
562 | int to = sib1(tree)->u.n; | ||
563 | assert(sib1(tree)->tag == TXInfo); | ||
564 | getinstr(compst, i + 1).offset = tree->u.n; | ||
565 | getinstr(compst, i).i.aux = to & 0xff; | ||
566 | getinstr(compst, i).i.key = to >> 8; | ||
567 | } | ||
568 | |||
569 | |||
550 | /* | 570 | /* |
551 | ** Add a capture instruction: | 571 | ** Add a capture instruction: |
552 | ** 'op' is the capture instruction; 'cap' the capture kind; | 572 | ** 'op' is the capture instruction; 'cap' the capture kind; |
@@ -694,11 +714,11 @@ static void codebehind (CompileState *compst, TTree *tree) { | |||
694 | 714 | ||
695 | /* | 715 | /* |
696 | ** Choice; optimizations: | 716 | ** Choice; optimizations: |
697 | ** - when p1 is headfail or | 717 | ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, |
698 | ** when first(p1) and first(p2) are disjoint, than | 718 | ** than a character not in first(p1) cannot go to p1 and a character |
699 | ** a character not in first(p1) cannot go to p1, and a character | 719 | ** in first(p1) cannot go to p2, either because p1 will accept |
700 | ** in first(p1) cannot go to p2 (at it is not in first(p2)). | 720 | ** (headfail) or because it is not in first(p2) (disjoint). |
701 | ** (The optimization is not valid if p1 accepts the empty string, | 721 | ** (The second case is not valid if p1 accepts the empty string, |
702 | ** as then there is no character at all...) | 722 | ** as then there is no character at all...) |
703 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse | 723 | ** - when p2 is empty and opt is true; a IPartialCommit can reuse |
704 | ** the Choice already active in the stack. | 724 | ** the Choice already active in the stack. |
@@ -715,7 +735,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
715 | int jmp = NOINST; | 735 | int jmp = NOINST; |
716 | codegen(compst, p1, 0, test, fl); | 736 | codegen(compst, p1, 0, test, fl); |
717 | if (!emptyp2) | 737 | if (!emptyp2) |
718 | jmp = addoffsetinst(compst, IJmp); | 738 | jmp = addoffsetinst(compst, IJmp); |
719 | jumptohere(compst, test); | 739 | jumptohere(compst, test); |
720 | codegen(compst, p2, opt, NOINST, fl); | 740 | codegen(compst, p2, opt, NOINST, fl); |
721 | jumptohere(compst, jmp); | 741 | jumptohere(compst, jmp); |
@@ -726,7 +746,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, | |||
726 | codegen(compst, p1, 1, NOINST, fullset); | 746 | codegen(compst, p1, 1, NOINST, fullset); |
727 | } | 747 | } |
728 | else { | 748 | else { |
729 | /* <p1 / p2> == | 749 | /* <p1 / p2> == |
730 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ | 750 | test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ |
731 | int pcommit; | 751 | int pcommit; |
732 | int test = codetestset(compst, &cs1, e1); | 752 | int test = codetestset(compst, &cs1, e1); |
@@ -887,7 +907,7 @@ static void correctcalls (CompileState *compst, int *positions, | |||
887 | else | 907 | else |
888 | code[i].i.code = ICall; | 908 | code[i].i.code = ICall; |
889 | jumptothere(compst, i, rule); /* call jumps to respective rule */ | 909 | jumptothere(compst, i, rule); /* call jumps to respective rule */ |
890 | } else if (code[i].i.code == IThrowRec) { | 910 | } else if (code[i].i.code == IThrowRec) { /* labeled failure */ |
891 | int n = code[i].i.key; /* rule number */ | 911 | int n = code[i].i.key; /* rule number */ |
892 | int rule = positions[n]; /* rule position */ | 912 | int rule = positions[n]; /* rule position */ |
893 | assert(rule == from || code[rule - 1].i.code == IRet); | 913 | assert(rule == from || code[rule - 1].i.code == IRet); |
@@ -912,8 +932,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) { | |||
912 | int start = gethere(compst); /* here starts the initial rule */ | 932 | int start = gethere(compst); /* here starts the initial rule */ |
913 | jumptohere(compst, firstcall); | 933 | jumptohere(compst, firstcall); |
914 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | 934 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { |
935 | TTree *r = sib1(rule); | ||
936 | assert(r->tag == TXInfo); | ||
915 | positions[rulenumber++] = gethere(compst); /* save rule position */ | 937 | positions[rulenumber++] = gethere(compst); /* save rule position */ |
916 | codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ | 938 | codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */ |
917 | addinstruction(compst, IRet, 0); | 939 | addinstruction(compst, IRet, 0); |
918 | } | 940 | } |
919 | assert(rule->tag == TTrue); | 941 | assert(rule->tag == TTrue); |
@@ -924,8 +946,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) { | |||
924 | 946 | ||
925 | static void codecall (CompileState *compst, TTree *call) { | 947 | static void codecall (CompileState *compst, TTree *call) { |
926 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ | 948 | int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ |
927 | getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ | 949 | assert(sib1(sib2(call))->tag == TXInfo); |
928 | assert(sib2(call)->tag == TRule); | 950 | getinstr(compst, c).i.key = sib1(sib2(call))->u.n; /* rule number */ |
929 | } | 951 | } |
930 | 952 | ||
931 | 953 | ||
@@ -963,6 +985,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, | |||
963 | case TSet: codecharset(compst, treebuffer(tree), tt); break; | 985 | case TSet: codecharset(compst, treebuffer(tree), tt); break; |
964 | case TTrue: break; | 986 | case TTrue: break; |
965 | case TFalse: addinstruction(compst, IFail, 0); break; | 987 | case TFalse: addinstruction(compst, IFail, 0); break; |
988 | case TUTFR: codeutfr(compst, tree); break; | ||
966 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; | 989 | case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; |
967 | case TRep: coderep(compst, sib1(tree), opt, fl); break; | 990 | case TRep: coderep(compst, sib1(tree), opt, fl); break; |
968 | case TBehind: codebehind(compst, tree); break; | 991 | case TBehind: codebehind(compst, tree); break; |
@@ -1013,7 +1036,7 @@ static void peephole (CompileState *compst) { | |||
1013 | case IRet: case IFail: case IFailTwice: | 1036 | case IRet: case IFail: case IFailTwice: |
1014 | case IEnd: { /* instructions with unconditional implicit jumps */ | 1037 | case IEnd: { /* instructions with unconditional implicit jumps */ |
1015 | code[i] = code[ft]; /* jump becomes that instruction */ | 1038 | code[i] = code[ft]; /* jump becomes that instruction */ |
1016 | code[i + 1].i.code = IAny; /* 'no-op' for target position */ | 1039 | code[i + 1].i.code = IEmpty; /* 'no-op' for target position */ |
1017 | break; | 1040 | break; |
1018 | } | 1041 | } |
1019 | case ICommit: case IPartialCommit: | 1042 | case ICommit: case IPartialCommit: |
@@ -56,21 +56,26 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
56 | const char *const names[] = { | 56 | const char *const names[] = { |
57 | "any", "char", "set", | 57 | "any", "char", "set", |
58 | "testany", "testchar", "testset", | 58 | "testany", "testchar", "testset", |
59 | "span", "behind", | 59 | "span", "utf-range", "behind", |
60 | "ret", "end", | 60 | "ret", "end", |
61 | "choice", "pred_choice", "jmp", "call", "open_call", /* labeled failure */ | 61 | "choice", "pred_choice", "jmp", "call", "open_call", /* labeled failure */ |
62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", | 62 | "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", |
63 | "fullcapture", "opencapture", "closecapture", "closeruntime", | 63 | "fullcapture", "opencapture", "closecapture", "closeruntime", |
64 | "throw", "throw_rec", /* labeled failure */ | 64 | "throw", "throw_rec", /* labeled failure */ |
65 | "--" | ||
65 | }; | 66 | }; |
66 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); | 67 | printf("%02ld: %s ", (long)(p - op), names[p->i.code]); |
67 | switch ((Opcode)p->i.code) { | 68 | switch ((Opcode)p->i.code) { |
68 | case IChar: { | 69 | case IChar: { |
69 | printf("'%c'", p->i.aux); | 70 | printf("'%c' (%02x)", p->i.aux, p->i.aux); |
70 | break; | 71 | break; |
71 | } | 72 | } |
72 | case ITestChar: { | 73 | case ITestChar: { |
73 | printf("'%c'", p->i.aux); printjmp(op, p); | 74 | printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p); |
75 | break; | ||
76 | } | ||
77 | case IUTFR: { | ||
78 | printf("%d - %d", p[1].offset, utf_to(p)); | ||
74 | break; | 79 | break; |
75 | } | 80 | } |
76 | case IFullCapture: { | 81 | case IFullCapture: { |
@@ -157,11 +162,11 @@ void printcaplist (Capture *cap, Capture *limit) { | |||
157 | 162 | ||
158 | static const char *tagnames[] = { | 163 | static const char *tagnames[] = { |
159 | "char", "set", "any", | 164 | "char", "set", "any", |
160 | "true", "false", | 165 | "true", "false", "utf8.range", |
161 | "rep", | 166 | "rep", |
162 | "seq", "choice", | 167 | "seq", "choice", |
163 | "not", "and", | 168 | "not", "and", |
164 | "call", "opencall", "rule", "grammar", | 169 | "call", "opencall", "rule", "xinfo", "grammar", |
165 | "behind", | 170 | "behind", |
166 | "capture", "run-time", | 171 | "capture", "run-time", |
167 | "throw" /* labeled failure */ | 172 | "throw" /* labeled failure */ |
@@ -170,6 +175,7 @@ static const char *tagnames[] = { | |||
170 | 175 | ||
171 | void printtree (TTree *tree, int ident) { | 176 | void printtree (TTree *tree, int ident) { |
172 | int i; | 177 | int i; |
178 | int sibs = numsiblings[tree->tag]; | ||
173 | for (i = 0; i < ident; i++) printf(" "); | 179 | for (i = 0; i < ident; i++) printf(" "); |
174 | printf("%s", tagnames[tree->tag]); | 180 | printf("%s", tagnames[tree->tag]); |
175 | switch (tree->tag) { | 181 | switch (tree->tag) { |
@@ -186,25 +192,34 @@ void printtree (TTree *tree, int ident) { | |||
186 | printf("\n"); | 192 | printf("\n"); |
187 | break; | 193 | break; |
188 | } | 194 | } |
195 | case TUTFR: { | ||
196 | assert(sib1(tree)->tag == TXInfo); | ||
197 | printf(" %d (%02x %d) - %d (%02x %d) \n", | ||
198 | tree->u.n, tree->key, tree->cap, | ||
199 | sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); | ||
200 | break; | ||
201 | } | ||
189 | case TOpenCall: case TCall: { | 202 | case TOpenCall: case TCall: { |
190 | assert(sib2(tree)->tag == TRule); | 203 | assert(sib1(sib2(tree))->tag == TXInfo); |
191 | printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); | 204 | printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); |
192 | break; | 205 | break; |
193 | } | 206 | } |
194 | case TBehind: { | 207 | case TBehind: { |
195 | printf(" %d\n", tree->u.n); | 208 | printf(" %d\n", tree->u.n); |
196 | printtree(sib1(tree), ident + 2); | ||
197 | break; | 209 | break; |
198 | } | 210 | } |
199 | case TCapture: { | 211 | case TCapture: { |
200 | printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); | 212 | printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); |
201 | printtree(sib1(tree), ident + 2); | ||
202 | break; | 213 | break; |
203 | } | 214 | } |
204 | case TRule: { | 215 | case TRule: { |
205 | printf(" n: %d key: %d\n", tree->cap, tree->key); | 216 | printf(" key: %d\n", tree->key); |
206 | printtree(sib1(tree), ident + 2); | 217 | sibs = 1; /* do not print 'sib2' (next rule) as a sibling */ |
207 | break; /* do not print next rule as a sibling */ | 218 | break; |
219 | } | ||
220 | case TXInfo: { | ||
221 | printf(" n: %d\n", tree->u.n); | ||
222 | break; | ||
208 | } | 223 | } |
209 | case TGrammar: { | 224 | case TGrammar: { |
210 | TTree *rule = sib1(tree); | 225 | TTree *rule = sib1(tree); |
@@ -214,6 +229,7 @@ void printtree (TTree *tree, int ident) { | |||
214 | rule = sib2(rule); | 229 | rule = sib2(rule); |
215 | } | 230 | } |
216 | assert(rule->tag == TTrue); /* sentinel */ | 231 | assert(rule->tag == TTrue); /* sentinel */ |
232 | sibs = 0; /* siblings already handled */ | ||
217 | break; | 233 | break; |
218 | } | 234 | } |
219 | case TThrow: { /* labeled failure */ | 235 | case TThrow: { /* labeled failure */ |
@@ -222,16 +238,14 @@ void printtree (TTree *tree, int ident) { | |||
222 | printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); | 238 | printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); |
223 | break; | 239 | break; |
224 | } | 240 | } |
225 | default: { | 241 | default: |
226 | int sibs = numsiblings[tree->tag]; | ||
227 | printf("\n"); | 242 | printf("\n"); |
228 | if (sibs >= 1) { | ||
229 | printtree(sib1(tree), ident + 2); | ||
230 | if (sibs >= 2) | ||
231 | printtree(sib2(tree), ident + 2); | ||
232 | } | ||
233 | break; | 243 | break; |
234 | } | 244 | } |
245 | if (sibs >= 1) { | ||
246 | printtree(sib1(tree), ident + 2); | ||
247 | if (sibs >= 2) | ||
248 | printtree(sib2(tree), ident + 2); | ||
235 | } | 249 | } |
236 | } | 250 | } |
237 | 251 | ||
@@ -21,11 +21,11 @@ | |||
21 | /* number of siblings for each tree */ | 21 | /* number of siblings for each tree */ |
22 | const byte numsiblings[] = { | 22 | const byte numsiblings[] = { |
23 | 0, 0, 0, /* char, set, any */ | 23 | 0, 0, 0, /* char, set, any */ |
24 | 0, 0, /* true, false */ | 24 | 0, 0, 0, /* true, false, utf-range */ |
25 | 1, /* rep */ | 25 | 1, /* rep */ |
26 | 2, 2, /* seq, choice */ | 26 | 2, 2, /* seq, choice */ |
27 | 1, 1, /* not, and */ | 27 | 1, 1, /* not, and */ |
28 | 0, 0, 2, 1, /* call, opencall, rule, grammar */ | 28 | 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */ |
29 | 1, /* behind */ | 29 | 1, /* behind */ |
30 | 1, 1, /* capture, runtime capture */ | 30 | 1, 1, /* capture, runtime capture */ |
31 | 0 /* labeled failure throw */ | 31 | 0 /* labeled failure throw */ |
@@ -58,7 +58,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t, byte tag | |||
58 | lua_gettable(L, postable); /* query name in position table */ | 58 | lua_gettable(L, postable); /* query name in position table */ |
59 | n = lua_tonumber(L, -1); /* get (absolute) position */ | 59 | n = lua_tonumber(L, -1); /* get (absolute) position */ |
60 | lua_pop(L, 1); /* remove position */ | 60 | lua_pop(L, 1); /* remove position */ |
61 | if (tag == TOpenCall) { | 61 | if (tag == TOpenCall) { /* labeled failure */ |
62 | if (n == 0) { /* no position? */ | 62 | if (n == 0) { /* no position? */ |
63 | lua_rawgeti(L, -1, t->key); /* get rule's name again */ | 63 | lua_rawgeti(L, -1, t->key); /* get rule's name again */ |
64 | luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); | 64 | luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); |
@@ -109,7 +109,7 @@ static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { | |||
109 | return; | 109 | return; |
110 | case TOpenCall: { | 110 | case TOpenCall: { |
111 | if (g != NULL) /* inside a grammar? */ | 111 | if (g != NULL) /* inside a grammar? */ |
112 | fixonecall(L, postable, g, t, TOpenCall); | 112 | fixonecall(L, postable, g, t, TOpenCall); /* labeled failure */ |
113 | else { /* open call outside grammar */ | 113 | else { /* open call outside grammar */ |
114 | lua_rawgeti(L, -1, t->key); | 114 | lua_rawgeti(L, -1, t->key); |
115 | luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); | 115 | luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); |
@@ -695,6 +695,56 @@ static int lp_range (lua_State *L) { | |||
695 | 695 | ||
696 | 696 | ||
697 | /* | 697 | /* |
698 | ** Fills a tree node with basic information about the UTF-8 code point | ||
699 | ** 'cpu': its value in 'n', its length in 'cap', and its first byte in | ||
700 | ** 'key' | ||
701 | */ | ||
702 | static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { | ||
703 | int len, fb, cp; | ||
704 | cp = (int)cpu; | ||
705 | if (cp <= 0x7f) { /* one byte? */ | ||
706 | len = 1; | ||
707 | fb = cp; | ||
708 | } else if (cp <= 0x7ff) { | ||
709 | len = 2; | ||
710 | fb = 0xC0 | (cp >> 6); | ||
711 | } else if (cp <= 0xffff) { | ||
712 | len = 3; | ||
713 | fb = 0xE0 | (cp >> 12); | ||
714 | } | ||
715 | else { | ||
716 | luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); | ||
717 | len = 4; | ||
718 | fb = 0xF0 | (cp >> 18); | ||
719 | } | ||
720 | t->u.n = cp; | ||
721 | t->cap = len; | ||
722 | t->key = fb; | ||
723 | } | ||
724 | |||
725 | |||
726 | static int lp_utfr (lua_State *L) { | ||
727 | lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); | ||
728 | lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); | ||
729 | luaL_argcheck(L, from <= to, 2, "empty range"); | ||
730 | if (to <= 0x7f) { /* ascii range? */ | ||
731 | TTree *tree = newcharset(L); /* code it as a regular charset */ | ||
732 | unsigned int f; | ||
733 | for (f = (int)from; f <= to; f++) | ||
734 | setchar(treebuffer(tree), f); | ||
735 | } | ||
736 | else { /* multi-byte utf-8 range */ | ||
737 | TTree *tree = newtree(L, 2); | ||
738 | tree->tag = TUTFR; | ||
739 | codeutftree(L, tree, from, 1); | ||
740 | sib1(tree)->tag = TXInfo; | ||
741 | codeutftree(L, sib1(tree), to, 2); | ||
742 | } | ||
743 | return 1; | ||
744 | } | ||
745 | |||
746 | |||
747 | /* | ||
698 | ** Look-behind predicate | 748 | ** Look-behind predicate |
699 | */ | 749 | */ |
700 | static int lp_behind (lua_State *L) { | 750 | static int lp_behind (lua_State *L) { |
@@ -940,7 +990,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { | |||
940 | int size; /* accumulator for total size */ | 990 | int size; /* accumulator for total size */ |
941 | lua_newtable(L); /* create position table */ | 991 | lua_newtable(L); /* create position table */ |
942 | getfirstrule(L, arg, postab); | 992 | getfirstrule(L, arg, postab); |
943 | size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ | 993 | size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */ |
944 | lua_pushnil(L); /* prepare to traverse grammar table */ | 994 | lua_pushnil(L); /* prepare to traverse grammar table */ |
945 | while (lua_next(L, arg) != 0) { | 995 | while (lua_next(L, arg) != 0) { |
946 | if (lua_tonumber(L, -2) == 1 || | 996 | if (lua_tonumber(L, -2) == 1 || |
@@ -954,11 +1004,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { | |||
954 | lua_pushvalue(L, -2); /* push key (to insert into position table) */ | 1004 | lua_pushvalue(L, -2); /* push key (to insert into position table) */ |
955 | lua_pushinteger(L, size); | 1005 | lua_pushinteger(L, size); |
956 | lua_settable(L, postab); | 1006 | lua_settable(L, postab); |
957 | size += 1 + getsize(L, -1); /* update size */ | 1007 | size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */ |
958 | lua_pushvalue(L, -2); /* push key (for next lua_next) */ | 1008 | lua_pushvalue(L, -2); /* push key (for next lua_next) */ |
959 | n++; | 1009 | n++; |
960 | } | 1010 | } |
961 | *totalsize = size + 1; /* TTrue to finish list of rules */ | 1011 | *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */ |
962 | return n; | 1012 | return n; |
963 | } | 1013 | } |
964 | 1014 | ||
@@ -970,11 +1020,13 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { | |||
970 | int ridx = frule + 2*i + 1; /* index of i-th rule */ | 1020 | int ridx = frule + 2*i + 1; /* index of i-th rule */ |
971 | int rulesize; | 1021 | int rulesize; |
972 | TTree *rn = gettree(L, ridx, &rulesize); | 1022 | TTree *rn = gettree(L, ridx, &rulesize); |
1023 | TTree *pr = sib1(nd); /* points to rule's prerule */ | ||
973 | nd->tag = TRule; | 1024 | nd->tag = TRule; |
974 | nd->key = 0; /* will be fixed when rule is used */ | 1025 | nd->key = 0; /* will be fixed when rule is used */ |
975 | nd->cap = i; /* rule number */ | 1026 | pr->tag = TXInfo; |
976 | nd->u.ps = rulesize + 1; /* point to next rule */ | 1027 | pr->u.n = i; /* rule number */ |
977 | memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ | 1028 | nd->u.ps = rulesize + 2; /* point to next rule */ |
1029 | memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */ | ||
978 | mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ | 1030 | mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ |
979 | nd = sib2(nd); /* move to next rule */ | 1031 | nd = sib2(nd); /* move to next rule */ |
980 | } | 1032 | } |
@@ -1010,7 +1062,7 @@ static int checkloops (TTree *tree) { | |||
1010 | ** twice in 'passed', there is path from it back to itself without | 1062 | ** twice in 'passed', there is path from it back to itself without |
1011 | ** advancing the subject. | 1063 | ** advancing the subject. |
1012 | */ | 1064 | */ |
1013 | static int verifyerror (lua_State *L, int *passed, int npassed) { | 1065 | static int verifyerror (lua_State *L, unsigned short *passed, int npassed) { |
1014 | int i, j; | 1066 | int i, j; |
1015 | for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ | 1067 | for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ |
1016 | for (j = i - 1; j >= 0; j--) { | 1068 | for (j = i - 1; j >= 0; j--) { |
@@ -1035,12 +1087,13 @@ static int verifyerror (lua_State *L, int *passed, int npassed) { | |||
1035 | ** counts the elements in 'passed'. | 1087 | ** counts the elements in 'passed'. |
1036 | ** Assume ktable at the top of the stack. | 1088 | ** Assume ktable at the top of the stack. |
1037 | */ | 1089 | */ |
1038 | static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | 1090 | static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, |
1039 | int nb) { | 1091 | int npassed, int nb) { |
1040 | tailcall: | 1092 | tailcall: |
1041 | switch (tree->tag) { | 1093 | switch (tree->tag) { |
1042 | case TChar: case TSet: case TAny: | 1094 | case TChar: case TSet: case TAny: |
1043 | case TFalse: case TThrow: /* labeled failure */ | 1095 | case TFalse: case TUTFR: |
1096 | case TThrow: /* labeled failure */ | ||
1044 | return nb; /* cannot pass from here */ | 1097 | return nb; /* cannot pass from here */ |
1045 | case TTrue: | 1098 | case TTrue: |
1046 | case TBehind: /* look-behind cannot have calls */ | 1099 | case TBehind: /* look-behind cannot have calls */ |
@@ -1048,7 +1101,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | |||
1048 | case TNot: case TAnd: case TRep: | 1101 | case TNot: case TAnd: case TRep: |
1049 | /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ | 1102 | /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ |
1050 | tree = sib1(tree); nb = 1; goto tailcall; | 1103 | tree = sib1(tree); nb = 1; goto tailcall; |
1051 | case TCapture: case TRunTime: | 1104 | case TCapture: case TRunTime: case TXInfo: |
1052 | /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ | 1105 | /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ |
1053 | tree = sib1(tree); goto tailcall; | 1106 | tree = sib1(tree); goto tailcall; |
1054 | case TCall: | 1107 | case TCall: |
@@ -1059,15 +1112,15 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | |||
1059 | return nb; | 1112 | return nb; |
1060 | /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ | 1113 | /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ |
1061 | tree = sib2(tree); goto tailcall; | 1114 | tree = sib2(tree); goto tailcall; |
1062 | case TChoice: /* must check both children */ | 1115 | case TChoice: /* must check both children */ |
1063 | nb = verifyrule(L, sib1(tree), passed, npassed, nb); | 1116 | nb = verifyrule(L, sib1(tree), passed, npassed, nb); |
1064 | /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ | 1117 | /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ |
1065 | tree = sib2(tree); goto tailcall; | 1118 | tree = sib2(tree); goto tailcall; |
1066 | case TRule: | 1119 | case TRule: |
1067 | if (npassed >= MAXRULES) | 1120 | if (npassed >= MAXRULES) /* too many steps? */ |
1068 | return verifyerror(L, passed, npassed); | 1121 | return verifyerror(L, passed, npassed); /* error */ |
1069 | else { | 1122 | else { |
1070 | passed[npassed++] = tree->key; | 1123 | passed[npassed++] = tree->key; /* add rule to path */ |
1071 | /* return verifyrule(L, sib1(tree), passed, npassed); */ | 1124 | /* return verifyrule(L, sib1(tree), passed, npassed); */ |
1072 | tree = sib1(tree); goto tailcall; | 1125 | tree = sib1(tree); goto tailcall; |
1073 | } | 1126 | } |
@@ -1079,7 +1132,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, | |||
1079 | 1132 | ||
1080 | 1133 | ||
1081 | static void verifygrammar (lua_State *L, TTree *grammar) { | 1134 | static void verifygrammar (lua_State *L, TTree *grammar) { |
1082 | int passed[MAXRULES]; | 1135 | unsigned short passed[MAXRULES]; |
1083 | TTree *rule; | 1136 | TTree *rule; |
1084 | /* check left-recursive rules */ | 1137 | /* check left-recursive rules */ |
1085 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { | 1138 | for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { |
@@ -1243,12 +1296,6 @@ static int lp_setmax (lua_State *L) { | |||
1243 | } | 1296 | } |
1244 | 1297 | ||
1245 | 1298 | ||
1246 | static int lp_version (lua_State *L) { | ||
1247 | lua_pushstring(L, VERSION); | ||
1248 | return 1; | ||
1249 | } | ||
1250 | |||
1251 | |||
1252 | static int lp_type (lua_State *L) { | 1299 | static int lp_type (lua_State *L) { |
1253 | if (testpattern(L, 1)) | 1300 | if (testpattern(L, 1)) |
1254 | lua_pushliteral(L, "pattern"); | 1301 | lua_pushliteral(L, "pattern"); |
@@ -1317,8 +1364,9 @@ static struct luaL_Reg pattreg[] = { | |||
1317 | {"P", lp_P}, | 1364 | {"P", lp_P}, |
1318 | {"S", lp_set}, | 1365 | {"S", lp_set}, |
1319 | {"R", lp_range}, | 1366 | {"R", lp_range}, |
1367 | {"utfR", lp_utfr}, | ||
1320 | {"locale", lp_locale}, | 1368 | {"locale", lp_locale}, |
1321 | {"version", lp_version}, | 1369 | {"version", NULL}, |
1322 | {"setmaxstack", lp_setmax}, | 1370 | {"setmaxstack", lp_setmax}, |
1323 | {"type", lp_type}, | 1371 | {"type", lp_type}, |
1324 | {"T", lp_throw}, /* labeled failure throw */ | 1372 | {"T", lp_throw}, /* labeled failure throw */ |
@@ -1347,6 +1395,8 @@ int luaopen_lpeglabel (lua_State *L) { /* labeled failure */ | |||
1347 | luaL_newlib(L, pattreg); | 1395 | luaL_newlib(L, pattreg); |
1348 | lua_pushvalue(L, -1); | 1396 | lua_pushvalue(L, -1); |
1349 | lua_setfield(L, -3, "__index"); | 1397 | lua_setfield(L, -3, "__index"); |
1398 | lua_pushliteral(L, "LPegLabel " VERSION); /* labeled failure */ | ||
1399 | lua_setfield(L, -2, "version"); | ||
1350 | return 1; | 1400 | return 1; |
1351 | } | 1401 | } |
1352 | 1402 | ||
@@ -18,6 +18,9 @@ typedef enum TTag { | |||
18 | TAny, | 18 | TAny, |
19 | TTrue, | 19 | TTrue, |
20 | TFalse, | 20 | TFalse, |
21 | TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; | ||
22 | 'cap' has length; 'key' has first byte; | ||
23 | extra info is similar for end codepoint */ | ||
21 | TRep, /* 'sib1'* */ | 24 | TRep, /* 'sib1'* */ |
22 | TSeq, /* 'sib1' 'sib2' */ | 25 | TSeq, /* 'sib1' 'sib2' */ |
23 | TChoice, /* 'sib1' / 'sib2' */ | 26 | TChoice, /* 'sib1' / 'sib2' */ |
@@ -26,8 +29,9 @@ typedef enum TTag { | |||
26 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ | 29 | TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ |
27 | TOpenCall, /* ktable[key] is rule's key */ | 30 | TOpenCall, /* ktable[key] is rule's key */ |
28 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); | 31 | TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); |
29 | 'sib1' is rule's pattern; | 32 | 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; |
30 | 'sib2' is next rule; 'cap' is rule's sequential number */ | 33 | extra info 'n' is rule's sequential number */ |
34 | TXInfo, /* extra info */ | ||
31 | TGrammar, /* 'sib1' is initial (and first) rule */ | 35 | TGrammar, /* 'sib1' is initial (and first) rule */ |
32 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ | 36 | TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ |
33 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); | 37 | TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); |
@@ -36,6 +40,7 @@ typedef enum TTag { | |||
36 | TRunTime, /* run-time capture: 'key' is Lua function; | 40 | TRunTime, /* run-time capture: 'key' is Lua function; |
37 | 'sib1' is capture body */ | 41 | 'sib1' is capture body */ |
38 | TThrow, /* labeled failure: ktable[key] is label's name */ | 42 | TThrow, /* labeled failure: ktable[key] is label's name */ |
43 | |||
39 | } TTag; | 44 | } TTag; |
40 | 45 | ||
41 | 46 | ||
@@ -50,8 +55,8 @@ typedef struct TTree { | |||
50 | byte cap; /* kind of capture (if it is a capture) */ | 55 | byte cap; /* kind of capture (if it is a capture) */ |
51 | unsigned short key; /* key in ktable for Lua data (0 if no key) */ | 56 | unsigned short key; /* key in ktable for Lua data (0 if no key) */ |
52 | union { | 57 | union { |
53 | int n; /* occasional counter */ | ||
54 | int ps; /* occasional second child */ | 58 | int ps; /* occasional second child */ |
59 | int n; /* occasional counter */ | ||
55 | } u; | 60 | } u; |
56 | } TTree; | 61 | } TTree; |
57 | 62 | ||
@@ -15,7 +15,7 @@ | |||
15 | #include "lua.h" | 15 | #include "lua.h" |
16 | 16 | ||
17 | 17 | ||
18 | #define VERSION "1.5.2" | 18 | #define VERSION "1.6.0" |
19 | 19 | ||
20 | 20 | ||
21 | #define PATTERN_T "lpeg-pattern" | 21 | #define PATTERN_T "lpeg-pattern" |
@@ -37,6 +37,8 @@ | |||
37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) | 37 | #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) |
38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) | 38 | #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) |
39 | 39 | ||
40 | typedef size_t lua_Unsigned; | ||
41 | |||
40 | #endif | 42 | #endif |
41 | 43 | ||
42 | 44 | ||
@@ -51,9 +53,9 @@ | |||
51 | #endif | 53 | #endif |
52 | 54 | ||
53 | 55 | ||
54 | /* maximum number of rules in a grammar (limited by 'unsigned char') */ | 56 | /* maximum number of rules in a grammar (limited by 'unsigned short') */ |
55 | #if !defined(MAXRULES) | 57 | #if !defined(MAXRULES) |
56 | #define MAXRULES UCHAR_MAX | 58 | #define MAXRULES 1000 |
57 | #endif | 59 | #endif |
58 | 60 | ||
59 | 61 | ||
@@ -18,16 +18,45 @@ | |||
18 | 18 | ||
19 | /* initial size for call/backtrack stack */ | 19 | /* initial size for call/backtrack stack */ |
20 | #if !defined(INITBACK) | 20 | #if !defined(INITBACK) |
21 | #define INITBACK MAXBACK | 21 | #define INITBACK MAXBACK |
22 | #endif | 22 | #endif |
23 | 23 | ||
24 | 24 | ||
25 | #define getoffset(p) (((p) + 1)->offset) | 25 | #define getoffset(p) (((p) + 1)->offset) |
26 | 26 | ||
27 | static const Instruction giveup = {{IGiveup, 0, 0}}; | 27 | static const Instruction giveup = {{IGiveup, 0, 0}}; |
28 | 28 | ||
29 | 29 | ||
30 | /* | 30 | /* |
31 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | ||
32 | */ | ||
33 | static const char *utf8_decode (const char *o, int *val) { | ||
34 | static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; | ||
35 | const unsigned char *s = (const unsigned char *)o; | ||
36 | unsigned int c = s[0]; /* first byte */ | ||
37 | unsigned int res = 0; /* final result */ | ||
38 | if (c < 0x80) /* ascii? */ | ||
39 | res = c; | ||
40 | else { | ||
41 | int count = 0; /* to count number of continuation bytes */ | ||
42 | while (c & 0x40) { /* still have continuation bytes? */ | ||
43 | int cc = s[++count]; /* read next byte */ | ||
44 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ | ||
45 | return NULL; /* invalid byte sequence */ | ||
46 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ | ||
47 | c <<= 1; /* to test next bit */ | ||
48 | } | ||
49 | res |= (c & 0x7F) << (count * 5); /* add first byte */ | ||
50 | if (count > 3 || res > 0x10FFFFu || res <= limits[count]) | ||
51 | return NULL; /* invalid byte sequence */ | ||
52 | s += count; /* skip continuation bytes read */ | ||
53 | } | ||
54 | *val = res; | ||
55 | return (const char *)s + 1; /* +1 to include first byte */ | ||
56 | } | ||
57 | |||
58 | |||
59 | /* | ||
31 | ** {====================================================== | 60 | ** {====================================================== |
32 | ** Virtual Machine | 61 | ** Virtual Machine |
33 | ** ======================================================= | 62 | ** ======================================================= |
@@ -43,7 +72,7 @@ typedef struct Stack { | |||
43 | } Stack; | 72 | } Stack; |
44 | 73 | ||
45 | 74 | ||
46 | #define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) | 75 | #define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) |
47 | 76 | ||
48 | 77 | ||
49 | /* | 78 | /* |
@@ -207,6 +236,20 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
207 | } | 236 | } |
208 | continue; | 237 | continue; |
209 | } | 238 | } |
239 | case IUTFR: { | ||
240 | int codepoint; | ||
241 | if (s >= e) | ||
242 | goto fail; | ||
243 | s = utf8_decode (s, &codepoint); | ||
244 | if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) | ||
245 | p += 2; | ||
246 | else { | ||
247 | *labelf = LFAIL; /* labeled failure */ | ||
248 | updatefarthest(*sfail, s); /*labeled failure */ | ||
249 | goto fail; | ||
250 | } | ||
251 | continue; | ||
252 | } | ||
210 | case ITestAny: { | 253 | case ITestAny: { |
211 | if (s < e) p += 2; | 254 | if (s < e) p += 2; |
212 | else p += getoffset(p); | 255 | else p += getoffset(p); |
@@ -301,8 +344,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
301 | continue; | 344 | continue; |
302 | } | 345 | } |
303 | case ICommit: { | 346 | case ICommit: { |
304 | assert(stack > getstackbase(L, ptop)); | 347 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); |
305 | assert((stack - 1)->s != NULL); | ||
306 | stack--; | 348 | stack--; |
307 | p += getoffset(p); | 349 | p += getoffset(p); |
308 | continue; | 350 | continue; |
@@ -318,6 +360,8 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
318 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); | 360 | assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); |
319 | s = (--stack)->s; | 361 | s = (--stack)->s; |
320 | insidepred = stack->labenv; /* labeled failure */ | 362 | insidepred = stack->labenv; /* labeled failure */ |
363 | if (ndyncap > 0) /* are there matchtime captures? */ | ||
364 | ndyncap -= removedyncap(L, capture, stack->caplevel, captop); | ||
321 | captop = stack->caplevel; | 365 | captop = stack->caplevel; |
322 | p += getoffset(p); | 366 | p += getoffset(p); |
323 | continue; | 367 | continue; |
@@ -17,17 +17,18 @@ typedef enum Opcode { | |||
17 | ITestChar, /* if char != aux, jump to 'offset' */ | 17 | ITestChar, /* if char != aux, jump to 'offset' */ |
18 | ITestSet, /* if char not in buff, jump to 'offset' */ | 18 | ITestSet, /* if char not in buff, jump to 'offset' */ |
19 | ISpan, /* read a span of chars in buff */ | 19 | ISpan, /* read a span of chars in buff */ |
20 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ | ||
20 | IBehind, /* walk back 'aux' characters (fail if not possible) */ | 21 | IBehind, /* walk back 'aux' characters (fail if not possible) */ |
21 | IRet, /* return from a rule */ | 22 | IRet, /* return from a rule */ |
22 | IEnd, /* end of pattern */ | 23 | IEnd, /* end of pattern */ |
23 | IChoice, /* stack a choice; next fail will jump to 'offset' */ | 24 | IChoice, /* stack a choice; next fail will jump to 'offset' */ |
24 | IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ | 25 | IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ /*labeled failure */ |
25 | IJmp, /* jump to 'offset' */ | 26 | IJmp, /* jump to 'offset' */ |
26 | ICall, /* call rule at 'offset' */ | 27 | ICall, /* call rule at 'offset' */ |
27 | IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ | 28 | IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ |
28 | ICommit, /* pop choice and jump to 'offset' */ | 29 | ICommit, /* pop choice and jump to 'offset' */ |
29 | IPartialCommit, /* update top choice to current position and jump */ | 30 | IPartialCommit, /* update top choice to current position and jump */ |
30 | IBackCommit, /* "fails" but jump to its own 'offset' */ | 31 | IBackCommit, /* backtrack like "fail" but jump to its own 'offset' */ |
31 | IFailTwice, /* pop one choice and then fail */ | 32 | IFailTwice, /* pop one choice and then fail */ |
32 | IFail, /* go back to saved state on choice and jump to saved offset */ | 33 | IFail, /* go back to saved state on choice and jump to saved offset */ |
33 | IGiveup, /* internal use */ | 34 | IGiveup, /* internal use */ |
@@ -35,8 +36,9 @@ typedef enum Opcode { | |||
35 | IOpenCapture, /* start a capture */ | 36 | IOpenCapture, /* start a capture */ |
36 | ICloseCapture, | 37 | ICloseCapture, |
37 | ICloseRunTime, | 38 | ICloseRunTime, |
38 | IThrow, /* fails with a given label */ | 39 | IThrow, /* fails with a given label */ /*labeled failure */ |
39 | IThrowRec, /* fails with a given label and call rule at 'offset' */ | 40 | IThrowRec, /* fails with a given label and call rule at 'offset' */ /*labeled failure */ |
41 | IEmpty /* to fill empty slots left by optimizations */ | ||
40 | } Opcode; | 42 | } Opcode; |
41 | 43 | ||
42 | 44 | ||
@@ -52,10 +54,13 @@ typedef union Instruction { | |||
52 | } Instruction; | 54 | } Instruction; |
53 | 55 | ||
54 | 56 | ||
57 | /* extract 24-bit value from an instruction */ | ||
58 | #define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux) | ||
59 | |||
60 | |||
55 | void printpatt (Instruction *p, int n); | 61 | void printpatt (Instruction *p, int n); |
56 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | 62 | const char *match (lua_State *L, const char *o, const char *s, const char *e, |
57 | Instruction *op, Capture *capture, int ptop, short *labelf, const char **sfail); /* labeled failure */ | 63 | Instruction *op, Capture *capture, int ptop, short *labelf, const char **sfail); /* labeled failure */ |
58 | 64 | ||
59 | |||
60 | #endif | 65 | #endif |
61 | 66 | ||
@@ -48,8 +48,8 @@ end | |||
48 | 48 | ||
49 | print"General tests for LPeg library" | 49 | print"General tests for LPeg library" |
50 | 50 | ||
51 | assert(type(m.version()) == "string") | 51 | assert(type(m.version) == "string") |
52 | print("version " .. m.version()) | 52 | print(m.version) |
53 | assert(m.type("alo") ~= "pattern") | 53 | assert(m.type("alo") ~= "pattern") |
54 | assert(m.type(io.input) ~= "pattern") | 54 | assert(m.type(io.input) ~= "pattern") |
55 | assert(m.type(m.P"alo") == "pattern") | 55 | assert(m.type(m.P"alo") == "pattern") |
@@ -70,7 +70,6 @@ assert(m.match(#m.P(true) * "a", "a") == 2) | |||
70 | assert(m.match("a" * #m.P(false), "a") == nil) | 70 | assert(m.match("a" * #m.P(false), "a") == nil) |
71 | assert(m.match("a" * #m.P(true), "a") == 2) | 71 | assert(m.match("a" * #m.P(true), "a") == 2) |
72 | 72 | ||
73 | |||
74 | -- tests for locale | 73 | -- tests for locale |
75 | do | 74 | do |
76 | assert(m.locale(m) == m) | 75 | assert(m.locale(m) == m) |
@@ -406,7 +405,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') | |||
406 | 405 | ||
407 | 406 | ||
408 | do | 407 | do |
409 | -- large dynamic Cc | 408 | print "testing large dynamic Cc" |
410 | local lim = 2^16 - 1 | 409 | local lim = 2^16 - 1 |
411 | local c = 0 | 410 | local c = 0 |
412 | local function seq (n) | 411 | local function seq (n) |
@@ -985,10 +984,10 @@ for i = 1, 10 do | |||
985 | assert(p:match("aaaaaaaaaaa") == 11 - i + 1) | 984 | assert(p:match("aaaaaaaaaaa") == 11 - i + 1) |
986 | end | 985 | end |
987 | 986 | ||
988 | print"+" | ||
989 | 987 | ||
990 | 988 | ||
991 | -- tests for back references | 989 | print "testing back references" |
990 | |||
992 | checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') | 991 | checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') |
993 | checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') | 992 | checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') |
994 | 993 | ||
@@ -1032,6 +1031,17 @@ local function id (s, i, ...) | |||
1032 | return true, ... | 1031 | return true, ... |
1033 | end | 1032 | end |
1034 | 1033 | ||
1034 | do -- run-time capture in an end predicate (should discard its value) | ||
1035 | local x = 0 | ||
1036 | function foo (s, i) | ||
1037 | x = x + 1 | ||
1038 | return true, x | ||
1039 | end | ||
1040 | |||
1041 | local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) | ||
1042 | assert(p:match("xx") == 2) | ||
1043 | end | ||
1044 | |||
1035 | assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + | 1045 | assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + |
1036 | m.R'09'^1 / string.char + | 1046 | m.R'09'^1 / string.char + |
1037 | m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") | 1047 | m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") |
@@ -1171,9 +1181,85 @@ t = {p:match('abacc')} | |||
1171 | checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) | 1181 | checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) |
1172 | 1182 | ||
1173 | 1183 | ||
1184 | do print"testing large grammars" | ||
1185 | local lim = 1000 -- number of rules | ||
1186 | local t = {} | ||
1187 | |||
1188 | for i = 3, lim do | ||
1189 | t[i] = m.V(i - 1) -- each rule calls previous one | ||
1190 | end | ||
1191 | t[1] = m.V(lim) -- start on last rule | ||
1192 | t[2] = m.C("alo") -- final rule | ||
1193 | |||
1194 | local P = m.P(t) -- build grammar | ||
1195 | assert(P:match("alo") == "alo") | ||
1196 | |||
1197 | t[#t + 1] = m.P("x") -- one more rule... | ||
1198 | checkerr("too many rules", m.P, t) | ||
1199 | end | ||
1200 | |||
1201 | |||
1202 | print "testing UTF-8 ranges" | ||
1203 | |||
1204 | do -- a few typical UTF-8 ranges | ||
1205 | local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" | ||
1206 | + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" | ||
1207 | + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" | ||
1208 | + m.utfR(0, 0x7f)^1 / "ascii: %0" | ||
1209 | + m.utfR(0, 0x10ffff) / "other: %0" | ||
1210 | |||
1211 | p = m.Ct(p^0) * -m.P(1) | ||
1212 | |||
1213 | local cyr = "ждюя" | ||
1214 | local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 | ||
1215 | local cjk = "专举乸" | ||
1216 | local ascii = "alo" | ||
1217 | local last = "\244\143\191\191" -- U+10FFFF | ||
1218 | |||
1219 | local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last | ||
1220 | t = (p:match(s)) | ||
1221 | |||
1222 | assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and | ||
1223 | t[3] == "emot: " .. emot and t[4] == "other: —" and | ||
1224 | t[5] == "cjk: " .. cjk and t[6] == "other: —" and | ||
1225 | t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and | ||
1226 | t[9] == nil) | ||
1227 | end | ||
1228 | |||
1229 | |||
1230 | do -- valid and invalid code points | ||
1231 | local p = m.utfR(0, 0x10ffff)^0 | ||
1232 | assert(p:match("汉字\128") == #"汉字" + 1) | ||
1233 | assert(p:match("\244\159\191") == 1) | ||
1234 | assert(p:match("\244\159\191\191") == 1) | ||
1235 | assert(p:match("\255") == 1) | ||
1236 | |||
1237 | -- basic errors | ||
1238 | checkerr("empty range", m.utfR, 1, 0) | ||
1239 | checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) | ||
1240 | end | ||
1241 | |||
1242 | |||
1243 | do -- back references (fixed width) | ||
1244 | -- match a byte after a CJK point | ||
1245 | local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) | ||
1246 | p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' | ||
1247 | assert(p:match("ab д 专X x") == "X") | ||
1248 | |||
1249 | -- match a byte after a hebrew point | ||
1250 | local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) | ||
1251 | p = m.P(#"ש") * p | ||
1252 | assert(p:match("שX") == "X") | ||
1253 | |||
1254 | checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) | ||
1255 | end | ||
1256 | |||
1257 | |||
1258 | |||
1174 | ------------------------------------------------------------------- | 1259 | ------------------------------------------------------------------- |
1175 | -- Tests for 're' module | 1260 | -- Tests for 're' module |
1176 | ------------------------------------------------------------------- | 1261 | ------------------------------------------------------------------- |
1262 | print"testing 're' module" | ||
1177 | 1263 | ||
1178 | local re = require "relabel" | 1264 | local re = require "relabel" |
1179 | 1265 | ||
diff --git a/testlabel.lua b/testlabel.lua index d60bb54..c2f760d 100644 --- a/testlabel.lua +++ b/testlabel.lua | |||
@@ -115,7 +115,7 @@ p = m.P{ | |||
115 | "S", | 115 | "S", |
116 | S = m.T("bola"), | 116 | S = m.T("bola"), |
117 | bolada = m.P"a" | 117 | bolada = m.P"a" |
118 | } | 118 | } |
119 | r, l, poserr = p:match("abc") | 119 | r, l, poserr = p:match("abc") |
120 | assert(r == nil and l == 'bola' and poserr == 1) | 120 | assert(r == nil and l == 'bola' and poserr == 1) |
121 | 121 | ||
@@ -134,7 +134,7 @@ p = m.P{ | |||
134 | "S", | 134 | "S", |
135 | S = m.T("bola"), | 135 | S = m.T("bola"), |
136 | bola = m.P"a" | 136 | bola = m.P"a" |
137 | } | 137 | } |
138 | r, l, poserr = p:match("abc") | 138 | r, l, poserr = p:match("abc") |
139 | assert(r == 2) | 139 | assert(r == 2) |
140 | 140 | ||