aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lpcode.c79
-rw-r--r--lpprint.c54
-rw-r--r--lptree.c104
-rw-r--r--lptree.h11
-rw-r--r--lptypes.h8
-rw-r--r--lpvm.c54
-rw-r--r--lpvm.h15
-rwxr-xr-xtest.lua98
-rw-r--r--testlabel.lua4
9 files changed, 328 insertions, 99 deletions
diff --git a/lpcode.c b/lpcode.c
index ecc6fa3..a2d08f7 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -196,7 +196,7 @@ int hascaptures (TTree *tree) {
196int checkaux (TTree *tree, int pred) { 196int checkaux (TTree *tree, int pred) {
197 tailcall: 197 tailcall:
198 switch (tree->tag) { 198 switch (tree->tag) {
199 case TChar: case TSet: case TAny: 199 case TChar: case TSet: case TAny: case TUTFR:
200 case TFalse: case TOpenCall: case TThrow: /* labeled failure */ 200 case TFalse: case TOpenCall: case TThrow: /* labeled failure */
201 return 0; /* not nullable */ 201 return 0; /* not nullable */
202 case TRep: case TTrue: 202 case TRep: case TTrue:
@@ -220,7 +220,7 @@ int checkaux (TTree *tree, int pred) {
220 if (checkaux(sib2(tree), pred)) return 1; 220 if (checkaux(sib2(tree), pred)) return 1;
221 /* else return checkaux(sib1(tree), pred); */ 221 /* else return checkaux(sib1(tree), pred); */
222 tree = sib1(tree); goto tailcall; 222 tree = sib1(tree); goto tailcall;
223 case TCapture: case TGrammar: case TRule: 223 case TCapture: case TGrammar: case TRule: case TXInfo:
224 /* return checkaux(sib1(tree), pred); */ 224 /* return checkaux(sib1(tree), pred); */
225 tree = sib1(tree); goto tailcall; 225 tree = sib1(tree); goto tailcall;
226 case TCall: /* return checkaux(sib2(tree), pred); */ 226 case TCall: /* return checkaux(sib2(tree), pred); */
@@ -239,12 +239,14 @@ int fixedlen (TTree *tree) {
239 switch (tree->tag) { 239 switch (tree->tag) {
240 case TChar: case TSet: case TAny: 240 case TChar: case TSet: case TAny:
241 return len + 1; 241 return len + 1;
242 case TUTFR:
243 return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1;
242 case TFalse: case TTrue: case TNot: case TAnd: case TBehind: 244 case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
243 return len; 245 return len;
244 case TRep: case TRunTime: case TOpenCall: 246 case TRep: case TRunTime: case TOpenCall:
245 case TThrow: /* labeled failure */ 247 case TThrow: /* labeled failure */
246 return -1; 248 return -1;
247 case TCapture: case TRule: case TGrammar: 249 case TCapture: case TRule: case TGrammar: case TXInfo:
248 /* return fixedlen(sib1(tree)); */ 250 /* return fixedlen(sib1(tree)); */
249 tree = sib1(tree); goto tailcall; 251 tree = sib1(tree); goto tailcall;
250 case TCall: { 252 case TCall: {
@@ -254,14 +256,14 @@ int fixedlen (TTree *tree) {
254 else 256 else
255 return len + n1; 257 return len + n1;
256 } 258 }
257 case TSeq: { 259 case TSeq: {
258 int n1 = fixedlen(sib1(tree)); 260 int n1 = fixedlen(sib1(tree));
259 if (n1 < 0) 261 if (n1 < 0)
260 return -1; 262 return -1;
261 /* else return fixedlen(sib2(tree)) + len; */ 263 /* else return fixedlen(sib2(tree)) + len; */
262 len += n1; tree = sib2(tree); goto tailcall; 264 len += n1; tree = sib2(tree); goto tailcall;
263 } 265 }
264 case TChoice: { 266 case TChoice: {
265 int n1 = fixedlen(sib1(tree)); 267 int n1 = fixedlen(sib1(tree));
266 int n2 = fixedlen(sib2(tree)); 268 int n2 = fixedlen(sib2(tree));
267 if (n1 != n2 || n1 < 0) 269 if (n1 != n2 || n1 < 0)
@@ -299,6 +301,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
299 tocharset(tree, firstset); 301 tocharset(tree, firstset);
300 return 0; 302 return 0;
301 } 303 }
304 case TUTFR: {
305 int c;
306 loopset(i, firstset->cs[i] = 0); /* erase all chars */
307 for (c = tree->key; c <= sib1(tree)->key; c++)
308 setchar(firstset->cs, c);
309 return 0;
310 }
302 case TTrue: { 311 case TTrue: {
303 loopset(i, firstset->cs[i] = follow->cs[i]); 312 loopset(i, firstset->cs[i] = follow->cs[i]);
304 return 1; /* accepts the empty string */ 313 return 1; /* accepts the empty string */
@@ -311,7 +320,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
311 loopset(i, firstset->cs[i] = fullset->cs[i]); 320 loopset(i, firstset->cs[i] = fullset->cs[i]);
312 return 1; 321 return 1;
313 } 322 }
314 case TChoice: { 323 case TChoice: {
315 Charset csaux; 324 Charset csaux;
316 int e1 = getfirst(sib1(tree), follow, firstset); 325 int e1 = getfirst(sib1(tree), follow, firstset);
317 int e2 = getfirst(sib2(tree), follow, &csaux); 326 int e2 = getfirst(sib2(tree), follow, &csaux);
@@ -339,7 +348,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
339 loopset(i, firstset->cs[i] |= follow->cs[i]); 348 loopset(i, firstset->cs[i] |= follow->cs[i]);
340 return 1; /* accept the empty string */ 349 return 1; /* accept the empty string */
341 } 350 }
342 case TCapture: case TGrammar: case TRule: { 351 case TCapture: case TGrammar: case TRule: case TXInfo: {
343 /* return getfirst(sib1(tree), follow, firstset); */ 352 /* return getfirst(sib1(tree), follow, firstset); */
344 tree = sib1(tree); goto tailcall; 353 tree = sib1(tree); goto tailcall;
345 } 354 }
@@ -385,10 +394,10 @@ static int headfail (TTree *tree) {
385 case TChar: case TSet: case TAny: case TFalse: 394 case TChar: case TSet: case TAny: case TFalse:
386 return 1; 395 return 1;
387 case TTrue: case TRep: case TRunTime: case TNot: 396 case TTrue: case TRep: case TRunTime: case TNot:
388 case TBehind: 397 case TBehind: case TUTFR:
389 case TThrow: /* labeled failure: must always throw the label */ 398 case TThrow: /* labeled failure: must always throw the label */
390 return 0; 399 return 0;
391 case TCapture: case TGrammar: case TRule: case TAnd: 400 case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd:
392 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ 401 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
393 case TCall: 402 case TCall:
394 tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ 403 tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */
@@ -396,7 +405,7 @@ static int headfail (TTree *tree) {
396 if (!nofail(sib2(tree))) return 0; 405 if (!nofail(sib2(tree))) return 0;
397 /* else return headfail(sib1(tree)); */ 406 /* else return headfail(sib1(tree)); */
398 tree = sib1(tree); goto tailcall; 407 tree = sib1(tree); goto tailcall;
399 case TChoice: 408 case TChoice:
400 if (!headfail(sib1(tree))) return 0; 409 if (!headfail(sib1(tree))) return 0;
401 /* else return headfail(sib2(tree)); */ 410 /* else return headfail(sib2(tree)); */
402 tree = sib2(tree); goto tailcall; 411 tree = sib2(tree); goto tailcall;
@@ -413,7 +422,7 @@ static int headfail (TTree *tree) {
413static int needfollow (TTree *tree) { 422static int needfollow (TTree *tree) {
414 tailcall: 423 tailcall:
415 switch (tree->tag) { 424 switch (tree->tag) {
416 case TChar: case TSet: case TAny: 425 case TChar: case TSet: case TAny: case TUTFR:
417 case TFalse: case TTrue: case TAnd: case TNot: 426 case TFalse: case TTrue: case TAnd: case TNot:
418 case TRunTime: case TGrammar: case TCall: case TBehind: 427 case TRunTime: case TGrammar: case TCall: case TBehind:
419 case TThrow: /* (?)labeled failure */ 428 case TThrow: /* (?)labeled failure */
@@ -425,7 +434,7 @@ static int needfollow (TTree *tree) {
425 case TSeq: 434 case TSeq:
426 tree = sib2(tree); goto tailcall; 435 tree = sib2(tree); goto tailcall;
427 default: assert(0); return 0; 436 default: assert(0); return 0;
428 } 437 }
429} 438}
430 439
431/* }====================================================== */ 440/* }====================================================== */
@@ -448,12 +457,12 @@ int sizei (const Instruction *i) {
448 case ITestSet: return CHARSETINSTSIZE + 1; 457 case ITestSet: return CHARSETINSTSIZE + 1;
449 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: 458 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
450 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: 459 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
460 case IUTFR:
451 return 2; 461 return 2;
452 case IThrow: case IPredChoice: /* labeled failure */ 462 case IThrow: case IPredChoice: /* labeled failure */
453 return 2; 463 return 2;
454 case IThrowRec: /* labeled failure */ 464 case IThrowRec: /* labeled failure */
455 return 3; 465 return 3;
456
457 default: return 1; 466 default: return 1;
458 } 467 }
459} 468}
@@ -517,8 +526,9 @@ static int addinstruction (CompileState *compst, Opcode op, int aux) {
517static int addoffsetinst (CompileState *compst, Opcode op) { 526static int addoffsetinst (CompileState *compst, Opcode op) {
518 int i = addinstruction(compst, op, 0); /* instruction */ 527 int i = addinstruction(compst, op, 0); /* instruction */
519 addinstruction(compst, (Opcode)0, 0); /* open space for offset */ 528 addinstruction(compst, (Opcode)0, 0); /* open space for offset */
520 assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 || 529 assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 ||
521 op == IThrowRec); /* labeled failure */ 530 op == IThrowRec); /* labeled failure */
531
522 return i; 532 return i;
523} 533}
524 534
@@ -528,13 +538,13 @@ static void codethrow (CompileState *compst, TTree *throw) {
528 int recov, aux; 538 int recov, aux;
529 if (throw->u.ps != 0) { 539 if (throw->u.ps != 0) {
530 recov = addoffsetinst(compst, IThrowRec); 540 recov = addoffsetinst(compst, IThrowRec);
531 assert(sib2(throw)->tag == TRule); 541 assert(sib1(sib2(throw))->tag == TXInfo);
532 } else { 542 } else {
533 recov = addinstruction(compst, IThrow, 0); 543 recov = addinstruction(compst, IThrow, 0);
534 } 544 }
535 aux = nextinstruction(compst); 545 aux = nextinstruction(compst);
536 getinstr(compst, aux).i.key = throw->key; /* next instruction keeps only rule name */ 546 getinstr(compst, aux).i.key = throw->key; /* next instruction keeps only rule name */
537 getinstr(compst, recov).i.key = sib2(throw)->cap; /* rule number */ 547 getinstr(compst, recov).i.key = sib1(sib2(throw))->u.n; /* rule number */
538} 548}
539/* labeled failure */ 549/* labeled failure */
540 550
@@ -547,6 +557,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) {
547} 557}
548 558
549 559
560static void codeutfr (CompileState *compst, TTree *tree) {
561 int i = addoffsetinst(compst, IUTFR);
562 int to = sib1(tree)->u.n;
563 assert(sib1(tree)->tag == TXInfo);
564 getinstr(compst, i + 1).offset = tree->u.n;
565 getinstr(compst, i).i.aux = to & 0xff;
566 getinstr(compst, i).i.key = to >> 8;
567}
568
569
550/* 570/*
551** Add a capture instruction: 571** Add a capture instruction:
552** 'op' is the capture instruction; 'cap' the capture kind; 572** 'op' is the capture instruction; 'cap' the capture kind;
@@ -694,11 +714,11 @@ static void codebehind (CompileState *compst, TTree *tree) {
694 714
695/* 715/*
696** Choice; optimizations: 716** Choice; optimizations:
697** - when p1 is headfail or 717** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
698** when first(p1) and first(p2) are disjoint, than 718** than a character not in first(p1) cannot go to p1 and a character
699** a character not in first(p1) cannot go to p1, and a character 719** in first(p1) cannot go to p2, either because p1 will accept
700** in first(p1) cannot go to p2 (at it is not in first(p2)). 720** (headfail) or because it is not in first(p2) (disjoint).
701** (The optimization is not valid if p1 accepts the empty string, 721** (The second case is not valid if p1 accepts the empty string,
702** as then there is no character at all...) 722** as then there is no character at all...)
703** - when p2 is empty and opt is true; a IPartialCommit can reuse 723** - when p2 is empty and opt is true; a IPartialCommit can reuse
704** the Choice already active in the stack. 724** the Choice already active in the stack.
@@ -715,7 +735,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
715 int jmp = NOINST; 735 int jmp = NOINST;
716 codegen(compst, p1, 0, test, fl); 736 codegen(compst, p1, 0, test, fl);
717 if (!emptyp2) 737 if (!emptyp2)
718 jmp = addoffsetinst(compst, IJmp); 738 jmp = addoffsetinst(compst, IJmp);
719 jumptohere(compst, test); 739 jumptohere(compst, test);
720 codegen(compst, p2, opt, NOINST, fl); 740 codegen(compst, p2, opt, NOINST, fl);
721 jumptohere(compst, jmp); 741 jumptohere(compst, jmp);
@@ -726,7 +746,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
726 codegen(compst, p1, 1, NOINST, fullset); 746 codegen(compst, p1, 1, NOINST, fullset);
727 } 747 }
728 else { 748 else {
729 /* <p1 / p2> == 749 /* <p1 / p2> ==
730 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ 750 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
731 int pcommit; 751 int pcommit;
732 int test = codetestset(compst, &cs1, e1); 752 int test = codetestset(compst, &cs1, e1);
@@ -887,7 +907,7 @@ static void correctcalls (CompileState *compst, int *positions,
887 else 907 else
888 code[i].i.code = ICall; 908 code[i].i.code = ICall;
889 jumptothere(compst, i, rule); /* call jumps to respective rule */ 909 jumptothere(compst, i, rule); /* call jumps to respective rule */
890 } else if (code[i].i.code == IThrowRec) { 910 } else if (code[i].i.code == IThrowRec) { /* labeled failure */
891 int n = code[i].i.key; /* rule number */ 911 int n = code[i].i.key; /* rule number */
892 int rule = positions[n]; /* rule position */ 912 int rule = positions[n]; /* rule position */
893 assert(rule == from || code[rule - 1].i.code == IRet); 913 assert(rule == from || code[rule - 1].i.code == IRet);
@@ -912,8 +932,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) {
912 int start = gethere(compst); /* here starts the initial rule */ 932 int start = gethere(compst); /* here starts the initial rule */
913 jumptohere(compst, firstcall); 933 jumptohere(compst, firstcall);
914 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { 934 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
935 TTree *r = sib1(rule);
936 assert(r->tag == TXInfo);
915 positions[rulenumber++] = gethere(compst); /* save rule position */ 937 positions[rulenumber++] = gethere(compst); /* save rule position */
916 codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ 938 codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */
917 addinstruction(compst, IRet, 0); 939 addinstruction(compst, IRet, 0);
918 } 940 }
919 assert(rule->tag == TTrue); 941 assert(rule->tag == TTrue);
@@ -924,8 +946,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) {
924 946
925static void codecall (CompileState *compst, TTree *call) { 947static void codecall (CompileState *compst, TTree *call) {
926 int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ 948 int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */
927 getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ 949 assert(sib1(sib2(call))->tag == TXInfo);
928 assert(sib2(call)->tag == TRule); 950 getinstr(compst, c).i.key = sib1(sib2(call))->u.n; /* rule number */
929} 951}
930 952
931 953
@@ -963,6 +985,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
963 case TSet: codecharset(compst, treebuffer(tree), tt); break; 985 case TSet: codecharset(compst, treebuffer(tree), tt); break;
964 case TTrue: break; 986 case TTrue: break;
965 case TFalse: addinstruction(compst, IFail, 0); break; 987 case TFalse: addinstruction(compst, IFail, 0); break;
988 case TUTFR: codeutfr(compst, tree); break;
966 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; 989 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
967 case TRep: coderep(compst, sib1(tree), opt, fl); break; 990 case TRep: coderep(compst, sib1(tree), opt, fl); break;
968 case TBehind: codebehind(compst, tree); break; 991 case TBehind: codebehind(compst, tree); break;
@@ -1013,7 +1036,7 @@ static void peephole (CompileState *compst) {
1013 case IRet: case IFail: case IFailTwice: 1036 case IRet: case IFail: case IFailTwice:
1014 case IEnd: { /* instructions with unconditional implicit jumps */ 1037 case IEnd: { /* instructions with unconditional implicit jumps */
1015 code[i] = code[ft]; /* jump becomes that instruction */ 1038 code[i] = code[ft]; /* jump becomes that instruction */
1016 code[i + 1].i.code = IAny; /* 'no-op' for target position */ 1039 code[i + 1].i.code = IEmpty; /* 'no-op' for target position */
1017 break; 1040 break;
1018 } 1041 }
1019 case ICommit: case IPartialCommit: 1042 case ICommit: case IPartialCommit:
diff --git a/lpprint.c b/lpprint.c
index 76a7007..af03edc 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -56,21 +56,26 @@ void printinst (const Instruction *op, const Instruction *p) {
56 const char *const names[] = { 56 const char *const names[] = {
57 "any", "char", "set", 57 "any", "char", "set",
58 "testany", "testchar", "testset", 58 "testany", "testchar", "testset",
59 "span", "behind", 59 "span", "utf-range", "behind",
60 "ret", "end", 60 "ret", "end",
61 "choice", "pred_choice", "jmp", "call", "open_call", /* labeled failure */ 61 "choice", "pred_choice", "jmp", "call", "open_call", /* labeled failure */
62 "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", 62 "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
63 "fullcapture", "opencapture", "closecapture", "closeruntime", 63 "fullcapture", "opencapture", "closecapture", "closeruntime",
64 "throw", "throw_rec", /* labeled failure */ 64 "throw", "throw_rec", /* labeled failure */
65 "--"
65 }; 66 };
66 printf("%02ld: %s ", (long)(p - op), names[p->i.code]); 67 printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
67 switch ((Opcode)p->i.code) { 68 switch ((Opcode)p->i.code) {
68 case IChar: { 69 case IChar: {
69 printf("'%c'", p->i.aux); 70 printf("'%c' (%02x)", p->i.aux, p->i.aux);
70 break; 71 break;
71 } 72 }
72 case ITestChar: { 73 case ITestChar: {
73 printf("'%c'", p->i.aux); printjmp(op, p); 74 printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p);
75 break;
76 }
77 case IUTFR: {
78 printf("%d - %d", p[1].offset, utf_to(p));
74 break; 79 break;
75 } 80 }
76 case IFullCapture: { 81 case IFullCapture: {
@@ -157,11 +162,11 @@ void printcaplist (Capture *cap, Capture *limit) {
157 162
158static const char *tagnames[] = { 163static const char *tagnames[] = {
159 "char", "set", "any", 164 "char", "set", "any",
160 "true", "false", 165 "true", "false", "utf8.range",
161 "rep", 166 "rep",
162 "seq", "choice", 167 "seq", "choice",
163 "not", "and", 168 "not", "and",
164 "call", "opencall", "rule", "grammar", 169 "call", "opencall", "rule", "xinfo", "grammar",
165 "behind", 170 "behind",
166 "capture", "run-time", 171 "capture", "run-time",
167 "throw" /* labeled failure */ 172 "throw" /* labeled failure */
@@ -170,6 +175,7 @@ static const char *tagnames[] = {
170 175
171void printtree (TTree *tree, int ident) { 176void printtree (TTree *tree, int ident) {
172 int i; 177 int i;
178 int sibs = numsiblings[tree->tag];
173 for (i = 0; i < ident; i++) printf(" "); 179 for (i = 0; i < ident; i++) printf(" ");
174 printf("%s", tagnames[tree->tag]); 180 printf("%s", tagnames[tree->tag]);
175 switch (tree->tag) { 181 switch (tree->tag) {
@@ -186,25 +192,34 @@ void printtree (TTree *tree, int ident) {
186 printf("\n"); 192 printf("\n");
187 break; 193 break;
188 } 194 }
195 case TUTFR: {
196 assert(sib1(tree)->tag == TXInfo);
197 printf(" %d (%02x %d) - %d (%02x %d) \n",
198 tree->u.n, tree->key, tree->cap,
199 sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap);
200 break;
201 }
189 case TOpenCall: case TCall: { 202 case TOpenCall: case TCall: {
190 assert(sib2(tree)->tag == TRule); 203 assert(sib1(sib2(tree))->tag == TXInfo);
191 printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); 204 printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
192 break; 205 break;
193 } 206 }
194 case TBehind: { 207 case TBehind: {
195 printf(" %d\n", tree->u.n); 208 printf(" %d\n", tree->u.n);
196 printtree(sib1(tree), ident + 2);
197 break; 209 break;
198 } 210 }
199 case TCapture: { 211 case TCapture: {
200 printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); 212 printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key);
201 printtree(sib1(tree), ident + 2);
202 break; 213 break;
203 } 214 }
204 case TRule: { 215 case TRule: {
205 printf(" n: %d key: %d\n", tree->cap, tree->key); 216 printf(" key: %d\n", tree->key);
206 printtree(sib1(tree), ident + 2); 217 sibs = 1; /* do not print 'sib2' (next rule) as a sibling */
207 break; /* do not print next rule as a sibling */ 218 break;
219 }
220 case TXInfo: {
221 printf(" n: %d\n", tree->u.n);
222 break;
208 } 223 }
209 case TGrammar: { 224 case TGrammar: {
210 TTree *rule = sib1(tree); 225 TTree *rule = sib1(tree);
@@ -214,6 +229,7 @@ void printtree (TTree *tree, int ident) {
214 rule = sib2(rule); 229 rule = sib2(rule);
215 } 230 }
216 assert(rule->tag == TTrue); /* sentinel */ 231 assert(rule->tag == TTrue); /* sentinel */
232 sibs = 0; /* siblings already handled */
217 break; 233 break;
218 } 234 }
219 case TThrow: { /* labeled failure */ 235 case TThrow: { /* labeled failure */
@@ -222,16 +238,14 @@ void printtree (TTree *tree, int ident) {
222 printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); 238 printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap);
223 break; 239 break;
224 } 240 }
225 default: { 241 default:
226 int sibs = numsiblings[tree->tag];
227 printf("\n"); 242 printf("\n");
228 if (sibs >= 1) {
229 printtree(sib1(tree), ident + 2);
230 if (sibs >= 2)
231 printtree(sib2(tree), ident + 2);
232 }
233 break; 243 break;
234 } 244 }
245 if (sibs >= 1) {
246 printtree(sib1(tree), ident + 2);
247 if (sibs >= 2)
248 printtree(sib2(tree), ident + 2);
235 } 249 }
236} 250}
237 251
diff --git a/lptree.c b/lptree.c
index b1a32c4..4afbae7 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,11 +21,11 @@
21/* number of siblings for each tree */ 21/* number of siblings for each tree */
22const byte numsiblings[] = { 22const byte numsiblings[] = {
23 0, 0, 0, /* char, set, any */ 23 0, 0, 0, /* char, set, any */
24 0, 0, /* true, false */ 24 0, 0, 0, /* true, false, utf-range */
25 1, /* rep */ 25 1, /* rep */
26 2, 2, /* seq, choice */ 26 2, 2, /* seq, choice */
27 1, 1, /* not, and */ 27 1, 1, /* not, and */
28 0, 0, 2, 1, /* call, opencall, rule, grammar */ 28 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */
29 1, /* behind */ 29 1, /* behind */
30 1, 1, /* capture, runtime capture */ 30 1, 1, /* capture, runtime capture */
31 0 /* labeled failure throw */ 31 0 /* labeled failure throw */
@@ -58,7 +58,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t, byte tag
58 lua_gettable(L, postable); /* query name in position table */ 58 lua_gettable(L, postable); /* query name in position table */
59 n = lua_tonumber(L, -1); /* get (absolute) position */ 59 n = lua_tonumber(L, -1); /* get (absolute) position */
60 lua_pop(L, 1); /* remove position */ 60 lua_pop(L, 1); /* remove position */
61 if (tag == TOpenCall) { 61 if (tag == TOpenCall) { /* labeled failure */
62 if (n == 0) { /* no position? */ 62 if (n == 0) { /* no position? */
63 lua_rawgeti(L, -1, t->key); /* get rule's name again */ 63 lua_rawgeti(L, -1, t->key); /* get rule's name again */
64 luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); 64 luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1));
@@ -109,7 +109,7 @@ static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) {
109 return; 109 return;
110 case TOpenCall: { 110 case TOpenCall: {
111 if (g != NULL) /* inside a grammar? */ 111 if (g != NULL) /* inside a grammar? */
112 fixonecall(L, postable, g, t, TOpenCall); 112 fixonecall(L, postable, g, t, TOpenCall); /* labeled failure */
113 else { /* open call outside grammar */ 113 else { /* open call outside grammar */
114 lua_rawgeti(L, -1, t->key); 114 lua_rawgeti(L, -1, t->key);
115 luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); 115 luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1));
@@ -695,6 +695,56 @@ static int lp_range (lua_State *L) {
695 695
696 696
697/* 697/*
698** Fills a tree node with basic information about the UTF-8 code point
699** 'cpu': its value in 'n', its length in 'cap', and its first byte in
700** 'key'
701*/
702static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
703 int len, fb, cp;
704 cp = (int)cpu;
705 if (cp <= 0x7f) { /* one byte? */
706 len = 1;
707 fb = cp;
708 } else if (cp <= 0x7ff) {
709 len = 2;
710 fb = 0xC0 | (cp >> 6);
711 } else if (cp <= 0xffff) {
712 len = 3;
713 fb = 0xE0 | (cp >> 12);
714 }
715 else {
716 luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
717 len = 4;
718 fb = 0xF0 | (cp >> 18);
719 }
720 t->u.n = cp;
721 t->cap = len;
722 t->key = fb;
723}
724
725
726static int lp_utfr (lua_State *L) {
727 lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
728 lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
729 luaL_argcheck(L, from <= to, 2, "empty range");
730 if (to <= 0x7f) { /* ascii range? */
731 TTree *tree = newcharset(L); /* code it as a regular charset */
732 unsigned int f;
733 for (f = (int)from; f <= to; f++)
734 setchar(treebuffer(tree), f);
735 }
736 else { /* multi-byte utf-8 range */
737 TTree *tree = newtree(L, 2);
738 tree->tag = TUTFR;
739 codeutftree(L, tree, from, 1);
740 sib1(tree)->tag = TXInfo;
741 codeutftree(L, sib1(tree), to, 2);
742 }
743 return 1;
744}
745
746
747/*
698** Look-behind predicate 748** Look-behind predicate
699*/ 749*/
700static int lp_behind (lua_State *L) { 750static int lp_behind (lua_State *L) {
@@ -940,7 +990,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
940 int size; /* accumulator for total size */ 990 int size; /* accumulator for total size */
941 lua_newtable(L); /* create position table */ 991 lua_newtable(L); /* create position table */
942 getfirstrule(L, arg, postab); 992 getfirstrule(L, arg, postab);
943 size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ 993 size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */
944 lua_pushnil(L); /* prepare to traverse grammar table */ 994 lua_pushnil(L); /* prepare to traverse grammar table */
945 while (lua_next(L, arg) != 0) { 995 while (lua_next(L, arg) != 0) {
946 if (lua_tonumber(L, -2) == 1 || 996 if (lua_tonumber(L, -2) == 1 ||
@@ -954,11 +1004,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
954 lua_pushvalue(L, -2); /* push key (to insert into position table) */ 1004 lua_pushvalue(L, -2); /* push key (to insert into position table) */
955 lua_pushinteger(L, size); 1005 lua_pushinteger(L, size);
956 lua_settable(L, postab); 1006 lua_settable(L, postab);
957 size += 1 + getsize(L, -1); /* update size */ 1007 size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */
958 lua_pushvalue(L, -2); /* push key (for next lua_next) */ 1008 lua_pushvalue(L, -2); /* push key (for next lua_next) */
959 n++; 1009 n++;
960 } 1010 }
961 *totalsize = size + 1; /* TTrue to finish list of rules */ 1011 *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */
962 return n; 1012 return n;
963} 1013}
964 1014
@@ -970,11 +1020,13 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
970 int ridx = frule + 2*i + 1; /* index of i-th rule */ 1020 int ridx = frule + 2*i + 1; /* index of i-th rule */
971 int rulesize; 1021 int rulesize;
972 TTree *rn = gettree(L, ridx, &rulesize); 1022 TTree *rn = gettree(L, ridx, &rulesize);
1023 TTree *pr = sib1(nd); /* points to rule's prerule */
973 nd->tag = TRule; 1024 nd->tag = TRule;
974 nd->key = 0; /* will be fixed when rule is used */ 1025 nd->key = 0; /* will be fixed when rule is used */
975 nd->cap = i; /* rule number */ 1026 pr->tag = TXInfo;
976 nd->u.ps = rulesize + 1; /* point to next rule */ 1027 pr->u.n = i; /* rule number */
977 memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ 1028 nd->u.ps = rulesize + 2; /* point to next rule */
1029 memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */
978 mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ 1030 mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */
979 nd = sib2(nd); /* move to next rule */ 1031 nd = sib2(nd); /* move to next rule */
980 } 1032 }
@@ -1010,7 +1062,7 @@ static int checkloops (TTree *tree) {
1010** twice in 'passed', there is path from it back to itself without 1062** twice in 'passed', there is path from it back to itself without
1011** advancing the subject. 1063** advancing the subject.
1012*/ 1064*/
1013static int verifyerror (lua_State *L, int *passed, int npassed) { 1065static int verifyerror (lua_State *L, unsigned short *passed, int npassed) {
1014 int i, j; 1066 int i, j;
1015 for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ 1067 for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */
1016 for (j = i - 1; j >= 0; j--) { 1068 for (j = i - 1; j >= 0; j--) {
@@ -1035,12 +1087,13 @@ static int verifyerror (lua_State *L, int *passed, int npassed) {
1035** counts the elements in 'passed'. 1087** counts the elements in 'passed'.
1036** Assume ktable at the top of the stack. 1088** Assume ktable at the top of the stack.
1037*/ 1089*/
1038static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, 1090static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
1039 int nb) { 1091 int npassed, int nb) {
1040 tailcall: 1092 tailcall:
1041 switch (tree->tag) { 1093 switch (tree->tag) {
1042 case TChar: case TSet: case TAny: 1094 case TChar: case TSet: case TAny:
1043 case TFalse: case TThrow: /* labeled failure */ 1095 case TFalse: case TUTFR:
1096 case TThrow: /* labeled failure */
1044 return nb; /* cannot pass from here */ 1097 return nb; /* cannot pass from here */
1045 case TTrue: 1098 case TTrue:
1046 case TBehind: /* look-behind cannot have calls */ 1099 case TBehind: /* look-behind cannot have calls */
@@ -1048,7 +1101,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
1048 case TNot: case TAnd: case TRep: 1101 case TNot: case TAnd: case TRep:
1049 /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ 1102 /* return verifyrule(L, sib1(tree), passed, npassed, 1); */
1050 tree = sib1(tree); nb = 1; goto tailcall; 1103 tree = sib1(tree); nb = 1; goto tailcall;
1051 case TCapture: case TRunTime: 1104 case TCapture: case TRunTime: case TXInfo:
1052 /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ 1105 /* return verifyrule(L, sib1(tree), passed, npassed, nb); */
1053 tree = sib1(tree); goto tailcall; 1106 tree = sib1(tree); goto tailcall;
1054 case TCall: 1107 case TCall:
@@ -1059,15 +1112,15 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
1059 return nb; 1112 return nb;
1060 /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ 1113 /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */
1061 tree = sib2(tree); goto tailcall; 1114 tree = sib2(tree); goto tailcall;
1062 case TChoice: /* must check both children */ 1115 case TChoice: /* must check both children */
1063 nb = verifyrule(L, sib1(tree), passed, npassed, nb); 1116 nb = verifyrule(L, sib1(tree), passed, npassed, nb);
1064 /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ 1117 /* return verifyrule(L, sib2(tree), passed, npassed, nb); */
1065 tree = sib2(tree); goto tailcall; 1118 tree = sib2(tree); goto tailcall;
1066 case TRule: 1119 case TRule:
1067 if (npassed >= MAXRULES) 1120 if (npassed >= MAXRULES) /* too many steps? */
1068 return verifyerror(L, passed, npassed); 1121 return verifyerror(L, passed, npassed); /* error */
1069 else { 1122 else {
1070 passed[npassed++] = tree->key; 1123 passed[npassed++] = tree->key; /* add rule to path */
1071 /* return verifyrule(L, sib1(tree), passed, npassed); */ 1124 /* return verifyrule(L, sib1(tree), passed, npassed); */
1072 tree = sib1(tree); goto tailcall; 1125 tree = sib1(tree); goto tailcall;
1073 } 1126 }
@@ -1079,7 +1132,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
1079 1132
1080 1133
1081static void verifygrammar (lua_State *L, TTree *grammar) { 1134static void verifygrammar (lua_State *L, TTree *grammar) {
1082 int passed[MAXRULES]; 1135 unsigned short passed[MAXRULES];
1083 TTree *rule; 1136 TTree *rule;
1084 /* check left-recursive rules */ 1137 /* check left-recursive rules */
1085 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { 1138 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
@@ -1243,12 +1296,6 @@ static int lp_setmax (lua_State *L) {
1243} 1296}
1244 1297
1245 1298
1246static int lp_version (lua_State *L) {
1247 lua_pushstring(L, VERSION);
1248 return 1;
1249}
1250
1251
1252static int lp_type (lua_State *L) { 1299static int lp_type (lua_State *L) {
1253 if (testpattern(L, 1)) 1300 if (testpattern(L, 1))
1254 lua_pushliteral(L, "pattern"); 1301 lua_pushliteral(L, "pattern");
@@ -1317,8 +1364,9 @@ static struct luaL_Reg pattreg[] = {
1317 {"P", lp_P}, 1364 {"P", lp_P},
1318 {"S", lp_set}, 1365 {"S", lp_set},
1319 {"R", lp_range}, 1366 {"R", lp_range},
1367 {"utfR", lp_utfr},
1320 {"locale", lp_locale}, 1368 {"locale", lp_locale},
1321 {"version", lp_version}, 1369 {"version", NULL},
1322 {"setmaxstack", lp_setmax}, 1370 {"setmaxstack", lp_setmax},
1323 {"type", lp_type}, 1371 {"type", lp_type},
1324 {"T", lp_throw}, /* labeled failure throw */ 1372 {"T", lp_throw}, /* labeled failure throw */
@@ -1347,6 +1395,8 @@ int luaopen_lpeglabel (lua_State *L) { /* labeled failure */
1347 luaL_newlib(L, pattreg); 1395 luaL_newlib(L, pattreg);
1348 lua_pushvalue(L, -1); 1396 lua_pushvalue(L, -1);
1349 lua_setfield(L, -3, "__index"); 1397 lua_setfield(L, -3, "__index");
1398 lua_pushliteral(L, "LPegLabel " VERSION); /* labeled failure */
1399 lua_setfield(L, -2, "version");
1350 return 1; 1400 return 1;
1351} 1401}
1352 1402
diff --git a/lptree.h b/lptree.h
index 0cf160a..05e0680 100644
--- a/lptree.h
+++ b/lptree.h
@@ -18,6 +18,9 @@ typedef enum TTag {
18 TAny, 18 TAny,
19 TTrue, 19 TTrue,
20 TFalse, 20 TFalse,
21 TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint;
22 'cap' has length; 'key' has first byte;
23 extra info is similar for end codepoint */
21 TRep, /* 'sib1'* */ 24 TRep, /* 'sib1'* */
22 TSeq, /* 'sib1' 'sib2' */ 25 TSeq, /* 'sib1' 'sib2' */
23 TChoice, /* 'sib1' / 'sib2' */ 26 TChoice, /* 'sib1' / 'sib2' */
@@ -26,8 +29,9 @@ typedef enum TTag {
26 TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ 29 TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */
27 TOpenCall, /* ktable[key] is rule's key */ 30 TOpenCall, /* ktable[key] is rule's key */
28 TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); 31 TRule, /* ktable[key] is rule's key (but key == 0 for unused rules);
29 'sib1' is rule's pattern; 32 'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
30 'sib2' is next rule; 'cap' is rule's sequential number */ 33 extra info 'n' is rule's sequential number */
34 TXInfo, /* extra info */
31 TGrammar, /* 'sib1' is initial (and first) rule */ 35 TGrammar, /* 'sib1' is initial (and first) rule */
32 TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ 36 TBehind, /* 'sib1' is pattern, 'n' is how much to go back */
33 TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); 37 TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind');
@@ -36,6 +40,7 @@ typedef enum TTag {
36 TRunTime, /* run-time capture: 'key' is Lua function; 40 TRunTime, /* run-time capture: 'key' is Lua function;
37 'sib1' is capture body */ 41 'sib1' is capture body */
38 TThrow, /* labeled failure: ktable[key] is label's name */ 42 TThrow, /* labeled failure: ktable[key] is label's name */
43
39} TTag; 44} TTag;
40 45
41 46
@@ -50,8 +55,8 @@ typedef struct TTree {
50 byte cap; /* kind of capture (if it is a capture) */ 55 byte cap; /* kind of capture (if it is a capture) */
51 unsigned short key; /* key in ktable for Lua data (0 if no key) */ 56 unsigned short key; /* key in ktable for Lua data (0 if no key) */
52 union { 57 union {
53 int n; /* occasional counter */
54 int ps; /* occasional second child */ 58 int ps; /* occasional second child */
59 int n; /* occasional counter */
55 } u; 60 } u;
56} TTree; 61} TTree;
57 62
diff --git a/lptypes.h b/lptypes.h
index 3261428..bf9aed1 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -15,7 +15,7 @@
15#include "lua.h" 15#include "lua.h"
16 16
17 17
18#define VERSION "1.5.2" 18#define VERSION "1.6.0"
19 19
20 20
21#define PATTERN_T "lpeg-pattern" 21#define PATTERN_T "lpeg-pattern"
@@ -37,6 +37,8 @@
37#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) 37#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f)
38#define luaL_newlib(L,f) luaL_register(L,"lpeg",f) 38#define luaL_newlib(L,f) luaL_register(L,"lpeg",f)
39 39
40typedef size_t lua_Unsigned;
41
40#endif 42#endif
41 43
42 44
@@ -51,9 +53,9 @@
51#endif 53#endif
52 54
53 55
54/* maximum number of rules in a grammar (limited by 'unsigned char') */ 56/* maximum number of rules in a grammar (limited by 'unsigned short') */
55#if !defined(MAXRULES) 57#if !defined(MAXRULES)
56#define MAXRULES UCHAR_MAX 58#define MAXRULES 1000
57#endif 59#endif
58 60
59 61
diff --git a/lpvm.c b/lpvm.c
index a791c44..b7ae631 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -18,16 +18,45 @@
18 18
19/* initial size for call/backtrack stack */ 19/* initial size for call/backtrack stack */
20#if !defined(INITBACK) 20#if !defined(INITBACK)
21#define INITBACK MAXBACK 21#define INITBACK MAXBACK
22#endif 22#endif
23 23
24 24
25#define getoffset(p) (((p) + 1)->offset) 25#define getoffset(p) (((p) + 1)->offset)
26 26
27static const Instruction giveup = {{IGiveup, 0, 0}}; 27static const Instruction giveup = {{IGiveup, 0, 0}};
28 28
29 29
30/* 30/*
31** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
32*/
33static const char *utf8_decode (const char *o, int *val) {
34 static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
35 const unsigned char *s = (const unsigned char *)o;
36 unsigned int c = s[0]; /* first byte */
37 unsigned int res = 0; /* final result */
38 if (c < 0x80) /* ascii? */
39 res = c;
40 else {
41 int count = 0; /* to count number of continuation bytes */
42 while (c & 0x40) { /* still have continuation bytes? */
43 int cc = s[++count]; /* read next byte */
44 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
45 return NULL; /* invalid byte sequence */
46 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
47 c <<= 1; /* to test next bit */
48 }
49 res |= (c & 0x7F) << (count * 5); /* add first byte */
50 if (count > 3 || res > 0x10FFFFu || res <= limits[count])
51 return NULL; /* invalid byte sequence */
52 s += count; /* skip continuation bytes read */
53 }
54 *val = res;
55 return (const char *)s + 1; /* +1 to include first byte */
56}
57
58
59/*
31** {====================================================== 60** {======================================================
32** Virtual Machine 61** Virtual Machine
33** ======================================================= 62** =======================================================
@@ -43,7 +72,7 @@ typedef struct Stack {
43} Stack; 72} Stack;
44 73
45 74
46#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) 75#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop)))
47 76
48 77
49/* 78/*
@@ -207,6 +236,20 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
207 } 236 }
208 continue; 237 continue;
209 } 238 }
239 case IUTFR: {
240 int codepoint;
241 if (s >= e)
242 goto fail;
243 s = utf8_decode (s, &codepoint);
244 if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
245 p += 2;
246 else {
247 *labelf = LFAIL; /* labeled failure */
248 updatefarthest(*sfail, s); /*labeled failure */
249 goto fail;
250 }
251 continue;
252 }
210 case ITestAny: { 253 case ITestAny: {
211 if (s < e) p += 2; 254 if (s < e) p += 2;
212 else p += getoffset(p); 255 else p += getoffset(p);
@@ -301,8 +344,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
301 continue; 344 continue;
302 } 345 }
303 case ICommit: { 346 case ICommit: {
304 assert(stack > getstackbase(L, ptop)); 347 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
305 assert((stack - 1)->s != NULL);
306 stack--; 348 stack--;
307 p += getoffset(p); 349 p += getoffset(p);
308 continue; 350 continue;
@@ -318,6 +360,8 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
318 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); 360 assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
319 s = (--stack)->s; 361 s = (--stack)->s;
320 insidepred = stack->labenv; /* labeled failure */ 362 insidepred = stack->labenv; /* labeled failure */
363 if (ndyncap > 0) /* are there matchtime captures? */
364 ndyncap -= removedyncap(L, capture, stack->caplevel, captop);
321 captop = stack->caplevel; 365 captop = stack->caplevel;
322 p += getoffset(p); 366 p += getoffset(p);
323 continue; 367 continue;
diff --git a/lpvm.h b/lpvm.h
index 6633c4b..19f4108 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -17,17 +17,18 @@ typedef enum Opcode {
17 ITestChar, /* if char != aux, jump to 'offset' */ 17 ITestChar, /* if char != aux, jump to 'offset' */
18 ITestSet, /* if char not in buff, jump to 'offset' */ 18 ITestSet, /* if char not in buff, jump to 'offset' */
19 ISpan, /* read a span of chars in buff */ 19 ISpan, /* read a span of chars in buff */
20 IUTFR, /* if codepoint not in range [offset, utf_to], fail */
20 IBehind, /* walk back 'aux' characters (fail if not possible) */ 21 IBehind, /* walk back 'aux' characters (fail if not possible) */
21 IRet, /* return from a rule */ 22 IRet, /* return from a rule */
22 IEnd, /* end of pattern */ 23 IEnd, /* end of pattern */
23 IChoice, /* stack a choice; next fail will jump to 'offset' */ 24 IChoice, /* stack a choice; next fail will jump to 'offset' */
24 IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ 25 IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ /*labeled failure */
25 IJmp, /* jump to 'offset' */ 26 IJmp, /* jump to 'offset' */
26 ICall, /* call rule at 'offset' */ 27 ICall, /* call rule at 'offset' */
27 IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ 28 IOpenCall, /* call rule number 'key' (must be closed to a ICall) */
28 ICommit, /* pop choice and jump to 'offset' */ 29 ICommit, /* pop choice and jump to 'offset' */
29 IPartialCommit, /* update top choice to current position and jump */ 30 IPartialCommit, /* update top choice to current position and jump */
30 IBackCommit, /* "fails" but jump to its own 'offset' */ 31 IBackCommit, /* backtrack like "fail" but jump to its own 'offset' */
31 IFailTwice, /* pop one choice and then fail */ 32 IFailTwice, /* pop one choice and then fail */
32 IFail, /* go back to saved state on choice and jump to saved offset */ 33 IFail, /* go back to saved state on choice and jump to saved offset */
33 IGiveup, /* internal use */ 34 IGiveup, /* internal use */
@@ -35,8 +36,9 @@ typedef enum Opcode {
35 IOpenCapture, /* start a capture */ 36 IOpenCapture, /* start a capture */
36 ICloseCapture, 37 ICloseCapture,
37 ICloseRunTime, 38 ICloseRunTime,
38 IThrow, /* fails with a given label */ 39 IThrow, /* fails with a given label */ /*labeled failure */
39 IThrowRec, /* fails with a given label and call rule at 'offset' */ 40 IThrowRec, /* fails with a given label and call rule at 'offset' */ /*labeled failure */
41 IEmpty /* to fill empty slots left by optimizations */
40} Opcode; 42} Opcode;
41 43
42 44
@@ -52,10 +54,13 @@ typedef union Instruction {
52} Instruction; 54} Instruction;
53 55
54 56
57/* extract 24-bit value from an instruction */
58#define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux)
59
60
55void printpatt (Instruction *p, int n); 61void printpatt (Instruction *p, int n);
56const char *match (lua_State *L, const char *o, const char *s, const char *e, 62const char *match (lua_State *L, const char *o, const char *s, const char *e,
57 Instruction *op, Capture *capture, int ptop, short *labelf, const char **sfail); /* labeled failure */ 63 Instruction *op, Capture *capture, int ptop, short *labelf, const char **sfail); /* labeled failure */
58 64
59
60#endif 65#endif
61 66
diff --git a/test.lua b/test.lua
index 2c05dd0..3989f30 100755
--- a/test.lua
+++ b/test.lua
@@ -48,8 +48,8 @@ end
48 48
49print"General tests for LPeg library" 49print"General tests for LPeg library"
50 50
51assert(type(m.version()) == "string") 51assert(type(m.version) == "string")
52print("version " .. m.version()) 52print(m.version)
53assert(m.type("alo") ~= "pattern") 53assert(m.type("alo") ~= "pattern")
54assert(m.type(io.input) ~= "pattern") 54assert(m.type(io.input) ~= "pattern")
55assert(m.type(m.P"alo") == "pattern") 55assert(m.type(m.P"alo") == "pattern")
@@ -70,7 +70,6 @@ assert(m.match(#m.P(true) * "a", "a") == 2)
70assert(m.match("a" * #m.P(false), "a") == nil) 70assert(m.match("a" * #m.P(false), "a") == nil)
71assert(m.match("a" * #m.P(true), "a") == 2) 71assert(m.match("a" * #m.P(true), "a") == 2)
72 72
73
74-- tests for locale 73-- tests for locale
75do 74do
76 assert(m.locale(m) == m) 75 assert(m.locale(m) == m)
@@ -406,7 +405,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc')
406 405
407 406
408do 407do
409 -- large dynamic Cc 408 print "testing large dynamic Cc"
410 local lim = 2^16 - 1 409 local lim = 2^16 - 1
411 local c = 0 410 local c = 0
412 local function seq (n) 411 local function seq (n)
@@ -985,10 +984,10 @@ for i = 1, 10 do
985 assert(p:match("aaaaaaaaaaa") == 11 - i + 1) 984 assert(p:match("aaaaaaaaaaa") == 11 - i + 1)
986end 985end
987 986
988print"+"
989 987
990 988
991-- tests for back references 989print "testing back references"
990
992checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') 991checkerr("back reference 'x' not found", m.match, m.Cb('x'), '')
993checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') 992checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a')
994 993
@@ -1032,6 +1031,17 @@ local function id (s, i, ...)
1032 return true, ... 1031 return true, ...
1033end 1032end
1034 1033
1034do -- run-time capture in an end predicate (should discard its value)
1035 local x = 0
1036 function foo (s, i)
1037 x = x + 1
1038 return true, x
1039 end
1040
1041 local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo)
1042 assert(p:match("xx") == 2)
1043end
1044
1035assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + 1045assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) +
1036 m.R'09'^1 / string.char + 1046 m.R'09'^1 / string.char +
1037 m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") 1047 m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y")
@@ -1171,9 +1181,85 @@ t = {p:match('abacc')}
1171checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) 1181checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'})
1172 1182
1173 1183
1184do print"testing large grammars"
1185 local lim = 1000 -- number of rules
1186 local t = {}
1187
1188 for i = 3, lim do
1189 t[i] = m.V(i - 1) -- each rule calls previous one
1190 end
1191 t[1] = m.V(lim) -- start on last rule
1192 t[2] = m.C("alo") -- final rule
1193
1194 local P = m.P(t) -- build grammar
1195 assert(P:match("alo") == "alo")
1196
1197 t[#t + 1] = m.P("x") -- one more rule...
1198 checkerr("too many rules", m.P, t)
1199end
1200
1201
1202print "testing UTF-8 ranges"
1203
1204do -- a few typical UTF-8 ranges
1205 local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0"
1206 + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0"
1207 + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0"
1208 + m.utfR(0, 0x7f)^1 / "ascii: %0"
1209 + m.utfR(0, 0x10ffff) / "other: %0"
1210
1211 p = m.Ct(p^0) * -m.P(1)
1212
1213 local cyr = "ждюя"
1214 local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀
1215 local cjk = "专举乸"
1216 local ascii = "alo"
1217 local last = "\244\143\191\191" -- U+10FFFF
1218
1219 local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last
1220 t = (p:match(s))
1221
1222 assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and
1223 t[3] == "emot: " .. emot and t[4] == "other: —" and
1224 t[5] == "cjk: " .. cjk and t[6] == "other: —" and
1225 t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and
1226 t[9] == nil)
1227end
1228
1229
1230do -- valid and invalid code points
1231 local p = m.utfR(0, 0x10ffff)^0
1232 assert(p:match("汉字\128") == #"汉字" + 1)
1233 assert(p:match("\244\159\191") == 1)
1234 assert(p:match("\244\159\191\191") == 1)
1235 assert(p:match("\255") == 1)
1236
1237 -- basic errors
1238 checkerr("empty range", m.utfR, 1, 0)
1239 checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1)
1240end
1241
1242
1243do -- back references (fixed width)
1244 -- match a byte after a CJK point
1245 local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1)
1246 p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p'
1247 assert(p:match("ab д 专X x") == "X")
1248
1249 -- match a byte after a hebrew point
1250 local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1)
1251 p = m.P(#"ש") * p
1252 assert(p:match("שX") == "X")
1253
1254 checkerr("fixed length", m.B, m.utfR(0, 0x10ffff))
1255end
1256
1257
1258
1174------------------------------------------------------------------- 1259-------------------------------------------------------------------
1175-- Tests for 're' module 1260-- Tests for 're' module
1176------------------------------------------------------------------- 1261-------------------------------------------------------------------
1262print"testing 're' module"
1177 1263
1178local re = require "relabel" 1264local re = require "relabel"
1179 1265
diff --git a/testlabel.lua b/testlabel.lua
index d60bb54..c2f760d 100644
--- a/testlabel.lua
+++ b/testlabel.lua
@@ -115,7 +115,7 @@ p = m.P{
115 "S", 115 "S",
116 S = m.T("bola"), 116 S = m.T("bola"),
117 bolada = m.P"a" 117 bolada = m.P"a"
118} 118}
119r, l, poserr = p:match("abc") 119r, l, poserr = p:match("abc")
120assert(r == nil and l == 'bola' and poserr == 1) 120assert(r == nil and l == 'bola' and poserr == 1)
121 121
@@ -134,7 +134,7 @@ p = m.P{
134 "S", 134 "S",
135 S = m.T("bola"), 135 S = m.T("bola"),
136 bola = m.P"a" 136 bola = m.P"a"
137} 137}
138r, l, poserr = p:match("abc") 138r, l, poserr = p:match("abc")
139assert(r == 2) 139assert(r == 2)
140 140