diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2012-07-31 14:48:42 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2012-07-31 14:48:42 -0300 |
| commit | 6625cbecd101bfc0bc1d1034a4b527bd9a17a5de (patch) | |
| tree | a69a04b8ba006ede9eada251502c8381b72173be /lstrlib.c | |
| parent | 4ac55997ecc70410c065a9494a694c2ad25c0104 (diff) | |
| download | lua-6625cbecd101bfc0bc1d1034a4b527bd9a17a5de.tar.gz lua-6625cbecd101bfc0bc1d1034a4b527bd9a17a5de.tar.bz2 lua-6625cbecd101bfc0bc1d1034a4b527bd9a17a5de.zip | |
Bug: Some patterns can overflow the C stack, due to recursion
(Took the opportunity to refactor function 'match')
Diffstat (limited to 'lstrlib.c')
| -rw-r--r-- | lstrlib.c | 201 |
1 files changed, 124 insertions, 77 deletions
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | ** $Id: lstrlib.c,v 1.175 2012/04/20 13:16:48 roberto Exp roberto $ | 2 | ** $Id: lstrlib.c,v 1.176 2012/05/23 15:37:09 roberto Exp roberto $ |
| 3 | ** Standard library for string operations and pattern-matching | 3 | ** Standard library for string operations and pattern-matching |
| 4 | ** See Copyright Notice in lua.h | 4 | ** See Copyright Notice in lua.h |
| 5 | */ | 5 | */ |
| @@ -194,7 +194,9 @@ static int str_dump (lua_State *L) { | |||
| 194 | #define CAP_UNFINISHED (-1) | 194 | #define CAP_UNFINISHED (-1) |
| 195 | #define CAP_POSITION (-2) | 195 | #define CAP_POSITION (-2) |
| 196 | 196 | ||
| 197 | |||
| 197 | typedef struct MatchState { | 198 | typedef struct MatchState { |
| 199 | int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ | ||
| 198 | const char *src_init; /* init of source string */ | 200 | const char *src_init; /* init of source string */ |
| 199 | const char *src_end; /* end ('\0') of source string */ | 201 | const char *src_end; /* end ('\0') of source string */ |
| 200 | const char *p_end; /* end ('\0') of pattern */ | 202 | const char *p_end; /* end ('\0') of pattern */ |
| @@ -207,6 +209,16 @@ typedef struct MatchState { | |||
| 207 | } MatchState; | 209 | } MatchState; |
| 208 | 210 | ||
| 209 | 211 | ||
| 212 | /* recursive function */ | ||
| 213 | static const char *match (MatchState *ms, const char *s, const char *p); | ||
| 214 | |||
| 215 | |||
| 216 | /* maximum recursion depth for 'match' */ | ||
| 217 | #if !defined(MAXCCALLS) | ||
| 218 | #define MAXCCALLS 200 | ||
| 219 | #endif | ||
| 220 | |||
| 221 | |||
| 210 | #define L_ESC '%' | 222 | #define L_ESC '%' |
| 211 | #define SPECIALS "^$*+?.([%-" | 223 | #define SPECIALS "^$*+?.([%-" |
| 212 | 224 | ||
| @@ -294,19 +306,22 @@ static int matchbracketclass (int c, const char *p, const char *ec) { | |||
| 294 | } | 306 | } |
| 295 | 307 | ||
| 296 | 308 | ||
| 297 | static int singlematch (int c, const char *p, const char *ep) { | 309 | static int singlematch (MatchState *ms, const char *s, const char *p, |
| 298 | switch (*p) { | 310 | const char *ep) { |
| 299 | case '.': return 1; /* matches any char */ | 311 | if (s >= ms->src_end) |
| 300 | case L_ESC: return match_class(c, uchar(*(p+1))); | 312 | return 0; |
| 301 | case '[': return matchbracketclass(c, p, ep-1); | 313 | else { |
| 302 | default: return (uchar(*p) == c); | 314 | int c = uchar(*s); |
| 315 | switch (*p) { | ||
| 316 | case '.': return 1; /* matches any char */ | ||
| 317 | case L_ESC: return match_class(c, uchar(*(p+1))); | ||
| 318 | case '[': return matchbracketclass(c, p, ep-1); | ||
| 319 | default: return (uchar(*p) == c); | ||
| 320 | } | ||
| 303 | } | 321 | } |
| 304 | } | 322 | } |
| 305 | 323 | ||
| 306 | 324 | ||
| 307 | static const char *match (MatchState *ms, const char *s, const char *p); | ||
| 308 | |||
| 309 | |||
| 310 | static const char *matchbalance (MatchState *ms, const char *s, | 325 | static const char *matchbalance (MatchState *ms, const char *s, |
| 311 | const char *p) { | 326 | const char *p) { |
| 312 | if (p >= ms->p_end - 1) | 327 | if (p >= ms->p_end - 1) |
| @@ -331,7 +346,7 @@ static const char *matchbalance (MatchState *ms, const char *s, | |||
| 331 | static const char *max_expand (MatchState *ms, const char *s, | 346 | static const char *max_expand (MatchState *ms, const char *s, |
| 332 | const char *p, const char *ep) { | 347 | const char *p, const char *ep) { |
| 333 | ptrdiff_t i = 0; /* counts maximum expand for item */ | 348 | ptrdiff_t i = 0; /* counts maximum expand for item */ |
| 334 | while ((s+i)<ms->src_end && singlematch(uchar(*(s+i)), p, ep)) | 349 | while (singlematch(ms, s + i, p, ep)) |
| 335 | i++; | 350 | i++; |
| 336 | /* keeps trying to match with the maximum repetitions */ | 351 | /* keeps trying to match with the maximum repetitions */ |
| 337 | while (i>=0) { | 352 | while (i>=0) { |
| @@ -349,7 +364,7 @@ static const char *min_expand (MatchState *ms, const char *s, | |||
| 349 | const char *res = match(ms, s, ep+1); | 364 | const char *res = match(ms, s, ep+1); |
| 350 | if (res != NULL) | 365 | if (res != NULL) |
| 351 | return res; | 366 | return res; |
| 352 | else if (s<ms->src_end && singlematch(uchar(*s), p, ep)) | 367 | else if (singlematch(ms, s, p, ep)) |
| 353 | s++; /* try with one more repetition */ | 368 | s++; /* try with one more repetition */ |
| 354 | else return NULL; | 369 | else return NULL; |
| 355 | } | 370 | } |
| @@ -393,79 +408,105 @@ static const char *match_capture (MatchState *ms, const char *s, int l) { | |||
| 393 | 408 | ||
| 394 | 409 | ||
| 395 | static const char *match (MatchState *ms, const char *s, const char *p) { | 410 | static const char *match (MatchState *ms, const char *s, const char *p) { |
| 411 | if (ms->matchdepth-- == 0) | ||
| 412 | luaL_error(ms->L, "pattern too complex"); | ||
| 396 | init: /* using goto's to optimize tail recursion */ | 413 | init: /* using goto's to optimize tail recursion */ |
| 397 | if (p == ms->p_end) /* end of pattern? */ | 414 | if (p != ms->p_end) { /* end of pattern? */ |
| 398 | return s; /* match succeeded */ | 415 | switch (*p) { |
| 399 | switch (*p) { | 416 | case '(': { /* start capture */ |
| 400 | case '(': { /* start capture */ | 417 | if (*(p + 1) == ')') /* position capture? */ |
| 401 | if (*(p+1) == ')') /* position capture? */ | 418 | s = start_capture(ms, s, p + 2, CAP_POSITION); |
| 402 | return start_capture(ms, s, p+2, CAP_POSITION); | 419 | else |
| 403 | else | 420 | s = start_capture(ms, s, p + 1, CAP_UNFINISHED); |
| 404 | return start_capture(ms, s, p+1, CAP_UNFINISHED); | 421 | break; |
| 405 | } | ||
| 406 | case ')': { /* end capture */ | ||
| 407 | return end_capture(ms, s, p+1); | ||
| 408 | } | ||
| 409 | case '$': { | ||
| 410 | if ((p+1) == ms->p_end) /* is the `$' the last char in pattern? */ | ||
| 411 | return (s == ms->src_end) ? s : NULL; /* check end of string */ | ||
| 412 | else goto dflt; | ||
| 413 | } | ||
| 414 | case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ | ||
| 415 | switch (*(p+1)) { | ||
| 416 | case 'b': { /* balanced string? */ | ||
| 417 | s = matchbalance(ms, s, p+2); | ||
| 418 | if (s == NULL) return NULL; | ||
| 419 | p+=4; goto init; /* else return match(ms, s, p+4); */ | ||
| 420 | } | ||
| 421 | case 'f': { /* frontier? */ | ||
| 422 | const char *ep; char previous; | ||
| 423 | p += 2; | ||
| 424 | if (*p != '[') | ||
| 425 | luaL_error(ms->L, "missing " LUA_QL("[") " after " | ||
| 426 | LUA_QL("%%f") " in pattern"); | ||
| 427 | ep = classend(ms, p); /* points to what is next */ | ||
| 428 | previous = (s == ms->src_init) ? '\0' : *(s-1); | ||
| 429 | if (matchbracketclass(uchar(previous), p, ep-1) || | ||
| 430 | !matchbracketclass(uchar(*s), p, ep-1)) return NULL; | ||
| 431 | p=ep; goto init; /* else return match(ms, s, ep); */ | ||
| 432 | } | ||
| 433 | case '0': case '1': case '2': case '3': | ||
| 434 | case '4': case '5': case '6': case '7': | ||
| 435 | case '8': case '9': { /* capture results (%0-%9)? */ | ||
| 436 | s = match_capture(ms, s, uchar(*(p+1))); | ||
| 437 | if (s == NULL) return NULL; | ||
| 438 | p+=2; goto init; /* else return match(ms, s, p+2) */ | ||
| 439 | } | ||
| 440 | default: goto dflt; | ||
| 441 | } | 422 | } |
| 442 | } | 423 | case ')': { /* end capture */ |
| 443 | default: dflt: { /* pattern class plus optional suffix */ | 424 | s = end_capture(ms, s, p + 1); |
| 444 | const char *ep = classend(ms, p); /* points to what is next */ | 425 | break; |
| 445 | int m = s < ms->src_end && singlematch(uchar(*s), p, ep); | 426 | } |
| 446 | switch (*ep) { | 427 | case '$': { |
| 447 | case '?': { /* optional */ | 428 | if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ |
| 448 | const char *res; | 429 | goto dflt; /* no; go to default */ |
| 449 | if (m && ((res=match(ms, s+1, ep+1)) != NULL)) | 430 | s = (s == ms->src_end) ? s : NULL; /* check end of string */ |
| 450 | return res; | 431 | break; |
| 451 | p=ep+1; goto init; /* else return match(ms, s, ep+1); */ | 432 | } |
| 452 | } | 433 | case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ |
| 453 | case '*': { /* 0 or more repetitions */ | 434 | switch (*(p + 1)) { |
| 454 | return max_expand(ms, s, p, ep); | 435 | case 'b': { /* balanced string? */ |
| 455 | } | 436 | s = matchbalance(ms, s, p + 2); |
| 456 | case '+': { /* 1 or more repetitions */ | 437 | if (s != NULL) { |
| 457 | return (m ? max_expand(ms, s+1, p, ep) : NULL); | 438 | p += 4; goto init; /* return match(ms, s, p + 4); */ |
| 439 | } /* else fail (s == NULL) */ | ||
| 440 | break; | ||
| 441 | } | ||
| 442 | case 'f': { /* frontier? */ | ||
| 443 | const char *ep; char previous; | ||
| 444 | p += 2; | ||
| 445 | if (*p != '[') | ||
| 446 | luaL_error(ms->L, "missing " LUA_QL("[") " after " | ||
| 447 | LUA_QL("%%f") " in pattern"); | ||
| 448 | ep = classend(ms, p); /* points to what is next */ | ||
| 449 | previous = (s == ms->src_init) ? '\0' : *(s - 1); | ||
| 450 | if (!matchbracketclass(uchar(previous), p, ep - 1) && | ||
| 451 | matchbracketclass(uchar(*s), p, ep - 1)) { | ||
| 452 | p = ep; goto init; /* return match(ms, s, ep); */ | ||
| 453 | } | ||
| 454 | s = NULL; /* match failed */ | ||
| 455 | break; | ||
| 456 | } | ||
| 457 | case '0': case '1': case '2': case '3': | ||
| 458 | case '4': case '5': case '6': case '7': | ||
| 459 | case '8': case '9': { /* capture results (%0-%9)? */ | ||
| 460 | s = match_capture(ms, s, uchar(*(p + 1))); | ||
| 461 | if (s != NULL) { | ||
| 462 | p += 2; goto init; /* return match(ms, s, p + 2) */ | ||
| 463 | } | ||
| 464 | break; | ||
| 465 | } | ||
| 466 | default: goto dflt; | ||
| 458 | } | 467 | } |
| 459 | case '-': { /* 0 or more repetitions (minimum) */ | 468 | break; |
| 460 | return min_expand(ms, s, p, ep); | 469 | } |
| 470 | default: dflt: { /* pattern class plus optional suffix */ | ||
| 471 | const char *ep = classend(ms, p); /* points to optional suffix */ | ||
| 472 | /* does not match at least once? */ | ||
| 473 | if (!singlematch(ms, s, p, ep)) { | ||
| 474 | if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ | ||
| 475 | p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ | ||
| 476 | } | ||
| 477 | else /* '+' or no suffix */ | ||
| 478 | s = NULL; /* fail */ | ||
| 461 | } | 479 | } |
| 462 | default: { | 480 | else { /* matched once */ |
| 463 | if (!m) return NULL; | 481 | switch (*ep) { /* handle optional suffix */ |
| 464 | s++; p=ep; goto init; /* else return match(ms, s+1, ep); */ | 482 | case '?': { /* optional */ |
| 483 | const char *res; | ||
| 484 | if ((res = match(ms, s + 1, ep + 1)) != NULL) | ||
| 485 | s = res; | ||
| 486 | else { | ||
| 487 | p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ | ||
| 488 | } | ||
| 489 | break; | ||
| 490 | } | ||
| 491 | case '+': /* 1 or more repetitions */ | ||
| 492 | s++; /* 1 match already done */ | ||
| 493 | /* go through */ | ||
| 494 | case '*': /* 0 or more repetitions */ | ||
| 495 | s = max_expand(ms, s, p, ep); | ||
| 496 | break; | ||
| 497 | case '-': /* 0 or more repetitions (minimum) */ | ||
| 498 | s = min_expand(ms, s, p, ep); | ||
| 499 | break; | ||
| 500 | default: /* no suffix */ | ||
| 501 | s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ | ||
| 502 | } | ||
| 465 | } | 503 | } |
| 504 | break; | ||
| 466 | } | 505 | } |
| 467 | } | 506 | } |
| 468 | } | 507 | } |
| 508 | ms->matchdepth++; | ||
| 509 | return s; | ||
| 469 | } | 510 | } |
| 470 | 511 | ||
| 471 | 512 | ||
| @@ -561,12 +602,14 @@ static int str_find_aux (lua_State *L, int find) { | |||
| 561 | p++; lp--; /* skip anchor character */ | 602 | p++; lp--; /* skip anchor character */ |
| 562 | } | 603 | } |
| 563 | ms.L = L; | 604 | ms.L = L; |
| 605 | ms.matchdepth = MAXCCALLS; | ||
| 564 | ms.src_init = s; | 606 | ms.src_init = s; |
| 565 | ms.src_end = s + ls; | 607 | ms.src_end = s + ls; |
| 566 | ms.p_end = p + lp; | 608 | ms.p_end = p + lp; |
| 567 | do { | 609 | do { |
| 568 | const char *res; | 610 | const char *res; |
| 569 | ms.level = 0; | 611 | ms.level = 0; |
| 612 | lua_assert(ms.matchdepth == MAXCCALLS); | ||
| 570 | if ((res=match(&ms, s1, p)) != NULL) { | 613 | if ((res=match(&ms, s1, p)) != NULL) { |
| 571 | if (find) { | 614 | if (find) { |
| 572 | lua_pushinteger(L, s1 - s + 1); /* start */ | 615 | lua_pushinteger(L, s1 - s + 1); /* start */ |
| @@ -600,6 +643,7 @@ static int gmatch_aux (lua_State *L) { | |||
| 600 | const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); | 643 | const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); |
| 601 | const char *src; | 644 | const char *src; |
| 602 | ms.L = L; | 645 | ms.L = L; |
| 646 | ms.matchdepth = MAXCCALLS; | ||
| 603 | ms.src_init = s; | 647 | ms.src_init = s; |
| 604 | ms.src_end = s+ls; | 648 | ms.src_end = s+ls; |
| 605 | ms.p_end = p + lp; | 649 | ms.p_end = p + lp; |
| @@ -608,6 +652,7 @@ static int gmatch_aux (lua_State *L) { | |||
| 608 | src++) { | 652 | src++) { |
| 609 | const char *e; | 653 | const char *e; |
| 610 | ms.level = 0; | 654 | ms.level = 0; |
| 655 | lua_assert(ms.matchdepth == MAXCCALLS); | ||
| 611 | if ((e = match(&ms, src, p)) != NULL) { | 656 | if ((e = match(&ms, src, p)) != NULL) { |
| 612 | lua_Integer newstart = e-s; | 657 | lua_Integer newstart = e-s; |
| 613 | if (e == src) newstart++; /* empty match? go at least one position */ | 658 | if (e == src) newstart++; /* empty match? go at least one position */ |
| @@ -705,12 +750,14 @@ static int str_gsub (lua_State *L) { | |||
| 705 | p++; lp--; /* skip anchor character */ | 750 | p++; lp--; /* skip anchor character */ |
| 706 | } | 751 | } |
| 707 | ms.L = L; | 752 | ms.L = L; |
| 753 | ms.matchdepth = MAXCCALLS; | ||
| 708 | ms.src_init = src; | 754 | ms.src_init = src; |
| 709 | ms.src_end = src+srcl; | 755 | ms.src_end = src+srcl; |
| 710 | ms.p_end = p + lp; | 756 | ms.p_end = p + lp; |
| 711 | while (n < max_s) { | 757 | while (n < max_s) { |
| 712 | const char *e; | 758 | const char *e; |
| 713 | ms.level = 0; | 759 | ms.level = 0; |
| 760 | lua_assert(ms.matchdepth == MAXCCALLS); | ||
| 714 | e = match(&ms, src, p); | 761 | e = match(&ms, src, p); |
| 715 | if (e) { | 762 | if (e) { |
| 716 | n++; | 763 | n++; |
