diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2023-06-08 10:42:39 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2023-06-08 10:42:39 +0200 |
commit | 2ca39ffd447ca874fcea933194829717d5573247 (patch) | |
tree | 6d9eb4ba80ad9feec70c3f4f25dd3f7629c5fe5a | |
parent | 113685fbcd4c3432ec9b640583d50ba8da2102e8 (diff) | |
download | busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.gz busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.bz2 busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.zip |
awk: fix subst code to handle "start of word" pattern correctly (needs REG_STARTEND)
function old new delta
awk_sub 637 714 +77
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | editors/awk.c | 49 | ||||
-rwxr-xr-x | testsuite/awk.tests | 28 |
2 files changed, 51 insertions, 26 deletions
diff --git a/editors/awk.c b/editors/awk.c index df9b7fdc9..171f0a7ea 100644 --- a/editors/awk.c +++ b/editors/awk.c | |||
@@ -2504,17 +2504,46 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in | |||
2504 | regex_t sreg, *regex; | 2504 | regex_t sreg, *regex; |
2505 | /* True only if called to implement gensub(): */ | 2505 | /* True only if called to implement gensub(): */ |
2506 | int subexp = (src != dest); | 2506 | int subexp = (src != dest); |
2507 | 2507 | #if defined(REG_STARTEND) | |
2508 | const char *src_string; | ||
2509 | size_t src_strlen; | ||
2510 | regexec_flags = REG_STARTEND; | ||
2511 | #else | ||
2512 | regexec_flags = 0; | ||
2513 | #endif | ||
2508 | resbuf = NULL; | 2514 | resbuf = NULL; |
2509 | residx = 0; | 2515 | residx = 0; |
2510 | match_no = 0; | 2516 | match_no = 0; |
2511 | regexec_flags = 0; | ||
2512 | regex = as_regex(rn, &sreg); | 2517 | regex = as_regex(rn, &sreg); |
2513 | sp = getvar_s(src ? src : intvar[F0]); | 2518 | sp = getvar_s(src ? src : intvar[F0]); |
2519 | #if defined(REG_STARTEND) | ||
2520 | src_string = sp; | ||
2521 | src_strlen = strlen(src_string); | ||
2522 | #endif | ||
2514 | replen = strlen(repl); | 2523 | replen = strlen(repl); |
2515 | while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) { | 2524 | for (;;) { |
2516 | int so = pmatch[0].rm_so; | 2525 | int so, eo; |
2517 | int eo = pmatch[0].rm_eo; | 2526 | |
2527 | #if defined(REG_STARTEND) | ||
2528 | // REG_STARTEND: "This flag is a BSD extension, not present in POSIX" | ||
2529 | size_t start_ofs = sp - src_string; | ||
2530 | pmatch[0].rm_so = start_ofs; | ||
2531 | pmatch[0].rm_eo = src_strlen; | ||
2532 | if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0) | ||
2533 | break; | ||
2534 | eo = pmatch[0].rm_eo - start_ofs; | ||
2535 | so = pmatch[0].rm_so - start_ofs; | ||
2536 | #else | ||
2537 | // BUG: | ||
2538 | // gsub(/\<b*/,"") on "abc" matches empty string at "a...", | ||
2539 | // advances sp one char (see "Empty match" comment later) to "bc" | ||
2540 | // ... and erroneously matches "b" even though it is NOT at the word start. | ||
2541 | enum { start_ofs = 0 }; | ||
2542 | if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0) | ||
2543 | break; | ||
2544 | so = pmatch[0].rm_so; | ||
2545 | eo = pmatch[0].rm_eo; | ||
2546 | #endif | ||
2518 | 2547 | ||
2519 | //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); | 2548 | //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); |
2520 | resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); | 2549 | resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); |
@@ -2543,7 +2572,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in | |||
2543 | } | 2572 | } |
2544 | n = pmatch[j].rm_eo - pmatch[j].rm_so; | 2573 | n = pmatch[j].rm_eo - pmatch[j].rm_so; |
2545 | resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); | 2574 | resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); |
2546 | memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); | 2575 | memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n); |
2547 | residx += n; | 2576 | residx += n; |
2548 | } else | 2577 | } else |
2549 | resbuf[residx++] = c; | 2578 | resbuf[residx++] = c; |
@@ -2557,12 +2586,6 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in | |||
2557 | if (eo == so) { | 2586 | if (eo == so) { |
2558 | /* Empty match (e.g. "b*" will match anywhere). | 2587 | /* Empty match (e.g. "b*" will match anywhere). |
2559 | * Advance by one char. */ | 2588 | * Advance by one char. */ |
2560 | //BUG (bug 1333): | ||
2561 | //gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc" | ||
2562 | //... and will erroneously match "b" even though it is NOT at the word start. | ||
2563 | //we need REG_NOTBOW but it does not exist... | ||
2564 | //TODO: if EXTRA_COMPAT=y, use GNU matching and re_search, | ||
2565 | //it should be able to do it correctly. | ||
2566 | /* Subtle: this is safe only because | 2589 | /* Subtle: this is safe only because |
2567 | * qrealloc allocated at least one extra byte */ | 2590 | * qrealloc allocated at least one extra byte */ |
2568 | resbuf[residx] = *sp; | 2591 | resbuf[residx] = *sp; |
@@ -2571,7 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in | |||
2571 | sp++; | 2594 | sp++; |
2572 | residx++; | 2595 | residx++; |
2573 | } | 2596 | } |
2574 | regexec_flags = REG_NOTBOL; | 2597 | regexec_flags |= REG_NOTBOL; |
2575 | } | 2598 | } |
2576 | 2599 | ||
2577 | resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize); | 2600 | resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize); |
diff --git a/testsuite/awk.tests b/testsuite/awk.tests index c61d32947..5a792c241 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests | |||
@@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \ | |||
557 | 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 557 | 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
558 | 's=\\ | 558 | 's=\\ |
559 | \\|\\ | 559 | \\|\\ |
560 | ' \ | 560 | ' '' '' |
561 | '' '' | ||
562 | testing 'awk gensub backslashes \\' \ | 561 | testing 'awk gensub backslashes \\' \ |
563 | 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 562 | 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
564 | 's=\\\\ | 563 | 's=\\\\ |
565 | \\|\\ | 564 | \\|\\ |
566 | ' \ | 565 | ' '' '' |
567 | '' '' | ||
568 | # gawk 5.1.1 handles trailing unpaired \ inconsistently. | 566 | # gawk 5.1.1 handles trailing unpaired \ inconsistently. |
569 | # If replace string is single \, it is used verbatim, | 567 | # If replace string is single \, it is used verbatim, |
570 | # but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect. | 568 | # but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect. |
@@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \ | |||
572 | 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 570 | 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
573 | 's=\\\\\\ | 571 | 's=\\\\\\ |
574 | \\\\|\\\\ | 572 | \\\\|\\\\ |
575 | ' \ | 573 | ' '' '' |
576 | '' '' | ||
577 | testing 'awk gensub backslashes \\\\' \ | 574 | testing 'awk gensub backslashes \\\\' \ |
578 | 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 575 | 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
579 | 's=\\\\\\\\ | 576 | 's=\\\\\\\\ |
580 | \\\\|\\\\ | 577 | \\\\|\\\\ |
581 | ' \ | 578 | ' '' '' |
582 | '' '' | ||
583 | testing 'awk gensub backslashes \&' \ | 579 | testing 'awk gensub backslashes \&' \ |
584 | 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 580 | 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
585 | 's=\\& | 581 | 's=\\& |
586 | &|& | 582 | &|& |
587 | ' \ | 583 | ' '' '' |
588 | '' '' | ||
589 | testing 'awk gensub backslashes \0' \ | 584 | testing 'awk gensub backslashes \0' \ |
590 | 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 585 | 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
591 | 's=\\0 | 586 | 's=\\0 |
592 | a|a | 587 | a|a |
593 | ' \ | 588 | ' '' '' |
594 | '' '' | ||
595 | testing 'awk gensub backslashes \\0' \ | 589 | testing 'awk gensub backslashes \\0' \ |
596 | 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ | 590 | 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ |
597 | 's=\\\\0 | 591 | 's=\\\\0 |
598 | \\0|\\0 | 592 | \\0|\\0 |
599 | ' \ | 593 | ' '' '' |
594 | |||
595 | # The "b" in "abc" should not match <b* pattern. | ||
596 | # Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX") | ||
597 | # to implement the code to handle this correctly, but if your libc has no REG_STARTEND, | ||
598 | # the alternative code mishandles this case. | ||
599 | testing 'awk gsub erroneous word start match' \ | ||
600 | "awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \ | ||
601 | 'abc\n' \ | ||
600 | '' '' | 602 | '' '' |
601 | 603 | ||
602 | exit $FAILCOUNT | 604 | exit $FAILCOUNT |