aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2023-06-08 10:42:39 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2023-06-08 10:42:39 +0200
commit2ca39ffd447ca874fcea933194829717d5573247 (patch)
tree6d9eb4ba80ad9feec70c3f4f25dd3f7629c5fe5a
parent113685fbcd4c3432ec9b640583d50ba8da2102e8 (diff)
downloadbusybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.gz
busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.bz2
busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.zip
awk: fix subst code to handle "start of word" pattern correctly (needs REG_STARTEND)
function old new delta awk_sub 637 714 +77 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--editors/awk.c49
-rwxr-xr-xtestsuite/awk.tests28
2 files changed, 51 insertions, 26 deletions
diff --git a/editors/awk.c b/editors/awk.c
index df9b7fdc9..171f0a7ea 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -2504,17 +2504,46 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
2504 regex_t sreg, *regex; 2504 regex_t sreg, *regex;
2505 /* True only if called to implement gensub(): */ 2505 /* True only if called to implement gensub(): */
2506 int subexp = (src != dest); 2506 int subexp = (src != dest);
2507 2507#if defined(REG_STARTEND)
2508 const char *src_string;
2509 size_t src_strlen;
2510 regexec_flags = REG_STARTEND;
2511#else
2512 regexec_flags = 0;
2513#endif
2508 resbuf = NULL; 2514 resbuf = NULL;
2509 residx = 0; 2515 residx = 0;
2510 match_no = 0; 2516 match_no = 0;
2511 regexec_flags = 0;
2512 regex = as_regex(rn, &sreg); 2517 regex = as_regex(rn, &sreg);
2513 sp = getvar_s(src ? src : intvar[F0]); 2518 sp = getvar_s(src ? src : intvar[F0]);
2519#if defined(REG_STARTEND)
2520 src_string = sp;
2521 src_strlen = strlen(src_string);
2522#endif
2514 replen = strlen(repl); 2523 replen = strlen(repl);
2515 while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) { 2524 for (;;) {
2516 int so = pmatch[0].rm_so; 2525 int so, eo;
2517 int eo = pmatch[0].rm_eo; 2526
2527#if defined(REG_STARTEND)
2528// REG_STARTEND: "This flag is a BSD extension, not present in POSIX"
2529 size_t start_ofs = sp - src_string;
2530 pmatch[0].rm_so = start_ofs;
2531 pmatch[0].rm_eo = src_strlen;
2532 if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0)
2533 break;
2534 eo = pmatch[0].rm_eo - start_ofs;
2535 so = pmatch[0].rm_so - start_ofs;
2536#else
2537// BUG:
2538// gsub(/\<b*/,"") on "abc" matches empty string at "a...",
2539// advances sp one char (see "Empty match" comment later) to "bc"
2540// ... and erroneously matches "b" even though it is NOT at the word start.
2541 enum { start_ofs = 0 };
2542 if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0)
2543 break;
2544 so = pmatch[0].rm_so;
2545 eo = pmatch[0].rm_eo;
2546#endif
2518 2547
2519 //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); 2548 //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
2520 resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); 2549 resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
@@ -2543,7 +2572,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
2543 } 2572 }
2544 n = pmatch[j].rm_eo - pmatch[j].rm_so; 2573 n = pmatch[j].rm_eo - pmatch[j].rm_so;
2545 resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); 2574 resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
2546 memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); 2575 memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n);
2547 residx += n; 2576 residx += n;
2548 } else 2577 } else
2549 resbuf[residx++] = c; 2578 resbuf[residx++] = c;
@@ -2557,12 +2586,6 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
2557 if (eo == so) { 2586 if (eo == so) {
2558 /* Empty match (e.g. "b*" will match anywhere). 2587 /* Empty match (e.g. "b*" will match anywhere).
2559 * Advance by one char. */ 2588 * Advance by one char. */
2560//BUG (bug 1333):
2561//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
2562//... and will erroneously match "b" even though it is NOT at the word start.
2563//we need REG_NOTBOW but it does not exist...
2564//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
2565//it should be able to do it correctly.
2566 /* Subtle: this is safe only because 2589 /* Subtle: this is safe only because
2567 * qrealloc allocated at least one extra byte */ 2590 * qrealloc allocated at least one extra byte */
2568 resbuf[residx] = *sp; 2591 resbuf[residx] = *sp;
@@ -2571,7 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
2571 sp++; 2594 sp++;
2572 residx++; 2595 residx++;
2573 } 2596 }
2574 regexec_flags = REG_NOTBOL; 2597 regexec_flags |= REG_NOTBOL;
2575 } 2598 }
2576 2599
2577 resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize); 2600 resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);
diff --git a/testsuite/awk.tests b/testsuite/awk.tests
index c61d32947..5a792c241 100755
--- a/testsuite/awk.tests
+++ b/testsuite/awk.tests
@@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \
557 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 557 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
558 's=\\ 558 's=\\
559\\|\\ 559\\|\\
560' \ 560' '' ''
561 '' ''
562testing 'awk gensub backslashes \\' \ 561testing 'awk gensub backslashes \\' \
563 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 562 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
564 's=\\\\ 563 's=\\\\
565\\|\\ 564\\|\\
566' \ 565' '' ''
567 '' ''
568# gawk 5.1.1 handles trailing unpaired \ inconsistently. 566# gawk 5.1.1 handles trailing unpaired \ inconsistently.
569# If replace string is single \, it is used verbatim, 567# If replace string is single \, it is used verbatim,
570# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect. 568# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.
@@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \
572 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 570 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
573 's=\\\\\\ 571 's=\\\\\\
574\\\\|\\\\ 572\\\\|\\\\
575' \ 573' '' ''
576 '' ''
577testing 'awk gensub backslashes \\\\' \ 574testing 'awk gensub backslashes \\\\' \
578 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 575 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
579 's=\\\\\\\\ 576 's=\\\\\\\\
580\\\\|\\\\ 577\\\\|\\\\
581' \ 578' '' ''
582 '' ''
583testing 'awk gensub backslashes \&' \ 579testing 'awk gensub backslashes \&' \
584 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 580 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
585 's=\\& 581 's=\\&
586&|& 582&|&
587' \ 583' '' ''
588 '' ''
589testing 'awk gensub backslashes \0' \ 584testing 'awk gensub backslashes \0' \
590 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 585 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
591 's=\\0 586 's=\\0
592a|a 587a|a
593' \ 588' '' ''
594 '' ''
595testing 'awk gensub backslashes \\0' \ 589testing 'awk gensub backslashes \\0' \
596 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 590 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
597 's=\\\\0 591 's=\\\\0
598\\0|\\0 592\\0|\\0
599' \ 593' '' ''
594
595# The "b" in "abc" should not match <b* pattern.
596# Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX")
597# to implement the code to handle this correctly, but if your libc has no REG_STARTEND,
598# the alternative code mishandles this case.
599testing 'awk gsub erroneous word start match' \
600 "awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \
601 'abc\n' \
600 '' '' 602 '' ''
601 603
602exit $FAILCOUNT 604exit $FAILCOUNT