awk: fix subst code to handle "start of word" pattern correctly (needs REG_STARTEND)

function old new delta awk_sub 637 714 +77 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2023-06-08 10:42:39 +0200
committer: Denys Vlasenko <vda.linux@googlemail.com> 2023-06-08 10:42:39 +0200
commit: 2ca39ffd447ca874fcea933194829717d5573247 (patch)
tree: 6d9eb4ba80ad9feec70c3f4f25dd3f7629c5fe5a
parent: 113685fbcd4c3432ec9b640583d50ba8da2102e8 (diff)
download: busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.gz
busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.bz2
busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.zip
2 files changed, 51 insertions, 26 deletions
diff --git a/editors/awk.c b/editors/awk.c
index df9b7fdc9..171f0a7ea 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -2504,17 +2504,46 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
        regex_t sreg, *regex;
        /* True only if called to implement gensub(): */
        int subexp = (src != dest);
+#if defined(REG_STARTEND)
+        const char *src_string;
+        size_t src_strlen;
+        regexec_flags = REG_STARTEND;
+#else
+        regexec_flags = 0;
+#endif
        resbuf = NULL;
        residx = 0;
        match_no = 0;
-        regexec_flags = 0;
        regex = as_regex(rn, &sreg);
        sp = getvar_s(src ? src : intvar[F0]);
+#if defined(REG_STARTEND)
+        src_string = sp;
+        src_strlen = strlen(src_string);
+#endif
        replen = strlen(repl);
-        while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) {
+        for (;;) {
-                int so = pmatch[0].rm_so;
+                int so, eo;
-                int eo = pmatch[0].rm_eo;
+#if defined(REG_STARTEND)
+// REG_STARTEND: "This flag is a BSD extension, not present in POSIX"
+                size_t start_ofs = sp - src_string;
+                pmatch[0].rm_so = start_ofs;
+                pmatch[0].rm_eo = src_strlen;
+                if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0)
+                        break;
+                eo = pmatch[0].rm_eo - start_ofs;
+                so = pmatch[0].rm_so - start_ofs;
+#else
+// BUG:
+// gsub(/\<b*/,"") on "abc" matches empty string at "a...",
+// advances sp one char (see "Empty match" comment later) to "bc"
+// ... and erroneously matches "b" even though it is NOT at the word start.
+                enum { start_ofs = 0 };
+                if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0)
+                        break;
+                so = pmatch[0].rm_so;
+                eo = pmatch[0].rm_eo;
+#endif
                //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
                resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
@@ -2543,7 +2572,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
                                        }
                                        n = pmatch[j].rm_eo - pmatch[j].rm_so;
                                        resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
-                                        memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
+                                        memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n);
                                        residx += n;
                                } else
                                        resbuf[residx++] = c;
@@ -2557,12 +2586,6 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
                if (eo == so) {
                        /* Empty match (e.g. "b*" will match anywhere).
                         * Advance by one char. */
-//BUG (bug 1333):
-//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
-//... and will erroneously match "b" even though it is NOT at the word start.
-//we need REG_NOTBOW but it does not exist...
-//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
-//it should be able to do it correctly.
                        /* Subtle: this is safe only because
                         * qrealloc allocated at least one extra byte */
                        resbuf[residx] = *sp;
@@ -2571,7 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
                        sp++;
                        residx++;
                }
-                regexec_flags = REG_NOTBOL;
+                regexec_flags |= REG_NOTBOL;
        }
        resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);
diff --git a/testsuite/awk.tests b/testsuite/awk.tests
index c61d32947..5a792c241 100755
--- a/testsuite/awk.tests
+++ b/testsuite/awk.tests
@@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \
        'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\
 \\|\\
-' \
+' '' ''
-        '' ''
 testing 'awk gensub backslashes \\' \
        'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\\\
 \\|\\
-' \
+' '' ''
-        '' ''
 # gawk 5.1.1 handles trailing unpaired \ inconsistently.
 # If replace string is single \, it is used verbatim,
 # but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.
@@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \
        'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\\\\\
 \\\\|\\\\
-' \
+' '' ''
-        '' ''
 testing 'awk gensub backslashes \\\\' \
        'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\\\\\\\
 \\\\|\\\\
-' \
+' '' ''
-        '' ''
 testing 'awk gensub backslashes \&' \
        'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\&
 &|&
-' \
+' '' ''
-        '' ''
 testing 'awk gensub backslashes \0' \
        'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\0
 a|a
-' \
+' '' ''
-        '' ''
 testing 'awk gensub backslashes \\0' \
        'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
        's=\\\\0
 \\0|\\0
-' \
+' '' ''
+# The "b" in "abc" should not match <b* pattern.
+# Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX")
+# to implement the code to handle this correctly, but if your libc has no REG_STARTEND,
+# the alternative code mishandles this case.
+testing 'awk gsub erroneous word start match' \
+        "awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \
+        'abc\n' \
        '' ''
 exit $FAILCOUNT
author	Denys Vlasenko <vda.linux@googlemail.com>	2023-06-08 10:42:39 +0200
committer	Denys Vlasenko <vda.linux@googlemail.com>	2023-06-08 10:42:39 +0200
commit	2ca39ffd447ca874fcea933194829717d5573247 (patch)
tree	6d9eb4ba80ad9feec70c3f4f25dd3f7629c5fe5a
parent	113685fbcd4c3432ec9b640583d50ba8da2102e8 (diff)
download	busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.gz busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.tar.bz2 busybox-w32-2ca39ffd447ca874fcea933194829717d5573247.zip

diff --git a/editors/awk.c b/editors/awk.c index df9b7fdc9..171f0a7ea 100644 --- a/editors/awk.c +++ b/editors/awk.c
@@ -2504,17 +2504,46 @@ static int awk_sub(node rn, const char repl, int nm, var src, var dest /*,in
2504	regex_t sreg, *regex;	2504	regex_t sreg, *regex;
2505	/* True only if called to implement gensub(): */	2505	/* True only if called to implement gensub(): */
2506	int subexp = (src != dest);	2506	int subexp = (src != dest);
2507		2507	#if defined(REG_STARTEND)
		2508	const char *src_string;
		2509	size_t src_strlen;
		2510	regexec_flags = REG_STARTEND;
		2511	#else
		2512	regexec_flags = 0;
		2513	#endif
2508	resbuf = NULL;	2514	resbuf = NULL;
2509	residx = 0;	2515	residx = 0;
2510	match_no = 0;	2516	match_no = 0;
2511	regexec_flags = 0;
2512	regex = as_regex(rn, &sreg);	2517	regex = as_regex(rn, &sreg);
2513	sp = getvar_s(src ? src : intvar[F0]);	2518	sp = getvar_s(src ? src : intvar[F0]);
		2519	#if defined(REG_STARTEND)
		2520	src_string = sp;
		2521	src_strlen = strlen(src_string);
		2522	#endif
2514	replen = strlen(repl);	2523	replen = strlen(repl);
2515	while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) {	2524	for (;;) {
2516	int so = pmatch[0].rm_so;	2525	int so, eo;
2517	int eo = pmatch[0].rm_eo;	2526
		2527	#if defined(REG_STARTEND)
		2528	// REG_STARTEND: "This flag is a BSD extension, not present in POSIX"
		2529	size_t start_ofs = sp - src_string;
		2530	pmatch[0].rm_so = start_ofs;
		2531	pmatch[0].rm_eo = src_strlen;
		2532	if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0)
		2533	break;
		2534	eo = pmatch[0].rm_eo - start_ofs;
		2535	so = pmatch[0].rm_so - start_ofs;
		2536	#else
		2537	// BUG:
		2538	// gsub(/\<b*/,"") on "abc" matches empty string at "a...",
		2539	// advances sp one char (see "Empty match" comment later) to "bc"
		2540	// ... and erroneously matches "b" even though it is NOT at the word start.
		2541	enum { start_ofs = 0 };
		2542	if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0)
		2543	break;
		2544	so = pmatch[0].rm_so;
		2545	eo = pmatch[0].rm_eo;
		2546	#endif
2518		2547
2519	//bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);	2548	//bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
2520	resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);	2549	resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
@@ -2543,7 +2572,7 @@ static int awk_sub(node rn, const char repl, int nm, var src, var dest /*,in
2543	}	2572	}
2544	n = pmatch[j].rm_eo - pmatch[j].rm_so;	2573	n = pmatch[j].rm_eo - pmatch[j].rm_so;
2545	resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);	2574	resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
2546	memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);	2575	memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n);
2547	residx += n;	2576	residx += n;
2548	} else	2577	} else
2549	resbuf[residx++] = c;	2578	resbuf[residx++] = c;
@@ -2557,12 +2586,6 @@ static int awk_sub(node rn, const char repl, int nm, var src, var dest /*,in
2557	if (eo == so) {	2586	if (eo == so) {
2558	/* Empty match (e.g. "b*" will match anywhere).	2587	/* Empty match (e.g. "b*" will match anywhere).
2559	* Advance by one char. */	2588	* Advance by one char. */
2560	//BUG (bug 1333):
2561	//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
2562	//... and will erroneously match "b" even though it is NOT at the word start.
2563	//we need REG_NOTBOW but it does not exist...
2564	//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
2565	//it should be able to do it correctly.
2566	/* Subtle: this is safe only because	2589	/* Subtle: this is safe only because
2567	* qrealloc allocated at least one extra byte */	2590	* qrealloc allocated at least one extra byte */
2568	resbuf[residx] = *sp;	2591	resbuf[residx] = *sp;
@@ -2571,7 +2594,7 @@ static int awk_sub(node rn, const char repl, int nm, var src, var dest /*,in
2571	sp++;	2594	sp++;
2572	residx++;	2595	residx++;
2573	}	2596	}
2574	regexec_flags = REG_NOTBOL;	2597	regexec_flags \|= REG_NOTBOL;
2575	}	2598	}
2576		2599
2577	resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);	2600	resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);


diff --git a/testsuite/awk.tests b/testsuite/awk.tests index c61d32947..5a792c241 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests
@@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \
557	'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	557	'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
558	's=\\	558	's=\\
559	\\\|\\	559	\\\|\\
560	' \	560	' '' ''
561	'' ''
562	testing 'awk gensub backslashes \\' \	561	testing 'awk gensub backslashes \\' \
563	'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	562	'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
564	's=\\\\	563	's=\\\\
565	\\\|\\	564	\\\|\\
566	' \	565	' '' ''
567	'' ''
568	# gawk 5.1.1 handles trailing unpaired \ inconsistently.	566	# gawk 5.1.1 handles trailing unpaired \ inconsistently.
569	# If replace string is single \, it is used verbatim,	567	# If replace string is single \, it is used verbatim,
570	# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.	568	# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.
@@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \
572	'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	570	'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
573	's=\\\\\\	571	's=\\\\\\
574	\\\\\|\\\\	572	\\\\\|\\\\
575	' \	573	' '' ''
576	'' ''
577	testing 'awk gensub backslashes \\\\' \	574	testing 'awk gensub backslashes \\\\' \
578	'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	575	'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
579	's=\\\\\\\\	576	's=\\\\\\\\
580	\\\\\|\\\\	577	\\\\\|\\\\
581	' \	578	' '' ''
582	'' ''
583	testing 'awk gensub backslashes \&' \	579	testing 'awk gensub backslashes \&' \
584	'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	580	'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
585	's=\\&	581	's=\\&
586	&\|&	582	&\|&
587	' \	583	' '' ''
588	'' ''
589	testing 'awk gensub backslashes \0' \	584	testing 'awk gensub backslashes \0' \
590	'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	585	'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
591	's=\\0	586	's=\\0
592	a\|a	587	a\|a
593	' \	588	' '' ''
594	'' ''
595	testing 'awk gensub backslashes \\0' \	589	testing 'awk gensub backslashes \\0' \
596	'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \	590	'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a\|a") }'$sq \
597	's=\\\\0	591	's=\\\\0
598	\\0\|\\0	592	\\0\|\\0
599	' \	593	' '' ''
		594
		595	# The "b" in "abc" should not match <b* pattern.
		596	# Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX")
		597	# to implement the code to handle this correctly, but if your libc has no REG_STARTEND,
		598	# the alternative code mishandles this case.
		599	testing 'awk gsub erroneous word start match' \
		600	"awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \
		601	'abc\n' \
600	'' ''	602	'' ''
601		603
602	exit $FAILCOUNT	604	exit $FAILCOUNT