From 5f84c5633663f6ee8c9cc3a4608b86d4b56b39d6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jun 2023 00:39:33 +0200 Subject: awk: fix backslash handling in sub() builtins function old new delta awk_sub 559 544 -15 Signed-off-by: Denys Vlasenko --- editors/awk.c | 41 +++++++++++++++++++---------------------- testsuite/awk.tests | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 0f062dcdb..f77573806 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2492,7 +2492,7 @@ static char *awk_printf(node *n, size_t *len) * store result into (dest), return number of substitutions. * If nm = 0, replace all matches. * If src or dst is NULL, use $0. - * If subexp != 0, enable subexpression matching (\1-\9). + * If subexp != 0, enable subexpression matching (\0-\9). */ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp) { @@ -2520,35 +2520,32 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int residx += eo; if (++match_no >= nm) { const char *s; - int nbs; + int bslash; /* replace */ residx -= (eo - so); - nbs = 0; + bslash = 0; for (s = repl; *s; s++) { - char c = resbuf[residx++] = *s; - if (c == '\\') { - nbs++; - continue; + char c = *s; + if (c == '\\' && s[1]) { + bslash ^= 1; + if (bslash) + continue; } - if (c == '&' || (subexp && c >= '0' && c <= '9')) { - int j; - residx -= ((nbs + 3) >> 1); - j = 0; + if ((!bslash && c == '&') + || (subexp && bslash && c >= '0' && c <= '9') + ) { + int n, j = 0; if (c != '&') { j = c - '0'; - nbs++; } - if (nbs % 2) { - resbuf[residx++] = c; - } else { - int n = pmatch[j].rm_eo - pmatch[j].rm_so; - resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); - memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); - residx += n; - } - } - nbs = 0; + n = pmatch[j].rm_eo - pmatch[j].rm_so; + resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); + memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); + residx += n; + } else + resbuf[residx++] = c; + bslash = 0; } } diff --git a/testsuite/awk.tests b/testsuite/awk.tests index cdab93d21..c61d32947 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -552,4 +552,51 @@ testing "awk = has higher precedence than == (despite what gawk manpage claims)" '0\n1\n2\n1\n3\n' \ '' '' +sq="'" +testing 'awk gensub backslashes \' \ + 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\ +\\|\\ +' \ + '' '' +testing 'awk gensub backslashes \\' \ + 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\\\ +\\|\\ +' \ + '' '' +# gawk 5.1.1 handles trailing unpaired \ inconsistently. +# If replace string is single \, it is used verbatim, +# but if it is \\\ (three slashes), gawk uses "\" (!!!), not "\\" as you would expect. +testing 'awk gensub backslashes \\\' \ + 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\\\\\ +\\\\|\\\\ +' \ + '' '' +testing 'awk gensub backslashes \\\\' \ + 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\\\\\\\ +\\\\|\\\\ +' \ + '' '' +testing 'awk gensub backslashes \&' \ + 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\& +&|& +' \ + '' '' +testing 'awk gensub backslashes \0' \ + 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\0 +a|a +' \ + '' '' +testing 'awk gensub backslashes \\0' \ + 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ + 's=\\\\0 +\\0|\\0 +' \ + '' '' + exit $FAILCOUNT -- cgit v1.2.3-55-g6feb