aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2020-12-02 19:07:31 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2020-12-02 19:07:31 +0100
commit665a65953076ea21be49250b8279ddb1f0f99f38 (patch)
treebfb46738da9fec6715843197b5987ad56d4fcf76
parent50ead33c45919abffde35313daac4c2dfd8641ca (diff)
downloadbusybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.tar.gz
busybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.tar.bz2
busybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.zip
awk: FS regex matches only non-empty separators (gawk compat)
function old new delta awk_split 484 553 +69 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--editors/awk.c33
-rwxr-xr-xtestsuite/awk.tests7
2 files changed, 32 insertions, 8 deletions
diff --git a/editors/awk.c b/editors/awk.c
index d56d6330d..2c15f9e4e 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -1763,6 +1763,29 @@ static void fsrealloc(int size)
1763 nfields = size; 1763 nfields = size;
1764} 1764}
1765 1765
1766static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[])
1767{
1768 int r = regexec(preg, s, 1, pmatch, 0);
1769 if (r == 0 && pmatch[0].rm_eo == 0) {
1770 /* For example, happens when FS can match
1771 * an empty string (awk -F ' *'). Logically,
1772 * this should split into one-char fields.
1773 * However, gawk 5.0.1 searches for first
1774 * _non-empty_ separator string match:
1775 */
1776 size_t ofs = 0;
1777 do {
1778 ofs++;
1779 if (!s[ofs])
1780 return REG_NOMATCH;
1781 regexec(preg, s + ofs, 1, pmatch, 0);
1782 } while (pmatch[0].rm_eo == 0);
1783 pmatch[0].rm_so += ofs;
1784 pmatch[0].rm_eo += ofs;
1785 }
1786 return r;
1787}
1788
1766static int awk_split(const char *s, node *spl, char **slist) 1789static int awk_split(const char *s, node *spl, char **slist)
1767{ 1790{
1768 int n; 1791 int n;
@@ -1788,17 +1811,11 @@ static int awk_split(const char *s, node *spl, char **slist)
1788 regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... 1811 regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough...
1789 1812
1790 l = strcspn(s, c+2); /* len till next NUL or \n */ 1813 l = strcspn(s, c+2); /* len till next NUL or \n */
1791 if (regexec(icase ? spl->r.ire : spl->l.re, s, 1, pmatch, 0) == 0 1814 if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0
1792 && pmatch[0].rm_so <= l 1815 && pmatch[0].rm_so <= l
1793 ) { 1816 ) {
1817 /* if (pmatch[0].rm_eo == 0) ... - impossible */
1794 l = pmatch[0].rm_so; 1818 l = pmatch[0].rm_so;
1795 if (pmatch[0].rm_eo == 0) {
1796 /* For example, happens when FS can match
1797 * an empthy string (awk -F ' *')
1798 */
1799 l++;
1800 pmatch[0].rm_eo++;
1801 }
1802 n++; /* we saw yet another delimiter */ 1819 n++; /* we saw yet another delimiter */
1803 } else { 1820 } else {
1804 pmatch[0].rm_eo = l; 1821 pmatch[0].rm_eo = l;
diff --git a/testsuite/awk.tests b/testsuite/awk.tests
index 87f6b5007..06a531d96 100755
--- a/testsuite/awk.tests
+++ b/testsuite/awk.tests
@@ -398,5 +398,12 @@ testing 'awk do not allow "str"++' \
398 '' \ 398 '' \
399 'anything' 399 'anything'
400 400
401#gawk compat: FS regex matches only non-empty separators:
402#with -*, the splitting is NOT f o o b a r, but foo bar:
403testing 'awk FS regex which can match empty string' \
404 "awk -F '-*' '{print \$1 \"-\" \$2 \"=\" \$3 \"*\" \$4}'" \
405 "foo-bar=*\n" \
406 '' \
407 'foo--bar'
401 408
402exit $FAILCOUNT 409exit $FAILCOUNT