diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2020-12-02 19:07:31 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2020-12-02 19:07:31 +0100 |
commit | 665a65953076ea21be49250b8279ddb1f0f99f38 (patch) | |
tree | bfb46738da9fec6715843197b5987ad56d4fcf76 | |
parent | 50ead33c45919abffde35313daac4c2dfd8641ca (diff) | |
download | busybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.tar.gz busybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.tar.bz2 busybox-w32-665a65953076ea21be49250b8279ddb1f0f99f38.zip |
awk: FS regex matches only non-empty separators (gawk compat)
function old new delta
awk_split 484 553 +69
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | editors/awk.c | 33 | ||||
-rwxr-xr-x | testsuite/awk.tests | 7 |
2 files changed, 32 insertions, 8 deletions
diff --git a/editors/awk.c b/editors/awk.c index d56d6330d..2c15f9e4e 100644 --- a/editors/awk.c +++ b/editors/awk.c | |||
@@ -1763,6 +1763,29 @@ static void fsrealloc(int size) | |||
1763 | nfields = size; | 1763 | nfields = size; |
1764 | } | 1764 | } |
1765 | 1765 | ||
1766 | static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[]) | ||
1767 | { | ||
1768 | int r = regexec(preg, s, 1, pmatch, 0); | ||
1769 | if (r == 0 && pmatch[0].rm_eo == 0) { | ||
1770 | /* For example, happens when FS can match | ||
1771 | * an empty string (awk -F ' *'). Logically, | ||
1772 | * this should split into one-char fields. | ||
1773 | * However, gawk 5.0.1 searches for first | ||
1774 | * _non-empty_ separator string match: | ||
1775 | */ | ||
1776 | size_t ofs = 0; | ||
1777 | do { | ||
1778 | ofs++; | ||
1779 | if (!s[ofs]) | ||
1780 | return REG_NOMATCH; | ||
1781 | regexec(preg, s + ofs, 1, pmatch, 0); | ||
1782 | } while (pmatch[0].rm_eo == 0); | ||
1783 | pmatch[0].rm_so += ofs; | ||
1784 | pmatch[0].rm_eo += ofs; | ||
1785 | } | ||
1786 | return r; | ||
1787 | } | ||
1788 | |||
1766 | static int awk_split(const char *s, node *spl, char **slist) | 1789 | static int awk_split(const char *s, node *spl, char **slist) |
1767 | { | 1790 | { |
1768 | int n; | 1791 | int n; |
@@ -1788,17 +1811,11 @@ static int awk_split(const char *s, node *spl, char **slist) | |||
1788 | regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... | 1811 | regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... |
1789 | 1812 | ||
1790 | l = strcspn(s, c+2); /* len till next NUL or \n */ | 1813 | l = strcspn(s, c+2); /* len till next NUL or \n */ |
1791 | if (regexec(icase ? spl->r.ire : spl->l.re, s, 1, pmatch, 0) == 0 | 1814 | if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 |
1792 | && pmatch[0].rm_so <= l | 1815 | && pmatch[0].rm_so <= l |
1793 | ) { | 1816 | ) { |
1817 | /* if (pmatch[0].rm_eo == 0) ... - impossible */ | ||
1794 | l = pmatch[0].rm_so; | 1818 | l = pmatch[0].rm_so; |
1795 | if (pmatch[0].rm_eo == 0) { | ||
1796 | /* For example, happens when FS can match | ||
1797 | * an empthy string (awk -F ' *') | ||
1798 | */ | ||
1799 | l++; | ||
1800 | pmatch[0].rm_eo++; | ||
1801 | } | ||
1802 | n++; /* we saw yet another delimiter */ | 1819 | n++; /* we saw yet another delimiter */ |
1803 | } else { | 1820 | } else { |
1804 | pmatch[0].rm_eo = l; | 1821 | pmatch[0].rm_eo = l; |
diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 87f6b5007..06a531d96 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests | |||
@@ -398,5 +398,12 @@ testing 'awk do not allow "str"++' \ | |||
398 | '' \ | 398 | '' \ |
399 | 'anything' | 399 | 'anything' |
400 | 400 | ||
401 | #gawk compat: FS regex matches only non-empty separators: | ||
402 | #with -*, the splitting is NOT f o o b a r, but foo bar: | ||
403 | testing 'awk FS regex which can match empty string' \ | ||
404 | "awk -F '-*' '{print \$1 \"-\" \$2 \"=\" \$3 \"*\" \$4}'" \ | ||
405 | "foo-bar=*\n" \ | ||
406 | '' \ | ||
407 | 'foo--bar' | ||
401 | 408 | ||
402 | exit $FAILCOUNT | 409 | exit $FAILCOUNT |