aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2026-04-14 11:05:53 +0300
committerRon Yorston <rmy@pobox.com>2026-04-16 10:42:13 +0100
commit85f8fb5f2841e732e01cb32340822cc124f63fb7 (patch)
tree4b3f2db40f19b9592f18f01c1114d308ead22fe1
parentfeca895aa0e76d1ec89fa11ce673746aefc29d64 (diff)
downloadbusybox-w32-85f8fb5f2841e732e01cb32340822cc124f63fb7.tar.gz
busybox-w32-85f8fb5f2841e732e01cb32340822cc124f63fb7.tar.bz2
busybox-w32-85f8fb5f2841e732e01cb32340822cc124f63fb7.zip
win32: fnmatch2.c: optimize last '*'
This adds two fairly cheap and effective optimizations at the tail of the pattern (after the last '*'). Speeds up many cases by ~ x2+. Details at the comment inside. Adds 192 bytes in x64.
-rw-r--r--win32/fnmatch2.c88
1 files changed, 87 insertions, 1 deletions
diff --git a/win32/fnmatch2.c b/win32/fnmatch2.c
index 2f0e1d3af..fbf5df81a 100644
--- a/win32/fnmatch2.c
+++ b/win32/fnmatch2.c
@@ -45,6 +45,56 @@
45 * https://research.swtch.com/glob 45 * https://research.swtch.com/glob
46 * 46 *
47 * 47 *
48 * Optimizations (notation: LPAT is len(pat), LSTR is len(str)):
49 *
50 * We have two relatively simple optimizations, and neither changes the worst
51 * case complexity, but in practice it can speed up some patterns considerably.
52 * Both optimizations are tested at most once after each '*' at the pattern.
53 *
54 * First optimization is trivial: if the pattern ends in '*', there's no need
55 * to test the rest of str. This has resonable return, and very cheap to test.
56 * A common scenario for this optimization is the trivial pattern "*", where
57 * we don't look at str at all. But it also helps with foo* or *foo*bar*, etc.
58 * Biggest save: O(LSTR) (advancing to the end of str in steps of O(1)).
59 *
60 * Second optimization is a bit more involved: if we've reached the last '*'
61 * (which we need to test by looking ahead in pattern), and know that the
62 * rest of pattern expects exactly N chars, then advance str to last N chars.
63 * Biggest save: O(LPAT*LSTR), e.g. pat *aaa and str aaaaaaaaaaaaaXYZ would
64 * have matched the whole pattern in every char of str (sans the end), but kept
65 * failing because str didn't end yet. Instead, it compares aaa only to XYZ.
66 *
67 * This lookahead can also end up as work-for-nothing. For instance with *foo*
68 * we lookahead after the 1st '*', realize that it was not the last '*', and
69 * then we end up at the 1st optimization - without value from the lookahead.
70 *
71 * However, it's still quite cheap because at most we lookahead the pattern
72 * once, i.e. worst case cost of O(LPAT), which is typically negligible.
73 * When it's not negligible (very short strings - not enough combos to save)
74 * it still typically ends up only barely worse than without it - not too bad.
75 *
76 * And the return on this optimization is very high. Most patterns with
77 * anything after the last '*' end up x2-x3... faster. So quite worth it.
78 * For instance, *foo is tested in O(LPAT+LSTR), which is very fast.
79 *
80 * Note aboute the conditions for the optimizations tests:
81 * While we could apply these optimizations with any combination of flags,
82 * we limit them only to cases where we don't need to do additional work due
83 * to the flags. For instance with FNM_PATHNAME, if the pattern ends in '*',
84 * we still need to test that there are no '/' in the rest of str. It's easy
85 * but we don't do that. Instead, we simply don't optimize with FNM_PATHNAME.
86 * Similarly with the lookahead, we disable the optimization with flags which
87 * would require extra work at the lookahead, to keep it simple - and fast.
88 * For reference, when used for shell pattern matching - no flag is set.
89 *
90 * Other optimizations: there are more cases which could complete faster,
91 * e.g. *x* could be tested as strchr(s,'x'), *foo* as strstr(s,"foo"), etc,
92 * but each case requires to identify it and maybe some setup - which won't
93 * always pay off, and each case adds code complexity and risk. Also, unlike
94 * regcomp, we don't have the luxury of spending much time on pre-processing
95 * the pattern - which will pay off when reused by regexec. So keep it simple.
96 *
97 *
48 * Flags: 98 * Flags:
49 * 99 *
50 * FNM_NOESCAPE Backslash doesn't quote the next char. 100 * FNM_NOESCAPE Backslash doesn't quote the next char.
@@ -65,6 +115,7 @@
65#if 1 115#if 1
66 116
67#include <ctype.h> /* to{lower,upper} */ 117#include <ctype.h> /* to{lower,upper} */
118#include <stdio.h> /* size_t */
68 119
69#include "actype.h" 120#include "actype.h"
70#include "fnmatch.h" 121#include "fnmatch.h"
@@ -98,7 +149,7 @@ static int bracket_matched(const char **pat, char testchar, const int flags)
98 const uchar *p = (const uchar*)(*pat) + 1; 149 const uchar *p = (const uchar*)(*pat) + 1;
99 int c = (uchar)testchar, cfold = c; 150 int c = (uchar)testchar, cfold = c;
100 int inv = 0, len; 151 int inv = 0, len;
101 int found = 0; 152 int found = !c; /* only when testing last '*': testchar==0 */
102 actype_t t; 153 actype_t t;
103 154
104 if (FLAG(CASEFOLD)) { 155 if (FLAG(CASEFOLD)) {
@@ -188,8 +239,20 @@ int fnmatch(const char *pat, const char *str, int flags)
188 if (*pat == '*') { /* skip, setup retry (max once per '*') */ 239 if (*pat == '*') { /* skip, setup retry (max once per '*') */
189 while (*++pat == '*'); /* all consecutive '*' */ 240 while (*++pat == '*'); /* all consecutive '*' */
190 241
242 /* optimization - pat ends in '*': O(str) -> O(1) */
243 if (!*pat && !FLAG(PATHNAME))
244 return 0; /* success */
245
191 p_star = pat - 1; 246 p_star = pat - 1;
192 s_head = str; /* on failure -> p_star eats s_head */ 247 s_head = str; /* on failure -> p_star eats s_head */
248
249 /* Optimization - last '*': O(pat*str) -> O(str)
250 * See optimization comment above. the "goto" is only
251 * for readability - avoid adding big code block here.
252 */
253 if (!(flags & (FNM_PATHNAME|FNM_NOESCAPE|FNM_LEADING_DIR)))
254 goto tailopt; /* continues at tailcont */
255 tailcont:;
193 } 256 }
194 257
195 if (!*pat) { 258 if (!*pat) {
@@ -221,6 +284,29 @@ int fnmatch(const char *pat, const char *str, int flags)
221 pat = p_star; 284 pat = p_star;
222 str = s_head++; /* pat/str increased by the loop */ 285 str = s_head++; /* pat/str increased by the loop */
223 } 286 }
287
288/* unreachable - except with goto */
289tailopt:
290 {
291 /* test whether there are more '*', and if not, advance str */
292 size_t n = 0; /* exact number of chars expected by pat */
293 const char *p = pat;
294 for (; *p; ++p, ++n) {
295 if (*p == '*') /* not last '*' */
296 goto tailcont;
297 if (!str[n]) /* str is too short */
298 return FNM_NOMATCH;
299 /* *p can be [ or ? or [escaped] literal */
300 if (*p == '[')
301 (void)bracket_matched(&p, 0, 0);
302 else if (*p == '\\' && !*++p)
303 return FNM_NOMATCH; /* trailing '\' */
304 }
305 /* no more '*' - advance str to last n chars, no retries */
306 for (; str[n]; ++str); /* same as: str += strlen(str+n) */
307 s_head = 0;
308 goto tailcont;
309 }
224} 310}
225 311
226#endif /* whole file */ 312#endif /* whole file */