summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenis Vlasenko <vda.linux@googlemail.com>2008-08-09 16:15:14 +0000
committerDenis Vlasenko <vda.linux@googlemail.com>2008-08-09 16:15:14 +0000
commit3fd15e197e21aa313ce56126ee814f0ebc884dee (patch)
tree38ac32cdea89bff09017eda0a1836e60f2c06749
parentfb5902ca5cf802557eb1e3c56502a2f5e27242f4 (diff)
downloadbusybox-w32-3fd15e197e21aa313ce56126ee814f0ebc884dee.tar.gz
busybox-w32-3fd15e197e21aa313ce56126ee814f0ebc884dee.tar.bz2
busybox-w32-3fd15e197e21aa313ce56126ee814f0ebc884dee.zip
grep: option to use GNU regex matching instead of POSIX one.
This fixes problems with NULs in files being scanned, but costs +800 bytes. The same can be done to sed (TODO).
-rw-r--r--Config.in9
-rw-r--r--findutils/grep.c129
-rw-r--r--libbb/get_line_from_file.c43
-rw-r--r--libbb/xregcomp.c2
-rwxr-xr-xtestsuite/grep.tests8
5 files changed, 155 insertions, 36 deletions
diff --git a/Config.in b/Config.in
index 5ad35ce20..c2005c78a 100644
--- a/Config.in
+++ b/Config.in
@@ -21,6 +21,15 @@ config DESKTOP
21 Select this only if you plan to use busybox on full-blown 21 Select this only if you plan to use busybox on full-blown
22 desktop machine with common Linux distro, not on an embedded box. 22 desktop machine with common Linux distro, not on an embedded box.
23 23
24config EXTRA_COMPAT
25 bool "Provide compatible behavior for rare corner cases (bigger code)"
26 default n
27 help
28 This option makes grep, sed etc handle rare corner cases
29 (embedded NUL bytes and such). This makes code bigger and uses
30 some GNU extensions in libc. You probably only need this option
31 if you plan to run busybox on desktop.
32
24config FEATURE_ASSUME_UNICODE 33config FEATURE_ASSUME_UNICODE
25 bool "Assume that 1:1 char/glyph correspondence is not true" 34 bool "Assume that 1:1 char/glyph correspondence is not true"
26 default n 35 default n
diff --git a/findutils/grep.c b/findutils/grep.c
index 030e62461..f2ed01e74 100644
--- a/findutils/grep.c
+++ b/findutils/grep.c
@@ -96,6 +96,7 @@ struct globals {
96 int lines_before; 96 int lines_before;
97 int lines_after; 97 int lines_after;
98 char **before_buf; 98 char **before_buf;
99 USE_EXTRA_COMPAT(size_t *before_buf_size;)
99 int last_line_printed; 100 int last_line_printed;
100#endif 101#endif
101 /* globals used internally */ 102 /* globals used internally */
@@ -117,6 +118,7 @@ struct globals {
117#define lines_before (G.lines_before ) 118#define lines_before (G.lines_before )
118#define lines_after (G.lines_after ) 119#define lines_after (G.lines_after )
119#define before_buf (G.before_buf ) 120#define before_buf (G.before_buf )
121#define before_buf_size (G.before_buf_size )
120#define last_line_printed (G.last_line_printed ) 122#define last_line_printed (G.last_line_printed )
121#define pattern_head (G.pattern_head ) 123#define pattern_head (G.pattern_head )
122#define cur_file (G.cur_file ) 124#define cur_file (G.cur_file )
@@ -124,14 +126,24 @@ struct globals {
124 126
125typedef struct grep_list_data_t { 127typedef struct grep_list_data_t {
126 char *pattern; 128 char *pattern;
127 regex_t preg; 129/* for GNU regex, matched_range must be persistent across grep_file() calls */
130#if !ENABLE_EXTRA_COMPAT
131 regex_t compiled_regex;
132 regmatch_t matched_range;
133#else
134 struct re_pattern_buffer compiled_regex;
135 struct re_registers matched_range;
136#endif
128#define ALLOCATED 1 137#define ALLOCATED 1
129#define COMPILED 2 138#define COMPILED 2
130 int flg_mem_alocated_compiled; 139 int flg_mem_alocated_compiled;
131} grep_list_data_t; 140} grep_list_data_t;
132 141
133 142#if !ENABLE_EXTRA_COMPAT
134static void print_line(const char *line, int linenum, char decoration) 143#define print_line(line, line_len, linenum, decoration) \
144 print_line(line, linenum, decoration)
145#endif
146static void print_line(const char *line, size_t line_len, int linenum, char decoration)
135{ 147{
136#if ENABLE_FEATURE_GREP_CONTEXT 148#if ENABLE_FEATURE_GREP_CONTEXT
137 /* Happens when we go to next file, immediately hit match 149 /* Happens when we go to next file, immediately hit match
@@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration)
139 if (linenum < 1) 151 if (linenum < 1)
140 return; 152 return;
141 /* possibly print the little '--' separator */ 153 /* possibly print the little '--' separator */
142 if ((lines_before || lines_after) && did_print_line && 154 if ((lines_before || lines_after) && did_print_line
143 last_line_printed != linenum - 1) { 155 && last_line_printed != linenum - 1
156 ) {
144 puts("--"); 157 puts("--");
145 } 158 }
146 /* guard against printing "--" before first line of first file */ 159 /* guard against printing "--" before first line of first file */
@@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration)
152 if (PRINT_LINE_NUM) 165 if (PRINT_LINE_NUM)
153 printf("%i%c", linenum, decoration); 166 printf("%i%c", linenum, decoration);
154 /* Emulate weird GNU grep behavior with -ov */ 167 /* Emulate weird GNU grep behavior with -ov */
155 if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) 168 if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
169#if !ENABLE_EXTRA_COMPAT
156 puts(line); 170 puts(line);
171#else
172 fwrite(line, 1, line_len, stdout);
173 putchar('\n');
174#endif
175 }
157} 176}
158 177
159static int grep_file(FILE *file) 178#if ENABLE_EXTRA_COMPAT
179/* Unlike getline, this one removes trailing '\n' */
180static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
160{ 181{
182 ssize_t res_sz;
161 char *line; 183 char *line;
184
185 res_sz = getline(line_ptr, line_alloc_len, file);
186 line = *line_ptr;
187
188 if (res_sz > 0) {
189 if (line[res_sz - 1] == '\n')
190 line[--res_sz] = '\0';
191 } else {
192 free(line); /* uclibc allocates a buffer even on EOF. WTF? */
193 }
194 return res_sz;
195}
196#endif
197
198static int grep_file(FILE *file)
199{
162 smalluint found; 200 smalluint found;
163 int linenum = 0; 201 int linenum = 0;
164 int nmatches = 0; 202 int nmatches = 0;
165 regmatch_t regmatch; 203#if !ENABLE_EXTRA_COMPAT
204 char *line;
205#else
206 char *line = NULL;
207 ssize_t line_len;
208 size_t line_alloc_len;
209#define rm_so start[0]
210#define rm_eo end[0]
211#endif
166#if ENABLE_FEATURE_GREP_CONTEXT 212#if ENABLE_FEATURE_GREP_CONTEXT
167 int print_n_lines_after = 0; 213 int print_n_lines_after = 0;
168 int curpos = 0; /* track where we are in the circular 'before' buffer */ 214 int curpos = 0; /* track where we are in the circular 'before' buffer */
@@ -171,7 +217,13 @@ static int grep_file(FILE *file)
171 enum { print_n_lines_after = 0 }; 217 enum { print_n_lines_after = 0 };
172#endif /* ENABLE_FEATURE_GREP_CONTEXT */ 218#endif /* ENABLE_FEATURE_GREP_CONTEXT */
173 219
174 while ((line = xmalloc_fgetline(file)) != NULL) { 220 while (
221#if !ENABLE_EXTRA_COMPAT
222 (line = xmalloc_fgetline(file)) != NULL
223#else
224 (line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
225#endif
226 ) {
175 llist_t *pattern_ptr = pattern_head; 227 llist_t *pattern_ptr = pattern_head;
176 grep_list_data_t *gl = gl; /* for gcc */ 228 grep_list_data_t *gl = gl; /* for gcc */
177 229
@@ -184,19 +236,35 @@ static int grep_file(FILE *file)
184 } else { 236 } else {
185 if (!(gl->flg_mem_alocated_compiled & COMPILED)) { 237 if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
186 gl->flg_mem_alocated_compiled |= COMPILED; 238 gl->flg_mem_alocated_compiled |= COMPILED;
187 xregcomp(&(gl->preg), gl->pattern, reflags); 239#if !ENABLE_EXTRA_COMPAT
240 xregcomp(&gl->compiled_regex, gl->pattern, reflags);
241#else
242 memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
243 if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
244 bb_error_msg_and_die("bad regex '%s'", gl->pattern);
245#endif
188 } 246 }
189 regmatch.rm_so = 0; 247#if !ENABLE_EXTRA_COMPAT
190 regmatch.rm_eo = 0; 248 gl->matched_range.rm_so = 0;
191 if (regexec(&(gl->preg), line, 1, &regmatch, 0) == 0) { 249 gl->matched_range.rm_eo = 0;
250#endif
251 if (
252#if !ENABLE_EXTRA_COMPAT
253 regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
254#else
255 re_search(&gl->compiled_regex, line, line_len,
256 /*start:*/ 0, /*range:*/ line_len,
257 &gl->matched_range) >= 0
258#endif
259 ) {
192 if (!(option_mask32 & OPT_w)) 260 if (!(option_mask32 & OPT_w))
193 found = 1; 261 found = 1;
194 else { 262 else {
195 char c = ' '; 263 char c = ' ';
196 if (regmatch.rm_so) 264 if (gl->matched_range.rm_so)
197 c = line[regmatch.rm_so - 1]; 265 c = line[gl->matched_range.rm_so - 1];
198 if (!isalnum(c) && c != '_') { 266 if (!isalnum(c) && c != '_') {
199 c = line[regmatch.rm_eo]; 267 c = line[gl->matched_range.rm_eo];
200 if (!c || (!isalnum(c) && c != '_')) 268 if (!c || (!isalnum(c) && c != '_'))
201 found = 1; 269 found = 1;
202 } 270 }
@@ -261,7 +329,7 @@ static int grep_file(FILE *file)
261 329
262 /* now print each line in the buffer, clearing them as we go */ 330 /* now print each line in the buffer, clearing them as we go */
263 while (before_buf[idx] != NULL) { 331 while (before_buf[idx] != NULL) {
264 print_line(before_buf[idx], first_buf_entry_line_num, '-'); 332 print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
265 free(before_buf[idx]); 333 free(before_buf[idx]);
266 before_buf[idx] = NULL; 334 before_buf[idx] = NULL;
267 idx = (idx + 1) % lines_before; 335 idx = (idx + 1) % lines_before;
@@ -277,13 +345,15 @@ static int grep_file(FILE *file)
277 /* -Fo just prints the pattern 345 /* -Fo just prints the pattern
278 * (unless -v: -Fov doesnt print anything at all) */ 346 * (unless -v: -Fov doesnt print anything at all) */
279 if (found) 347 if (found)
280 print_line(gl->pattern, linenum, ':'); 348 print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
281 } else { 349 } else {
282 line[regmatch.rm_eo] = '\0'; 350 line[gl->matched_range.rm_eo] = '\0';
283 print_line(line + regmatch.rm_so, linenum, ':'); 351 print_line(line + gl->matched_range.rm_so,
352 gl->matched_range.rm_eo - gl->matched_range.rm_so,
353 linenum, ':');
284 } 354 }
285 } else { 355 } else {
286 print_line(line, linenum, ':'); 356 print_line(line, line_len, linenum, ':');
287 } 357 }
288 } 358 }
289 } 359 }
@@ -291,12 +361,13 @@ static int grep_file(FILE *file)
291 else { /* no match */ 361 else { /* no match */
292 /* if we need to print some context lines after the last match, do so */ 362 /* if we need to print some context lines after the last match, do so */
293 if (print_n_lines_after) { 363 if (print_n_lines_after) {
294 print_line(line, linenum, '-'); 364 print_line(line, strlen(line), linenum, '-');
295 print_n_lines_after--; 365 print_n_lines_after--;
296 } else if (lines_before) { 366 } else if (lines_before) {
297 /* Add the line to the circular 'before' buffer */ 367 /* Add the line to the circular 'before' buffer */
298 free(before_buf[curpos]); 368 free(before_buf[curpos]);
299 before_buf[curpos] = line; 369 before_buf[curpos] = line;
370 USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
300 curpos = (curpos + 1) % lines_before; 371 curpos = (curpos + 1) % lines_before;
301 /* avoid free(line) - we took the line */ 372 /* avoid free(line) - we took the line */
302 line = NULL; 373 line = NULL;
@@ -304,13 +375,14 @@ static int grep_file(FILE *file)
304 } 375 }
305 376
306#endif /* ENABLE_FEATURE_GREP_CONTEXT */ 377#endif /* ENABLE_FEATURE_GREP_CONTEXT */
378#if !ENABLE_EXTRA_COMPAT
307 free(line); 379 free(line);
308 380#endif
309 /* Did we print all context after last requested match? */ 381 /* Did we print all context after last requested match? */
310 if ((option_mask32 & OPT_m) 382 if ((option_mask32 & OPT_m)
311 && !print_n_lines_after && nmatches == max_matches) 383 && !print_n_lines_after && nmatches == max_matches)
312 break; 384 break;
313 } 385 } /* while (read line) */
314 386
315 /* special-case file post-processing for options where we don't print line 387 /* special-case file post-processing for options where we don't print line
316 * matches, just filenames and possibly match counts */ 388 * matches, just filenames and possibly match counts */
@@ -428,15 +500,16 @@ int grep_main(int argc, char **argv)
428 lines_after = Copt; 500 lines_after = Copt;
429 if (!(option_mask32 & OPT_B)) /* not overridden */ 501 if (!(option_mask32 & OPT_B)) /* not overridden */
430 lines_before = Copt; 502 lines_before = Copt;
431 //option_mask32 |= OPT_A|OPT_B; /* for parser */
432 } 503 }
433 /* sanity checks */ 504 /* sanity checks */
434 if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) { 505 if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
435 option_mask32 &= ~OPT_n; 506 option_mask32 &= ~OPT_n;
436 lines_before = 0; 507 lines_before = 0;
437 lines_after = 0; 508 lines_after = 0;
438 } else if (lines_before > 0) 509 } else if (lines_before > 0) {
439 before_buf = xzalloc(lines_before * sizeof(char *)); 510 before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
511 USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
512 }
440#else 513#else
441 /* with auto sanity checks */ 514 /* with auto sanity checks */
442 /* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */ 515 /* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
@@ -537,7 +610,7 @@ int grep_main(int argc, char **argv)
537 if (gl->flg_mem_alocated_compiled & ALLOCATED) 610 if (gl->flg_mem_alocated_compiled & ALLOCATED)
538 free(gl->pattern); 611 free(gl->pattern);
539 if (gl->flg_mem_alocated_compiled & COMPILED) 612 if (gl->flg_mem_alocated_compiled & COMPILED)
540 regfree(&(gl->preg)); 613 regfree(&gl->compiled_regex);
541 free(gl); 614 free(gl);
542 free(pattern_head_ptr); 615 free(pattern_head_ptr);
543 } 616 }
diff --git a/libbb/get_line_from_file.c b/libbb/get_line_from_file.c
index 56761f941..968d7572d 100644
--- a/libbb/get_line_from_file.c
+++ b/libbb/get_line_from_file.c
@@ -9,6 +9,10 @@
9 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. 9 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
10 */ 10 */
11 11
12/* for getline() [GNUism] */
13#ifndef _GNU_SOURCE
14#define _GNU_SOURCE 1
15#endif
12#include "libbb.h" 16#include "libbb.h"
13 17
14/* This function reads an entire line from a text file, up to a newline 18/* This function reads an entire line from a text file, up to a newline
@@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file)
55 59
56 return bb_get_chunk_from_file(file, &i); 60 return bb_get_chunk_from_file(file, &i);
57} 61}
58
59/* Get line. Remove trailing \n */ 62/* Get line. Remove trailing \n */
60char* FAST_FUNC xmalloc_fgetline(FILE *file) 63char* FAST_FUNC xmalloc_fgetline(FILE *file)
61{ 64{
@@ -69,6 +72,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file)
69} 72}
70 73
71#if 0 74#if 0
75
76/* GNUism getline() should be faster (not tested) than a loop with fgetc */
77
78/* Get line, including trailing \n if any */
79char* FAST_FUNC xmalloc_fgets(FILE *file)
80{
81 char *res_buf = NULL;
82 size_t res_sz;
83
84 if (getline(&res_buf, &res_sz, file) == -1) {
85 free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
86 res_buf = NULL;
87 }
88//TODO: trimming to res_sz?
89 return res_buf;
90}
91/* Get line. Remove trailing \n */
92char* FAST_FUNC xmalloc_fgetline(FILE *file)
93{
94 char *res_buf = NULL;
95 size_t res_sz;
96
97 res_sz = getline(&res_buf, &res_sz, file);
98
99 if ((ssize_t)res_sz != -1) {
100 if (res_buf[res_sz - 1] == '\n')
101 res_buf[--res_sz] = '\0';
102//TODO: trimming to res_sz?
103 } else {
104 free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
105 res_buf = NULL;
106 }
107 return res_buf;
108}
109
110#endif
111
112#if 0
72/* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07. 113/* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
73 * 114 *
74 * NB: they stop at NUL byte too. 115 * NB: they stop at NUL byte too.
diff --git a/libbb/xregcomp.c b/libbb/xregcomp.c
index abfa35ff1..61efb5bc6 100644
--- a/libbb/xregcomp.c
+++ b/libbb/xregcomp.c
@@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags)
27{ 27{
28 char *errmsg = regcomp_or_errmsg(preg, regex, cflags); 28 char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
29 if (errmsg) { 29 if (errmsg) {
30 bb_error_msg_and_die("xregcomp: %s", errmsg); 30 bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
31 } 31 }
32} 32}
diff --git a/testsuite/grep.tests b/testsuite/grep.tests
index b2de2af54..8cee1b9ee 100755
--- a/testsuite/grep.tests
+++ b/testsuite/grep.tests
@@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \
62 "grep -s domatch nonexistent - ; echo \$?" \ 62 "grep -s domatch nonexistent - ; echo \$?" \
63 "(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n" 63 "(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
64 64
65# This doesn't match GNU behaviour (Binary file input matches) 65testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
66# acts like GNU grep -a 66testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
67testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
68# This doesn't match GNU behaviour (Binary file (standard input) matches)
69# acts like GNU grep -a
70testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
71 67
72testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \ 68testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
73 "0\n" "\0\n" "" 69 "0\n" "\0\n" ""