aboutsummaryrefslogtreecommitdiff
path: root/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'regexp.c')
-rw-r--r--regexp.c826
1 files changed, 826 insertions, 0 deletions
diff --git a/regexp.c b/regexp.c
new file mode 100644
index 000000000..6017d79b4
--- /dev/null
+++ b/regexp.c
@@ -0,0 +1,826 @@
1/* regexp.c */
2
3#include "internal.h"
4#include "regexp.h"
5#include <setjmp.h>
6#include <stdio.h>
7#include <ctype.h>
8
9
10#if ( defined BB_GREP || defined BB_FIND )
11
12/* This also tries to find a needle in a haystack, but uses
13 * real regular expressions.... The fake regular expression
14 * version of find_match lives in utility.c. Using this version
15 * will add 3.9k to busybox...
16 * -Erik Andersen
17 */
18extern int find_match(char *haystack, char *needle, int ignoreCase)
19{
20 int status;
21 struct regexp* re;
22 re = regcomp( needle);
23 status = regexec(re, haystack, FALSE, ignoreCase);
24 free( re);
25 return( status);
26}
27
28
29/* code swiped from elvis-tiny 1.4 (a clone of vi) and adjusted to
30 * suit the needs of busybox by Erik Andersen.
31 *
32 * From the README:
33 * "Elvis is freely redistributable, in either source form or executable form.
34 * There are no restrictions on how you may use it".
35 * Elvis was written by Steve Kirkendall <kirkenda@cs.pdx.edu>
36 *
37 *
38 * This file contains the code that compiles regular expressions and executes
39 * them. It supports the same syntax and features as vi's regular expression
40 * code. Specifically, the meta characters are:
41 * ^ matches the beginning of a line
42 * $ matches the end of a line
43 * \< matches the beginning of a word
44 * \> matches the end of a word
45 * . matches any single character
46 * [] matches any character in a character class
47 * \( delimits the start of a subexpression
48 * \) delimits the end of a subexpression
49 * * repeats the preceding 0 or more times
50 * NOTE: You cannot follow a \) with a *.
51 *
52 * The physical structure of a compiled RE is as follows:
53 * - First, there is a one-byte value that says how many character classes
54 * are used in this regular expression
55 * - Next, each character class is stored as a bitmap that is 256 bits
56 * (32 bytes) long.
57 * - A mixture of literal characters and compiled meta characters follows.
58 * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
59 * are stored as a \n followed by a one-byte code, so they take up two
60 * bytes apiece. Literal characters take up one byte apiece. \n can't
61 * be used as a literal character.
62 *
63 */
64
65
66
67static char *previous; /* the previous regexp, used when null regexp is given */
68static char *previous1; /* a copy of the text from the previous substitution for regsub()*/
69
70
71/* These are used to classify or recognize meta-characters */
72#define META '\0'
73#define BASE_META(m) ((m) - 256)
74#define INT_META(c) ((c) + 256)
75#define IS_META(m) ((m) >= 256)
76#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
77#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
78#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
79#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_QMARK)
80#define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
81#define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
82
83/* These are the internal codes used for each type of meta-character */
84#define M_BEGLINE 256 /* internal code for ^ */
85#define M_ENDLINE 257 /* internal code for $ */
86#define M_BEGWORD 258 /* internal code for \< */
87#define M_ENDWORD 259 /* internal code for \> */
88#define M_ANY 260 /* internal code for . */
89#define M_SPLAT 261 /* internal code for * */
90#define M_PLUS 262 /* internal code for \+ */
91#define M_QMARK 263 /* internal code for \? */
92#define M_CLASS(n) (264+(n)) /* internal code for [] */
93#define M_START(n) (274+(n)) /* internal code for \( */
94#define M_END(n) (284+(n)) /* internal code for \) */
95
96/* These are used during compilation */
97static int class_cnt; /* used to assign class IDs */
98static int start_cnt; /* used to assign start IDs */
99static int end_stk[NSUBEXP];/* used to assign end IDs */
100static int end_sp;
101static char *retext; /* points to the text being compiled */
102
103/* error-handling stuff */
104jmp_buf errorhandler;
105#define FAIL(why) fprintf(stderr, why); longjmp(errorhandler, 1)
106
107
108
109
110/* This function builds a bitmap for a particular class */
111/* text -- start of the class */
112/* bmap -- the bitmap */
113static char *makeclass(char* text, char* bmap)
114{
115 int i;
116 int complement = 0;
117
118
119 /* zero the bitmap */
120 for (i = 0; bmap && i < 32; i++)
121 {
122 bmap[i] = 0;
123 }
124
125 /* see if we're going to complement this class */
126 if (*text == '^')
127 {
128 text++;
129 complement = 1;
130 }
131
132 /* add in the characters */
133 while (*text && *text != ']')
134 {
135 /* is this a span of characters? */
136 if (text[1] == '-' && text[2])
137 {
138 /* spans can't be backwards */
139 if (text[0] > text[2])
140 {
141 FAIL("Backwards span in []");
142 }
143
144 /* add each character in the span to the bitmap */
145 for (i = text[0]; bmap && i <= text[2]; i++)
146 {
147 bmap[i >> 3] |= (1 << (i & 7));
148 }
149
150 /* move past this span */
151 text += 3;
152 }
153 else
154 {
155 /* add this single character to the span */
156 i = *text++;
157 if (bmap)
158 {
159 bmap[i >> 3] |= (1 << (i & 7));
160 }
161 }
162 }
163
164 /* make sure the closing ] is missing */
165 if (*text++ != ']')
166 {
167 FAIL("] missing");
168 }
169
170 /* if we're supposed to complement this class, then do so */
171 if (complement && bmap)
172 {
173 for (i = 0; i < 32; i++)
174 {
175 bmap[i] = ~bmap[i];
176 }
177 }
178
179 return text;
180}
181
182
183
184
185/* This function gets the next character or meta character from a string.
186 * The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
187 * a bitmap is generated via makeclass() (if re is given), and the
188 * character-class text is skipped.
189 */
190static int gettoken(sptr, re)
191 char **sptr;
192 regexp *re;
193{
194 int c;
195
196 c = **sptr;
197 ++*sptr;
198 if (c == '\\')
199 {
200 c = **sptr;
201 ++*sptr;
202 switch (c)
203 {
204 case '<':
205 return M_BEGWORD;
206
207 case '>':
208 return M_ENDWORD;
209
210 case '(':
211 if (start_cnt >= NSUBEXP)
212 {
213 FAIL("Too many \\(s");
214 }
215 end_stk[end_sp++] = start_cnt;
216 return M_START(start_cnt++);
217
218 case ')':
219 if (end_sp <= 0)
220 {
221 FAIL("Mismatched \\)");
222 }
223 return M_END(end_stk[--end_sp]);
224
225 case '*':
226 return M_SPLAT;
227
228 case '.':
229 return M_ANY;
230
231 case '+':
232 return M_PLUS;
233
234 case '?':
235 return M_QMARK;
236
237 default:
238 return c;
239 }
240 }
241 else {
242 switch (c)
243 {
244 case '^':
245 if (*sptr == retext + 1)
246 {
247 return M_BEGLINE;
248 }
249 return c;
250
251 case '$':
252 if (!**sptr)
253 {
254 return M_ENDLINE;
255 }
256 return c;
257
258 case '.':
259 return M_ANY;
260
261 case '*':
262 return M_SPLAT;
263
264 case '[':
265 /* make sure we don't have too many classes */
266 if (class_cnt >= 10)
267 {
268 FAIL("Too many []s");
269 }
270
271 /* process the character list for this class */
272 if (re)
273 {
274 /* generate the bitmap for this class */
275 *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);
276 }
277 else
278 {
279 /* skip to end of the class */
280 *sptr = makeclass(*sptr, (char *)0);
281 }
282 return M_CLASS(class_cnt++);
283
284 default:
285 return c;
286 }
287 }
288 /*NOTREACHED*/
289}
290
291
292
293
294/* This function calculates the number of bytes that will be needed for a
295 * compiled RE. Its argument is the uncompiled version. It is not clever
296 * about catching syntax errors; that is done in a later pass.
297 */
298static unsigned calcsize(text)
299 char *text;
300{
301 unsigned size;
302 int token;
303
304 retext = text;
305 class_cnt = 0;
306 start_cnt = 1;
307 end_sp = 0;
308 size = 5;
309 while ((token = gettoken(&text, (regexp *)0)) != 0)
310 {
311 if (IS_CLASS(token))
312 {
313 size += 34;
314 }
315 else if (IS_META(token))
316 {
317 size += 2;
318 }
319 else
320 {
321 size++;
322 }
323 }
324
325 return size;
326}
327
328
329
330/*---------------------------------------------------------------------------*/
331
332
333/* This function checks for a match between a character and a token which is
334 * known to represent a single character. It returns 0 if they match, or
335 * 1 if they don't.
336 */
337static int match1(regexp* re, char ch, int token, int ignoreCase)
338{
339 if (!ch)
340 {
341 /* the end of a line can't match any RE of width 1 */
342 return 1;
343 }
344 if (token == M_ANY)
345 {
346 return 0;
347 }
348 else if (IS_CLASS(token))
349 {
350 if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7)))
351 return 0;
352 }
353 else if (ch == token
354 || (ignoreCase==TRUE && isupper(ch) && tolower(ch) == token))
355 {
356 return 0;
357 }
358 return 1;
359}
360
361
362
363/* This function checks characters up to and including the next closure, at
364 * which point it does a recursive call to check the rest of it. This function
365 * returns 0 if everything matches, or 1 if something doesn't match.
366 */
367/* re -- the regular expression */
368/* str -- the string */
369/* prog -- a portion of re->program, an compiled RE */
370/* here -- a portion of str, the string to compare it to */
371static int match(regexp* re, char* str, char* prog, char* here, int ignoreCase)
372{
373 int token;
374 int nmatched;
375 int closure;
376
377 for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog))
378 {
379 switch (token)
380 {
381 /*case M_BEGLINE: can't happen; re->bol is used instead */
382 case M_ENDLINE:
383 if (*here)
384 return 1;
385 break;
386
387 case M_BEGWORD:
388 if (here != str &&
389 (here[-1] == '_' ||
390 (isascii(here[-1]) && isalnum(here[-1]))))
391 return 1;
392 break;
393
394 case M_ENDWORD:
395 if ((here[0] == '_' || isascii(here[0])) && isalnum(here[0]))
396 return 1;
397 break;
398
399 case M_START(0):
400 case M_START(1):
401 case M_START(2):
402 case M_START(3):
403 case M_START(4):
404 case M_START(5):
405 case M_START(6):
406 case M_START(7):
407 case M_START(8):
408 case M_START(9):
409 re->startp[token - M_START(0)] = (char *)here;
410 break;
411
412 case M_END(0):
413 case M_END(1):
414 case M_END(2):
415 case M_END(3):
416 case M_END(4):
417 case M_END(5):
418 case M_END(6):
419 case M_END(7):
420 case M_END(8):
421 case M_END(9):
422 re->endp[token - M_END(0)] = (char *)here;
423 if (token == M_END(0))
424 {
425 return 0;
426 }
427 break;
428
429 default: /* literal, M_CLASS(n), or M_ANY */
430 if (match1(re, *here, token, ignoreCase) != 0)
431 return 1;
432 here++;
433 }
434 }
435
436 /* C L O S U R E */
437
438 /* step 1: see what we have to match against, and move "prog" to point
439 * the the remainder of the compiled RE.
440 */
441 closure = token;
442 prog++, token = GET_META(prog);
443 prog++;
444
445 /* step 2: see how many times we can match that token against the string */
446 for (nmatched = 0;
447 (closure != M_QMARK || nmatched < 1) && *here && match1(re, *here, token, ignoreCase) == 0;
448 nmatched++, here++)
449 {
450 }
451
452 /* step 3: try to match the remainder, and back off if it doesn't */
453 while (nmatched >= 0 && match(re, str, prog, here, ignoreCase) != 0)
454 {
455 nmatched--;
456 here--;
457 }
458
459 /* so how did it work out? */
460 if (nmatched >= ((closure == M_PLUS) ? 1 : 0))
461 return 0;
462 return 1;
463}
464
465
466/* This function compiles a regexp. */
467extern regexp *regcomp(char* text)
468{
469 int needfirst;
470 unsigned size;
471 int token;
472 int peek;
473 char *build;
474 regexp *re;
475
476
477 /* prepare for error handling */
478 re = (regexp *)0;
479 if (setjmp(errorhandler))
480 {
481 if (re)
482 {
483 free(re);
484 }
485 return (regexp *)0;
486 }
487
488 /* if an empty regexp string was given, use the previous one */
489 if (*text == 0)
490 {
491 if (!previous)
492 {
493 FAIL("No previous RE");
494 }
495 text = previous;
496 }
497 else /* non-empty regexp given, so remember it */
498 {
499 if (previous)
500 free(previous);
501 previous = (char *)malloc((unsigned)(strlen(text) + 1));
502 if (previous)
503 strcpy(previous, text);
504 }
505
506 /* allocate memory */
507 class_cnt = 0;
508 start_cnt = 1;
509 end_sp = 0;
510 retext = text;
511 size = calcsize(text) + sizeof(regexp);
512 re = (regexp *)malloc((unsigned)size);
513
514 if (!re)
515 {
516 FAIL("Not enough memory for this RE");
517 }
518
519 /* compile it */
520 build = &re->program[1 + 32 * class_cnt];
521 re->program[0] = class_cnt;
522 for (token = 0; token < NSUBEXP; token++)
523 {
524 re->startp[token] = re->endp[token] = (char *)0;
525 }
526 re->first = 0;
527 re->bol = 0;
528 re->minlen = 0;
529 needfirst = 1;
530 class_cnt = 0;
531 start_cnt = 1;
532 end_sp = 0;
533 retext = text;
534 for (token = M_START(0), peek = gettoken(&text, re);
535 token;
536 token = peek, peek = gettoken(&text, re))
537 {
538 /* special processing for the closure operator */
539 if (IS_CLOSURE(peek))
540 {
541 /* detect misuse of closure operator */
542 if (IS_START(token))
543 {
544 FAIL("* or \\+ or \\? follows nothing");
545 }
546 else if (IS_META(token) && token != M_ANY && !IS_CLASS(token))
547 {
548 FAIL("* or \\+ or \\? can only follow a normal character or . or []");
549 }
550
551 /* it is okay -- make it prefix instead of postfix */
552 ADD_META(build, peek);
553
554 /* take care of "needfirst" - is this the first char? */
555 if (needfirst && peek == M_PLUS && !IS_META(token))
556 {
557 re->first = token;
558 }
559 needfirst = 0;
560
561 /* we used "peek" -- need to refill it */
562 peek = gettoken(&text, re);
563 if (IS_CLOSURE(peek))
564 {
565 FAIL("* or \\+ or \\? doubled up");
566 }
567 }
568 else if (!IS_META(token))
569 {
570 /* normal char is NOT argument of closure */
571 if (needfirst)
572 {
573 re->first = token;
574 needfirst = 0;
575 }
576 re->minlen++;
577 }
578 else if (token == M_ANY || IS_CLASS(token))
579 {
580 /* . or [] is NOT argument of closure */
581 needfirst = 0;
582 re->minlen++;
583 }
584
585 /* the "token" character is not closure -- process it normally */
586 if (token == M_BEGLINE)
587 {
588 /* set the BOL flag instead of storing M_BEGLINE */
589 re->bol = 1;
590 }
591 else if (IS_META(token))
592 {
593 ADD_META(build, token);
594 }
595 else
596 {
597 *build++ = token;
598 }
599 }
600
601 /* end it with a \) which MUST MATCH the opening \( */
602 ADD_META(build, M_END(0));
603 if (end_sp > 0)
604 {
605 FAIL("Not enough \\)s");
606 }
607
608 return re;
609}
610
611
612
613
614/* This function searches through a string for text that matches an RE. */
615/* re -- the compiled regexp to search for */
616/* str -- the string to search through */
617/* bol -- does str start at the beginning of a line? (boolean) */
618/* ignoreCase -- ignoreCase or not */
619extern int regexec(struct regexp* re, char* str, int bol, int ignoreCase)
620{
621 char *prog; /* the entry point of re->program */
622 int len; /* length of the string */
623 char *here;
624
625 /* if must start at the beginning of a line, and this isn't, then fail */
626 if (re->bol && bol==TRUE)
627 {
628 return FALSE;
629 }
630
631 len = strlen(str);
632 prog = re->program + 1 + 32 * re->program[0];
633
634 /* search for the RE in the string */
635 if (re->bol)
636 {
637 /* must occur at BOL */
638 if ((re->first
639 && match1(re, *(char *)str, re->first, ignoreCase))/* wrong first letter? */
640 || len < re->minlen /* not long enough? */
641 || match(re, (char *)str, prog, str, ignoreCase)) /* doesn't match? */
642 return FALSE; /* THEN FAIL! */
643 }
644 else if (ignoreCase == FALSE)
645 {
646 /* can occur anywhere in the line, noignorecase */
647 for (here = (char *)str;
648 (re->first && re->first != *here)
649 || match(re, (char *)str, prog, here, ignoreCase);
650 here++, len--)
651 {
652 if (len < re->minlen)
653 return FALSE;
654 }
655 }
656 else
657 {
658 /* can occur anywhere in the line, ignorecase */
659 for (here = (char *)str;
660 (re->first && match1(re, *here, (int)re->first, ignoreCase))
661 || match(re, (char *)str, prog, here, ignoreCase);
662 here++, len--)
663 {
664 if (len < re->minlen)
665 return FALSE;
666 }
667 }
668
669 /* if we didn't fail, then we must have succeeded */
670 return TRUE;
671}
672
673
674
675
676
677/* This performs substitutions after a regexp match has been found. */
678extern void regsub(regexp* re, char* src, char* dst)
679{
680 char *cpy;
681 char *end;
682 char c;
683 char *start;
684 int mod;
685
686 mod = 0;
687
688 start = src;
689 while ((c = *src++) != '\0')
690 {
691 /* recognize any meta characters */
692 if (c == '&')
693 {
694 cpy = re->startp[0];
695 end = re->endp[0];
696 }
697 else if (c == '~')
698 {
699 cpy = previous1;
700 if (cpy)
701 end = cpy + strlen(cpy);
702 }
703 else
704 if (c == '\\')
705 {
706 c = *src++;
707 switch (c)
708 {
709 case '0':
710 case '1':
711 case '2':
712 case '3':
713 case '4':
714 case '5':
715 case '6':
716 case '7':
717 case '8':
718 case '9':
719 /* \0 thru \9 mean "copy subexpression" */
720 c -= '0';
721 cpy = re->startp[(int)c];
722 end = re->endp[(int)c];
723 break;
724 case 'U':
725 case 'u':
726 case 'L':
727 case 'l':
728 /* \U and \L mean "convert to upper/lowercase" */
729 mod = c;
730 continue;
731
732 case 'E':
733 case 'e':
734 /* \E ends the \U or \L */
735 mod = 0;
736 continue;
737 case '&':
738 /* "\&" means "original text" */
739 *dst++ = c;
740 continue;
741
742 case '~':
743 /* "\~" means "previous text, if any" */
744 *dst++ = c;
745 continue;
746 default:
747 /* ordinary char preceded by backslash */
748 *dst++ = c;
749 continue;
750 }
751 }
752 else
753 {
754 /* ordinary character, so just copy it */
755 *dst++ = c;
756 continue;
757 }
758
759 /* Note: to reach this point in the code, we must have evaded
760 * all "continue" statements. To do that, we must have hit
761 * a metacharacter that involves copying.
762 */
763
764 /* if there is nothing to copy, loop */
765 if (!cpy)
766 continue;
767
768 /* copy over a portion of the original */
769 while (cpy < end)
770 {
771 switch (mod)
772 {
773 case 'U':
774 case 'u':
775 /* convert to uppercase */
776 if (isascii(*cpy) && islower(*cpy))
777 {
778 *dst++ = toupper(*cpy);
779 cpy++;
780 }
781 else
782 {
783 *dst++ = *cpy++;
784 }
785 break;
786
787 case 'L':
788 case 'l':
789 /* convert to lowercase */
790 if (isascii(*cpy) && isupper(*cpy))
791 {
792 *dst++ = tolower(*cpy);
793 cpy++;
794 }
795 else
796 {
797 *dst++ = *cpy++;
798 }
799 break;
800
801 default:
802 /* copy without any conversion */
803 *dst++ = *cpy++;
804 }
805
806 /* \u and \l end automatically after the first char */
807 if (mod && (mod == 'u' || mod == 'l'))
808 {
809 mod = 0;
810 }
811 }
812 }
813 *dst = '\0';
814
815 /* remember what text we inserted this time */
816 if (previous1)
817 free(previous1);
818 previous1 = (char *)malloc((unsigned)(strlen(start) + 1));
819 if (previous1)
820 strcpy(previous1, start);
821}
822
823
824#endif /* BB_REGEXP */
825
826