1 files changed, 826 insertions, 0 deletions
diff --git a/regexp.c b/regexp.c
new file mode 100644
index 000000000..6017d79b4
--- /dev/null
+++ b/regexp.c
@@ -0,0 +1,826 @@
+/* regexp.c */
+#include "internal.h"
+#include "regexp.h"
+#include <setjmp.h>
+#include <stdio.h>
+#include <ctype.h>
+#if ( defined BB_GREP || defined BB_FIND )
+/* This also tries to find a needle in a haystack, but uses
+ * real regular expressions....  The fake regular expression
+ * version of find_match lives in utility.c.  Using this version
+ * will add 3.9k to busybox...
+ *  -Erik Andersen
+ */
+extern int find_match(char *haystack, char *needle, int ignoreCase)
+{
+    int status;
+    struct regexp*  re;
+    re = regcomp( needle);
+    status = regexec(re, haystack, FALSE, ignoreCase);
+    free( re);
+    return( status);
+}
+/* code swiped from elvis-tiny 1.4 (a clone of vi) and adjusted to 
+ * suit the needs of busybox by Erik Andersen.
+ *
+ * From the README:
+ * "Elvis is freely redistributable, in either source form or executable form.
+ * There are no restrictions on how you may use it".
+ * Elvis was written by Steve Kirkendall <kirkenda@cs.pdx.edu>
+ *
+ *
+ * This file contains the code that compiles regular expressions and executes
+ * them.  It supports the same syntax and features as vi's regular expression
+ * code.  Specifically, the meta characters are:
+ *      ^       matches the beginning of a line
+ *      $       matches the end of a line
+ *      \<      matches the beginning of a word
+ *      \>      matches the end of a word
+ *      .       matches any single character
+ *      []      matches any character in a character class
+ *      \(      delimits the start of a subexpression
+ *      \)      delimits the end of a subexpression
+ *      *       repeats the preceding 0 or more times
+ * NOTE: You cannot follow a \) with a *.
+ *
+ * The physical structure of a compiled RE is as follows:
+ *      - First, there is a one-byte value that says how many character classes
+ *        are used in this regular expression
+ *      - Next, each character class is stored as a bitmap that is 256 bits
+ *        (32 bytes) long.
+ *      - A mixture of literal characters and compiled meta characters follows.
+ *        This begins with M_BEGIN(0) and ends with M_END(0).  All meta chars
+ *        are stored as a \n followed by a one-byte code, so they take up two
+ *        bytes apiece.  Literal characters take up one byte apiece.  \n can't
+ *        be used as a literal character.
+ *
+ */
+static char *previous;  /* the previous regexp, used when null regexp is given */
+static char *previous1; /* a copy of the text from the previous substitution for regsub()*/
+/* These are used to classify or recognize meta-characters */
+#define META            '\0'
+#define BASE_META(m)    ((m) - 256)
+#define INT_META(c)     ((c) + 256)
+#define IS_META(m)      ((m) >= 256)
+#define IS_CLASS(m)     ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
+#define IS_START(m)     ((m) >= M_START(0) && (m) <= M_START(9))
+#define IS_END(m)       ((m) >= M_END(0) && (m) <= M_END(9))
+#define IS_CLOSURE(m)   ((m) >= M_SPLAT && (m) <= M_QMARK)
+#define ADD_META(s,m)   (*(s)++ = META, *(s)++ = BASE_META(m))
+#define GET_META(s)     (*(s) == META ? INT_META(*++(s)) : *s)
+/* These are the internal codes used for each type of meta-character */
+#define M_BEGLINE       256             /* internal code for ^ */
+#define M_ENDLINE       257             /* internal code for $ */
+#define M_BEGWORD       258             /* internal code for \< */
+#define M_ENDWORD       259             /* internal code for \> */
+#define M_ANY           260             /* internal code for . */
+#define M_SPLAT         261             /* internal code for * */
+#define M_PLUS          262             /* internal code for \+ */
+#define M_QMARK         263             /* internal code for \? */
+#define M_CLASS(n)      (264+(n))       /* internal code for [] */
+#define M_START(n)      (274+(n))       /* internal code for \( */
+#define M_END(n)        (284+(n))       /* internal code for \) */
+/* These are used during compilation */
+static int      class_cnt;      /* used to assign class IDs */
+static int      start_cnt;      /* used to assign start IDs */
+static int      end_stk[NSUBEXP];/* used to assign end IDs */
+static int      end_sp;
+static char     *retext;        /* points to the text being compiled */
+/* error-handling stuff */
+jmp_buf errorhandler;
+#define FAIL(why)       fprintf(stderr, why); longjmp(errorhandler, 1)
+/* This function builds a bitmap for a particular class */
+/* text -- start of the class */
+/* bmap -- the bitmap */
+static char *makeclass(char* text, char* bmap)
+{
+        int             i;
+        int             complement = 0;
+        /* zero the bitmap */
+        for (i = 0; bmap && i < 32; i++)
+        {
+                bmap[i] = 0;
+        }
+        /* see if we're going to complement this class */
+        if (*text == '^')
+        {
+                text++;
+                complement = 1;
+        }
+        /* add in the characters */
+        while (*text && *text != ']')
+        {
+                /* is this a span of characters? */
+                if (text[1] == '-' && text[2])
+                {
+                        /* spans can't be backwards */
+                        if (text[0] > text[2])
+                        {
+                                FAIL("Backwards span in []");
+                        }
+                        /* add each character in the span to the bitmap */
+                        for (i = text[0]; bmap && i <= text[2]; i++)
+                        {
+                                bmap[i >> 3] |= (1 << (i & 7));
+                        }
+                        /* move past this span */
+                        text += 3;
+                }
+                else
+                {
+                        /* add this single character to the span */
+                        i = *text++;
+                        if (bmap)
+                        {
+                                bmap[i >> 3] |= (1 << (i & 7));
+                        }
+                }
+        }
+        /* make sure the closing ] is missing */
+        if (*text++ != ']')
+        {
+                FAIL("] missing");
+        }
+        /* if we're supposed to complement this class, then do so */
+        if (complement && bmap)
+        {
+                for (i = 0; i < 32; i++)
+                {
+                        bmap[i] = ~bmap[i];
+                }
+        }
+        return text;
+}
+/* This function gets the next character or meta character from a string.
+ * The pointer is incremented by 1, or by 2 for \-quoted characters.  For [],
+ * a bitmap is generated via makeclass() (if re is given), and the
+ * character-class text is skipped.
+ */
+static int gettoken(sptr, re)
+        char    **sptr;
+        regexp  *re;
+{
+        int     c;
+        c = **sptr;
+        ++*sptr;
+        if (c == '\\')
+        {
+                c = **sptr;
+                ++*sptr;
+                switch (c)
+                {
+                  case '<':
+                        return M_BEGWORD;
+                  case '>':
+                        return M_ENDWORD;
+                  case '(':
+                        if (start_cnt >= NSUBEXP)
+                        {
+                                FAIL("Too many \\(s");
+                        }
+                        end_stk[end_sp++] = start_cnt;
+                        return M_START(start_cnt++);
+                  case ')':
+                        if (end_sp <= 0)
+                        {
+                                FAIL("Mismatched \\)");
+                        }
+                        return M_END(end_stk[--end_sp]);
+                  case '*':
+                        return M_SPLAT;
+                  case '.':
+                        return M_ANY;
+                  case '+':
+                        return M_PLUS;
+                  case '?':
+                        return M_QMARK;
+                  default:
+                        return c;
+                }
+        }
+        else {
+                switch (c)
+                {
+                  case '^':
+                        if (*sptr == retext + 1)
+                        {
+                                return M_BEGLINE;
+                        }
+                        return c;
+                  case '$':
+                        if (!**sptr)
+                        {
+                                return M_ENDLINE;
+                        }
+                        return c;
+                  case '.':
+                        return M_ANY;
+                  case '*':
+                        return M_SPLAT;
+                  case '[':
+                        /* make sure we don't have too many classes */
+                        if (class_cnt >= 10)
+                        {
+                                FAIL("Too many []s");
+                        }
+                        /* process the character list for this class */
+                        if (re)
+                        {
+                                /* generate the bitmap for this class */
+                                *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);
+                        }
+                        else
+                        {
+                                /* skip to end of the class */
+                                *sptr = makeclass(*sptr, (char *)0);
+                        }
+                        return M_CLASS(class_cnt++);
+                  default:
+                        return c;
+                }
+        }
+        /*NOTREACHED*/
+}
+/* This function calculates the number of bytes that will be needed for a
+ * compiled RE.  Its argument is the uncompiled version.  It is not clever
+ * about catching syntax errors; that is done in a later pass.
+ */
+static unsigned calcsize(text)
+        char            *text;
+{
+        unsigned        size;
+        int             token;
+        retext = text;
+        class_cnt = 0;
+        start_cnt = 1;
+        end_sp = 0;
+        size = 5;
+        while ((token = gettoken(&text, (regexp *)0)) != 0)
+        {
+                if (IS_CLASS(token))
+                {
+                        size += 34;
+                }
+                else if (IS_META(token))
+                {
+                        size += 2;
+                }
+                else
+                {
+                        size++;
+                }
+        }
+        return size;
+}
+/*---------------------------------------------------------------------------*/
+/* This function checks for a match between a character and a token which is
+ * known to represent a single character.  It returns 0 if they match, or
+ * 1 if they don't.
+ */
+static int match1(regexp* re, char ch, int token, int ignoreCase)
+{
+        if (!ch)
+        {
+                /* the end of a line can't match any RE of width 1 */
+                return 1;
+        }
+        if (token == M_ANY)
+        {
+                return 0;
+        }
+        else if (IS_CLASS(token))
+        {
+                if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7)))
+                        return 0;
+        }
+        else if (ch == token
+                || (ignoreCase==TRUE && isupper(ch) && tolower(ch) == token))
+        {
+                return 0;
+        }
+        return 1;
+}
+/* This function checks characters up to and including the next closure, at
+ * which point it does a recursive call to check the rest of it.  This function
+ * returns 0 if everything matches, or 1 if something doesn't match.
+ */
+/* re   -- the regular expression */
+/* str  -- the string */
+/* prog -- a portion of re->program, an compiled RE */
+/* here -- a portion of str, the string to compare it to */
+static int match(regexp* re, char* str, char* prog, char* here, int ignoreCase)
+{
+        int             token;
+        int             nmatched;
+        int             closure;
+        for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog))
+        {
+                switch (token)
+                {
+                /*case M_BEGLINE: can't happen; re->bol is used instead */
+                  case M_ENDLINE:
+                        if (*here)
+                                return 1;
+                        break;
+                  case M_BEGWORD:
+                        if (here != str &&
+                           (here[-1] == '_' ||
+                             (isascii(here[-1]) && isalnum(here[-1]))))
+                                return 1;
+                        break;
+                  case M_ENDWORD:
+                        if ((here[0] == '_' || isascii(here[0])) && isalnum(here[0]))
+                                return 1;
+                        break;
+                  case M_START(0):
+                  case M_START(1):
+                  case M_START(2):
+                  case M_START(3):
+                  case M_START(4):
+                  case M_START(5):
+                  case M_START(6):
+                  case M_START(7):
+                  case M_START(8):
+                  case M_START(9):
+                        re->startp[token - M_START(0)] = (char *)here;
+                        break;
+                  case M_END(0):
+                  case M_END(1):
+                  case M_END(2):
+                  case M_END(3):
+                  case M_END(4):
+                  case M_END(5):
+                  case M_END(6):
+                  case M_END(7):
+                  case M_END(8):
+                  case M_END(9):
+                        re->endp[token - M_END(0)] = (char *)here;
+                        if (token == M_END(0))
+                        {
+                                return 0;
+                        }
+                        break;
+                  default: /* literal, M_CLASS(n), or M_ANY */
+                        if (match1(re, *here, token, ignoreCase) != 0)
+                                return 1;
+                        here++;
+                }
+        }
+        /* C L O S U R E */
+        /* step 1: see what we have to match against, and move "prog" to point
+         * the the remainder of the compiled RE.
+         */
+        closure = token;
+        prog++, token = GET_META(prog);
+        prog++;
+        /* step 2: see how many times we can match that token against the string */
+        for (nmatched = 0;
+             (closure != M_QMARK || nmatched < 1) && *here && match1(re, *here, token, ignoreCase) == 0;
+             nmatched++, here++)
+        {
+        }
+        /* step 3: try to match the remainder, and back off if it doesn't */
+        while (nmatched >= 0 && match(re, str, prog, here, ignoreCase) != 0)
+        {
+                nmatched--;
+                here--;
+        }
+        /* so how did it work out? */
+        if (nmatched >= ((closure == M_PLUS) ? 1 : 0))
+                return 0;
+        return 1;
+}
+/* This function compiles a regexp. */
+extern regexp *regcomp(char* text)
+{
+        int             needfirst;
+        unsigned        size;
+        int             token;
+        int             peek;
+        char            *build;
+        regexp          *re;
+        /* prepare for error handling */
+        re = (regexp *)0;
+        if (setjmp(errorhandler))
+        {
+                if (re)
+                {
+                        free(re);
+                }
+                return (regexp *)0;
+        }
+        /* if an empty regexp string was given, use the previous one */
+        if (*text == 0)
+        {
+                if (!previous)
+                {
+                        FAIL("No previous RE");
+                }
+                text = previous;
+        }
+        else /* non-empty regexp given, so remember it */
+        {
+                if (previous)
+                        free(previous);
+                previous = (char *)malloc((unsigned)(strlen(text) + 1));
+                if (previous)
+                        strcpy(previous, text);
+        }
+        /* allocate memory */
+        class_cnt = 0;
+        start_cnt = 1;
+        end_sp = 0;
+        retext = text;
+        size = calcsize(text) + sizeof(regexp);
+        re = (regexp *)malloc((unsigned)size);
+        if (!re)
+        {
+                FAIL("Not enough memory for this RE");
+        }
+        /* compile it */
+        build = &re->program[1 + 32 * class_cnt];
+        re->program[0] = class_cnt;
+        for (token = 0; token < NSUBEXP; token++)
+        {
+                re->startp[token] = re->endp[token] = (char *)0;
+        }
+        re->first = 0;
+        re->bol = 0;
+        re->minlen = 0;
+        needfirst = 1;
+        class_cnt = 0;
+        start_cnt = 1;
+        end_sp = 0;
+        retext = text;
+        for (token = M_START(0), peek = gettoken(&text, re);
+             token;
+             token = peek, peek = gettoken(&text, re))
+        {
+                /* special processing for the closure operator */
+                if (IS_CLOSURE(peek))
+                {
+                        /* detect misuse of closure operator */
+                        if (IS_START(token))
+                        {
+                                FAIL("* or \\+ or \\? follows nothing");
+                        }
+                        else if (IS_META(token) && token != M_ANY && !IS_CLASS(token))
+                        {
+                                FAIL("* or \\+ or \\? can only follow a normal character or . or []");
+                        }
+                        /* it is okay -- make it prefix instead of postfix */
+                        ADD_META(build, peek);
+                        /* take care of "needfirst" - is this the first char? */
+                        if (needfirst && peek == M_PLUS && !IS_META(token))
+                        {
+                                re->first = token;
+                        }
+                        needfirst = 0;
+                        /* we used "peek" -- need to refill it */
+                        peek = gettoken(&text, re);
+                        if (IS_CLOSURE(peek))
+                        {
+                                FAIL("* or \\+ or \\? doubled up");
+                        }
+                }
+                else if (!IS_META(token))
+                {
+                        /* normal char is NOT argument of closure */
+                        if (needfirst)
+                        {
+                                re->first = token;
+                                needfirst = 0;
+                        }
+                        re->minlen++;
+                }
+                else if (token == M_ANY || IS_CLASS(token))
+                {
+                        /* . or [] is NOT argument of closure */
+                        needfirst = 0;
+                        re->minlen++;
+                }
+                /* the "token" character is not closure -- process it normally */
+                if (token == M_BEGLINE)
+                {
+                        /* set the BOL flag instead of storing M_BEGLINE */
+                        re->bol = 1;
+                }
+                else if (IS_META(token))
+                {
+                        ADD_META(build, token);
+                }
+                else
+                {
+                        *build++ = token;
+                }
+        }
+        /* end it with a \) which MUST MATCH the opening \( */
+        ADD_META(build, M_END(0));
+        if (end_sp > 0)
+        {
+                FAIL("Not enough \\)s");
+        }
+        return re;
+}
+/* This function searches through a string for text that matches an RE. */
+/* re  -- the compiled regexp to search for */
+/* str -- the string to search through */
+/* bol -- does str start at the beginning of a line? (boolean) */
+/* ignoreCase -- ignoreCase or not */
+extern int regexec(struct regexp* re, char* str, int bol, int ignoreCase)
+{
+        char    *prog;  /* the entry point of re->program */
+        int     len;    /* length of the string */
+        char    *here;
+        /* if must start at the beginning of a line, and this isn't, then fail */
+        if (re->bol && bol==TRUE)
+        {
+                return FALSE;
+        }
+        len = strlen(str);
+        prog = re->program + 1 + 32 * re->program[0];
+        /* search for the RE in the string */
+        if (re->bol)
+        {
+                /* must occur at BOL */
+                if ((re->first
+                        && match1(re, *(char *)str, re->first, ignoreCase))/* wrong first letter? */
+                 || len < re->minlen                    /* not long enough? */
+                 || match(re, (char *)str, prog, str, ignoreCase))      /* doesn't match? */
+                        return FALSE;                   /* THEN FAIL! */
+        }
+        else if (ignoreCase == FALSE)
+        {
+                /* can occur anywhere in the line, noignorecase */
+                for (here = (char *)str;
+                     (re->first && re->first != *here)
+                        || match(re, (char *)str, prog, here, ignoreCase);
+                     here++, len--)
+                {
+                        if (len < re->minlen)
+                                return FALSE;
+                }
+        }
+        else
+        {
+                /* can occur anywhere in the line, ignorecase */
+                for (here = (char *)str;
+                     (re->first && match1(re, *here, (int)re->first, ignoreCase))
+                        || match(re, (char *)str, prog, here, ignoreCase);
+                     here++, len--)
+                {
+                        if (len < re->minlen)
+                                return FALSE;
+                }
+        }
+        /* if we didn't fail, then we must have succeeded */
+        return TRUE;
+}
+/* This performs substitutions after a regexp match has been found.  */
+extern void regsub(regexp* re, char* src, char* dst)
+{
+        char    *cpy;
+        char    *end;
+        char    c;
+        char            *start;
+        int             mod;
+        mod = 0;
+        start = src;
+        while ((c = *src++) != '\0')
+        {
+                /* recognize any meta characters */
+                if (c == '&')
+                {
+                        cpy = re->startp[0];
+                        end = re->endp[0];
+                }
+                else if (c == '~')
+                {
+                        cpy = previous1;
+                        if (cpy)
+                                end = cpy + strlen(cpy);
+                }
+                else
+                if (c == '\\')
+                {
+                        c = *src++;
+                        switch (c)
+                        {
+                          case '0':
+                          case '1':
+                          case '2':
+                          case '3':
+                          case '4':
+                          case '5':
+                          case '6':
+                          case '7':
+                          case '8':
+                          case '9':
+                                /* \0 thru \9 mean "copy subexpression" */
+                                c -= '0';
+                                cpy = re->startp[(int)c];
+                                end = re->endp[(int)c];
+                                break;
+                          case 'U':
+                          case 'u':
+                          case 'L':
+                          case 'l':
+                                /* \U and \L mean "convert to upper/lowercase" */
+                                mod = c;
+                                continue;
+                          case 'E':
+                          case 'e':
+                                /* \E ends the \U or \L */
+                                mod = 0;
+                                continue;
+                          case '&':
+                                /* "\&" means "original text" */
+                                *dst++ = c;
+                                continue;
+                          case '~':
+                                /* "\~" means "previous text, if any" */
+                                *dst++ = c;
+                                continue;
+                          default:
+                                /* ordinary char preceded by backslash */
+                                *dst++ = c;
+                                continue;
+                        }
+                }
+                else
+                {
+                        /* ordinary character, so just copy it */
+                        *dst++ = c;
+                        continue;
+                }
+                /* Note: to reach this point in the code, we must have evaded
+                 * all "continue" statements.  To do that, we must have hit
+                 * a metacharacter that involves copying.
+                 */
+                /* if there is nothing to copy, loop */
+                if (!cpy)
+                        continue;
+                /* copy over a portion of the original */
+                while (cpy < end)
+                {
+                        switch (mod)
+                        {
+                          case 'U':
+                          case 'u':
+                                /* convert to uppercase */
+                                if (isascii(*cpy) && islower(*cpy))
+                                {
+                                        *dst++ = toupper(*cpy);
+                                        cpy++;
+                                }
+                                else
+                                {
+                                        *dst++ = *cpy++;
+                                }
+                                break;
+                          case 'L':
+                          case 'l':
+                                /* convert to lowercase */
+                                if (isascii(*cpy) && isupper(*cpy))
+                                {
+                                        *dst++ = tolower(*cpy);
+                                        cpy++;
+                                }
+                                else
+                                {
+                                        *dst++ = *cpy++;
+                                }
+                                break;
+                          default:
+                                /* copy without any conversion */
+                                *dst++ = *cpy++;
+                        }
+                        /* \u and \l end automatically after the first char */
+                        if (mod && (mod == 'u' || mod == 'l'))
+                        {
+                                mod = 0;
+                        }
+                }
+        }
+        *dst = '\0';
+        /* remember what text we inserted this time */
+        if (previous1)
+                free(previous1);
+        previous1 = (char *)malloc((unsigned)(strlen(start) + 1));
+        if (previous1)
+                strcpy(previous1, start);
+}
+#endif /* BB_REGEXP */

diff --git a/regexp.c b/regexp.c new file mode 100644 index 000000000..6017d79b4 --- /dev/null +++ b/regexp.c
@@ -0,0 +1,826 @@
	1	/* regexp.c */
	2
	3	#include "internal.h"
	4	#include "regexp.h"
	5	#include <setjmp.h>
	6	#include <stdio.h>
	7	#include <ctype.h>
	8
	9
	10	#if ( defined BB_GREP \|\| defined BB_FIND )
	11
	12	/* This also tries to find a needle in a haystack, but uses
	13	* real regular expressions.... The fake regular expression
	14	* version of find_match lives in utility.c. Using this version
	15	* will add 3.9k to busybox...
	16	* -Erik Andersen
	17	*/
	18	extern int find_match(char haystack, char needle, int ignoreCase)
	19	{
	20	int status;
	21	struct regexp* re;
	22	re = regcomp( needle);
	23	status = regexec(re, haystack, FALSE, ignoreCase);
	24	free( re);
	25	return( status);
	26	}
	27
	28
	29	/* code swiped from elvis-tiny 1.4 (a clone of vi) and adjusted to
	30	* suit the needs of busybox by Erik Andersen.
	31	*
	32	* From the README:
	33	* "Elvis is freely redistributable, in either source form or executable form.
	34	* There are no restrictions on how you may use it".
	35	* Elvis was written by Steve Kirkendall <kirkenda@cs.pdx.edu>
	36	*
	37	*
	38	* This file contains the code that compiles regular expressions and executes
	39	* them. It supports the same syntax and features as vi's regular expression
	40	* code. Specifically, the meta characters are:
	41	* ^ matches the beginning of a line
	42	* $ matches the end of a line
	43	* \< matches the beginning of a word
	44	* \> matches the end of a word
	45	* . matches any single character
	46	* [] matches any character in a character class
	47	* \( delimits the start of a subexpression
	48	* \) delimits the end of a subexpression
	49	* * repeats the preceding 0 or more times
	50	* NOTE: You cannot follow a \) with a *.
	51	*
	52	* The physical structure of a compiled RE is as follows:
	53	* - First, there is a one-byte value that says how many character classes
	54	* are used in this regular expression
	55	* - Next, each character class is stored as a bitmap that is 256 bits
	56	* (32 bytes) long.
	57	* - A mixture of literal characters and compiled meta characters follows.
	58	* This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
	59	* are stored as a \n followed by a one-byte code, so they take up two
	60	* bytes apiece. Literal characters take up one byte apiece. \n can't
	61	* be used as a literal character.
	62	*
	63	*/
	64
	65
	66
	67	static char previous; / the previous regexp, used when null regexp is given */
	68	static char previous1; / a copy of the text from the previous substitution for regsub()*/
	69
	70
	71	/* These are used to classify or recognize meta-characters */
	72	#define META '\0'
	73	#define BASE_META(m) ((m) - 256)
	74	#define INT_META(c) ((c) + 256)
	75	#define IS_META(m) ((m) >= 256)
	76	#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
	77	#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
	78	#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
	79	#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_QMARK)
	80	#define ADD_META(s,m) ((s)++ = META, (s)++ = BASE_META(m))
	81	#define GET_META(s) ((s) == META ? INT_META(++(s)) : *s)
	82
	83	/* These are the internal codes used for each type of meta-character */
	84	#define M_BEGLINE 256 /* internal code for ^ */
	85	#define M_ENDLINE 257 /* internal code for $ */
	86	#define M_BEGWORD 258 /* internal code for \< */
	87	#define M_ENDWORD 259 /* internal code for \> */
	88	#define M_ANY 260 /* internal code for . */
	89	#define M_SPLAT 261 /* internal code for * */
	90	#define M_PLUS 262 /* internal code for \+ */
	91	#define M_QMARK 263 /* internal code for \? */
	92	#define M_CLASS(n) (264+(n)) /* internal code for [] */
	93	#define M_START(n) (274+(n)) /* internal code for \( */
	94	#define M_END(n) (284+(n)) /* internal code for \) */
	95
	96	/* These are used during compilation */
	97	static int class_cnt; /* used to assign class IDs */
	98	static int start_cnt; /* used to assign start IDs */
	99	static int end_stk[NSUBEXP];/* used to assign end IDs */
	100	static int end_sp;
	101	static char retext; / points to the text being compiled */
	102
	103	/* error-handling stuff */
	104	jmp_buf errorhandler;
	105	#define FAIL(why) fprintf(stderr, why); longjmp(errorhandler, 1)
	106
	107
	108
	109
	110	/* This function builds a bitmap for a particular class */
	111	/* text -- start of the class */
	112	/* bmap -- the bitmap */
	113	static char makeclass(char text, char* bmap)
	114	{
	115	int i;
	116	int complement = 0;
	117
	118
	119	/* zero the bitmap */
	120	for (i = 0; bmap && i < 32; i++)
	121	{
	122	bmap[i] = 0;
	123	}
	124
	125	/* see if we're going to complement this class */
	126	if (*text == '^')
	127	{
	128	text++;
	129	complement = 1;
	130	}
	131
	132	/* add in the characters */
	133	while (text && text != ']')
	134	{
	135	/* is this a span of characters? */
	136	if (text[1] == '-' && text[2])
	137	{
	138	/* spans can't be backwards */
	139	if (text[0] > text[2])
	140	{
	141	FAIL("Backwards span in []");
	142	}
	143
	144	/* add each character in the span to the bitmap */
	145	for (i = text[0]; bmap && i <= text[2]; i++)
	146	{
	147	bmap[i >> 3] \|= (1 << (i & 7));
	148	}
	149
	150	/* move past this span */
	151	text += 3;
	152	}
	153	else
	154	{
	155	/* add this single character to the span */
	156	i = *text++;
	157	if (bmap)
	158	{
	159	bmap[i >> 3] \|= (1 << (i & 7));
	160	}
	161	}
	162	}
	163
	164	/* make sure the closing ] is missing */
	165	if (*text++ != ']')
	166	{
	167	FAIL("] missing");
	168	}
	169
	170	/* if we're supposed to complement this class, then do so */
	171	if (complement && bmap)
	172	{
	173	for (i = 0; i < 32; i++)
	174	{
	175	bmap[i] = ~bmap[i];
	176	}
	177	}
	178
	179	return text;
	180	}
	181
	182
	183
	184
	185	/* This function gets the next character or meta character from a string.
	186	* The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
	187	* a bitmap is generated via makeclass() (if re is given), and the
	188	* character-class text is skipped.
	189	*/
	190	static int gettoken(sptr, re)
	191	char **sptr;
	192	regexp *re;
	193	{
	194	int c;
	195
	196	c = **sptr;
	197	++*sptr;
	198	if (c == '\\')
	199	{
	200	c = **sptr;
	201	++*sptr;
	202	switch (c)
	203	{
	204	case '<':
	205	return M_BEGWORD;
	206
	207	case '>':
	208	return M_ENDWORD;
	209
	210	case '(':
	211	if (start_cnt >= NSUBEXP)
	212	{
	213	FAIL("Too many \\(s");
	214	}
	215	end_stk[end_sp++] = start_cnt;
	216	return M_START(start_cnt++);
	217
	218	case ')':
	219	if (end_sp <= 0)
	220	{
	221	FAIL("Mismatched \\)");
	222	}
	223	return M_END(end_stk[--end_sp]);
	224
	225	case '*':
	226	return M_SPLAT;
	227
	228	case '.':
	229	return M_ANY;
	230
	231	case '+':
	232	return M_PLUS;
	233
	234	case '?':
	235	return M_QMARK;
	236
	237	default:
	238	return c;
	239	}
	240	}
	241	else {
	242	switch (c)
	243	{
	244	case '^':
	245	if (*sptr == retext + 1)
	246	{
	247	return M_BEGLINE;
	248	}
	249	return c;
	250
	251	case '$':
	252	if (!**sptr)
	253	{
	254	return M_ENDLINE;
	255	}
	256	return c;
	257
	258	case '.':
	259	return M_ANY;
	260
	261	case '*':
	262	return M_SPLAT;
	263
	264	case '[':
	265	/* make sure we don't have too many classes */
	266	if (class_cnt >= 10)
	267	{
	268	FAIL("Too many []s");
	269	}
	270
	271	/* process the character list for this class */
	272	if (re)
	273	{
	274	/* generate the bitmap for this class */
	275	sptr = makeclass(sptr, re->program + 1 + 32 * class_cnt);
	276	}
	277	else
	278	{
	279	/* skip to end of the class */
	280	sptr = makeclass(sptr, (char *)0);
	281	}
	282	return M_CLASS(class_cnt++);
	283
	284	default:
	285	return c;
	286	}
	287	}
	288	/NOTREACHED/
	289	}
	290
	291
	292
	293
	294	/* This function calculates the number of bytes that will be needed for a
	295	* compiled RE. Its argument is the uncompiled version. It is not clever
	296	* about catching syntax errors; that is done in a later pass.
	297	*/
	298	static unsigned calcsize(text)
	299	char *text;
	300	{
	301	unsigned size;
	302	int token;
	303
	304	retext = text;
	305	class_cnt = 0;
	306	start_cnt = 1;
	307	end_sp = 0;
	308	size = 5;
	309	while ((token = gettoken(&text, (regexp *)0)) != 0)
	310	{
	311	if (IS_CLASS(token))
	312	{
	313	size += 34;
	314	}
	315	else if (IS_META(token))
	316	{
	317	size += 2;
	318	}
	319	else
	320	{
	321	size++;
	322	}
	323	}
	324
	325	return size;
	326	}
	327
	328
	329
	330	/---------------------------------------------------------------------------/
	331
	332
	333	/* This function checks for a match between a character and a token which is
	334	* known to represent a single character. It returns 0 if they match, or
	335	* 1 if they don't.
	336	*/
	337	static int match1(regexp* re, char ch, int token, int ignoreCase)
	338	{
	339	if (!ch)
	340	{
	341	/* the end of a line can't match any RE of width 1 */
	342	return 1;
	343	}
	344	if (token == M_ANY)
	345	{
	346	return 0;
	347	}
	348	else if (IS_CLASS(token))
	349	{
	350	if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7)))
	351	return 0;
	352	}
	353	else if (ch == token
	354	\|\| (ignoreCase==TRUE && isupper(ch) && tolower(ch) == token))
	355	{
	356	return 0;
	357	}
	358	return 1;
	359	}
	360
	361
	362
	363	/* This function checks characters up to and including the next closure, at
	364	* which point it does a recursive call to check the rest of it. This function
	365	* returns 0 if everything matches, or 1 if something doesn't match.
	366	*/
	367	/* re -- the regular expression */
	368	/* str -- the string */
	369	/* prog -- a portion of re->program, an compiled RE */
	370	/* here -- a portion of str, the string to compare it to */
	371	static int match(regexp* re, char* str, char* prog, char* here, int ignoreCase)
	372	{
	373	int token;
	374	int nmatched;
	375	int closure;
	376
	377	for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog))
	378	{
	379	switch (token)
	380	{
	381	/case M_BEGLINE: can't happen; re->bol is used instead /
	382	case M_ENDLINE:
	383	if (*here)
	384	return 1;
	385	break;
	386
	387	case M_BEGWORD:
	388	if (here != str &&
	389	(here[-1] == '_' \|\|
	390	(isascii(here[-1]) && isalnum(here[-1]))))
	391	return 1;
	392	break;
	393
	394	case M_ENDWORD:
	395	if ((here[0] == '_' \|\| isascii(here[0])) && isalnum(here[0]))
	396	return 1;
	397	break;
	398
	399	case M_START(0):
	400	case M_START(1):
	401	case M_START(2):
	402	case M_START(3):
	403	case M_START(4):
	404	case M_START(5):
	405	case M_START(6):
	406	case M_START(7):
	407	case M_START(8):
	408	case M_START(9):
	409	re->startp[token - M_START(0)] = (char *)here;
	410	break;
	411
	412	case M_END(0):
	413	case M_END(1):
	414	case M_END(2):
	415	case M_END(3):
	416	case M_END(4):
	417	case M_END(5):
	418	case M_END(6):
	419	case M_END(7):
	420	case M_END(8):
	421	case M_END(9):
	422	re->endp[token - M_END(0)] = (char *)here;
	423	if (token == M_END(0))
	424	{
	425	return 0;
	426	}
	427	break;
	428
	429	default: /* literal, M_CLASS(n), or M_ANY */
	430	if (match1(re, *here, token, ignoreCase) != 0)
	431	return 1;
	432	here++;
	433	}
	434	}
	435
	436	/* C L O S U R E */
	437
	438	/* step 1: see what we have to match against, and move "prog" to point
	439	* the the remainder of the compiled RE.
	440	*/
	441	closure = token;
	442	prog++, token = GET_META(prog);
	443	prog++;
	444
	445	/* step 2: see how many times we can match that token against the string */
	446	for (nmatched = 0;
	447	(closure != M_QMARK \|\| nmatched < 1) && here && match1(re, here, token, ignoreCase) == 0;
	448	nmatched++, here++)
	449	{
	450	}
	451
	452	/* step 3: try to match the remainder, and back off if it doesn't */
	453	while (nmatched >= 0 && match(re, str, prog, here, ignoreCase) != 0)
	454	{
	455	nmatched--;
	456	here--;
	457	}
	458
	459	/* so how did it work out? */
	460	if (nmatched >= ((closure == M_PLUS) ? 1 : 0))
	461	return 0;
	462	return 1;
	463	}
	464
	465
	466	/* This function compiles a regexp. */
	467	extern regexp regcomp(char text)
	468	{
	469	int needfirst;
	470	unsigned size;
	471	int token;
	472	int peek;
	473	char *build;
	474	regexp *re;
	475
	476
	477	/* prepare for error handling */
	478	re = (regexp *)0;
	479	if (setjmp(errorhandler))
	480	{
	481	if (re)
	482	{
	483	free(re);
	484	}
	485	return (regexp *)0;
	486	}
	487
	488	/* if an empty regexp string was given, use the previous one */
	489	if (*text == 0)
	490	{
	491	if (!previous)
	492	{
	493	FAIL("No previous RE");
	494	}
	495	text = previous;
	496	}
	497	else /* non-empty regexp given, so remember it */
	498	{
	499	if (previous)
	500	free(previous);
	501	previous = (char *)malloc((unsigned)(strlen(text) + 1));
	502	if (previous)
	503	strcpy(previous, text);
	504	}
	505
	506	/* allocate memory */
	507	class_cnt = 0;
	508	start_cnt = 1;
	509	end_sp = 0;
	510	retext = text;
	511	size = calcsize(text) + sizeof(regexp);
	512	re = (regexp *)malloc((unsigned)size);
	513
	514	if (!re)
	515	{
	516	FAIL("Not enough memory for this RE");
	517	}
	518
	519	/* compile it */
	520	build = &re->program[1 + 32 * class_cnt];
	521	re->program[0] = class_cnt;
	522	for (token = 0; token < NSUBEXP; token++)
	523	{
	524	re->startp[token] = re->endp[token] = (char *)0;
	525	}
	526	re->first = 0;
	527	re->bol = 0;
	528	re->minlen = 0;
	529	needfirst = 1;
	530	class_cnt = 0;
	531	start_cnt = 1;
	532	end_sp = 0;
	533	retext = text;
	534	for (token = M_START(0), peek = gettoken(&text, re);
	535	token;
	536	token = peek, peek = gettoken(&text, re))
	537	{
	538	/* special processing for the closure operator */
	539	if (IS_CLOSURE(peek))
	540	{
	541	/* detect misuse of closure operator */
	542	if (IS_START(token))
	543	{
	544	FAIL("* or \\+ or \\? follows nothing");
	545	}
	546	else if (IS_META(token) && token != M_ANY && !IS_CLASS(token))
	547	{
	548	FAIL("* or \\+ or \\? can only follow a normal character or . or []");
	549	}
	550
	551	/* it is okay -- make it prefix instead of postfix */
	552	ADD_META(build, peek);
	553
	554	/* take care of "needfirst" - is this the first char? */
	555	if (needfirst && peek == M_PLUS && !IS_META(token))
	556	{
	557	re->first = token;
	558	}
	559	needfirst = 0;
	560
	561	/* we used "peek" -- need to refill it */
	562	peek = gettoken(&text, re);
	563	if (IS_CLOSURE(peek))
	564	{
	565	FAIL("* or \\+ or \\? doubled up");
	566	}
	567	}
	568	else if (!IS_META(token))
	569	{
	570	/* normal char is NOT argument of closure */
	571	if (needfirst)
	572	{
	573	re->first = token;
	574	needfirst = 0;
	575	}
	576	re->minlen++;
	577	}
	578	else if (token == M_ANY \|\| IS_CLASS(token))
	579	{
	580	/* . or [] is NOT argument of closure */
	581	needfirst = 0;
	582	re->minlen++;
	583	}
	584
	585	/* the "token" character is not closure -- process it normally */
	586	if (token == M_BEGLINE)
	587	{
	588	/* set the BOL flag instead of storing M_BEGLINE */
	589	re->bol = 1;
	590	}
	591	else if (IS_META(token))
	592	{
	593	ADD_META(build, token);
	594	}
	595	else
	596	{
	597	*build++ = token;
	598	}
	599	}
	600
	601	/* end it with a \) which MUST MATCH the opening \( */
	602	ADD_META(build, M_END(0));
	603	if (end_sp > 0)
	604	{
	605	FAIL("Not enough \\)s");
	606	}
	607
	608	return re;
	609	}
	610
	611
	612
	613
	614	/* This function searches through a string for text that matches an RE. */
	615	/* re -- the compiled regexp to search for */
	616	/* str -- the string to search through */
	617	/* bol -- does str start at the beginning of a line? (boolean) */
	618	/* ignoreCase -- ignoreCase or not */
	619	extern int regexec(struct regexp* re, char* str, int bol, int ignoreCase)
	620	{
	621	char prog; / the entry point of re->program */
	622	int len; /* length of the string */
	623	char *here;
	624
	625	/* if must start at the beginning of a line, and this isn't, then fail */
	626	if (re->bol && bol==TRUE)
	627	{
	628	return FALSE;
	629	}
	630
	631	len = strlen(str);
	632	prog = re->program + 1 + 32 * re->program[0];
	633
	634	/* search for the RE in the string */
	635	if (re->bol)
	636	{
	637	/* must occur at BOL */
	638	if ((re->first
	639	&& match1(re, (char )str, re->first, ignoreCase))/* wrong first letter? */
	640	\|\| len < re->minlen /* not long enough? */
	641	\|\| match(re, (char )str, prog, str, ignoreCase)) / doesn't match? */
	642	return FALSE; /* THEN FAIL! */
	643	}
	644	else if (ignoreCase == FALSE)
	645	{
	646	/* can occur anywhere in the line, noignorecase */
	647	for (here = (char *)str;
	648	(re->first && re->first != *here)
	649	\|\| match(re, (char *)str, prog, here, ignoreCase);
	650	here++, len--)
	651	{
	652	if (len < re->minlen)
	653	return FALSE;
	654	}
	655	}
	656	else
	657	{
	658	/* can occur anywhere in the line, ignorecase */
	659	for (here = (char *)str;
	660	(re->first && match1(re, *here, (int)re->first, ignoreCase))
	661	\|\| match(re, (char *)str, prog, here, ignoreCase);
	662	here++, len--)
	663	{
	664	if (len < re->minlen)
	665	return FALSE;
	666	}
	667	}
	668
	669	/* if we didn't fail, then we must have succeeded */
	670	return TRUE;
	671	}
	672
	673
	674
	675
	676
	677	/* This performs substitutions after a regexp match has been found. */
	678	extern void regsub(regexp* re, char* src, char* dst)
	679	{
	680	char *cpy;
	681	char *end;
	682	char c;
	683	char *start;
	684	int mod;
	685
	686	mod = 0;
	687
	688	start = src;
	689	while ((c = *src++) != '\0')
	690	{
	691	/* recognize any meta characters */
	692	if (c == '&')
	693	{
	694	cpy = re->startp[0];
	695	end = re->endp[0];
	696	}
	697	else if (c == '~')
	698	{
	699	cpy = previous1;
	700	if (cpy)
	701	end = cpy + strlen(cpy);
	702	}
	703	else
	704	if (c == '\\')
	705	{
	706	c = *src++;
	707	switch (c)
	708	{
	709	case '0':
	710	case '1':
	711	case '2':
	712	case '3':
	713	case '4':
	714	case '5':
	715	case '6':
	716	case '7':
	717	case '8':
	718	case '9':
	719	/* \0 thru \9 mean "copy subexpression" */
	720	c -= '0';
	721	cpy = re->startp[(int)c];
	722	end = re->endp[(int)c];
	723	break;
	724	case 'U':
	725	case 'u':
	726	case 'L':
	727	case 'l':
	728	/* \U and \L mean "convert to upper/lowercase" */
	729	mod = c;
	730	continue;
	731
	732	case 'E':
	733	case 'e':
	734	/* \E ends the \U or \L */
	735	mod = 0;
	736	continue;
	737	case '&':
	738	/* "\&" means "original text" */
	739	*dst++ = c;
	740	continue;
	741
	742	case '~':
	743	/* "\~" means "previous text, if any" */
	744	*dst++ = c;
	745	continue;
	746	default:
	747	/* ordinary char preceded by backslash */
	748	*dst++ = c;
	749	continue;
	750	}
	751	}
	752	else
	753	{
	754	/* ordinary character, so just copy it */
	755	*dst++ = c;
	756	continue;
	757	}
	758
	759	/* Note: to reach this point in the code, we must have evaded
	760	* all "continue" statements. To do that, we must have hit
	761	* a metacharacter that involves copying.
	762	*/
	763
	764	/* if there is nothing to copy, loop */
	765	if (!cpy)
	766	continue;
	767
	768	/* copy over a portion of the original */
	769	while (cpy < end)
	770	{
	771	switch (mod)
	772	{
	773	case 'U':
	774	case 'u':
	775	/* convert to uppercase */
	776	if (isascii(cpy) && islower(cpy))
	777	{
	778	dst++ = toupper(cpy);
	779	cpy++;
	780	}
	781	else
	782	{
	783	dst++ = cpy++;
	784	}
	785	break;
	786
	787	case 'L':
	788	case 'l':
	789	/* convert to lowercase */
	790	if (isascii(cpy) && isupper(cpy))
	791	{
	792	dst++ = tolower(cpy);
	793	cpy++;
	794	}
	795	else
	796	{
	797	dst++ = cpy++;
	798	}
	799	break;
	800
	801	default:
	802	/* copy without any conversion */
	803	dst++ = cpy++;
	804	}
	805
	806	/* \u and \l end automatically after the first char */
	807	if (mod && (mod == 'u' \|\| mod == 'l'))
	808	{
	809	mod = 0;
	810	}
	811	}
	812	}
	813	*dst = '\0';
	814
	815	/* remember what text we inserted this time */
	816	if (previous1)
	817	free(previous1);
	818	previous1 = (char *)malloc((unsigned)(strlen(start) + 1));
	819	if (previous1)
	820	strcpy(previous1, start);
	821	}
	822
	823
	824	#endif /* BB_REGEXP */
	825
	826