aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Schindelin <johannes.schindelin@gmx.de>2017-06-28 11:29:20 +0200
committerJohannes Schindelin <johannes.schindelin@gmx.de>2017-07-14 22:57:49 +0200
commitdacee10ce244e3da9f2d1f0f43a3224ab9e5d26b (patch)
tree1a57e6afd73fae8732b05641a6235593385a7464
parent8b66b26302cb1d5422605bd247f7df53e0d2bb26 (diff)
downloadbusybox-w32-dacee10ce244e3da9f2d1f0f43a3224ab9e5d26b.tar.gz
busybox-w32-dacee10ce244e3da9f2d1f0f43a3224ab9e5d26b.tar.bz2
busybox-w32-dacee10ce244e3da9f2d1f0f43a3224ab9e5d26b.zip
win32/regex: update to newest version in Git
In 909696f13 (win32: Import regex source, 2010-04-14), Git's compat/regex/ was imported wholesale, with one change (to avoid redefining _GNU_SOURCE). For the record, the git.git commit mentioned in that commit message refers to a transient commit made to git.git's `next` branch which is rewound with every major Git version, therefore it is long gone. Also for the record, the correct reference would be: 3632cfc2487 (Use compatibility regex library for OSX/Darwin, 2008-09-07), i.e. the compat/regex/ source code as of Git v1.6.0.2. This commit updates the regex source code to that of Git v2.13.2, or bd8f0055836 (regex: fix a SIZE_MAX macro redefinition warning, 2016-06-07) in git.git. Instead of the original fixup to avoid redefining _GNU_SOURCE, we now require these changes relative to Git's source code: > diff --git a/win32/regex.c b/win32/regex.c > index 5cb23e5d5..95e5d757a 100644 > --- a/win32/regex.c > +++ b/win32/regex.c > @@ -18,9 +18,11 @@ > Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA. */ > > -#ifdef HAVE_CONFIG_H > -#include "config.h" > -#endif > +#define HAVE_LIBINTL_H 0 > +#define ENABLE_NLS 0 > +#define HAVE_ALLOCA 0 > +#define NO_MBSUPPORT 1 > +#define GAWK 1 > > /* Make sure no one compiles this code with a C++ compiler. */ > #ifdef __cplusplus Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
-rw-r--r--win32/regcomp.c3884
-rw-r--r--win32/regex.c5009
-rw-r--r--win32/regex.h462
-rw-r--r--win32/regex_internal.c1744
-rw-r--r--win32/regex_internal.h810
-rw-r--r--win32/regexec.c4369
6 files changed, 11169 insertions, 5109 deletions
diff --git a/win32/regcomp.c b/win32/regcomp.c
new file mode 100644
index 000000000..d8bde06f1
--- /dev/null
+++ b/win32/regcomp.c
@@ -0,0 +1,3884 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124/* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129const char __re_error_msgid[] attribute_hidden =
130 {
131#define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203
204/* Entry points for GNU code. */
205
206
207#ifdef ZOS_USS
208
209/* For ZOS USS we must define btowc */
210
211wchar_t
212btowc (int c)
213{
214 wchar_t wtmp[2];
215 char tmp[2];
216
217 tmp[0] = c;
218 tmp[1] = 0;
219
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
222}
223#endif
224
225/* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
228
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
231
232const char *
233re_compile_pattern (const char *pattern,
234 size_t length,
235 struct re_pattern_buffer *bufp)
236{
237 reg_errcode_t ret;
238
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub, unless RE_NO_SUB is set. */
242 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
243
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
246
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
248
249 if (!ret)
250 return NULL;
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
252}
253#ifdef _LIBC
254weak_alias (__re_compile_pattern, re_compile_pattern)
255#endif
256
257/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260/* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262reg_syntax_t re_syntax_options;
263
264
265/* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
268
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
271
272reg_syntax_t
273re_set_syntax (reg_syntax_t syntax)
274{
275 reg_syntax_t ret = re_syntax_options;
276
277 re_syntax_options = syntax;
278 return ret;
279}
280#ifdef _LIBC
281weak_alias (__re_set_syntax, re_set_syntax)
282#endif
283
284int
285re_compile_fastmap (struct re_pattern_buffer *bufp)
286{
287 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
288 char *fastmap = bufp->fastmap;
289
290 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
291 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
292 if (dfa->init_state != dfa->init_state_word)
293 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
294 if (dfa->init_state != dfa->init_state_nl)
295 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
296 if (dfa->init_state != dfa->init_state_begbuf)
297 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
298 bufp->fastmap_accurate = 1;
299 return 0;
300}
301#ifdef _LIBC
302weak_alias (__re_compile_fastmap, re_compile_fastmap)
303#endif
304
305static inline void
306__attribute ((always_inline))
307re_set_fastmap (char *fastmap, int icase, int ch)
308{
309 fastmap[ch] = 1;
310 if (icase)
311 fastmap[tolower (ch)] = 1;
312}
313
314/* Helper function for re_compile_fastmap.
315 Compile fastmap for the initial_state INIT_STATE. */
316
317static void
318re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
319 char *fastmap)
320{
321 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
322 int node_cnt;
323 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
324 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
325 {
326 int node = init_state->nodes.elems[node_cnt];
327 re_token_type_t type = dfa->nodes[node].type;
328
329 if (type == CHARACTER)
330 {
331 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
332#ifdef RE_ENABLE_I18N
333 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
334 {
335 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
336 wchar_t wc;
337 mbstate_t state;
338
339 p = buf;
340 *p++ = dfa->nodes[node].opr.c;
341 while (++node < dfa->nodes_len
342 && dfa->nodes[node].type == CHARACTER
343 && dfa->nodes[node].mb_partial)
344 *p++ = dfa->nodes[node].opr.c;
345 memset (&state, '\0', sizeof (state));
346 if (__mbrtowc (&wc, (const char *) buf, p - buf,
347 &state) == p - buf
348 && (__wcrtomb ((char *) buf, towlower (wc), &state)
349 != (size_t) -1))
350 re_set_fastmap (fastmap, 0, buf[0]);
351 re_free (buf);
352 }
353#endif
354 }
355 else if (type == SIMPLE_BRACKET)
356 {
357 int i, ch;
358 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
359 {
360 int j;
361 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
362 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
363 if (w & ((bitset_word_t) 1 << j))
364 re_set_fastmap (fastmap, icase, ch);
365 }
366 }
367#ifdef RE_ENABLE_I18N
368 else if (type == COMPLEX_BRACKET)
369 {
370 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
371 int i;
372
373# ifdef _LIBC
374 /* See if we have to try all bytes which start multiple collation
375 elements.
376 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
377 collation element, and don't catch 'b' since 'b' is
378 the only collation element which starts from 'b' (and
379 it is caught by SIMPLE_BRACKET). */
380 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
381 && (cset->ncoll_syms || cset->nranges))
382 {
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0; i < SBC_MAX; ++i)
386 if (table[i] < 0)
387 re_set_fastmap (fastmap, icase, i);
388 }
389# endif /* _LIBC */
390
391 /* See if we have to start the match at all multibyte characters,
392 i.e. where we would not find an invalid sequence. This only
393 applies to multibyte character sets; for single byte character
394 sets, the SIMPLE_BRACKET again suffices. */
395 if (dfa->mb_cur_max > 1
396 && (cset->nchar_classes || cset->non_match || cset->nranges
397# ifdef _LIBC
398 || cset->nequiv_classes
399# endif /* _LIBC */
400 ))
401 {
402 unsigned char c = 0;
403 do
404 {
405 mbstate_t mbs;
406 memset (&mbs, 0, sizeof (mbs));
407 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
408 re_set_fastmap (fastmap, false, (int) c);
409 }
410 while (++c != 0);
411 }
412
413 else
414 {
415 /* ... Else catch all bytes which can start the mbchars. */
416 for (i = 0; i < cset->nmbchars; ++i)
417 {
418 char buf[256];
419 mbstate_t state;
420 memset (&state, '\0', sizeof (state));
421 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
422 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
423 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
424 {
425 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
426 != (size_t) -1)
427 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
428 }
429 }
430 }
431 }
432#endif /* RE_ENABLE_I18N */
433 else if (type == OP_PERIOD
434#ifdef RE_ENABLE_I18N
435 || type == OP_UTF8_PERIOD
436#endif /* RE_ENABLE_I18N */
437 || type == END_OF_RE)
438 {
439 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
440 if (type == END_OF_RE)
441 bufp->can_be_null = 1;
442 return;
443 }
444 }
445}
446
447/* Entry point for POSIX code. */
448/* regcomp takes a regular expression as a string and compiles it.
449
450 PREG is a regex_t *. We do not expect any fields to be initialized,
451 since POSIX says we shouldn't. Thus, we set
452
453 `buffer' to the compiled pattern;
454 `used' to the length of the compiled pattern;
455 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
456 REG_EXTENDED bit in CFLAGS is set; otherwise, to
457 RE_SYNTAX_POSIX_BASIC;
458 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
459 `fastmap' to an allocated space for the fastmap;
460 `fastmap_accurate' to zero;
461 `re_nsub' to the number of subexpressions in PATTERN.
462
463 PATTERN is the address of the pattern string.
464
465 CFLAGS is a series of bits which affect compilation.
466
467 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
468 use POSIX basic syntax.
469
470 If REG_NEWLINE is set, then . and [^...] don't match newline.
471 Also, regexec will try a match beginning after every newline.
472
473 If REG_ICASE is set, then we considers upper- and lowercase
474 versions of letters to be equivalent when matching.
475
476 If REG_NOSUB is set, then when PREG is passed to regexec, that
477 routine will report only success or failure, and nothing about the
478 registers.
479
480 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
481 the return codes and their meanings.) */
482
483int
484regcomp (regex_t *__restrict preg,
485 const char *__restrict pattern,
486 int cflags)
487{
488 reg_errcode_t ret;
489 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
490 : RE_SYNTAX_POSIX_BASIC);
491
492 preg->buffer = NULL;
493 preg->allocated = 0;
494 preg->used = 0;
495
496 /* Try to allocate space for the fastmap. */
497 preg->fastmap = re_malloc (char, SBC_MAX);
498 if (BE (preg->fastmap == NULL, 0))
499 return REG_ESPACE;
500
501 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
502
503 /* If REG_NEWLINE is set, newlines are treated differently. */
504 if (cflags & REG_NEWLINE)
505 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
506 syntax &= ~RE_DOT_NEWLINE;
507 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
508 /* It also changes the matching behavior. */
509 preg->newline_anchor = 1;
510 }
511 else
512 preg->newline_anchor = 0;
513 preg->no_sub = !!(cflags & REG_NOSUB);
514 preg->translate = NULL;
515
516 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
517
518 /* POSIX doesn't distinguish between an unmatched open-group and an
519 unmatched close-group: both are REG_EPAREN. */
520 if (ret == REG_ERPAREN)
521 ret = REG_EPAREN;
522
523 /* We have already checked preg->fastmap != NULL. */
524 if (BE (ret == REG_NOERROR, 1))
525 /* Compute the fastmap now, since regexec cannot modify the pattern
526 buffer. This function never fails in this implementation. */
527 (void) re_compile_fastmap (preg);
528 else
529 {
530 /* Some error occurred while compiling the expression. */
531 re_free (preg->fastmap);
532 preg->fastmap = NULL;
533 }
534
535 return (int) ret;
536}
537#ifdef _LIBC
538weak_alias (__regcomp, regcomp)
539#endif
540
541/* Returns a message corresponding to an error code, ERRCODE, returned
542 from either regcomp or regexec. We don't use PREG here. */
543
544size_t
545regerror(int errcode, const regex_t *__restrict preg,
546 char *__restrict errbuf, size_t errbuf_size)
547{
548 const char *msg;
549 size_t msg_size;
550
551 if (BE (errcode < 0
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
558 abort ();
559
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
561
562 msg_size = strlen (msg) + 1; /* Includes the null. */
563
564 if (BE (errbuf_size != 0, 1))
565 {
566 if (BE (msg_size > errbuf_size, 0))
567 {
568 memcpy (errbuf, msg, errbuf_size - 1);
569 errbuf[errbuf_size - 1] = 0;
570 }
571 else
572 memcpy (errbuf, msg, msg_size);
573 }
574
575 return msg_size;
576}
577#ifdef _LIBC
578weak_alias (__regerror, regerror)
579#endif
580
581
582#ifdef RE_ENABLE_I18N
583/* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587#if __GNUC__ >= 3
588static const bitset_t utf8_sb_map = {
589 /* Set the first 128 bits. */
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591};
592#else /* ! (__GNUC__ >= 3) */
593static bitset_t utf8_sb_map;
594#endif /* __GNUC__ >= 3 */
595#endif /* RE_ENABLE_I18N */
596
597
598static void
599free_dfa_content (re_dfa_t *dfa)
600{
601 int i, j;
602
603 if (dfa->nodes)
604 for (i = 0; i < dfa->nodes_len; ++i)
605 free_token (dfa->nodes + i);
606 re_free (dfa->nexts);
607 for (i = 0; i < dfa->nodes_len; ++i)
608 {
609 if (dfa->eclosures != NULL)
610 re_node_set_free (dfa->eclosures + i);
611 if (dfa->inveclosures != NULL)
612 re_node_set_free (dfa->inveclosures + i);
613 if (dfa->edests != NULL)
614 re_node_set_free (dfa->edests + i);
615 }
616 re_free (dfa->edests);
617 re_free (dfa->eclosures);
618 re_free (dfa->inveclosures);
619 re_free (dfa->nodes);
620
621 if (dfa->state_table)
622 for (i = 0; i <= dfa->state_hash_mask; ++i)
623 {
624 struct re_state_table_entry *entry = dfa->state_table + i;
625 for (j = 0; j < entry->num; ++j)
626 {
627 re_dfastate_t *state = entry->array[j];
628 free_state (state);
629 }
630 re_free (entry->array);
631 }
632 re_free (dfa->state_table);
633#ifdef RE_ENABLE_I18N
634 if (dfa->sb_char != utf8_sb_map)
635 re_free (dfa->sb_char);
636#endif
637 re_free (dfa->subexp_map);
638#ifdef DEBUG
639 re_free (dfa->re_str);
640#endif
641
642 re_free (dfa);
643}
644
645
646/* Free dynamically allocated space used by PREG. */
647
648void
649regfree (regex_t *preg)
650{
651 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
652 if (BE (dfa != NULL, 1))
653 free_dfa_content (dfa);
654 preg->buffer = NULL;
655 preg->allocated = 0;
656
657 re_free (preg->fastmap);
658 preg->fastmap = NULL;
659
660 re_free (preg->translate);
661 preg->translate = NULL;
662}
663#ifdef _LIBC
664weak_alias (__regfree, regfree)
665#endif
666
667/* Entry points compatible with 4.2 BSD regex library. We don't define
668 them unless specifically requested. */
669
670#if defined _REGEX_RE_COMP || defined _LIBC
671
672/* BSD has one and only one pattern buffer. */
673static struct re_pattern_buffer re_comp_buf;
674
675char *
676# ifdef _LIBC
677/* Make these definitions weak in libc, so POSIX programs can redefine
678 these names if they don't use our functions, and still use
679 regcomp/regexec above without link errors. */
680weak_function
681# endif
682re_comp (s)
683 const char *s;
684{
685 reg_errcode_t ret;
686 char *fastmap;
687
688 if (!s)
689 {
690 if (!re_comp_buf.buffer)
691 return gettext ("No previous regular expression");
692 return 0;
693 }
694
695 if (re_comp_buf.buffer)
696 {
697 fastmap = re_comp_buf.fastmap;
698 re_comp_buf.fastmap = NULL;
699 __regfree (&re_comp_buf);
700 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
701 re_comp_buf.fastmap = fastmap;
702 }
703
704 if (re_comp_buf.fastmap == NULL)
705 {
706 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
707 if (re_comp_buf.fastmap == NULL)
708 return (char *) gettext (__re_error_msgid
709 + __re_error_msgid_idx[(int) REG_ESPACE]);
710 }
711
712 /* Since `re_exec' always passes NULL for the `regs' argument, we
713 don't need to initialize the pattern buffer fields which affect it. */
714
715 /* Match anchors at newlines. */
716 re_comp_buf.newline_anchor = 1;
717
718 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719
720 if (!ret)
721 return NULL;
722
723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
725}
726
727#ifdef _LIBC
728libc_freeres_fn (free_mem)
729{
730 __regfree (&re_comp_buf);
731}
732#endif
733
734#endif /* _REGEX_RE_COMP */
735
736/* Internal entry point.
737 Compile the regular expression PATTERN, whose length is LENGTH.
738 SYNTAX indicate regular expression's syntax. */
739
740static reg_errcode_t
741re_compile_internal (regex_t *preg, const char * pattern, size_t length,
742 reg_syntax_t syntax)
743{
744 reg_errcode_t err = REG_NOERROR;
745 re_dfa_t *dfa;
746 re_string_t regexp;
747
748 /* Initialize the pattern buffer. */
749 preg->fastmap_accurate = 0;
750 preg->syntax = syntax;
751 preg->not_bol = preg->not_eol = 0;
752 preg->used = 0;
753 preg->re_nsub = 0;
754 preg->can_be_null = 0;
755 preg->regs_allocated = REGS_UNALLOCATED;
756
757 /* Initialize the dfa. */
758 dfa = (re_dfa_t *) preg->buffer;
759 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
760 {
761 /* If zero allocated, but buffer is non-null, try to realloc
762 enough space. This loses if buffer's address is bogus, but
763 that is the user's responsibility. If ->buffer is NULL this
764 is a simple allocation. */
765 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
766 if (dfa == NULL)
767 return REG_ESPACE;
768 preg->allocated = sizeof (re_dfa_t);
769 preg->buffer = (unsigned char *) dfa;
770 }
771 preg->used = sizeof (re_dfa_t);
772
773 err = init_dfa (dfa, length);
774 if (BE (err != REG_NOERROR, 0))
775 {
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
780 }
781#ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785#endif
786
787 __libc_lock_init (dfa->lock);
788
789 err = re_string_construct (&regexp, pattern, length, preg->translate,
790 syntax & RE_ICASE, dfa);
791 if (BE (err != REG_NOERROR, 0))
792 {
793 re_compile_internal_free_return:
794 free_workarea_compile (preg);
795 re_string_destruct (&regexp);
796 free_dfa_content (dfa);
797 preg->buffer = NULL;
798 preg->allocated = 0;
799 return err;
800 }
801
802 /* Parse the regular expression, and build a structure tree. */
803 preg->re_nsub = 0;
804 dfa->str_tree = parse (&regexp, preg, syntax, &err);
805 if (BE (dfa->str_tree == NULL, 0))
806 goto re_compile_internal_free_return;
807
808 /* Analyze the tree and create the nfa. */
809 err = analyze (preg);
810 if (BE (err != REG_NOERROR, 0))
811 goto re_compile_internal_free_return;
812
813#ifdef RE_ENABLE_I18N
814 /* If possible, do searching in single byte encoding to speed things up. */
815 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
816 optimize_utf8 (dfa);
817#endif
818
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
821
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (&regexp);
825
826 if (BE (err != REG_NOERROR, 0))
827 {
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
831 }
832
833 return err;
834}
835
836/* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839static reg_errcode_t
840init_dfa (re_dfa_t *dfa, size_t pat_len)
841{
842 unsigned int table_size;
843#ifndef _LIBC
844 char *codeset_name;
845#endif
846
847 memset (dfa, '\0', sizeof (re_dfa_t));
848
849 /* Force allocation of str_tree_storage the first time. */
850 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
851
852 /* Avoid overflows. */
853 if (pat_len == SIZE_MAX)
854 return REG_ESPACE;
855
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
858
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; ; table_size <<= 1)
861 if (table_size > pat_len)
862 break;
863
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
866
867 dfa->mb_cur_max = MB_CUR_MAX;
868#ifdef _LIBC
869 if (dfa->mb_cur_max == 6
870 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
871 dfa->is_utf8 = 1;
872 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
873 != 0);
874#else
875# ifdef HAVE_LANGINFO_CODESET
876 codeset_name = nl_langinfo (CODESET);
877# else
878 codeset_name = getenv ("LC_ALL");
879 if (codeset_name == NULL || codeset_name[0] == '\0')
880 codeset_name = getenv ("LC_CTYPE");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LANG");
883 if (codeset_name == NULL)
884 codeset_name = "";
885 else if (strchr (codeset_name, '.') != NULL)
886 codeset_name = strchr (codeset_name, '.') + 1;
887# endif
888
889 /* strcasecmp isn't a standard interface. brute force check */
890#if 0
891 if (strcasecmp (codeset_name, "UTF-8") == 0
892 || strcasecmp (codeset_name, "UTF8") == 0)
893 dfa->is_utf8 = 1;
894#else
895 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
896 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
897 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
898 && (codeset_name[3] == '-'
899 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
900 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
901 dfa->is_utf8 = 1;
902#endif
903
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
907#endif
908
909#ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
911 {
912 if (dfa->is_utf8)
913 {
914#if !defined(__GNUC__) || __GNUC__ < 3
915 static short utf8_sb_map_inited = 0;
916
917 if (! utf8_sb_map_inited)
918 {
919 int i;
920
921 utf8_sb_map_inited = 0;
922 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
923 utf8_sb_map[i] = BITSET_WORD_MAX;
924 }
925#endif
926 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
927 }
928 else
929 {
930 int i, j, ch;
931
932 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
933 if (BE (dfa->sb_char == NULL, 0))
934 return REG_ESPACE;
935
936 /* Set the bits corresponding to single byte chars. */
937 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
938 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
939 {
940 wint_t wch = __btowc (ch);
941 if (wch != WEOF)
942 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
943# ifndef _LIBC
944 if (isascii (ch) && wch != ch)
945 dfa->map_notascii = 1;
946# endif
947 }
948 }
949 }
950#endif
951
952 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
953 return REG_ESPACE;
954 return REG_NOERROR;
955}
956
957/* Initialize WORD_CHAR table, which indicate which character is
958 "word". In this case "word" means that it is the word construction
959 character used by some operators like "\<", "\>", etc. */
960
961static void
962internal_function
963init_word_char (re_dfa_t *dfa)
964{
965 int i, j, ch;
966 dfa->word_ops_used = 1;
967 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
968 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
969 if (isalnum (ch) || ch == '_')
970 dfa->word_char[i] |= (bitset_word_t) 1 << j;
971}
972
973/* Free the work area which are only used while compiling. */
974
975static void
976free_workarea_compile (regex_t *preg)
977{
978 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
979 bin_tree_storage_t *storage, *next;
980 for (storage = dfa->str_tree_storage; storage; storage = next)
981 {
982 next = storage->next;
983 re_free (storage);
984 }
985 dfa->str_tree_storage = NULL;
986 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
987 dfa->str_tree = NULL;
988 re_free (dfa->org_indices);
989 dfa->org_indices = NULL;
990}
991
992/* Create initial states for all contexts. */
993
994static reg_errcode_t
995create_initial_state (re_dfa_t *dfa)
996{
997 int first, i;
998 reg_errcode_t err;
999 re_node_set init_nodes;
1000
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first = dfa->str_tree->first->node_idx;
1004 dfa->init_node = first;
1005 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006 if (BE (err != REG_NOERROR, 0))
1007 return err;
1008
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa->nbackref > 0)
1014 for (i = 0; i < init_nodes.nelem; ++i)
1015 {
1016 int node_idx = init_nodes.elems[i];
1017 re_token_type_t type = dfa->nodes[node_idx].type;
1018
1019 int clexp_idx;
1020 if (type != OP_BACK_REF)
1021 continue;
1022 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1023 {
1024 re_token_t *clexp_node;
1025 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026 if (clexp_node->type == OP_CLOSE_SUBEXP
1027 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028 break;
1029 }
1030 if (clexp_idx == init_nodes.nelem)
1031 continue;
1032
1033 if (type == OP_BACK_REF)
1034 {
1035 int dest_idx = dfa->edests[node_idx].elems[0];
1036 if (!re_node_set_contains (&init_nodes, dest_idx))
1037 {
1038 reg_errcode_t err = re_node_set_merge (&init_nodes,
1039 dfa->eclosures
1040 + dest_idx);
1041 if (err != REG_NOERROR)
1042 return err;
1043 i = 0;
1044 }
1045 }
1046 }
1047
1048 /* It must be the first time to invoke acquire_state. */
1049 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050 /* We don't check ERR here, since the initial state must not be NULL. */
1051 if (BE (dfa->init_state == NULL, 0))
1052 return err;
1053 if (dfa->init_state->has_constraint)
1054 {
1055 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1056 CONTEXT_WORD);
1057 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1058 CONTEXT_NEWLINE);
1059 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1060 &init_nodes,
1061 CONTEXT_NEWLINE
1062 | CONTEXT_BEGBUF);
1063 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064 || dfa->init_state_begbuf == NULL, 0))
1065 return err;
1066 }
1067 else
1068 dfa->init_state_word = dfa->init_state_nl
1069 = dfa->init_state_begbuf = dfa->init_state;
1070
1071 re_node_set_free (&init_nodes);
1072 return REG_NOERROR;
1073}
1074
1075#ifdef RE_ENABLE_I18N
1076/* If it is possible to do searching in single byte encoding instead of UTF-8
1077 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078 DFA nodes where needed. */
1079
1080static void
1081optimize_utf8 (re_dfa_t *dfa)
1082{
1083 int node, i, mb_chars = 0, has_period = 0;
1084
1085 for (node = 0; node < dfa->nodes_len; ++node)
1086 switch (dfa->nodes[node].type)
1087 {
1088 case CHARACTER:
1089 if (dfa->nodes[node].opr.c >= 0x80)
1090 mb_chars = 1;
1091 break;
1092 case ANCHOR:
1093 switch (dfa->nodes[node].opr.ctx_type)
1094 {
1095 case LINE_FIRST:
1096 case LINE_LAST:
1097 case BUF_FIRST:
1098 case BUF_LAST:
1099 break;
1100 default:
1101 /* Word anchors etc. cannot be handled. It's okay to test
1102 opr.ctx_type since constraints (for all DFA nodes) are
1103 created by ORing one or more opr.ctx_type values. */
1104 return;
1105 }
1106 break;
1107 case OP_PERIOD:
1108 has_period = 1;
1109 break;
1110 case OP_BACK_REF:
1111 case OP_ALT:
1112 case END_OF_RE:
1113 case OP_DUP_ASTERISK:
1114 case OP_OPEN_SUBEXP:
1115 case OP_CLOSE_SUBEXP:
1116 break;
1117 case COMPLEX_BRACKET:
1118 return;
1119 case SIMPLE_BRACKET:
1120 /* Just double check. The non-ASCII range starts at 0x80. */
1121 assert (0x80 % BITSET_WORD_BITS == 0);
1122 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123 if (dfa->nodes[node].opr.sbcset[i])
1124 return;
1125 break;
1126 default:
1127 abort ();
1128 }
1129
1130 if (mb_chars || has_period)
1131 for (node = 0; node < dfa->nodes_len; ++node)
1132 {
1133 if (dfa->nodes[node].type == CHARACTER
1134 && dfa->nodes[node].opr.c >= 0x80)
1135 dfa->nodes[node].mb_partial = 0;
1136 else if (dfa->nodes[node].type == OP_PERIOD)
1137 dfa->nodes[node].type = OP_UTF8_PERIOD;
1138 }
1139
1140 /* The search can be in single byte locale. */
1141 dfa->mb_cur_max = 1;
1142 dfa->is_utf8 = 0;
1143 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1144}
1145#endif
1146
1147/* Analyze the structure tree, and calculate "first", "next", "edest",
1148 "eclosure", and "inveclosure". */
1149
1150static reg_errcode_t
1151analyze (regex_t *preg)
1152{
1153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1154 reg_errcode_t ret;
1155
1156 /* Allocate arrays. */
1157 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162 || dfa->eclosures == NULL, 0))
1163 return REG_ESPACE;
1164
1165 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166 if (dfa->subexp_map != NULL)
1167 {
1168 int i;
1169 for (i = 0; i < preg->re_nsub; i++)
1170 dfa->subexp_map[i] = i;
1171 preorder (dfa->str_tree, optimize_subexps, dfa);
1172 for (i = 0; i < preg->re_nsub; i++)
1173 if (dfa->subexp_map[i] != i)
1174 break;
1175 if (i == preg->re_nsub)
1176 {
1177 free (dfa->subexp_map);
1178 dfa->subexp_map = NULL;
1179 }
1180 }
1181
1182 ret = postorder (dfa->str_tree, lower_subexps, preg);
1183 if (BE (ret != REG_NOERROR, 0))
1184 return ret;
1185 ret = postorder (dfa->str_tree, calc_first, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1187 return ret;
1188 preorder (dfa->str_tree, calc_next, dfa);
1189 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1191 return ret;
1192 ret = calc_eclosure (dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1194 return ret;
1195
1196 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1198 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1199 || dfa->nbackref)
1200 {
1201 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202 if (BE (dfa->inveclosures == NULL, 0))
1203 return REG_ESPACE;
1204 ret = calc_inveclosure (dfa);
1205 }
1206
1207 return ret;
1208}
1209
1210/* Our parse trees are very unbalanced, so we cannot use a stack to
1211 implement parse tree visits. Instead, we use parent pointers and
1212 some hairy code in these two functions. */
1213static reg_errcode_t
1214postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1215 void *extra)
1216{
1217 bin_tree_t *node, *prev;
1218
1219 for (node = root; ; )
1220 {
1221 /* Descend down the tree, preferably to the left (or to the right
1222 if that's the only child). */
1223 while (node->left || node->right)
1224 if (node->left)
1225 node = node->left;
1226 else
1227 node = node->right;
1228
1229 do
1230 {
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1233 return err;
1234 if (node->parent == NULL)
1235 return REG_NOERROR;
1236 prev = node;
1237 node = node->parent;
1238 }
1239 /* Go up while we have a node that is reached from the right. */
1240 while (node->right == prev || node->right == NULL);
1241 node = node->right;
1242 }
1243}
1244
1245static reg_errcode_t
1246preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247 void *extra)
1248{
1249 bin_tree_t *node;
1250
1251 for (node = root; ; )
1252 {
1253 reg_errcode_t err = fn (extra, node);
1254 if (BE (err != REG_NOERROR, 0))
1255 return err;
1256
1257 /* Go to the left node, or up and to the right. */
1258 if (node->left)
1259 node = node->left;
1260 else
1261 {
1262 bin_tree_t *prev = NULL;
1263 while (node->right == prev || node->right == NULL)
1264 {
1265 prev = node;
1266 node = node->parent;
1267 if (!node)
1268 return REG_NOERROR;
1269 }
1270 node = node->right;
1271 }
1272 }
1273}
1274
1275/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1277 backreferences as well. Requires a preorder visit. */
1278static reg_errcode_t
1279optimize_subexps (void *extra, bin_tree_t *node)
1280{
1281 re_dfa_t *dfa = (re_dfa_t *) extra;
1282
1283 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1284 {
1285 int idx = node->token.opr.idx;
1286 node->token.opr.idx = dfa->subexp_map[idx];
1287 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1288 }
1289
1290 else if (node->token.type == SUBEXP
1291 && node->left && node->left->token.type == SUBEXP)
1292 {
1293 int other_idx = node->left->token.opr.idx;
1294
1295 node->left = node->left->left;
1296 if (node->left)
1297 node->left->parent = node;
1298
1299 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300 if (other_idx < BITSET_WORD_BITS)
1301 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1302 }
1303
1304 return REG_NOERROR;
1305}
1306
1307/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1309static reg_errcode_t
1310lower_subexps (void *extra, bin_tree_t *node)
1311{
1312 regex_t *preg = (regex_t *) extra;
1313 reg_errcode_t err = REG_NOERROR;
1314
1315 if (node->left && node->left->token.type == SUBEXP)
1316 {
1317 node->left = lower_subexp (&err, preg, node->left);
1318 if (node->left)
1319 node->left->parent = node;
1320 }
1321 if (node->right && node->right->token.type == SUBEXP)
1322 {
1323 node->right = lower_subexp (&err, preg, node->right);
1324 if (node->right)
1325 node->right->parent = node;
1326 }
1327
1328 return err;
1329}
1330
1331static bin_tree_t *
1332lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1333{
1334 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335 bin_tree_t *body = node->left;
1336 bin_tree_t *op, *cls, *tree1, *tree;
1337
1338 if (preg->no_sub
1339 /* We do not optimize empty subexpressions, because otherwise we may
1340 have bad CONCAT nodes with NULL children. This is obviously not
1341 very common, so we do not lose much. An example that triggers
1342 this case is the sed "script" /\(\)/x. */
1343 && node->left != NULL
1344 && (node->token.opr.idx >= BITSET_WORD_BITS
1345 || !(dfa->used_bkref_map
1346 & ((bitset_word_t) 1 << node->token.opr.idx))))
1347 return node->left;
1348
1349 /* Convert the SUBEXP node to the concatenation of an
1350 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1351 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354 tree = create_tree (dfa, op, tree1, CONCAT);
1355 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1356 {
1357 *err = REG_ESPACE;
1358 return NULL;
1359 }
1360
1361 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1363 return tree;
1364}
1365
1366/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367 nodes. Requires a postorder visit. */
1368static reg_errcode_t
1369calc_first (void *extra, bin_tree_t *node)
1370{
1371 re_dfa_t *dfa = (re_dfa_t *) extra;
1372 if (node->token.type == CONCAT)
1373 {
1374 node->first = node->left->first;
1375 node->node_idx = node->left->node_idx;
1376 }
1377 else
1378 {
1379 node->first = node;
1380 node->node_idx = re_dfa_add_node (dfa, node->token);
1381 if (BE (node->node_idx == -1, 0))
1382 return REG_ESPACE;
1383 if (node->token.type == ANCHOR)
1384 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1385 }
1386 return REG_NOERROR;
1387}
1388
1389/* Pass 2: compute NEXT on the tree. Preorder visit. */
1390static reg_errcode_t
1391calc_next (void *extra, bin_tree_t *node)
1392{
1393 switch (node->token.type)
1394 {
1395 case OP_DUP_ASTERISK:
1396 node->left->next = node;
1397 break;
1398 case CONCAT:
1399 node->left->next = node->right->first;
1400 node->right->next = node->next;
1401 break;
1402 default:
1403 if (node->left)
1404 node->left->next = node->next;
1405 if (node->right)
1406 node->right->next = node->next;
1407 break;
1408 }
1409 return REG_NOERROR;
1410}
1411
1412/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1413static reg_errcode_t
1414link_nfa_nodes (void *extra, bin_tree_t *node)
1415{
1416 re_dfa_t *dfa = (re_dfa_t *) extra;
1417 int idx = node->node_idx;
1418 reg_errcode_t err = REG_NOERROR;
1419
1420 switch (node->token.type)
1421 {
1422 case CONCAT:
1423 break;
1424
1425 case END_OF_RE:
1426 assert (node->next == NULL);
1427 break;
1428
1429 case OP_DUP_ASTERISK:
1430 case OP_ALT:
1431 {
1432 int left, right;
1433 dfa->has_plural_match = 1;
1434 if (node->left != NULL)
1435 left = node->left->first->node_idx;
1436 else
1437 left = node->next->node_idx;
1438 if (node->right != NULL)
1439 right = node->right->first->node_idx;
1440 else
1441 right = node->next->node_idx;
1442 assert (left > -1);
1443 assert (right > -1);
1444 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1445 }
1446 break;
1447
1448 case ANCHOR:
1449 case OP_OPEN_SUBEXP:
1450 case OP_CLOSE_SUBEXP:
1451 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1452 break;
1453
1454 case OP_BACK_REF:
1455 dfa->nexts[idx] = node->next->node_idx;
1456 if (node->token.type == OP_BACK_REF)
1457 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1458 break;
1459
1460 default:
1461 assert (!IS_EPSILON_NODE (node->token.type));
1462 dfa->nexts[idx] = node->next->node_idx;
1463 break;
1464 }
1465
1466 return err;
1467}
1468
1469/* Duplicate the epsilon closure of the node ROOT_NODE.
1470 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471 to their own constraint. */
1472
1473static reg_errcode_t
1474internal_function
1475duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476 int root_node, unsigned int init_constraint)
1477{
1478 int org_node, clone_node, ret;
1479 unsigned int constraint = init_constraint;
1480 for (org_node = top_org_node, clone_node = top_clone_node;;)
1481 {
1482 int org_dest, clone_dest;
1483 if (dfa->nodes[org_node].type == OP_BACK_REF)
1484 {
1485 /* If the back reference epsilon-transit, its destination must
1486 also have the constraint. Then duplicate the epsilon closure
1487 of the destination of the back reference, and store it in
1488 edests of the back reference. */
1489 org_dest = dfa->nexts[org_node];
1490 re_node_set_empty (dfa->edests + clone_node);
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1493 return REG_ESPACE;
1494 dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1497 return REG_ESPACE;
1498 }
1499 else if (dfa->edests[org_node].nelem == 0)
1500 {
1501 /* In case of the node can't epsilon-transit, don't duplicate the
1502 destination and store the original destination as the
1503 destination of the node. */
1504 dfa->nexts[clone_node] = dfa->nexts[org_node];
1505 break;
1506 }
1507 else if (dfa->edests[org_node].nelem == 1)
1508 {
1509 /* In case of the node can epsilon-transit, and it has only one
1510 destination. */
1511 org_dest = dfa->edests[org_node].elems[0];
1512 re_node_set_empty (dfa->edests + clone_node);
1513 /* If the node is root_node itself, it means the epsilon clsoure
1514 has a loop. Then tie it to the destination of the root_node. */
1515 if (org_node == root_node && clone_node != org_node)
1516 {
1517 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518 if (BE (ret < 0, 0))
1519 return REG_ESPACE;
1520 break;
1521 }
1522 /* In case of the node has another constraint, add it. */
1523 constraint |= dfa->nodes[org_node].constraint;
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1526 return REG_ESPACE;
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1529 return REG_ESPACE;
1530 }
1531 else /* dfa->edests[org_node].nelem == 2 */
1532 {
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1540 {
1541 /* There is no such duplicated node, create a new one. */
1542 reg_errcode_t err;
1543 clone_dest = duplicate_node (dfa, org_dest, constraint);
1544 if (BE (clone_dest == -1, 0))
1545 return REG_ESPACE;
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1548 return REG_ESPACE;
1549 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550 root_node, constraint);
1551 if (BE (err != REG_NOERROR, 0))
1552 return err;
1553 }
1554 else
1555 {
1556 /* There is a duplicated node which satisfies the constraint,
1557 use it to avoid infinite loop. */
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1560 return REG_ESPACE;
1561 }
1562
1563 org_dest = dfa->edests[org_node].elems[1];
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == -1, 0))
1566 return REG_ESPACE;
1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (ret < 0, 0))
1569 return REG_ESPACE;
1570 }
1571 org_node = org_dest;
1572 clone_node = clone_dest;
1573 }
1574 return REG_NOERROR;
1575}
1576
1577/* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1579
1580static int
1581search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582 unsigned int constraint)
1583{
1584 int idx;
1585 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586 {
1587 if (org_node == dfa->org_indices[idx]
1588 && constraint == dfa->nodes[idx].constraint)
1589 return idx; /* Found. */
1590 }
1591 return -1; /* Not found. */
1592}
1593
1594/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595 Return the index of the new node, or -1 if insufficient storage is
1596 available. */
1597
1598static int
1599duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1600{
1601 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602 if (BE (dup_idx != -1, 1))
1603 {
1604 dfa->nodes[dup_idx].constraint = constraint;
1605 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606 dfa->nodes[dup_idx].duplicated = 1;
1607
1608 /* Store the index of the original node. */
1609 dfa->org_indices[dup_idx] = org_idx;
1610 }
1611 return dup_idx;
1612}
1613
1614static reg_errcode_t
1615calc_inveclosure (re_dfa_t *dfa)
1616{
1617 int src, idx, ret;
1618 for (idx = 0; idx < dfa->nodes_len; ++idx)
1619 re_node_set_init_empty (dfa->inveclosures + idx);
1620
1621 for (src = 0; src < dfa->nodes_len; ++src)
1622 {
1623 int *elems = dfa->eclosures[src].elems;
1624 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1625 {
1626 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627 if (BE (ret == -1, 0))
1628 return REG_ESPACE;
1629 }
1630 }
1631
1632 return REG_NOERROR;
1633}
1634
1635/* Calculate "eclosure" for all the node in DFA. */
1636
1637static reg_errcode_t
1638calc_eclosure (re_dfa_t *dfa)
1639{
1640 int node_idx, incomplete;
1641#ifdef DEBUG
1642 assert (dfa->nodes_len > 0);
1643#endif
1644 incomplete = 0;
1645 /* For each nodes, calculate epsilon closure. */
1646 for (node_idx = 0; ; ++node_idx)
1647 {
1648 reg_errcode_t err;
1649 re_node_set eclosure_elem;
1650 if (node_idx == dfa->nodes_len)
1651 {
1652 if (!incomplete)
1653 break;
1654 incomplete = 0;
1655 node_idx = 0;
1656 }
1657
1658#ifdef DEBUG
1659 assert (dfa->eclosures[node_idx].nelem != -1);
1660#endif
1661
1662 /* If we have already calculated, skip it. */
1663 if (dfa->eclosures[node_idx].nelem != 0)
1664 continue;
1665 /* Calculate epsilon closure of `node_idx'. */
1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667 if (BE (err != REG_NOERROR, 0))
1668 return err;
1669
1670 if (dfa->eclosures[node_idx].nelem == 0)
1671 {
1672 incomplete = 1;
1673 re_node_set_free (&eclosure_elem);
1674 }
1675 }
1676 return REG_NOERROR;
1677}
1678
1679/* Calculate epsilon closure of NODE. */
1680
1681static reg_errcode_t
1682calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1683{
1684 reg_errcode_t err;
1685 int i;
1686 re_node_set eclosure;
1687 int ret;
1688 int incomplete = 0;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1691 return err;
1692
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = -1;
1696
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702 {
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1706 return err;
1707 }
1708
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1712 {
1713 re_node_set eclosure_elem;
1714 int edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == -1)
1718 {
1719 incomplete = 1;
1720 continue;
1721 }
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1725 {
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729 }
1730 else
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1735 return err;
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1739 {
1740 incomplete = 1;
1741 re_node_set_free (&eclosure_elem);
1742 }
1743 }
1744
1745 /* An epsilon closure includes itself. */
1746 ret = re_node_set_insert (&eclosure, node);
1747 if (BE (ret < 0, 0))
1748 return REG_ESPACE;
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1751 else
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1754 return REG_NOERROR;
1755}
1756
1757/* Functions for token which are used in the parser. */
1758
1759/* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1761
1762static void
1763internal_function
1764fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765{
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1767}
1768
1769/* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1771
1772static int
1773internal_function
1774peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775{
1776 unsigned char c;
1777
1778 if (re_string_eoi (input))
1779 {
1780 token->type = END_OF_RE;
1781 return 0;
1782 }
1783
1784 c = re_string_peek_byte (input, 0);
1785 token->opr.c = c;
1786
1787 token->word_char = 0;
1788#ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1792 {
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1795 return 1;
1796 }
1797#endif
1798 if (c == '\\')
1799 {
1800 unsigned char c2;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802 {
1803 token->type = BACK_SLASH;
1804 return 1;
1805 }
1806
1807 c2 = re_string_peek_byte_case (input, 1);
1808 token->opr.c = c2;
1809 token->type = CHARACTER;
1810#ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1812 {
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816 }
1817 else
1818#endif
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821 switch (c2)
1822 {
1823 case '|':
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1826 break;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1830 {
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1833 }
1834 break;
1835 case '<':
1836 if (!(syntax & RE_NO_GNU_OPS))
1837 {
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1840 }
1841 break;
1842 case '>':
1843 if (!(syntax & RE_NO_GNU_OPS))
1844 {
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1847 }
1848 break;
1849 case 'b':
1850 if (!(syntax & RE_NO_GNU_OPS))
1851 {
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1854 }
1855 break;
1856 case 'B':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 {
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1861 }
1862 break;
1863 case 'w':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1866 break;
1867 case 'W':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1870 break;
1871 case 's':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1874 break;
1875 case 'S':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1878 break;
1879 case '`':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1884 }
1885 break;
1886 case '\'':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 {
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1891 }
1892 break;
1893 case '(':
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1896 break;
1897 case ')':
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1900 break;
1901 case '+':
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1904 break;
1905 case '?':
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1908 break;
1909 case '{':
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1912 break;
1913 case '}':
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1916 break;
1917 default:
1918 break;
1919 }
1920 return 2;
1921 }
1922
1923 token->type = CHARACTER;
1924#ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1926 {
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929 }
1930 else
1931#endif
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934 switch (c)
1935 {
1936 case '\n':
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1939 break;
1940 case '|':
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1943 break;
1944 case '*':
1945 token->type = OP_DUP_ASTERISK;
1946 break;
1947 case '+':
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1950 break;
1951 case '?':
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1954 break;
1955 case '{':
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1958 break;
1959 case '}':
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1962 break;
1963 case '(':
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1966 break;
1967 case ')':
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1970 break;
1971 case '[':
1972 token->type = OP_OPEN_BRACKET;
1973 break;
1974 case '.':
1975 token->type = OP_PERIOD;
1976 break;
1977 case '^':
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1980 {
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983 break;
1984 }
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1987 break;
1988 case '$':
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1991 {
1992 re_token_t next;
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997 break;
1998 }
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2001 break;
2002 default:
2003 break;
2004 }
2005 return 1;
2006}
2007
2008/* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2010
2011static int
2012internal_function
2013peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014{
2015 unsigned char c;
2016 if (re_string_eoi (input))
2017 {
2018 token->type = END_OF_RE;
2019 return 0;
2020 }
2021 c = re_string_peek_byte (input, 0);
2022 token->opr.c = c;
2023
2024#ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2027 {
2028 token->type = CHARACTER;
2029 return 1;
2030 }
2031#endif /* RE_ENABLE_I18N */
2032
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2035 {
2036 /* In this case, '\' escape a character. */
2037 unsigned char c2;
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2040 token->opr.c = c2;
2041 token->type = CHARACTER;
2042 return 1;
2043 }
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2045 {
2046 unsigned char c2;
2047 int token_len;
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2050 else
2051 c2 = 0;
2052 token->opr.c = c2;
2053 token_len = 2;
2054 switch (c2)
2055 {
2056 case '.':
2057 token->type = OP_OPEN_COLL_ELEM;
2058 break;
2059 case '=':
2060 token->type = OP_OPEN_EQUIV_CLASS;
2061 break;
2062 case ':':
2063 if (syntax & RE_CHAR_CLASSES)
2064 {
2065 token->type = OP_OPEN_CHAR_CLASS;
2066 break;
2067 }
2068 /* else fall through. */
2069 default:
2070 token->type = CHARACTER;
2071 token->opr.c = c;
2072 token_len = 1;
2073 break;
2074 }
2075 return token_len;
2076 }
2077 switch (c)
2078 {
2079 case '-':
2080 token->type = OP_CHARSET_RANGE;
2081 break;
2082 case ']':
2083 token->type = OP_CLOSE_BRACKET;
2084 break;
2085 case '^':
2086 token->type = OP_NON_MATCH_LIST;
2087 break;
2088 default:
2089 token->type = CHARACTER;
2090 }
2091 return 1;
2092}
2093
2094/* Functions for parser. */
2095
2096/* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error has occurred, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2100 CAT
2101 / \
2102 / \
2103 <reg_exp> EOR
2104
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2107
2108static bin_tree_t *
2109parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110 reg_errcode_t *err)
2111{
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119 return NULL;
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121 if (tree != NULL)
2122 root = create_tree (dfa, tree, eor, CONCAT);
2123 else
2124 root = eor;
2125 if (BE (eor == NULL || root == NULL, 0))
2126 {
2127 *err = REG_ESPACE;
2128 return NULL;
2129 }
2130 return root;
2131}
2132
2133/* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2135 ALT
2136 / \
2137 / \
2138 <branch1> <branch2>
2139
2140 ALT means alternative, which represents the operator `|'. */
2141
2142static bin_tree_t *
2143parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2145{
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 return NULL;
2151
2152 while (token->type == OP_ALT)
2153 {
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157 {
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160 return NULL;
2161 }
2162 else
2163 branch = NULL;
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2166 {
2167 *err = REG_ESPACE;
2168 return NULL;
2169 }
2170 }
2171 return tree;
2172}
2173
2174/* This function build the following tree, from regular expression
2175 <exp1><exp2>:
2176 CAT
2177 / \
2178 / \
2179 <exp1> <exp2>
2180
2181 CAT means concatenation. */
2182
2183static bin_tree_t *
2184parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2186{
2187 bin_tree_t *tree, *exp;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191 return NULL;
2192
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195 {
2196 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2198 {
2199 return NULL;
2200 }
2201 if (tree != NULL && exp != NULL)
2202 {
2203 tree = create_tree (dfa, tree, exp, CONCAT);
2204 if (tree == NULL)
2205 {
2206 *err = REG_ESPACE;
2207 return NULL;
2208 }
2209 }
2210 else if (tree == NULL)
2211 tree = exp;
2212 /* Otherwise exp == NULL, we don't need to create new tree. */
2213 }
2214 return tree;
2215}
2216
2217/* This function build the following tree, from regular expression a*:
2218 *
2219 |
2220 a
2221*/
2222
2223static bin_tree_t *
2224parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226{
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228 bin_tree_t *tree;
2229 switch (token->type)
2230 {
2231 case CHARACTER:
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2234 {
2235 *err = REG_ESPACE;
2236 return NULL;
2237 }
2238#ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2240 {
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243 {
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2249 {
2250 *err = REG_ESPACE;
2251 return NULL;
2252 }
2253 }
2254 }
2255#endif
2256 break;
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 return NULL;
2261 break;
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 return NULL;
2266 break;
2267 case OP_BACK_REF:
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269 {
2270 *err = REG_ESUBREG;
2271 return NULL;
2272 }
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2276 {
2277 *err = REG_ESPACE;
2278 return NULL;
2279 }
2280 ++dfa->nbackref;
2281 dfa->has_mb_node = 1;
2282 break;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2285 {
2286 *err = REG_BADRPT;
2287 return NULL;
2288 }
2289 /* FALLTHROUGH */
2290 case OP_DUP_ASTERISK:
2291 case OP_DUP_PLUS:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2294 {
2295 *err = REG_BADRPT;
2296 return NULL;
2297 }
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2299 {
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2302 }
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307 {
2308 *err = REG_ERPAREN;
2309 return NULL;
2310 }
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2314
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2318 by peek_token. */
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2321 {
2322 *err = REG_ESPACE;
2323 return NULL;
2324 }
2325 break;
2326 case ANCHOR:
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2333 {
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2336 {
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2340 }
2341 else
2342 {
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2346 }
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350 {
2351 *err = REG_ESPACE;
2352 return NULL;
2353 }
2354 }
2355 else
2356 {
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2359 {
2360 *err = REG_ESPACE;
2361 return NULL;
2362 }
2363 }
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2369 return tree;
2370 case OP_PERIOD:
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2373 {
2374 *err = REG_ESPACE;
2375 return NULL;
2376 }
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2379 break;
2380 case OP_WORD:
2381 case OP_NOTWORD:
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 "alnum",
2384 "_",
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387 return NULL;
2388 break;
2389 case OP_SPACE:
2390 case OP_NOTSPACE:
2391 tree = build_charclass_op (dfa, regexp->trans,
2392 "space",
2393 "",
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 return NULL;
2397 break;
2398 case OP_ALT:
2399 case END_OF_RE:
2400 return NULL;
2401 case BACK_SLASH:
2402 *err = REG_EESCAPE;
2403 return NULL;
2404 default:
2405 /* Must not happen? */
2406#ifdef DEBUG
2407 assert (0);
2408#endif
2409 return NULL;
2410 }
2411 fetch_token (token, regexp, syntax);
2412
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415 {
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 return NULL;
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2423 {
2424 *err = REG_BADRPT;
2425 return NULL;
2426 }
2427 }
2428
2429 return tree;
2430}
2431
2432/* This function build the following tree, from regular expression
2433 (<reg_exp>):
2434 SUBEXP
2435 |
2436 <reg_exp>
2437*/
2438
2439static bin_tree_t *
2440parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442{
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444 bin_tree_t *tree;
2445 size_t cur_nsub;
2446 cur_nsub = preg->re_nsub++;
2447
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2452 tree = NULL;
2453 else
2454 {
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457 *err = REG_EPAREN;
2458 if (BE (*err != REG_NOERROR, 0))
2459 return NULL;
2460 }
2461
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2467 {
2468 *err = REG_ESPACE;
2469 return NULL;
2470 }
2471 tree->token.opr.idx = cur_nsub;
2472 return tree;
2473}
2474
2475/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2476
2477static bin_tree_t *
2478parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480{
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 int i, start, end, start_idx = re_string_cur_idx (regexp);
2483#ifndef RE_TOKEN_INIT_BUG
2484 re_token_t start_token = *token;
2485#else
2486 re_token_t start_token;
2487
2488 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2489#endif
2490
2491 if (token->type == OP_OPEN_DUP_NUM)
2492 {
2493 end = 0;
2494 start = fetch_number (regexp, token, syntax);
2495 if (start == -1)
2496 {
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0; /* We treat "{,m}" as "{0,m}". */
2499 else
2500 {
2501 *err = REG_BADBR; /* <re>{} is invalid. */
2502 return NULL;
2503 }
2504 }
2505 if (BE (start != -2, 1))
2506 {
2507 /* We treat "{n}" as "{n,n}". */
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2511 }
2512 if (BE (start == -2 || end == -2, 0))
2513 {
2514 /* Invalid sequence. */
2515 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2516 {
2517 if (token->type == END_OF_RE)
2518 *err = REG_EBRACE;
2519 else
2520 *err = REG_BADBR;
2521
2522 return NULL;
2523 }
2524
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529 /* mb_partial and word_char bits should be already initialized by
2530 peek_token. */
2531 return elem;
2532 }
2533
2534 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2535 {
2536 /* First number greater than second. */
2537 *err = REG_BADBR;
2538 return NULL;
2539 }
2540 }
2541 else
2542 {
2543 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2545 }
2546
2547 fetch_token (token, regexp, syntax);
2548
2549 if (BE (elem == NULL, 0))
2550 return NULL;
2551 if (BE (start == 0 && end == 0, 0))
2552 {
2553 postorder (elem, free_tree, NULL);
2554 return NULL;
2555 }
2556
2557 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2558 if (BE (start > 0, 0))
2559 {
2560 tree = elem;
2561 for (i = 2; i <= start; ++i)
2562 {
2563 elem = duplicate_tree (elem, dfa);
2564 tree = create_tree (dfa, tree, elem, CONCAT);
2565 if (BE (elem == NULL || tree == NULL, 0))
2566 goto parse_dup_op_espace;
2567 }
2568
2569 if (start == end)
2570 return tree;
2571
2572 /* Duplicate ELEM before it is marked optional. */
2573 elem = duplicate_tree (elem, dfa);
2574 old_tree = tree;
2575 }
2576 else
2577 old_tree = NULL;
2578
2579 if (elem->token.type == SUBEXP)
2580 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2581
2582 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583 if (BE (tree == NULL, 0))
2584 goto parse_dup_op_espace;
2585
2586 /* This loop is actually executed only when end != -1,
2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2588 already created the start+1-th copy. */
2589 for (i = start + 2; i <= end; ++i)
2590 {
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2595
2596 tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 if (BE (tree == NULL, 0))
2598 goto parse_dup_op_espace;
2599 }
2600
2601 if (old_tree)
2602 tree = create_tree (dfa, old_tree, tree, CONCAT);
2603
2604 return tree;
2605
2606 parse_dup_op_espace:
2607 *err = REG_ESPACE;
2608 return NULL;
2609}
2610
2611/* Size of the names for collating symbol/equivalence_class/character_class.
2612 I'm not sure, but maybe enough. */
2613#define BRACKET_NAME_BUF_SIZE 32
2614
2615#ifndef _LIBC
2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617 Build the range expression which starts from START_ELEM, and ends
2618 at END_ELEM. The result are written to MBCSET and SBCSET.
2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620 mbcset->range_ends, is a pointer argument since we may
2621 update it. */
2622
2623static reg_errcode_t
2624internal_function
2625# ifdef RE_ENABLE_I18N
2626build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2627 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2628# else /* not RE_ENABLE_I18N */
2629build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2630 bracket_elem_t *end_elem)
2631# endif /* not RE_ENABLE_I18N */
2632{
2633 unsigned int start_ch, end_ch;
2634 /* Equivalence Classes and Character Classes can't be a range start/end. */
2635 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2636 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2637 0))
2638 return REG_ERANGE;
2639
2640 /* We can handle no multi character collating elements without libc
2641 support. */
2642 if (BE ((start_elem->type == COLL_SYM
2643 && strlen ((char *) start_elem->opr.name) > 1)
2644 || (end_elem->type == COLL_SYM
2645 && strlen ((char *) end_elem->opr.name) > 1), 0))
2646 return REG_ECOLLATE;
2647
2648# ifdef RE_ENABLE_I18N
2649 {
2650 wchar_t wc;
2651 wint_t start_wc;
2652 wint_t end_wc;
2653 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2654
2655 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2656 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2657 : 0));
2658 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2659 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2660 : 0));
2661#ifdef GAWK
2662 /*
2663 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2664 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2665 * unsigned, so we don't have sign extension problems.
2666 */
2667 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668 ? start_ch : start_elem->opr.wch);
2669 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670 ? end_ch : end_elem->opr.wch);
2671#else
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? __btowc (start_ch) : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? __btowc (end_ch) : end_elem->opr.wch);
2676#endif
2677 if (start_wc == WEOF || end_wc == WEOF)
2678 return REG_ECOLLATE;
2679 cmp_buf[0] = start_wc;
2680 cmp_buf[4] = end_wc;
2681 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2682 return REG_ERANGE;
2683
2684 /* Got valid collation sequence values, add them as a new entry.
2685 However, for !_LIBC we have no collation elements: if the
2686 character set is single byte, the single byte character set
2687 that we build below suffices. parse_bracket_exp passes
2688 no MBCSET if dfa->mb_cur_max == 1. */
2689 if (mbcset)
2690 {
2691 /* Check the space of the arrays. */
2692 if (BE (*range_alloc == mbcset->nranges, 0))
2693 {
2694 /* There is not enough space, need realloc. */
2695 wchar_t *new_array_start, *new_array_end;
2696 int new_nranges;
2697
2698 /* +1 in case of mbcset->nranges is 0. */
2699 new_nranges = 2 * mbcset->nranges + 1;
2700 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2701 are NULL if *range_alloc == 0. */
2702 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2703 new_nranges);
2704 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2705 new_nranges);
2706
2707 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2708 return REG_ESPACE;
2709
2710 mbcset->range_starts = new_array_start;
2711 mbcset->range_ends = new_array_end;
2712 *range_alloc = new_nranges;
2713 }
2714
2715 mbcset->range_starts[mbcset->nranges] = start_wc;
2716 mbcset->range_ends[mbcset->nranges++] = end_wc;
2717 }
2718
2719 /* Build the table for single byte characters. */
2720 for (wc = 0; wc < SBC_MAX; ++wc)
2721 {
2722 cmp_buf[2] = wc;
2723 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2724 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2725 bitset_set (sbcset, wc);
2726 }
2727 }
2728# else /* not RE_ENABLE_I18N */
2729 {
2730 unsigned int ch;
2731 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2732 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2733 : 0));
2734 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2735 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2736 : 0));
2737 if (start_ch > end_ch)
2738 return REG_ERANGE;
2739 /* Build the table for single byte characters. */
2740 for (ch = 0; ch < SBC_MAX; ++ch)
2741 if (start_ch <= ch && ch <= end_ch)
2742 bitset_set (sbcset, ch);
2743 }
2744# endif /* not RE_ENABLE_I18N */
2745 return REG_NOERROR;
2746}
2747#endif /* not _LIBC */
2748
2749#ifndef _LIBC
2750/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2751 Build the collating element which is represented by NAME.
2752 The result are written to MBCSET and SBCSET.
2753 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2754 pointer argument since we may update it. */
2755
2756static reg_errcode_t
2757internal_function
2758# ifdef RE_ENABLE_I18N
2759build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2760 int *coll_sym_alloc, const unsigned char *name)
2761# else /* not RE_ENABLE_I18N */
2762build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2763# endif /* not RE_ENABLE_I18N */
2764{
2765 size_t name_len = strlen ((const char *) name);
2766 if (BE (name_len != 1, 0))
2767 return REG_ECOLLATE;
2768 else
2769 {
2770 bitset_set (sbcset, name[0]);
2771 return REG_NOERROR;
2772 }
2773}
2774#endif /* not _LIBC */
2775
2776/* This function parse bracket expression like "[abc]", "[a-c]",
2777 "[[.a-a.]]" etc. */
2778
2779static bin_tree_t *
2780parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2781 reg_syntax_t syntax, reg_errcode_t *err)
2782{
2783#ifdef _LIBC
2784 const unsigned char *collseqmb;
2785 const char *collseqwc;
2786 uint32_t nrules;
2787 int32_t table_size;
2788 const int32_t *symb_table;
2789 const unsigned char *extra;
2790
2791 /* Local function for parse_bracket_exp used in _LIBC environment.
2792 Seek the collating symbol entry correspondings to NAME.
2793 Return the index of the symbol in the SYMB_TABLE. */
2794
2795 auto inline int32_t
2796 __attribute ((always_inline))
2797 seek_collating_symbol_entry (name, name_len)
2798 const unsigned char *name;
2799 size_t name_len;
2800 {
2801 int32_t hash = elem_hash ((const char *) name, name_len);
2802 int32_t elem = hash % table_size;
2803 if (symb_table[2 * elem] != 0)
2804 {
2805 int32_t second = hash % (table_size - 2) + 1;
2806
2807 do
2808 {
2809 /* First compare the hashing value. */
2810 if (symb_table[2 * elem] == hash
2811 /* Compare the length of the name. */
2812 && name_len == extra[symb_table[2 * elem + 1]]
2813 /* Compare the name. */
2814 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2815 name_len) == 0)
2816 {
2817 /* Yep, this is the entry. */
2818 break;
2819 }
2820
2821 /* Next entry. */
2822 elem += second;
2823 }
2824 while (symb_table[2 * elem] != 0);
2825 }
2826 return elem;
2827 }
2828
2829 /* Local function for parse_bracket_exp used in _LIBC environment.
2830 Look up the collation sequence value of BR_ELEM.
2831 Return the value if succeeded, UINT_MAX otherwise. */
2832
2833 auto inline unsigned int
2834 __attribute ((always_inline))
2835 lookup_collation_sequence_value (br_elem)
2836 bracket_elem_t *br_elem;
2837 {
2838 if (br_elem->type == SB_CHAR)
2839 {
2840 /*
2841 if (MB_CUR_MAX == 1)
2842 */
2843 if (nrules == 0)
2844 return collseqmb[br_elem->opr.ch];
2845 else
2846 {
2847 wint_t wc = __btowc (br_elem->opr.ch);
2848 return __collseq_table_lookup (collseqwc, wc);
2849 }
2850 }
2851 else if (br_elem->type == MB_CHAR)
2852 {
2853 if (nrules != 0)
2854 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2855 }
2856 else if (br_elem->type == COLL_SYM)
2857 {
2858 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2859 if (nrules != 0)
2860 {
2861 int32_t elem, idx;
2862 elem = seek_collating_symbol_entry (br_elem->opr.name,
2863 sym_name_len);
2864 if (symb_table[2 * elem] != 0)
2865 {
2866 /* We found the entry. */
2867 idx = symb_table[2 * elem + 1];
2868 /* Skip the name of collating element name. */
2869 idx += 1 + extra[idx];
2870 /* Skip the byte sequence of the collating element. */
2871 idx += 1 + extra[idx];
2872 /* Adjust for the alignment. */
2873 idx = (idx + 3) & ~3;
2874 /* Skip the multibyte collation sequence value. */
2875 idx += sizeof (unsigned int);
2876 /* Skip the wide char sequence of the collating element. */
2877 idx += sizeof (unsigned int) *
2878 (1 + *(unsigned int *) (extra + idx));
2879 /* Return the collation sequence value. */
2880 return *(unsigned int *) (extra + idx);
2881 }
2882 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2883 {
2884 /* No valid character. Match it as a single byte
2885 character. */
2886 return collseqmb[br_elem->opr.name[0]];
2887 }
2888 }
2889 else if (sym_name_len == 1)
2890 return collseqmb[br_elem->opr.name[0]];
2891 }
2892 return UINT_MAX;
2893 }
2894
2895 /* Local function for parse_bracket_exp used in _LIBC environment.
2896 Build the range expression which starts from START_ELEM, and ends
2897 at END_ELEM. The result are written to MBCSET and SBCSET.
2898 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2899 mbcset->range_ends, is a pointer argument since we may
2900 update it. */
2901
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline))
2904 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2905 re_charset_t *mbcset;
2906 int *range_alloc;
2907 bitset_t sbcset;
2908 bracket_elem_t *start_elem, *end_elem;
2909 {
2910 unsigned int ch;
2911 uint32_t start_collseq;
2912 uint32_t end_collseq;
2913
2914 /* Equivalence Classes and Character Classes can't be a range
2915 start/end. */
2916 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2917 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2918 0))
2919 return REG_ERANGE;
2920
2921 start_collseq = lookup_collation_sequence_value (start_elem);
2922 end_collseq = lookup_collation_sequence_value (end_elem);
2923 /* Check start/end collation sequence values. */
2924 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2925 return REG_ECOLLATE;
2926 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2927 return REG_ERANGE;
2928
2929 /* Got valid collation sequence values, add them as a new entry.
2930 However, if we have no collation elements, and the character set
2931 is single byte, the single byte character set that we
2932 build below suffices. */
2933 if (nrules > 0 || dfa->mb_cur_max > 1)
2934 {
2935 /* Check the space of the arrays. */
2936 if (BE (*range_alloc == mbcset->nranges, 0))
2937 {
2938 /* There is not enough space, need realloc. */
2939 uint32_t *new_array_start;
2940 uint32_t *new_array_end;
2941 int new_nranges;
2942
2943 /* +1 in case of mbcset->nranges is 0. */
2944 new_nranges = 2 * mbcset->nranges + 1;
2945 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2946 new_nranges);
2947 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2948 new_nranges);
2949
2950 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2951 return REG_ESPACE;
2952
2953 mbcset->range_starts = new_array_start;
2954 mbcset->range_ends = new_array_end;
2955 *range_alloc = new_nranges;
2956 }
2957
2958 mbcset->range_starts[mbcset->nranges] = start_collseq;
2959 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2960 }
2961
2962 /* Build the table for single byte characters. */
2963 for (ch = 0; ch < SBC_MAX; ch++)
2964 {
2965 uint32_t ch_collseq;
2966 /*
2967 if (MB_CUR_MAX == 1)
2968 */
2969 if (nrules == 0)
2970 ch_collseq = collseqmb[ch];
2971 else
2972 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2973 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2974 bitset_set (sbcset, ch);
2975 }
2976 return REG_NOERROR;
2977 }
2978
2979 /* Local function for parse_bracket_exp used in _LIBC environment.
2980 Build the collating element which is represented by NAME.
2981 The result are written to MBCSET and SBCSET.
2982 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2983 pointer argument since we may update it. */
2984
2985 auto inline reg_errcode_t
2986 __attribute ((always_inline))
2987 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2988 re_charset_t *mbcset;
2989 int *coll_sym_alloc;
2990 bitset_t sbcset;
2991 const unsigned char *name;
2992 {
2993 int32_t elem, idx;
2994 size_t name_len = strlen ((const char *) name);
2995 if (nrules != 0)
2996 {
2997 elem = seek_collating_symbol_entry (name, name_len);
2998 if (symb_table[2 * elem] != 0)
2999 {
3000 /* We found the entry. */
3001 idx = symb_table[2 * elem + 1];
3002 /* Skip the name of collating element name. */
3003 idx += 1 + extra[idx];
3004 }
3005 else if (symb_table[2 * elem] == 0 && name_len == 1)
3006 {
3007 /* No valid character, treat it as a normal
3008 character. */
3009 bitset_set (sbcset, name[0]);
3010 return REG_NOERROR;
3011 }
3012 else
3013 return REG_ECOLLATE;
3014
3015 /* Got valid collation sequence, add it as a new entry. */
3016 /* Check the space of the arrays. */
3017 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3018 {
3019 /* Not enough, realloc it. */
3020 /* +1 in case of mbcset->ncoll_syms is 0. */
3021 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3022 /* Use realloc since mbcset->coll_syms is NULL
3023 if *alloc == 0. */
3024 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3025 new_coll_sym_alloc);
3026 if (BE (new_coll_syms == NULL, 0))
3027 return REG_ESPACE;
3028 mbcset->coll_syms = new_coll_syms;
3029 *coll_sym_alloc = new_coll_sym_alloc;
3030 }
3031 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3032 return REG_NOERROR;
3033 }
3034 else
3035 {
3036 if (BE (name_len != 1, 0))
3037 return REG_ECOLLATE;
3038 else
3039 {
3040 bitset_set (sbcset, name[0]);
3041 return REG_NOERROR;
3042 }
3043 }
3044 }
3045#endif
3046
3047 re_token_t br_token;
3048 re_bitset_ptr_t sbcset;
3049#ifdef RE_ENABLE_I18N
3050 re_charset_t *mbcset;
3051 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3052 int equiv_class_alloc = 0, char_class_alloc = 0;
3053#endif /* not RE_ENABLE_I18N */
3054 int non_match = 0;
3055 bin_tree_t *work_tree;
3056 int token_len;
3057 int first_round = 1;
3058#ifdef _LIBC
3059 collseqmb = (const unsigned char *)
3060 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3061 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3062 if (nrules)
3063 {
3064 /*
3065 if (MB_CUR_MAX > 1)
3066 */
3067 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3068 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3069 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3070 _NL_COLLATE_SYMB_TABLEMB);
3071 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3072 _NL_COLLATE_SYMB_EXTRAMB);
3073 }
3074#endif
3075 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3076#ifdef RE_ENABLE_I18N
3077 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3078#endif /* RE_ENABLE_I18N */
3079#ifdef RE_ENABLE_I18N
3080 if (BE (sbcset == NULL || mbcset == NULL, 0))
3081#else
3082 if (BE (sbcset == NULL, 0))
3083#endif /* RE_ENABLE_I18N */
3084 {
3085 *err = REG_ESPACE;
3086 return NULL;
3087 }
3088
3089 token_len = peek_token_bracket (token, regexp, syntax);
3090 if (BE (token->type == END_OF_RE, 0))
3091 {
3092 *err = REG_BADPAT;
3093 goto parse_bracket_exp_free_return;
3094 }
3095 if (token->type == OP_NON_MATCH_LIST)
3096 {
3097#ifdef RE_ENABLE_I18N
3098 mbcset->non_match = 1;
3099#endif /* not RE_ENABLE_I18N */
3100 non_match = 1;
3101 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3102 bitset_set (sbcset, '\n');
3103 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3104 token_len = peek_token_bracket (token, regexp, syntax);
3105 if (BE (token->type == END_OF_RE, 0))
3106 {
3107 *err = REG_BADPAT;
3108 goto parse_bracket_exp_free_return;
3109 }
3110 }
3111
3112 /* We treat the first ']' as a normal character. */
3113 if (token->type == OP_CLOSE_BRACKET)
3114 token->type = CHARACTER;
3115
3116 while (1)
3117 {
3118 bracket_elem_t start_elem, end_elem;
3119 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3120 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3121 reg_errcode_t ret;
3122 int token_len2 = 0, is_range_exp = 0;
3123 re_token_t token2;
3124
3125 start_elem.opr.name = start_name_buf;
3126 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3127 syntax, first_round);
3128 if (BE (ret != REG_NOERROR, 0))
3129 {
3130 *err = ret;
3131 goto parse_bracket_exp_free_return;
3132 }
3133 first_round = 0;
3134
3135 /* Get information about the next token. We need it in any case. */
3136 token_len = peek_token_bracket (token, regexp, syntax);
3137
3138 /* Do not check for ranges if we know they are not allowed. */
3139 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3140 {
3141 if (BE (token->type == END_OF_RE, 0))
3142 {
3143 *err = REG_EBRACK;
3144 goto parse_bracket_exp_free_return;
3145 }
3146 if (token->type == OP_CHARSET_RANGE)
3147 {
3148 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3149 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3150 if (BE (token2.type == END_OF_RE, 0))
3151 {
3152 *err = REG_EBRACK;
3153 goto parse_bracket_exp_free_return;
3154 }
3155 if (token2.type == OP_CLOSE_BRACKET)
3156 {
3157 /* We treat the last '-' as a normal character. */
3158 re_string_skip_bytes (regexp, -token_len);
3159 token->type = CHARACTER;
3160 }
3161 else
3162 is_range_exp = 1;
3163 }
3164 }
3165
3166 if (is_range_exp == 1)
3167 {
3168 end_elem.opr.name = end_name_buf;
3169 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3170 dfa, syntax, 1);
3171 if (BE (ret != REG_NOERROR, 0))
3172 {
3173 *err = ret;
3174 goto parse_bracket_exp_free_return;
3175 }
3176
3177 token_len = peek_token_bracket (token, regexp, syntax);
3178
3179#ifdef _LIBC
3180 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3181 &start_elem, &end_elem);
3182#else
3183# ifdef RE_ENABLE_I18N
3184 *err = build_range_exp (sbcset,
3185 dfa->mb_cur_max > 1 ? mbcset : NULL,
3186 &range_alloc, &start_elem, &end_elem);
3187# else
3188 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3189# endif
3190#endif /* RE_ENABLE_I18N */
3191 if (BE (*err != REG_NOERROR, 0))
3192 goto parse_bracket_exp_free_return;
3193 }
3194 else
3195 {
3196 switch (start_elem.type)
3197 {
3198 case SB_CHAR:
3199 bitset_set (sbcset, start_elem.opr.ch);
3200 break;
3201#ifdef RE_ENABLE_I18N
3202 case MB_CHAR:
3203 /* Check whether the array has enough space. */
3204 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3205 {
3206 wchar_t *new_mbchars;
3207 /* Not enough, realloc it. */
3208 /* +1 in case of mbcset->nmbchars is 0. */
3209 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3210 /* Use realloc since array is NULL if *alloc == 0. */
3211 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3212 mbchar_alloc);
3213 if (BE (new_mbchars == NULL, 0))
3214 goto parse_bracket_exp_espace;
3215 mbcset->mbchars = new_mbchars;
3216 }
3217 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3218 break;
3219#endif /* RE_ENABLE_I18N */
3220 case EQUIV_CLASS:
3221 *err = build_equiv_class (sbcset,
3222#ifdef RE_ENABLE_I18N
3223 mbcset, &equiv_class_alloc,
3224#endif /* RE_ENABLE_I18N */
3225 start_elem.opr.name);
3226 if (BE (*err != REG_NOERROR, 0))
3227 goto parse_bracket_exp_free_return;
3228 break;
3229 case COLL_SYM:
3230 *err = build_collating_symbol (sbcset,
3231#ifdef RE_ENABLE_I18N
3232 mbcset, &coll_sym_alloc,
3233#endif /* RE_ENABLE_I18N */
3234 start_elem.opr.name);
3235 if (BE (*err != REG_NOERROR, 0))
3236 goto parse_bracket_exp_free_return;
3237 break;
3238 case CHAR_CLASS:
3239 *err = build_charclass (regexp->trans, sbcset,
3240#ifdef RE_ENABLE_I18N
3241 mbcset, &char_class_alloc,
3242#endif /* RE_ENABLE_I18N */
3243 (const char *) start_elem.opr.name, syntax);
3244 if (BE (*err != REG_NOERROR, 0))
3245 goto parse_bracket_exp_free_return;
3246 break;
3247 default:
3248 assert (0);
3249 break;
3250 }
3251 }
3252 if (BE (token->type == END_OF_RE, 0))
3253 {
3254 *err = REG_EBRACK;
3255 goto parse_bracket_exp_free_return;
3256 }
3257 if (token->type == OP_CLOSE_BRACKET)
3258 break;
3259 }
3260
3261 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3262
3263 /* If it is non-matching list. */
3264 if (non_match)
3265 bitset_not (sbcset);
3266
3267#ifdef RE_ENABLE_I18N
3268 /* Ensure only single byte characters are set. */
3269 if (dfa->mb_cur_max > 1)
3270 bitset_mask (sbcset, dfa->sb_char);
3271
3272 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3273 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3274 || mbcset->non_match)))
3275 {
3276 bin_tree_t *mbc_tree;
3277 int sbc_idx;
3278 /* Build a tree for complex bracket. */
3279 dfa->has_mb_node = 1;
3280 br_token.type = COMPLEX_BRACKET;
3281 br_token.opr.mbcset = mbcset;
3282 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283 if (BE (mbc_tree == NULL, 0))
3284 goto parse_bracket_exp_espace;
3285 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3286 if (sbcset[sbc_idx])
3287 break;
3288 /* If there are no bits set in sbcset, there is no point
3289 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3290 if (sbc_idx < BITSET_WORDS)
3291 {
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3298
3299 /* Then join them by ALT node. */
3300 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3303 }
3304 else
3305 {
3306 re_free (sbcset);
3307 work_tree = mbc_tree;
3308 }
3309 }
3310 else
3311#endif /* not RE_ENABLE_I18N */
3312 {
3313#ifdef RE_ENABLE_I18N
3314 free_charset (mbcset);
3315#endif
3316 /* Build a tree for simple bracket. */
3317 br_token.type = SIMPLE_BRACKET;
3318 br_token.opr.sbcset = sbcset;
3319 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320 if (BE (work_tree == NULL, 0))
3321 goto parse_bracket_exp_espace;
3322 }
3323 return work_tree;
3324
3325 parse_bracket_exp_espace:
3326 *err = REG_ESPACE;
3327 parse_bracket_exp_free_return:
3328 re_free (sbcset);
3329#ifdef RE_ENABLE_I18N
3330 free_charset (mbcset);
3331#endif /* RE_ENABLE_I18N */
3332 return NULL;
3333}
3334
3335/* Parse an element in the bracket expression. */
3336
3337static reg_errcode_t
3338parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3339 re_token_t *token, int token_len, re_dfa_t *dfa,
3340 reg_syntax_t syntax, int accept_hyphen)
3341{
3342#ifdef RE_ENABLE_I18N
3343 int cur_char_size;
3344 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3345 if (cur_char_size > 1)
3346 {
3347 elem->type = MB_CHAR;
3348 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3349 re_string_skip_bytes (regexp, cur_char_size);
3350 return REG_NOERROR;
3351 }
3352#endif /* RE_ENABLE_I18N */
3353 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3354 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3355 || token->type == OP_OPEN_EQUIV_CLASS)
3356 return parse_bracket_symbol (elem, regexp, token);
3357 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3358 {
3359 /* A '-' must only appear as anything but a range indicator before
3360 the closing bracket. Everything else is an error. */
3361 re_token_t token2;
3362 (void) peek_token_bracket (&token2, regexp, syntax);
3363 if (token2.type != OP_CLOSE_BRACKET)
3364 /* The actual error value is not standardized since this whole
3365 case is undefined. But ERANGE makes good sense. */
3366 return REG_ERANGE;
3367 }
3368 elem->type = SB_CHAR;
3369 elem->opr.ch = token->opr.c;
3370 return REG_NOERROR;
3371}
3372
3373/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3374 such as [:<character_class>:], [.<collating_element>.], and
3375 [=<equivalent_class>=]. */
3376
3377static reg_errcode_t
3378parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3379 re_token_t *token)
3380{
3381 unsigned char ch, delim = token->opr.c;
3382 int i = 0;
3383 if (re_string_eoi(regexp))
3384 return REG_EBRACK;
3385 for (;; ++i)
3386 {
3387 if (i >= BRACKET_NAME_BUF_SIZE)
3388 return REG_EBRACK;
3389 if (token->type == OP_OPEN_CHAR_CLASS)
3390 ch = re_string_fetch_byte_case (regexp);
3391 else
3392 ch = re_string_fetch_byte (regexp);
3393 if (re_string_eoi(regexp))
3394 return REG_EBRACK;
3395 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3396 break;
3397 elem->opr.name[i] = ch;
3398 }
3399 re_string_skip_bytes (regexp, 1);
3400 elem->opr.name[i] = '\0';
3401 switch (token->type)
3402 {
3403 case OP_OPEN_COLL_ELEM:
3404 elem->type = COLL_SYM;
3405 break;
3406 case OP_OPEN_EQUIV_CLASS:
3407 elem->type = EQUIV_CLASS;
3408 break;
3409 case OP_OPEN_CHAR_CLASS:
3410 elem->type = CHAR_CLASS;
3411 break;
3412 default:
3413 break;
3414 }
3415 return REG_NOERROR;
3416}
3417
3418 /* Helper function for parse_bracket_exp.
3419 Build the equivalence class which is represented by NAME.
3420 The result are written to MBCSET and SBCSET.
3421 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3422 is a pointer argument since we may update it. */
3423
3424static reg_errcode_t
3425#ifdef RE_ENABLE_I18N
3426build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3427 int *equiv_class_alloc, const unsigned char *name)
3428#else /* not RE_ENABLE_I18N */
3429build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430#endif /* not RE_ENABLE_I18N */
3431{
3432#ifdef _LIBC
3433 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3434 if (nrules != 0)
3435 {
3436 const int32_t *table, *indirect;
3437 const unsigned char *weights, *extra, *cp;
3438 unsigned char char_buf[2];
3439 int32_t idx1, idx2;
3440 unsigned int ch;
3441 size_t len;
3442 /* This #include defines a local function! */
3443# include <locale/weight.h>
3444 /* Calculate the index for equivalence class. */
3445 cp = name;
3446 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448 _NL_COLLATE_WEIGHTMB);
3449 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 _NL_COLLATE_EXTRAMB);
3451 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_INDIRECTMB);
3453 idx1 = findidx (&cp);
3454 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455 /* This isn't a valid character. */
3456 return REG_ECOLLATE;
3457
3458 /* Build single byte matcing table for this equivalence class. */
3459 char_buf[1] = (unsigned char) '\0';
3460 len = weights[idx1 & 0xffffff];
3461 for (ch = 0; ch < SBC_MAX; ++ch)
3462 {
3463 char_buf[0] = ch;
3464 cp = char_buf;
3465 idx2 = findidx (&cp);
3466/*
3467 idx2 = table[ch];
3468*/
3469 if (idx2 == 0)
3470 /* This isn't a valid character. */
3471 continue;
3472 /* Compare only if the length matches and the collation rule
3473 index is the same. */
3474 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3475 {
3476 int cnt = 0;
3477
3478 while (cnt <= len &&
3479 weights[(idx1 & 0xffffff) + 1 + cnt]
3480 == weights[(idx2 & 0xffffff) + 1 + cnt])
3481 ++cnt;
3482
3483 if (cnt > len)
3484 bitset_set (sbcset, ch);
3485 }
3486 }
3487 /* Check whether the array has enough space. */
3488 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3489 {
3490 /* Not enough, realloc it. */
3491 /* +1 in case of mbcset->nequiv_classes is 0. */
3492 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493 /* Use realloc since the array is NULL if *alloc == 0. */
3494 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3495 int32_t,
3496 new_equiv_class_alloc);
3497 if (BE (new_equiv_classes == NULL, 0))
3498 return REG_ESPACE;
3499 mbcset->equiv_classes = new_equiv_classes;
3500 *equiv_class_alloc = new_equiv_class_alloc;
3501 }
3502 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3503 }
3504 else
3505#endif /* _LIBC */
3506 {
3507 if (BE (strlen ((const char *) name) != 1, 0))
3508 return REG_ECOLLATE;
3509 bitset_set (sbcset, *name);
3510 }
3511 return REG_NOERROR;
3512}
3513
3514 /* Helper function for parse_bracket_exp.
3515 Build the character class which is represented by NAME.
3516 The result are written to MBCSET and SBCSET.
3517 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518 is a pointer argument since we may update it. */
3519
3520static reg_errcode_t
3521#ifdef RE_ENABLE_I18N
3522build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523 re_charset_t *mbcset, int *char_class_alloc,
3524 const char *class_name, reg_syntax_t syntax)
3525#else /* not RE_ENABLE_I18N */
3526build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527 const char *class_name, reg_syntax_t syntax)
3528#endif /* not RE_ENABLE_I18N */
3529{
3530 int i;
3531
3532 /* In case of REG_ICASE "upper" and "lower" match the both of
3533 upper and lower cases. */
3534 if ((syntax & RE_ICASE)
3535 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3536 class_name = "alpha";
3537
3538#ifdef RE_ENABLE_I18N
3539 /* Check the space of the arrays. */
3540 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3541 {
3542 /* Not enough, realloc it. */
3543 /* +1 in case of mbcset->nchar_classes is 0. */
3544 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545 /* Use realloc since array is NULL if *alloc == 0. */
3546 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547 new_char_class_alloc);
3548 if (BE (new_char_classes == NULL, 0))
3549 return REG_ESPACE;
3550 mbcset->char_classes = new_char_classes;
3551 *char_class_alloc = new_char_class_alloc;
3552 }
3553 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3554#endif /* RE_ENABLE_I18N */
3555
3556#define BUILD_CHARCLASS_LOOP(ctype_func) \
3557 do { \
3558 if (BE (trans != NULL, 0)) \
3559 { \
3560 for (i = 0; i < SBC_MAX; ++i) \
3561 if (ctype_func (i)) \
3562 bitset_set (sbcset, trans[i]); \
3563 } \
3564 else \
3565 { \
3566 for (i = 0; i < SBC_MAX; ++i) \
3567 if (ctype_func (i)) \
3568 bitset_set (sbcset, i); \
3569 } \
3570 } while (0)
3571
3572 if (strcmp (class_name, "alnum") == 0)
3573 BUILD_CHARCLASS_LOOP (isalnum);
3574 else if (strcmp (class_name, "cntrl") == 0)
3575 BUILD_CHARCLASS_LOOP (iscntrl);
3576 else if (strcmp (class_name, "lower") == 0)
3577 BUILD_CHARCLASS_LOOP (islower);
3578 else if (strcmp (class_name, "space") == 0)
3579 BUILD_CHARCLASS_LOOP (isspace);
3580 else if (strcmp (class_name, "alpha") == 0)
3581 BUILD_CHARCLASS_LOOP (isalpha);
3582 else if (strcmp (class_name, "digit") == 0)
3583 BUILD_CHARCLASS_LOOP (isdigit);
3584 else if (strcmp (class_name, "print") == 0)
3585 BUILD_CHARCLASS_LOOP (isprint);
3586 else if (strcmp (class_name, "upper") == 0)
3587 BUILD_CHARCLASS_LOOP (isupper);
3588 else if (strcmp (class_name, "blank") == 0)
3589#ifndef GAWK
3590 BUILD_CHARCLASS_LOOP (isblank);
3591#else
3592 /* see comments above */
3593 BUILD_CHARCLASS_LOOP (is_blank);
3594#endif
3595 else if (strcmp (class_name, "graph") == 0)
3596 BUILD_CHARCLASS_LOOP (isgraph);
3597 else if (strcmp (class_name, "punct") == 0)
3598 BUILD_CHARCLASS_LOOP (ispunct);
3599 else if (strcmp (class_name, "xdigit") == 0)
3600 BUILD_CHARCLASS_LOOP (isxdigit);
3601 else
3602 return REG_ECTYPE;
3603
3604 return REG_NOERROR;
3605}
3606
3607static bin_tree_t *
3608build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3609 const char *class_name,
3610 const char *extra, int non_match,
3611 reg_errcode_t *err)
3612{
3613 re_bitset_ptr_t sbcset;
3614#ifdef RE_ENABLE_I18N
3615 re_charset_t *mbcset;
3616 int alloc = 0;
3617#endif /* not RE_ENABLE_I18N */
3618 reg_errcode_t ret;
3619 re_token_t br_token;
3620 bin_tree_t *tree;
3621
3622 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3623#ifdef RE_ENABLE_I18N
3624 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3625#endif /* RE_ENABLE_I18N */
3626
3627#ifdef RE_ENABLE_I18N
3628 if (BE (sbcset == NULL || mbcset == NULL, 0))
3629#else /* not RE_ENABLE_I18N */
3630 if (BE (sbcset == NULL, 0))
3631#endif /* not RE_ENABLE_I18N */
3632 {
3633 *err = REG_ESPACE;
3634 return NULL;
3635 }
3636
3637 if (non_match)
3638 {
3639#ifdef RE_ENABLE_I18N
3640 mbcset->non_match = 1;
3641#endif /* not RE_ENABLE_I18N */
3642 }
3643
3644 /* We don't care the syntax in this case. */
3645 ret = build_charclass (trans, sbcset,
3646#ifdef RE_ENABLE_I18N
3647 mbcset, &alloc,
3648#endif /* RE_ENABLE_I18N */
3649 class_name, 0);
3650
3651 if (BE (ret != REG_NOERROR, 0))
3652 {
3653 re_free (sbcset);
3654#ifdef RE_ENABLE_I18N
3655 free_charset (mbcset);
3656#endif /* RE_ENABLE_I18N */
3657 *err = ret;
3658 return NULL;
3659 }
3660 /* \w match '_' also. */
3661 for (; *extra; extra++)
3662 bitset_set (sbcset, *extra);
3663
3664 /* If it is non-matching list. */
3665 if (non_match)
3666 bitset_not (sbcset);
3667
3668#ifdef RE_ENABLE_I18N
3669 /* Ensure only single byte characters are set. */
3670 if (dfa->mb_cur_max > 1)
3671 bitset_mask (sbcset, dfa->sb_char);
3672#endif
3673
3674 /* Build a tree for simple bracket. */
3675 br_token.type = SIMPLE_BRACKET;
3676 br_token.opr.sbcset = sbcset;
3677 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3678 if (BE (tree == NULL, 0))
3679 goto build_word_op_espace;
3680
3681#ifdef RE_ENABLE_I18N
3682 if (dfa->mb_cur_max > 1)
3683 {
3684 bin_tree_t *mbc_tree;
3685 /* Build a tree for complex bracket. */
3686 br_token.type = COMPLEX_BRACKET;
3687 br_token.opr.mbcset = mbcset;
3688 dfa->has_mb_node = 1;
3689 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3690 if (BE (mbc_tree == NULL, 0))
3691 goto build_word_op_espace;
3692 /* Then join them by ALT node. */
3693 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3694 if (BE (mbc_tree != NULL, 1))
3695 return tree;
3696 }
3697 else
3698 {
3699 free_charset (mbcset);
3700 return tree;
3701 }
3702#else /* not RE_ENABLE_I18N */
3703 return tree;
3704#endif /* not RE_ENABLE_I18N */
3705
3706 build_word_op_espace:
3707 re_free (sbcset);
3708#ifdef RE_ENABLE_I18N
3709 free_charset (mbcset);
3710#endif /* RE_ENABLE_I18N */
3711 *err = REG_ESPACE;
3712 return NULL;
3713}
3714
3715/* This is intended for the expressions like "a{1,3}".
3716 Fetch a number from `input', and return the number.
3717 Return -1, if the number field is empty like "{,1}".
3718 Return -2, if an error has occurred. */
3719
3720static int
3721fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3722{
3723 int num = -1;
3724 unsigned char c;
3725 while (1)
3726 {
3727 fetch_token (token, input, syntax);
3728 c = token->opr.c;
3729 if (BE (token->type == END_OF_RE, 0))
3730 return -2;
3731 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3732 break;
3733 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3734 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3735 num = (num > RE_DUP_MAX) ? -2 : num;
3736 }
3737 return num;
3738}
3739
3740#ifdef RE_ENABLE_I18N
3741static void
3742free_charset (re_charset_t *cset)
3743{
3744 re_free (cset->mbchars);
3745# ifdef _LIBC
3746 re_free (cset->coll_syms);
3747 re_free (cset->equiv_classes);
3748 re_free (cset->range_starts);
3749 re_free (cset->range_ends);
3750# endif
3751 re_free (cset->char_classes);
3752 re_free (cset);
3753}
3754#endif /* RE_ENABLE_I18N */
3755
3756/* Functions for binary tree operation. */
3757
3758/* Create a tree node. */
3759
3760static bin_tree_t *
3761create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3762 re_token_type_t type)
3763{
3764 re_token_t t;
3765 t.type = type;
3766 return create_token_tree (dfa, left, right, &t);
3767}
3768
3769static bin_tree_t *
3770create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3771 const re_token_t *token)
3772{
3773 bin_tree_t *tree;
3774 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3775 {
3776 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3777
3778 if (storage == NULL)
3779 return NULL;
3780 storage->next = dfa->str_tree_storage;
3781 dfa->str_tree_storage = storage;
3782 dfa->str_tree_storage_idx = 0;
3783 }
3784 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3785
3786 tree->parent = NULL;
3787 tree->left = left;
3788 tree->right = right;
3789 tree->token = *token;
3790 tree->token.duplicated = 0;
3791 tree->token.opt_subexp = 0;
3792 tree->first = NULL;
3793 tree->next = NULL;
3794 tree->node_idx = -1;
3795
3796 if (left != NULL)
3797 left->parent = tree;
3798 if (right != NULL)
3799 right->parent = tree;
3800 return tree;
3801}
3802
3803/* Mark the tree SRC as an optional subexpression.
3804 To be called from preorder or postorder. */
3805
3806static reg_errcode_t
3807mark_opt_subexp (void *extra, bin_tree_t *node)
3808{
3809 int idx = (int) (intptr_t) extra;
3810 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3811 node->token.opt_subexp = 1;
3812
3813 return REG_NOERROR;
3814}
3815
3816/* Free the allocated memory inside NODE. */
3817
3818static void
3819free_token (re_token_t *node)
3820{
3821#ifdef RE_ENABLE_I18N
3822 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3823 free_charset (node->opr.mbcset);
3824 else
3825#endif /* RE_ENABLE_I18N */
3826 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3827 re_free (node->opr.sbcset);
3828}
3829
3830/* Worker function for tree walking. Free the allocated memory inside NODE
3831 and its children. */
3832
3833static reg_errcode_t
3834free_tree (void *extra, bin_tree_t *node)
3835{
3836 free_token (&node->token);
3837 return REG_NOERROR;
3838}
3839
3840
3841/* Duplicate the node SRC, and return new node. This is a preorder
3842 visit similar to the one implemented by the generic visitor, but
3843 we need more infrastructure to maintain two parallel trees --- so,
3844 it's easier to duplicate. */
3845
3846static bin_tree_t *
3847duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3848{
3849 const bin_tree_t *node;
3850 bin_tree_t *dup_root;
3851 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3852
3853 for (node = root; ; )
3854 {
3855 /* Create a new tree and link it back to the current parent. */
3856 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3857 if (*p_new == NULL)
3858 return NULL;
3859 (*p_new)->parent = dup_node;
3860 (*p_new)->token.duplicated = 1;
3861 dup_node = *p_new;
3862
3863 /* Go to the left node, or up and to the right. */
3864 if (node->left)
3865 {
3866 node = node->left;
3867 p_new = &dup_node->left;
3868 }
3869 else
3870 {
3871 const bin_tree_t *prev = NULL;
3872 while (node->right == prev || node->right == NULL)
3873 {
3874 prev = node;
3875 node = node->parent;
3876 dup_node = dup_node->parent;
3877 if (!node)
3878 return dup_root;
3879 }
3880 node = node->right;
3881 p_new = &dup_node->right;
3882 }
3883 }
3884}
diff --git a/win32/regex.c b/win32/regex.c
index 2cca16934..95e5d757a 100644
--- a/win32/regex.c
+++ b/win32/regex.c
@@ -1,4929 +1,90 @@
1/* Extended regular expression matching and search library, 1/* Extended regular expression matching and search library.
2 version 0.12. 2 Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3 (Implements POSIX draft P10003.2/D11.2, except for 3 This file is part of the GNU C Library.
4 internationalization features.) 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 5
6 Copyright (C) 1993 Free Software Foundation, Inc. 6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
7 10
8 This program is free software; you can redistribute it and/or modify 11 The GNU C Library is distributed in the hope that it will be useful,
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 GNU General Public License for more details. 14 Lesser General Public License for more details.
17 15
18 You should have received a copy of the GNU General Public License 16 You should have received a copy of the GNU Lesser General Public
19 along with this program; if not, write to the Free Software 17 License along with the GNU C Library; if not, write to the Free
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 19 02110-1301 USA. */
22/* AIX requires this to be the first thing in the file. */ 20
23#if defined (_AIX) && !defined (REGEX_MALLOC) 21#define HAVE_LIBINTL_H 0
24 #pragma alloca 22#define ENABLE_NLS 0
25#endif 23#define HAVE_ALLOCA 0
26 24#define NO_MBSUPPORT 1
27#ifndef _GNU_SOURCE 25#define GAWK 1
28#define _GNU_SOURCE 26
29#endif 27/* Make sure no one compiles this code with a C++ compiler. */
30 28#ifdef __cplusplus
31/* We need this for `regex.h', and perhaps for the Emacs include files. */ 29# error "This is C code, use a C compiler"
32#include <sys/types.h> 30#endif
33 31
34/* We used to test for `BSTRING' here, but only GCC and Emacs define 32#ifdef _LIBC
35 `BSTRING', as far as I know, and neither of them use this code. */ 33/* We have to keep the namespace clean. */
36#include <string.h> 34# define regfree(preg) __regfree (preg)
37#ifndef bcmp 35# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
38#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 36# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
39#endif 37# define regerror(errcode, preg, errbuf, errbuf_size) \
40#ifndef bcopy 38 __regerror(errcode, preg, errbuf, errbuf_size)
41#define bcopy(s, d, n) memcpy ((d), (s), (n)) 39# define re_set_registers(bu, re, nu, st, en) \
42#endif 40 __re_set_registers (bu, re, nu, st, en)
43#ifndef bzero 41# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
44#define bzero(s, n) memset ((s), 0, (n)) 42 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
45#endif 43# define re_match(bufp, string, size, pos, regs) \
46 44 __re_match (bufp, string, size, pos, regs)
47#include <stdlib.h> 45# define re_search(bufp, string, size, startpos, range, regs) \
48 46 __re_search (bufp, string, size, startpos, range, regs)
49 47# define re_compile_pattern(pattern, length, bufp) \
50/* Define the syntax stuff for \<, \>, etc. */ 48 __re_compile_pattern (pattern, length, bufp)
51 49# define re_set_syntax(syntax) __re_set_syntax (syntax)
52/* This must be nonzero for the wordchar and notwordchar pattern 50# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
53 commands in re_match_2. */ 51 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
54#ifndef Sword 52# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
55#define Sword 1 53
56#endif 54# include "../locale/localeinfo.h"
57 55#endif
58#ifdef SYNTAX_TABLE 56
59 57#if defined (_MSC_VER)
60extern char *re_syntax_table; 58#include <stdio.h> /* for size_t */
61 59#endif
62#else /* not SYNTAX_TABLE */ 60
63 61/* On some systems, limits.h sets RE_DUP_MAX to a lower value than
64/* How many characters in the character set. */ 62 GNU regex allows. Include it before <regex.h>, which correctly
65#define CHAR_SET_SIZE 256 63 #undefs RE_DUP_MAX and sets it to the right value. */
66 64#include <limits.h>
67static char re_syntax_table[CHAR_SET_SIZE]; 65#include <stdint.h>
68 66
69static void 67#ifdef GAWK
70init_syntax_once () 68#undef alloca
71{ 69#define alloca alloca_is_bad_you_should_never_use_it
72 register int c; 70#endif
73 static int done = 0; 71#include <regex.h>
74 72#include "regex_internal.h"
75 if (done) 73
76 return; 74#include "regex_internal.c"
77 75#ifdef GAWK
78 bzero (re_syntax_table, sizeof re_syntax_table); 76#define bool int
79 77#define true (1)
80 for (c = 'a'; c <= 'z'; c++) 78#define false (0)
81 re_syntax_table[c] = Sword; 79#endif
82 80#include "regcomp.c"
83 for (c = 'A'; c <= 'Z'; c++) 81#include "regexec.c"
84 re_syntax_table[c] = Sword; 82
85 83/* Binary backward compatibility. */
86 for (c = '0'; c <= '9'; c++) 84#if _LIBC
87 re_syntax_table[c] = Sword; 85# include <shlib-compat.h>
88 86# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
89 re_syntax_table['_'] = Sword; 87link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
90
91 done = 1;
92}
93
94#endif /* not SYNTAX_TABLE */
95
96#define SYNTAX(c) re_syntax_table[c]
97
98
99/* Get the interface, including the syntax bits. */
100#include "regex.h"
101
102/* isalpha etc. are used for the character classes. */
103#include <ctype.h>
104
105#ifndef isascii
106#define isascii(c) 1
107#endif
108
109#ifdef isblank
110#define ISBLANK(c) (isascii (c) && isblank (c))
111#else
112#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
113#endif
114#ifdef isgraph
115#define ISGRAPH(c) (isascii (c) && isgraph (c))
116#else
117#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
118#endif
119
120#define ISPRINT(c) (isascii (c) && isprint (c))
121#define ISDIGIT(c) (isascii (c) && isdigit (c))
122#define ISALNUM(c) (isascii (c) && isalnum (c))
123#define ISALPHA(c) (isascii (c) && isalpha (c))
124#define ISCNTRL(c) (isascii (c) && iscntrl (c))
125#define ISLOWER(c) (isascii (c) && islower (c))
126#define ISPUNCT(c) (isascii (c) && ispunct (c))
127#define ISSPACE(c) (isascii (c) && isspace (c))
128#define ISUPPER(c) (isascii (c) && isupper (c))
129#define ISXDIGIT(c) (isascii (c) && isxdigit (c))
130
131#ifndef NULL
132#define NULL 0
133#endif
134
135/* We remove any previous definition of `SIGN_EXTEND_CHAR',
136 since ours (we hope) works properly with all combinations of
137 machines, compilers, `char' and `unsigned char' argument types.
138 (Per Bothner suggested the basic approach.) */
139#undef SIGN_EXTEND_CHAR
140#if __STDC__
141#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
142#else /* not __STDC__ */
143/* As in Harbison and Steele. */
144#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
145#endif
146
147/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
148 use `alloca' instead of `malloc'. This is because using malloc in
149 re_search* or re_match* could cause memory leaks when C-g is used in
150 Emacs; also, malloc is slower and causes storage fragmentation. On
151 the other hand, malloc is more portable, and easier to debug.
152
153 Because we sometimes use alloca, some routines have to be macros,
154 not functions -- `alloca'-allocated space disappears at the end of the
155 function it is called in. */
156
157#ifdef REGEX_MALLOC
158
159#define REGEX_ALLOCATE malloc
160#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
161
162#else /* not REGEX_MALLOC */
163
164/* Emacs already defines alloca, sometimes. */
165#ifndef alloca
166
167/* Make alloca work the best possible way. */
168#ifdef __GNUC__
169#define alloca __builtin_alloca
170#else /* not __GNUC__ */
171#if HAVE_ALLOCA_H
172#include <alloca.h>
173#else /* not __GNUC__ or HAVE_ALLOCA_H */
174#ifndef _AIX /* Already did AIX, up at the top. */
175char *alloca ();
176#endif /* not _AIX */
177#endif /* not HAVE_ALLOCA_H */
178#endif /* not __GNUC__ */
179
180#endif /* not alloca */
181
182#define REGEX_ALLOCATE alloca
183
184/* Assumes a `char *destination' variable. */
185#define REGEX_REALLOCATE(source, osize, nsize) \
186 (destination = (char *) alloca (nsize), \
187 bcopy (source, destination, osize), \
188 destination)
189
190#endif /* not REGEX_MALLOC */
191
192
193/* True if `size1' is non-NULL and PTR is pointing anywhere inside
194 `string1' or just past its end. This works if PTR is NULL, which is
195 a good thing. */
196#define FIRST_STRING_P(ptr) \
197 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
198
199/* (Re)Allocate N items of type T using malloc, or fail. */
200#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
201#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
202#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
203
204#define BYTEWIDTH 8 /* In bits. */
205
206#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
207
208#define MAX(a, b) ((a) > (b) ? (a) : (b))
209#define MIN(a, b) ((a) < (b) ? (a) : (b))
210
211typedef char boolean;
212#define false 0
213#define true 1
214
215/* These are the command codes that appear in compiled regular
216 expressions. Some opcodes are followed by argument bytes. A
217 command code can specify any interpretation whatsoever for its
218 arguments. Zero bytes may appear in the compiled regular expression.
219
220 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
221 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
222 `exactn' we use here must also be 1. */
223
224typedef enum
225{
226 no_op = 0,
227
228 /* Followed by one byte giving n, then by n literal bytes. */
229 exactn = 1,
230
231 /* Matches any (more or less) character. */
232 anychar,
233
234 /* Matches any one char belonging to specified set. First
235 following byte is number of bitmap bytes. Then come bytes
236 for a bitmap saying which chars are in. Bits in each byte
237 are ordered low-bit-first. A character is in the set if its
238 bit is 1. A character too large to have a bit in the map is
239 automatically not in the set. */
240 charset,
241
242 /* Same parameters as charset, but match any character that is
243 not one of those specified. */
244 charset_not,
245
246 /* Start remembering the text that is matched, for storing in a
247 register. Followed by one byte with the register number, in
248 the range 0 to one less than the pattern buffer's re_nsub
249 field. Then followed by one byte with the number of groups
250 inner to this one. (This last has to be part of the
251 start_memory only because we need it in the on_failure_jump
252 of re_match_2.) */
253 start_memory,
254
255 /* Stop remembering the text that is matched and store it in a
256 memory register. Followed by one byte with the register
257 number, in the range 0 to one less than `re_nsub' in the
258 pattern buffer, and one byte with the number of inner groups,
259 just like `start_memory'. (We need the number of inner
260 groups here because we don't have any easy way of finding the
261 corresponding start_memory when we're at a stop_memory.) */
262 stop_memory,
263
264 /* Match a duplicate of something remembered. Followed by one
265 byte containing the register number. */
266 duplicate,
267
268 /* Fail unless at beginning of line. */
269 begline,
270
271 /* Fail unless at end of line. */
272 endline,
273
274 /* Succeeds if at beginning of buffer (if emacs) or at beginning
275 of string to be matched (if not). */
276 begbuf,
277
278 /* Analogously, for end of buffer/string. */
279 endbuf,
280
281 /* Followed by two byte relative address to which to jump. */
282 jump,
283
284 /* Same as jump, but marks the end of an alternative. */
285 jump_past_alt,
286
287 /* Followed by two-byte relative address of place to resume at
288 in case of failure. */
289 on_failure_jump,
290
291 /* Like on_failure_jump, but pushes a placeholder instead of the
292 current string position when executed. */
293 on_failure_keep_string_jump,
294
295 /* Throw away latest failure point and then jump to following
296 two-byte relative address. */
297 pop_failure_jump,
298
299 /* Change to pop_failure_jump if know won't have to backtrack to
300 match; otherwise change to jump. This is used to jump
301 back to the beginning of a repeat. If what follows this jump
302 clearly won't match what the repeat does, such that we can be
303 sure that there is no use backtracking out of repetitions
304 already matched, then we change it to a pop_failure_jump.
305 Followed by two-byte address. */
306 maybe_pop_jump,
307
308 /* Jump to following two-byte address, and push a dummy failure
309 point. This failure point will be thrown away if an attempt
310 is made to use it for a failure. A `+' construct makes this
311 before the first repeat. Also used as an intermediary kind
312 of jump when compiling an alternative. */
313 dummy_failure_jump,
314
315 /* Push a dummy failure point and continue. Used at the end of
316 alternatives. */
317 push_dummy_failure,
318
319 /* Followed by two-byte relative address and two-byte number n.
320 After matching N times, jump to the address upon failure. */
321 succeed_n,
322
323 /* Followed by two-byte relative address, and two-byte number n.
324 Jump to the address N times, then fail. */
325 jump_n,
326
327 /* Set the following two-byte relative address to the
328 subsequent two-byte number. The address *includes* the two
329 bytes of number. */
330 set_number_at,
331
332 wordchar, /* Matches any word-constituent character. */
333 notwordchar, /* Matches any char that is not a word-constituent. */
334
335 wordbeg, /* Succeeds if at word beginning. */
336 wordend, /* Succeeds if at word end. */
337
338 wordbound, /* Succeeds if at a word boundary. */
339 notwordbound /* Succeeds if not at a word boundary. */
340
341#ifdef emacs
342 ,before_dot, /* Succeeds if before point. */
343 at_dot, /* Succeeds if at point. */
344 after_dot, /* Succeeds if after point. */
345
346 /* Matches any character whose syntax is specified. Followed by
347 a byte which contains a syntax code, e.g., Sword. */
348 syntaxspec,
349
350 /* Matches any character whose syntax is not that specified. */
351 notsyntaxspec
352#endif /* emacs */
353} re_opcode_t;
354
355/* Common operations on the compiled pattern. */
356
357/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
358
359#define STORE_NUMBER(destination, number) \
360 do { \
361 (destination)[0] = (number) & 0377; \
362 (destination)[1] = (number) >> 8; \
363 } while (0)
364
365/* Same as STORE_NUMBER, except increment DESTINATION to
366 the byte after where the number is stored. Therefore, DESTINATION
367 must be an lvalue. */
368
369#define STORE_NUMBER_AND_INCR(destination, number) \
370 do { \
371 STORE_NUMBER (destination, number); \
372 (destination) += 2; \
373 } while (0)
374
375/* Put into DESTINATION a number stored in two contiguous bytes starting
376 at SOURCE. */
377
378#define EXTRACT_NUMBER(destination, source) \
379 do { \
380 (destination) = *(source) & 0377; \
381 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
382 } while (0)
383
384#ifdef DEBUG
385static void
386extract_number (dest, source)
387 int *dest;
388 unsigned char *source;
389{
390 int temp = SIGN_EXTEND_CHAR (*(source + 1));
391 *dest = *source & 0377;
392 *dest += temp << 8;
393}
394
395#ifndef EXTRACT_MACROS /* To debug the macros. */
396#undef EXTRACT_NUMBER
397#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
398#endif /* not EXTRACT_MACROS */
399
400#endif /* DEBUG */
401
402/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
403 SOURCE must be an lvalue. */
404
405#define EXTRACT_NUMBER_AND_INCR(destination, source) \
406 do { \
407 EXTRACT_NUMBER (destination, source); \
408 (source) += 2; \
409 } while (0)
410
411#ifdef DEBUG
412static void
413extract_number_and_incr (destination, source)
414 int *destination;
415 unsigned char **source;
416{
417 extract_number (destination, *source);
418 *source += 2;
419}
420
421#ifndef EXTRACT_MACROS
422#undef EXTRACT_NUMBER_AND_INCR
423#define EXTRACT_NUMBER_AND_INCR(dest, src) \
424 extract_number_and_incr (&dest, &src)
425#endif /* not EXTRACT_MACROS */
426
427#endif /* DEBUG */
428
429/* If DEBUG is defined, Regex prints many voluminous messages about what
430 it is doing (if the variable `debug' is nonzero). If linked with the
431 main program in `iregex.c', you can enter patterns and strings
432 interactively. And if linked with the main program in `main.c' and
433 the other test files, you can run the already-written tests. */
434
435#ifdef DEBUG
436
437/* We use standard I/O for debugging. */
438#include <stdio.h>
439
440/* It is useful to test things that ``must'' be true when debugging. */
441#include <assert.h>
442
443static int debug = 0;
444
445#define DEBUG_STATEMENT(e) e
446#define DEBUG_PRINT1(x) if (debug) printf (x)
447#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
448#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
449#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
450#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
451 if (debug) print_partial_compiled_pattern (s, e)
452#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
453 if (debug) print_double_string (w, s1, sz1, s2, sz2)
454
455
456extern void printchar ();
457
458/* Print the fastmap in human-readable form. */
459
460void
461print_fastmap (fastmap)
462 char *fastmap;
463{
464 unsigned was_a_range = 0;
465 unsigned i = 0;
466
467 while (i < (1 << BYTEWIDTH))
468 {
469 if (fastmap[i++])
470 {
471 was_a_range = 0;
472 printchar (i - 1);
473 while (i < (1 << BYTEWIDTH) && fastmap[i])
474 {
475 was_a_range = 1;
476 i++;
477 }
478 if (was_a_range)
479 {
480 printf ("-");
481 printchar (i - 1);
482 }
483 }
484 }
485 putchar ('\n');
486}
487
488
489/* Print a compiled pattern string in human-readable form, starting at
490 the START pointer into it and ending just before the pointer END. */
491
492void
493print_partial_compiled_pattern (start, end)
494 unsigned char *start;
495 unsigned char *end;
496{
497 int mcnt, mcnt2;
498 unsigned char *p = start;
499 unsigned char *pend = end;
500
501 if (start == NULL)
502 {
503 printf ("(null)\n");
504 return;
505 }
506
507 /* Loop over pattern commands. */
508 while (p < pend)
509 {
510 switch ((re_opcode_t) *p++)
511 {
512 case no_op:
513 printf ("/no_op");
514 break;
515
516 case exactn:
517 mcnt = *p++;
518 printf ("/exactn/%d", mcnt);
519 do
520 {
521 putchar ('/');
522 printchar (*p++);
523 }
524 while (--mcnt);
525 break;
526
527 case start_memory:
528 mcnt = *p++;
529 printf ("/start_memory/%d/%d", mcnt, *p++);
530 break;
531
532 case stop_memory:
533 mcnt = *p++;
534 printf ("/stop_memory/%d/%d", mcnt, *p++);
535 break;
536
537 case duplicate:
538 printf ("/duplicate/%d", *p++);
539 break;
540
541 case anychar:
542 printf ("/anychar");
543 break;
544
545 case charset:
546 case charset_not:
547 {
548 register int c;
549
550 printf ("/charset%s",
551 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
552
553 assert (p + *p < pend);
554
555 for (c = 0; c < *p; c++)
556 {
557 unsigned bit;
558 unsigned char map_byte = p[1 + c];
559
560 putchar ('/');
561
562 for (bit = 0; bit < BYTEWIDTH; bit++)
563 if (map_byte & (1 << bit))
564 printchar (c * BYTEWIDTH + bit);
565 }
566 p += 1 + *p;
567 break;
568 }
569
570 case begline:
571 printf ("/begline");
572 break;
573
574 case endline:
575 printf ("/endline");
576 break;
577
578 case on_failure_jump:
579 extract_number_and_incr (&mcnt, &p);
580 printf ("/on_failure_jump/0/%d", mcnt);
581 break;
582
583 case on_failure_keep_string_jump:
584 extract_number_and_incr (&mcnt, &p);
585 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
586 break;
587
588 case dummy_failure_jump:
589 extract_number_and_incr (&mcnt, &p);
590 printf ("/dummy_failure_jump/0/%d", mcnt);
591 break;
592
593 case push_dummy_failure:
594 printf ("/push_dummy_failure");
595 break;
596
597 case maybe_pop_jump:
598 extract_number_and_incr (&mcnt, &p);
599 printf ("/maybe_pop_jump/0/%d", mcnt);
600 break;
601
602 case pop_failure_jump:
603 extract_number_and_incr (&mcnt, &p);
604 printf ("/pop_failure_jump/0/%d", mcnt);
605 break;
606
607 case jump_past_alt:
608 extract_number_and_incr (&mcnt, &p);
609 printf ("/jump_past_alt/0/%d", mcnt);
610 break;
611
612 case jump:
613 extract_number_and_incr (&mcnt, &p);
614 printf ("/jump/0/%d", mcnt);
615 break;
616
617 case succeed_n:
618 extract_number_and_incr (&mcnt, &p);
619 extract_number_and_incr (&mcnt2, &p);
620 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
621 break;
622
623 case jump_n:
624 extract_number_and_incr (&mcnt, &p);
625 extract_number_and_incr (&mcnt2, &p);
626 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
627 break;
628
629 case set_number_at:
630 extract_number_and_incr (&mcnt, &p);
631 extract_number_and_incr (&mcnt2, &p);
632 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
633 break;
634
635 case wordbound:
636 printf ("/wordbound");
637 break;
638
639 case notwordbound:
640 printf ("/notwordbound");
641 break;
642
643 case wordbeg:
644 printf ("/wordbeg");
645 break;
646
647 case wordend:
648 printf ("/wordend");
649
650#ifdef emacs
651 case before_dot:
652 printf ("/before_dot");
653 break;
654
655 case at_dot:
656 printf ("/at_dot");
657 break;
658
659 case after_dot:
660 printf ("/after_dot");
661 break;
662
663 case syntaxspec:
664 printf ("/syntaxspec");
665 mcnt = *p++;
666 printf ("/%d", mcnt);
667 break;
668
669 case notsyntaxspec:
670 printf ("/notsyntaxspec");
671 mcnt = *p++;
672 printf ("/%d", mcnt);
673 break;
674#endif /* emacs */
675
676 case wordchar:
677 printf ("/wordchar");
678 break;
679
680 case notwordchar:
681 printf ("/notwordchar");
682 break;
683
684 case begbuf:
685 printf ("/begbuf");
686 break;
687
688 case endbuf:
689 printf ("/endbuf");
690 break;
691
692 default:
693 printf ("?%d", *(p-1));
694 }
695 }
696 printf ("/\n");
697}
698
699
700void
701print_compiled_pattern (bufp)
702 struct re_pattern_buffer *bufp;
703{
704 unsigned char *buffer = bufp->buffer;
705
706 print_partial_compiled_pattern (buffer, buffer + bufp->used);
707 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
708
709 if (bufp->fastmap_accurate && bufp->fastmap)
710 {
711 printf ("fastmap: ");
712 print_fastmap (bufp->fastmap);
713 }
714
715 printf ("re_nsub: %d\t", bufp->re_nsub);
716 printf ("regs_alloc: %d\t", bufp->regs_allocated);
717 printf ("can_be_null: %d\t", bufp->can_be_null);
718 printf ("newline_anchor: %d\n", bufp->newline_anchor);
719 printf ("no_sub: %d\t", bufp->no_sub);
720 printf ("not_bol: %d\t", bufp->not_bol);
721 printf ("not_eol: %d\t", bufp->not_eol);
722 printf ("syntax: %d\n", bufp->syntax);
723 /* Perhaps we should print the translate table? */
724}
725
726
727void
728print_double_string (where, string1, size1, string2, size2)
729 const char *where;
730 const char *string1;
731 const char *string2;
732 int size1;
733 int size2;
734{
735 unsigned this_char;
736
737 if (where == NULL)
738 printf ("(null)");
739 else
740 {
741 if (FIRST_STRING_P (where))
742 {
743 for (this_char = where - string1; this_char < size1; this_char++)
744 printchar (string1[this_char]);
745
746 where = string2;
747 }
748
749 for (this_char = where - string2; this_char < size2; this_char++)
750 printchar (string2[this_char]);
751 }
752}
753
754#else /* not DEBUG */
755
756#undef assert
757#define assert(e)
758
759#define DEBUG_STATEMENT(e)
760#define DEBUG_PRINT1(x)
761#define DEBUG_PRINT2(x1, x2)
762#define DEBUG_PRINT3(x1, x2, x3)
763#define DEBUG_PRINT4(x1, x2, x3, x4)
764#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
765#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
766
767#endif /* not DEBUG */
768
769/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
770 also be assigned to arbitrarily: each pattern buffer stores its own
771 syntax, so it can be changed between regex compilations. */
772reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
773
774
775/* Specify the precise syntax of regexps for compilation. This provides
776 for compatibility for various utilities which historically have
777 different, incompatible syntaxes.
778
779 The argument SYNTAX is a bit mask comprised of the various bits
780 defined in regex.h. We return the old syntax. */
781
782reg_syntax_t
783re_set_syntax (syntax)
784 reg_syntax_t syntax;
785{
786 reg_syntax_t ret = re_syntax_options;
787
788 re_syntax_options = syntax;
789 return ret;
790}
791
792/* This table gives an error message for each of the error codes listed
793 in regex.h. Obviously the order here has to be same as there. */
794
795static const char *re_error_msg[] =
796 { NULL, /* REG_NOERROR */
797 "No match", /* REG_NOMATCH */
798 "Invalid regular expression", /* REG_BADPAT */
799 "Invalid collation character", /* REG_ECOLLATE */
800 "Invalid character class name", /* REG_ECTYPE */
801 "Trailing backslash", /* REG_EESCAPE */
802 "Invalid back reference", /* REG_ESUBREG */
803 "Unmatched [ or [^", /* REG_EBRACK */
804 "Unmatched ( or \\(", /* REG_EPAREN */
805 "Unmatched \\{", /* REG_EBRACE */
806 "Invalid content of \\{\\}", /* REG_BADBR */
807 "Invalid range end", /* REG_ERANGE */
808 "Memory exhausted", /* REG_ESPACE */
809 "Invalid preceding regular expression", /* REG_BADRPT */
810 "Premature end of regular expression", /* REG_EEND */
811 "Regular expression too big", /* REG_ESIZE */
812 "Unmatched ) or \\)", /* REG_ERPAREN */
813 };
814
815/* Subroutine declarations and macros for regex_compile. */
816
817static void store_op1 (), store_op2 ();
818static void insert_op1 (), insert_op2 ();
819static boolean at_begline_loc_p (), at_endline_loc_p ();
820static boolean group_in_compile_stack ();
821static reg_errcode_t compile_range ();
822
823/* Fetch the next character in the uncompiled pattern---translating it
824 if necessary. Also cast from a signed character in the constant
825 string passed to us by the user to an unsigned char that we can use
826 as an array index (in, e.g., `translate'). */
827#define PATFETCH(c) \
828 do {if (p == pend) return REG_EEND; \
829 c = (unsigned char) *p++; \
830 if (translate) c = translate[c]; \
831 } while (0)
832
833/* Fetch the next character in the uncompiled pattern, with no
834 translation. */
835#define PATFETCH_RAW(c) \
836 do {if (p == pend) return REG_EEND; \
837 c = (unsigned char) *p++; \
838 } while (0)
839
840/* Go backwards one character in the pattern. */
841#define PATUNFETCH p--
842
843
844/* If `translate' is non-null, return translate[D], else just D. We
845 cast the subscript to translate because some data is declared as
846 `char *', to avoid warnings when a string constant is passed. But
847 when we use a character as a subscript we must make it unsigned. */
848#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
849
850
851/* Macros for outputting the compiled pattern into `buffer'. */
852
853/* If the buffer isn't allocated when it comes in, use this. */
854#define INIT_BUF_SIZE 32
855
856/* Make sure we have at least N more bytes of space in buffer. */
857#define GET_BUFFER_SPACE(n) \
858 while (b - bufp->buffer + (n) > bufp->allocated) \
859 EXTEND_BUFFER ()
860
861/* Make sure we have one more byte of buffer space and then add C to it. */
862#define BUF_PUSH(c) \
863 do { \
864 GET_BUFFER_SPACE (1); \
865 *b++ = (unsigned char) (c); \
866 } while (0)
867
868
869/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
870#define BUF_PUSH_2(c1, c2) \
871 do { \
872 GET_BUFFER_SPACE (2); \
873 *b++ = (unsigned char) (c1); \
874 *b++ = (unsigned char) (c2); \
875 } while (0)
876
877
878/* As with BUF_PUSH_2, except for three bytes. */
879#define BUF_PUSH_3(c1, c2, c3) \
880 do { \
881 GET_BUFFER_SPACE (3); \
882 *b++ = (unsigned char) (c1); \
883 *b++ = (unsigned char) (c2); \
884 *b++ = (unsigned char) (c3); \
885 } while (0)
886
887
888/* Store a jump with opcode OP at LOC to location TO. We store a
889 relative address offset by the three bytes the jump itself occupies. */
890#define STORE_JUMP(op, loc, to) \
891 store_op1 (op, loc, (to) - (loc) - 3)
892
893/* Likewise, for a two-argument jump. */
894#define STORE_JUMP2(op, loc, to, arg) \
895 store_op2 (op, loc, (to) - (loc) - 3, arg)
896
897/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
898#define INSERT_JUMP(op, loc, to) \
899 insert_op1 (op, loc, (to) - (loc) - 3, b)
900
901/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
902#define INSERT_JUMP2(op, loc, to, arg) \
903 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
904
905
906/* This is not an arbitrary limit: the arguments which represent offsets
907 into the pattern are two bytes long. So if 2^16 bytes turns out to
908 be too small, many things would have to change. */
909#define MAX_BUF_SIZE (1L << 16)
910
911
912/* Extend the buffer by twice its current size via realloc and
913 reset the pointers that pointed into the old block to point to the
914 correct places in the new one. If extending the buffer results in it
915 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
916#define EXTEND_BUFFER() \
917 do { \
918 unsigned char *old_buffer = bufp->buffer; \
919 if (bufp->allocated == MAX_BUF_SIZE) \
920 return REG_ESIZE; \
921 bufp->allocated <<= 1; \
922 if (bufp->allocated > MAX_BUF_SIZE) \
923 bufp->allocated = MAX_BUF_SIZE; \
924 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
925 if (bufp->buffer == NULL) \
926 return REG_ESPACE; \
927 /* If the buffer moved, move all the pointers into it. */ \
928 if (old_buffer != bufp->buffer) \
929 { \
930 b = (b - old_buffer) + bufp->buffer; \
931 begalt = (begalt - old_buffer) + bufp->buffer; \
932 if (fixup_alt_jump) \
933 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
934 if (laststart) \
935 laststart = (laststart - old_buffer) + bufp->buffer; \
936 if (pending_exact) \
937 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
938 } \
939 } while (0)
940
941
942/* Since we have one byte reserved for the register number argument to
943 {start,stop}_memory, the maximum number of groups we can report
944 things about is what fits in that byte. */
945#define MAX_REGNUM 255
946
947/* But patterns can have more than `MAX_REGNUM' registers. We just
948 ignore the excess. */
949typedef unsigned regnum_t;
950
951
952/* Macros for the compile stack. */
953
954/* Since offsets can go either forwards or backwards, this type needs to
955 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
956typedef int pattern_offset_t;
957
958typedef struct
959{
960 pattern_offset_t begalt_offset;
961 pattern_offset_t fixup_alt_jump;
962 pattern_offset_t inner_group_offset;
963 pattern_offset_t laststart_offset;
964 regnum_t regnum;
965} compile_stack_elt_t;
966
967
968typedef struct
969{
970 compile_stack_elt_t *stack;
971 unsigned size;
972 unsigned avail; /* Offset of next open position. */
973} compile_stack_type;
974
975
976#define INIT_COMPILE_STACK_SIZE 32
977
978#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
979#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
980
981/* The next available element. */
982#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
983
984
985/* Set the bit for character C in a list. */
986#define SET_LIST_BIT(c) \
987 (b[((unsigned char) (c)) / BYTEWIDTH] \
988 |= 1 << (((unsigned char) c) % BYTEWIDTH))
989
990
991/* Get the next unsigned number in the uncompiled pattern. */
992#define GET_UNSIGNED_NUMBER(num) \
993 { if (p != pend) \
994 { \
995 PATFETCH (c); \
996 while (ISDIGIT (c)) \
997 { \
998 if (num < 0) \
999 num = 0; \
1000 num = num * 10 + c - '0'; \
1001 if (p == pend) \
1002 break; \
1003 PATFETCH (c); \
1004 } \
1005 } \
1006 }
1007
1008#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1009
1010#define IS_CHAR_CLASS(string) \
1011 (STREQ (string, "alpha") || STREQ (string, "upper") \
1012 || STREQ (string, "lower") || STREQ (string, "digit") \
1013 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1014 || STREQ (string, "space") || STREQ (string, "print") \
1015 || STREQ (string, "punct") || STREQ (string, "graph") \
1016 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1017
1018/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1019 Returns one of error codes defined in `regex.h', or zero for success.
1020
1021 Assumes the `allocated' (and perhaps `buffer') and `translate'
1022 fields are set in BUFP on entry.
1023
1024 If it succeeds, results are put in BUFP (if it returns an error, the
1025 contents of BUFP are undefined):
1026 `buffer' is the compiled pattern;
1027 `syntax' is set to SYNTAX;
1028 `used' is set to the length of the compiled pattern;
1029 `fastmap_accurate' is zero;
1030 `re_nsub' is the number of subexpressions in PATTERN;
1031 `not_bol' and `not_eol' are zero;
1032
1033 The `fastmap' and `newline_anchor' fields are neither
1034 examined nor set. */
1035
1036static reg_errcode_t
1037regex_compile (pattern, size, syntax, bufp)
1038 const char *pattern;
1039 int size;
1040 reg_syntax_t syntax;
1041 struct re_pattern_buffer *bufp;
1042{
1043 /* We fetch characters from PATTERN here. Even though PATTERN is
1044 `char *' (i.e., signed), we declare these variables as unsigned, so
1045 they can be reliably used as array indices. */
1046 register unsigned char c, c1;
1047
1048 /* A random tempory spot in PATTERN. */
1049 const char *p1;
1050
1051 /* Points to the end of the buffer, where we should append. */
1052 register unsigned char *b;
1053
1054 /* Keeps track of unclosed groups. */
1055 compile_stack_type compile_stack;
1056
1057 /* Points to the current (ending) position in the pattern. */
1058 const char *p = pattern;
1059 const char *pend = pattern + size;
1060
1061 /* How to translate the characters in the pattern. */
1062 char *translate = bufp->translate;
1063
1064 /* Address of the count-byte of the most recently inserted `exactn'
1065 command. This makes it possible to tell if a new exact-match
1066 character can be added to that command or if the character requires
1067 a new `exactn' command. */
1068 unsigned char *pending_exact = 0;
1069
1070 /* Address of start of the most recently finished expression.
1071 This tells, e.g., postfix * where to find the start of its
1072 operand. Reset at the beginning of groups and alternatives. */
1073 unsigned char *laststart = 0;
1074
1075 /* Address of beginning of regexp, or inside of last group. */
1076 unsigned char *begalt;
1077
1078 /* Place in the uncompiled pattern (i.e., the {) to
1079 which to go back if the interval is invalid. */
1080 const char *beg_interval;
1081
1082 /* Address of the place where a forward jump should go to the end of
1083 the containing expression. Each alternative of an `or' -- except the
1084 last -- ends with a forward jump of this sort. */
1085 unsigned char *fixup_alt_jump = 0;
1086
1087 /* Counts open-groups as they are encountered. Remembered for the
1088 matching close-group on the compile stack, so the same register
1089 number is put in the stop_memory as the start_memory. */
1090 regnum_t regnum = 0;
1091
1092#ifdef DEBUG
1093 DEBUG_PRINT1 ("\nCompiling pattern: ");
1094 if (debug)
1095 {
1096 unsigned debug_count;
1097
1098 for (debug_count = 0; debug_count < size; debug_count++)
1099 printchar (pattern[debug_count]);
1100 putchar ('\n');
1101 }
1102#endif /* DEBUG */
1103
1104 /* Initialize the compile stack. */
1105 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1106 if (compile_stack.stack == NULL)
1107 return REG_ESPACE;
1108
1109 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1110 compile_stack.avail = 0;
1111
1112 /* Initialize the pattern buffer. */
1113 bufp->syntax = syntax;
1114 bufp->fastmap_accurate = 0;
1115 bufp->not_bol = bufp->not_eol = 0;
1116
1117 /* Set `used' to zero, so that if we return an error, the pattern
1118 printer (for debugging) will think there's no pattern. We reset it
1119 at the end. */
1120 bufp->used = 0;
1121
1122 /* Always count groups, whether or not bufp->no_sub is set. */
1123 bufp->re_nsub = 0;
1124
1125#if !defined (emacs) && !defined (SYNTAX_TABLE)
1126 /* Initialize the syntax table. */
1127 init_syntax_once ();
1128#endif
1129
1130 if (bufp->allocated == 0)
1131 {
1132 if (bufp->buffer)
1133 { /* If zero allocated, but buffer is non-null, try to realloc
1134 enough space. This loses if buffer's address is bogus, but
1135 that is the user's responsibility. */
1136 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1137 }
1138 else
1139 { /* Caller did not allocate a buffer. Do it for them. */
1140 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1141 }
1142 if (!bufp->buffer) return REG_ESPACE;
1143
1144 bufp->allocated = INIT_BUF_SIZE;
1145 }
1146
1147 begalt = b = bufp->buffer;
1148
1149 /* Loop through the uncompiled pattern until we're at the end. */
1150 while (p != pend)
1151 {
1152 PATFETCH (c);
1153
1154 switch (c)
1155 {
1156 case '^':
1157 {
1158 if ( /* If at start of pattern, it's an operator. */
1159 p == pattern + 1
1160 /* If context independent, it's an operator. */
1161 || syntax & RE_CONTEXT_INDEP_ANCHORS
1162 /* Otherwise, depends on what's come before. */
1163 || at_begline_loc_p (pattern, p, syntax))
1164 BUF_PUSH (begline);
1165 else
1166 goto normal_char;
1167 }
1168 break;
1169
1170
1171 case '$':
1172 {
1173 if ( /* If at end of pattern, it's an operator. */
1174 p == pend
1175 /* If context independent, it's an operator. */
1176 || syntax & RE_CONTEXT_INDEP_ANCHORS
1177 /* Otherwise, depends on what's next. */
1178 || at_endline_loc_p (p, pend, syntax))
1179 BUF_PUSH (endline);
1180 else
1181 goto normal_char;
1182 }
1183 break;
1184
1185
1186 case '+':
1187 case '?':
1188 if ((syntax & RE_BK_PLUS_QM)
1189 || (syntax & RE_LIMITED_OPS))
1190 goto normal_char;
1191 handle_plus:
1192 case '*':
1193 /* If there is no previous pattern... */
1194 if (!laststart)
1195 {
1196 if (syntax & RE_CONTEXT_INVALID_OPS)
1197 return REG_BADRPT;
1198 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1199 goto normal_char;
1200 }
1201
1202 {
1203 /* Are we optimizing this jump? */
1204 boolean keep_string_p = false;
1205
1206 /* 1 means zero (many) matches is allowed. */
1207 char zero_times_ok = 0, many_times_ok = 0;
1208
1209 /* If there is a sequence of repetition chars, collapse it
1210 down to just one (the right one). We can't combine
1211 interval operators with these because of, e.g., `a{2}*',
1212 which should only match an even number of `a's. */
1213
1214 for (;;)
1215 {
1216 zero_times_ok |= c != '+';
1217 many_times_ok |= c != '?';
1218
1219 if (p == pend)
1220 break;
1221
1222 PATFETCH (c);
1223
1224 if (c == '*'
1225 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1226 ;
1227
1228 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1229 {
1230 if (p == pend) return REG_EESCAPE;
1231
1232 PATFETCH (c1);
1233 if (!(c1 == '+' || c1 == '?'))
1234 {
1235 PATUNFETCH;
1236 PATUNFETCH;
1237 break;
1238 }
1239
1240 c = c1;
1241 }
1242 else
1243 {
1244 PATUNFETCH;
1245 break;
1246 }
1247
1248 /* If we get here, we found another repeat character. */
1249 }
1250
1251 /* Star, etc. applied to an empty pattern is equivalent
1252 to an empty pattern. */
1253 if (!laststart)
1254 break;
1255
1256 /* Now we know whether or not zero matches is allowed
1257 and also whether or not two or more matches is allowed. */
1258 if (many_times_ok)
1259 { /* More than one repetition is allowed, so put in at the
1260 end a backward relative jump from `b' to before the next
1261 jump we're going to put in below (which jumps from
1262 laststart to after this jump).
1263
1264 But if we are at the `*' in the exact sequence `.*\n',
1265 insert an unconditional jump backwards to the .,
1266 instead of the beginning of the loop. This way we only
1267 push a failure point once, instead of every time
1268 through the loop. */
1269 assert (p - 1 > pattern);
1270
1271 /* Allocate the space for the jump. */
1272 GET_BUFFER_SPACE (3);
1273
1274 /* We know we are not at the first character of the pattern,
1275 because laststart was nonzero. And we've already
1276 incremented `p', by the way, to be the character after
1277 the `*'. Do we have to do something analogous here
1278 for null bytes, because of RE_DOT_NOT_NULL? */
1279 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1280 && zero_times_ok
1281 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1282 && !(syntax & RE_DOT_NEWLINE))
1283 { /* We have .*\n. */
1284 STORE_JUMP (jump, b, laststart);
1285 keep_string_p = true;
1286 }
1287 else
1288 /* Anything else. */
1289 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1290
1291 /* We've added more stuff to the buffer. */
1292 b += 3;
1293 }
1294
1295 /* On failure, jump from laststart to b + 3, which will be the
1296 end of the buffer after this jump is inserted. */
1297 GET_BUFFER_SPACE (3);
1298 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1299 : on_failure_jump,
1300 laststart, b + 3);
1301 pending_exact = 0;
1302 b += 3;
1303
1304 if (!zero_times_ok)
1305 {
1306 /* At least one repetition is required, so insert a
1307 `dummy_failure_jump' before the initial
1308 `on_failure_jump' instruction of the loop. This
1309 effects a skip over that instruction the first time
1310 we hit that loop. */
1311 GET_BUFFER_SPACE (3);
1312 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1313 b += 3;
1314 }
1315 }
1316 break;
1317
1318
1319 case '.':
1320 laststart = b;
1321 BUF_PUSH (anychar);
1322 break;
1323
1324
1325 case '[':
1326 {
1327 boolean had_char_class = false;
1328
1329 if (p == pend) return REG_EBRACK;
1330
1331 /* Ensure that we have enough space to push a charset: the
1332 opcode, the length count, and the bitset; 34 bytes in all. */
1333 GET_BUFFER_SPACE (34);
1334
1335 laststart = b;
1336
1337 /* We test `*p == '^' twice, instead of using an if
1338 statement, so we only need one BUF_PUSH. */
1339 BUF_PUSH (*p == '^' ? charset_not : charset);
1340 if (*p == '^')
1341 p++;
1342
1343 /* Remember the first position in the bracket expression. */
1344 p1 = p;
1345
1346 /* Push the number of bytes in the bitmap. */
1347 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1348
1349 /* Clear the whole map. */
1350 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1351
1352 /* charset_not matches newline according to a syntax bit. */
1353 if ((re_opcode_t) b[-2] == charset_not
1354 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1355 SET_LIST_BIT ('\n');
1356
1357 /* Read in characters and ranges, setting map bits. */
1358 for (;;)
1359 {
1360 if (p == pend) return REG_EBRACK;
1361
1362 PATFETCH (c);
1363
1364 /* \ might escape characters inside [...] and [^...]. */
1365 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1366 {
1367 if (p == pend) return REG_EESCAPE;
1368
1369 PATFETCH (c1);
1370 SET_LIST_BIT (c1);
1371 continue;
1372 }
1373
1374 /* Could be the end of the bracket expression. If it's
1375 not (i.e., when the bracket expression is `[]' so
1376 far), the ']' character bit gets set way below. */
1377 if (c == ']' && p != p1 + 1)
1378 break;
1379
1380 /* Look ahead to see if it's a range when the last thing
1381 was a character class. */
1382 if (had_char_class && c == '-' && *p != ']')
1383 return REG_ERANGE;
1384
1385 /* Look ahead to see if it's a range when the last thing
1386 was a character: if this is a hyphen not at the
1387 beginning or the end of a list, then it's the range
1388 operator. */
1389 if (c == '-'
1390 && !(p - 2 >= pattern && p[-2] == '[')
1391 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1392 && *p != ']')
1393 {
1394 reg_errcode_t ret
1395 = compile_range (&p, pend, translate, syntax, b);
1396 if (ret != REG_NOERROR) return ret;
1397 }
1398
1399 else if (p[0] == '-' && p[1] != ']')
1400 { /* This handles ranges made up of characters only. */
1401 reg_errcode_t ret;
1402
1403 /* Move past the `-'. */
1404 PATFETCH (c1);
1405
1406 ret = compile_range (&p, pend, translate, syntax, b);
1407 if (ret != REG_NOERROR) return ret;
1408 }
1409
1410 /* See if we're at the beginning of a possible character
1411 class. */
1412
1413 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1414 { /* Leave room for the null. */
1415 char str[CHAR_CLASS_MAX_LENGTH + 1];
1416
1417 PATFETCH (c);
1418 c1 = 0;
1419
1420 /* If pattern is `[[:'. */
1421 if (p == pend) return REG_EBRACK;
1422
1423 for (;;)
1424 {
1425 PATFETCH (c);
1426 if (c == ':' || c == ']' || p == pend
1427 || c1 == CHAR_CLASS_MAX_LENGTH)
1428 break;
1429 str[c1++] = c;
1430 }
1431 str[c1] = '\0';
1432
1433 /* If isn't a word bracketed by `[:' and:`]':
1434 undo the ending character, the letters, and leave
1435 the leading `:' and `[' (but set bits for them). */
1436 if (c == ':' && *p == ']')
1437 {
1438 int ch;
1439 boolean is_alnum = STREQ (str, "alnum");
1440 boolean is_alpha = STREQ (str, "alpha");
1441 boolean is_blank = STREQ (str, "blank");
1442 boolean is_cntrl = STREQ (str, "cntrl");
1443 boolean is_digit = STREQ (str, "digit");
1444 boolean is_graph = STREQ (str, "graph");
1445 boolean is_lower = STREQ (str, "lower");
1446 boolean is_print = STREQ (str, "print");
1447 boolean is_punct = STREQ (str, "punct");
1448 boolean is_space = STREQ (str, "space");
1449 boolean is_upper = STREQ (str, "upper");
1450 boolean is_xdigit = STREQ (str, "xdigit");
1451
1452 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1453
1454 /* Throw away the ] at the end of the character
1455 class. */
1456 PATFETCH (c);
1457
1458 if (p == pend) return REG_EBRACK;
1459
1460 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1461 {
1462 if ( (is_alnum && ISALNUM (ch))
1463 || (is_alpha && ISALPHA (ch))
1464 || (is_blank && ISBLANK (ch))
1465 || (is_cntrl && ISCNTRL (ch))
1466 || (is_digit && ISDIGIT (ch))
1467 || (is_graph && ISGRAPH (ch))
1468 || (is_lower && ISLOWER (ch))
1469 || (is_print && ISPRINT (ch))
1470 || (is_punct && ISPUNCT (ch))
1471 || (is_space && ISSPACE (ch))
1472 || (is_upper && ISUPPER (ch))
1473 || (is_xdigit && ISXDIGIT (ch)))
1474 SET_LIST_BIT (ch);
1475 }
1476 had_char_class = true;
1477 }
1478 else
1479 {
1480 c1++;
1481 while (c1--)
1482 PATUNFETCH;
1483 SET_LIST_BIT ('[');
1484 SET_LIST_BIT (':');
1485 had_char_class = false;
1486 }
1487 }
1488 else
1489 {
1490 had_char_class = false;
1491 SET_LIST_BIT (c);
1492 }
1493 }
1494
1495 /* Discard any (non)matching list bytes that are all 0 at the
1496 end of the map. Decrease the map-length byte too. */
1497 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1498 b[-1]--;
1499 b += b[-1];
1500 }
1501 break;
1502
1503
1504 case '(':
1505 if (syntax & RE_NO_BK_PARENS)
1506 goto handle_open;
1507 else
1508 goto normal_char;
1509
1510
1511 case ')':
1512 if (syntax & RE_NO_BK_PARENS)
1513 goto handle_close;
1514 else
1515 goto normal_char;
1516
1517
1518 case '\n':
1519 if (syntax & RE_NEWLINE_ALT)
1520 goto handle_alt;
1521 else
1522 goto normal_char;
1523
1524
1525 case '|':
1526 if (syntax & RE_NO_BK_VBAR)
1527 goto handle_alt;
1528 else
1529 goto normal_char;
1530
1531
1532 case '{':
1533 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1534 goto handle_interval;
1535 else
1536 goto normal_char;
1537
1538
1539 case '\\':
1540 if (p == pend) return REG_EESCAPE;
1541
1542 /* Do not translate the character after the \, so that we can
1543 distinguish, e.g., \B from \b, even if we normally would
1544 translate, e.g., B to b. */
1545 PATFETCH_RAW (c);
1546
1547 switch (c)
1548 {
1549 case '(':
1550 if (syntax & RE_NO_BK_PARENS)
1551 goto normal_backslash;
1552
1553 handle_open:
1554 bufp->re_nsub++;
1555 regnum++;
1556
1557 if (COMPILE_STACK_FULL)
1558 {
1559 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1560 compile_stack_elt_t);
1561 if (compile_stack.stack == NULL) return REG_ESPACE;
1562
1563 compile_stack.size <<= 1;
1564 }
1565
1566 /* These are the values to restore when we hit end of this
1567 group. They are all relative offsets, so that if the
1568 whole pattern moves because of realloc, they will still
1569 be valid. */
1570 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1571 COMPILE_STACK_TOP.fixup_alt_jump
1572 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1573 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1574 COMPILE_STACK_TOP.regnum = regnum;
1575
1576 /* We will eventually replace the 0 with the number of
1577 groups inner to this one. But do not push a
1578 start_memory for groups beyond the last one we can
1579 represent in the compiled pattern. */
1580 if (regnum <= MAX_REGNUM)
1581 {
1582 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1583 BUF_PUSH_3 (start_memory, regnum, 0);
1584 }
1585
1586 compile_stack.avail++;
1587
1588 fixup_alt_jump = 0;
1589 laststart = 0;
1590 begalt = b;
1591 /* If we've reached MAX_REGNUM groups, then this open
1592 won't actually generate any code, so we'll have to
1593 clear pending_exact explicitly. */
1594 pending_exact = 0;
1595 break;
1596
1597
1598 case ')':
1599 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1600
1601 if (COMPILE_STACK_EMPTY)
1602 {
1603 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1604 goto normal_backslash;
1605 else
1606 return REG_ERPAREN;
1607 }
1608
1609 handle_close:
1610 if (fixup_alt_jump)
1611 { /* Push a dummy failure point at the end of the
1612 alternative for a possible future
1613 `pop_failure_jump' to pop. See comments at
1614 `push_dummy_failure' in `re_match_2'. */
1615 BUF_PUSH (push_dummy_failure);
1616
1617 /* We allocated space for this jump when we assigned
1618 to `fixup_alt_jump', in the `handle_alt' case below. */
1619 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1620 }
1621
1622 /* See similar code for backslashed left paren above. */
1623 if (COMPILE_STACK_EMPTY)
1624 {
1625 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1626 goto normal_char;
1627 else
1628 return REG_ERPAREN;
1629 }
1630
1631 /* Since we just checked for an empty stack above, this
1632 ``can't happen''. */
1633 assert (compile_stack.avail != 0);
1634 {
1635 /* We don't just want to restore into `regnum', because
1636 later groups should continue to be numbered higher,
1637 as in `(ab)c(de)' -- the second group is #2. */
1638 regnum_t this_group_regnum;
1639
1640 compile_stack.avail--;
1641 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1642 fixup_alt_jump
1643 = COMPILE_STACK_TOP.fixup_alt_jump
1644 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1645 : 0;
1646 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1647 this_group_regnum = COMPILE_STACK_TOP.regnum;
1648 /* If we've reached MAX_REGNUM groups, then this open
1649 won't actually generate any code, so we'll have to
1650 clear pending_exact explicitly. */
1651 pending_exact = 0;
1652
1653 /* We're at the end of the group, so now we know how many
1654 groups were inside this one. */
1655 if (this_group_regnum <= MAX_REGNUM)
1656 {
1657 unsigned char *inner_group_loc
1658 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1659
1660 *inner_group_loc = regnum - this_group_regnum;
1661 BUF_PUSH_3 (stop_memory, this_group_regnum,
1662 regnum - this_group_regnum);
1663 }
1664 }
1665 break;
1666
1667
1668 case '|': /* `\|'. */
1669 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1670 goto normal_backslash;
1671 handle_alt:
1672 if (syntax & RE_LIMITED_OPS)
1673 goto normal_char;
1674
1675 /* Insert before the previous alternative a jump which
1676 jumps to this alternative if the former fails. */
1677 GET_BUFFER_SPACE (3);
1678 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1679 pending_exact = 0;
1680 b += 3;
1681
1682 /* The alternative before this one has a jump after it
1683 which gets executed if it gets matched. Adjust that
1684 jump so it will jump to this alternative's analogous
1685 jump (put in below, which in turn will jump to the next
1686 (if any) alternative's such jump, etc.). The last such
1687 jump jumps to the correct final destination. A picture:
1688 _____ _____
1689 | | | |
1690 | v | v
1691 a | b | c
1692
1693 If we are at `b', then fixup_alt_jump right now points to a
1694 three-byte space after `a'. We'll put in the jump, set
1695 fixup_alt_jump to right after `b', and leave behind three
1696 bytes which we'll fill in when we get to after `c'. */
1697
1698 if (fixup_alt_jump)
1699 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1700
1701 /* Mark and leave space for a jump after this alternative,
1702 to be filled in later either by next alternative or
1703 when know we're at the end of a series of alternatives. */
1704 fixup_alt_jump = b;
1705 GET_BUFFER_SPACE (3);
1706 b += 3;
1707
1708 laststart = 0;
1709 begalt = b;
1710 break;
1711
1712
1713 case '{':
1714 /* If \{ is a literal. */
1715 if (!(syntax & RE_INTERVALS)
1716 /* If we're at `\{' and it's not the open-interval
1717 operator. */
1718 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1719 || (p - 2 == pattern && p == pend))
1720 goto normal_backslash;
1721
1722 handle_interval:
1723 {
1724 /* If got here, then the syntax allows intervals. */
1725
1726 /* At least (most) this many matches must be made. */
1727 int lower_bound = -1, upper_bound = -1;
1728
1729 beg_interval = p - 1;
1730
1731 if (p == pend)
1732 {
1733 if (syntax & RE_NO_BK_BRACES)
1734 goto unfetch_interval;
1735 else
1736 return REG_EBRACE;
1737 }
1738
1739 GET_UNSIGNED_NUMBER (lower_bound);
1740
1741 if (c == ',')
1742 {
1743 GET_UNSIGNED_NUMBER (upper_bound);
1744 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1745 }
1746 else
1747 /* Interval such as `{1}' => match exactly once. */
1748 upper_bound = lower_bound;
1749
1750 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1751 || lower_bound > upper_bound)
1752 {
1753 if (syntax & RE_NO_BK_BRACES)
1754 goto unfetch_interval;
1755 else
1756 return REG_BADBR;
1757 }
1758
1759 if (!(syntax & RE_NO_BK_BRACES))
1760 {
1761 if (c != '\\') return REG_EBRACE;
1762
1763 PATFETCH (c);
1764 }
1765
1766 if (c != '}')
1767 {
1768 if (syntax & RE_NO_BK_BRACES)
1769 goto unfetch_interval;
1770 else
1771 return REG_BADBR;
1772 }
1773
1774 /* We just parsed a valid interval. */
1775
1776 /* If it's invalid to have no preceding re. */
1777 if (!laststart)
1778 {
1779 if (syntax & RE_CONTEXT_INVALID_OPS)
1780 return REG_BADRPT;
1781 else if (syntax & RE_CONTEXT_INDEP_OPS)
1782 laststart = b;
1783 else
1784 goto unfetch_interval;
1785 }
1786
1787 /* If the upper bound is zero, don't want to succeed at
1788 all; jump from `laststart' to `b + 3', which will be
1789 the end of the buffer after we insert the jump. */
1790 if (upper_bound == 0)
1791 {
1792 GET_BUFFER_SPACE (3);
1793 INSERT_JUMP (jump, laststart, b + 3);
1794 b += 3;
1795 }
1796
1797 /* Otherwise, we have a nontrivial interval. When
1798 we're all done, the pattern will look like:
1799 set_number_at <jump count> <upper bound>
1800 set_number_at <succeed_n count> <lower bound>
1801 succeed_n <after jump addr> <succed_n count>
1802 <body of loop>
1803 jump_n <succeed_n addr> <jump count>
1804 (The upper bound and `jump_n' are omitted if
1805 `upper_bound' is 1, though.) */
1806 else
1807 { /* If the upper bound is > 1, we need to insert
1808 more at the end of the loop. */
1809 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1810
1811 GET_BUFFER_SPACE (nbytes);
1812
1813 /* Initialize lower bound of the `succeed_n', even
1814 though it will be set during matching by its
1815 attendant `set_number_at' (inserted next),
1816 because `re_compile_fastmap' needs to know.
1817 Jump to the `jump_n' we might insert below. */
1818 INSERT_JUMP2 (succeed_n, laststart,
1819 b + 5 + (upper_bound > 1) * 5,
1820 lower_bound);
1821 b += 5;
1822
1823 /* Code to initialize the lower bound. Insert
1824 before the `succeed_n'. The `5' is the last two
1825 bytes of this `set_number_at', plus 3 bytes of
1826 the following `succeed_n'. */
1827 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1828 b += 5;
1829
1830 if (upper_bound > 1)
1831 { /* More than one repetition is allowed, so
1832 append a backward jump to the `succeed_n'
1833 that starts this interval.
1834
1835 When we've reached this during matching,
1836 we'll have matched the interval once, so
1837 jump back only `upper_bound - 1' times. */
1838 STORE_JUMP2 (jump_n, b, laststart + 5,
1839 upper_bound - 1);
1840 b += 5;
1841
1842 /* The location we want to set is the second
1843 parameter of the `jump_n'; that is `b-2' as
1844 an absolute address. `laststart' will be
1845 the `set_number_at' we're about to insert;
1846 `laststart+3' the number to set, the source
1847 for the relative address. But we are
1848 inserting into the middle of the pattern --
1849 so everything is getting moved up by 5.
1850 Conclusion: (b - 2) - (laststart + 3) + 5,
1851 i.e., b - laststart.
1852
1853 We insert this at the beginning of the loop
1854 so that if we fail during matching, we'll
1855 reinitialize the bounds. */
1856 insert_op2 (set_number_at, laststart, b - laststart,
1857 upper_bound - 1, b);
1858 b += 5;
1859 }
1860 }
1861 pending_exact = 0;
1862 beg_interval = NULL;
1863 }
1864 break;
1865
1866 unfetch_interval:
1867 /* If an invalid interval, match the characters as literals. */
1868 assert (beg_interval);
1869 p = beg_interval;
1870 beg_interval = NULL;
1871
1872 /* normal_char and normal_backslash need `c'. */
1873 PATFETCH (c);
1874
1875 if (!(syntax & RE_NO_BK_BRACES))
1876 {
1877 if (p > pattern && p[-1] == '\\')
1878 goto normal_backslash;
1879 }
1880 goto normal_char;
1881
1882#ifdef emacs
1883 /* There is no way to specify the before_dot and after_dot
1884 operators. rms says this is ok. --karl */
1885 case '=':
1886 BUF_PUSH (at_dot);
1887 break;
1888
1889 case 's':
1890 laststart = b;
1891 PATFETCH (c);
1892 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1893 break;
1894
1895 case 'S':
1896 laststart = b;
1897 PATFETCH (c);
1898 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1899 break;
1900#endif /* emacs */
1901
1902
1903 case 'w':
1904 laststart = b;
1905 BUF_PUSH (wordchar);
1906 break;
1907
1908
1909 case 'W':
1910 laststart = b;
1911 BUF_PUSH (notwordchar);
1912 break;
1913
1914
1915 case '<':
1916 BUF_PUSH (wordbeg);
1917 break;
1918
1919 case '>':
1920 BUF_PUSH (wordend);
1921 break;
1922
1923 case 'b':
1924 BUF_PUSH (wordbound);
1925 break;
1926
1927 case 'B':
1928 BUF_PUSH (notwordbound);
1929 break;
1930
1931 case '`':
1932 BUF_PUSH (begbuf);
1933 break;
1934
1935 case '\'':
1936 BUF_PUSH (endbuf);
1937 break;
1938
1939 case '1': case '2': case '3': case '4': case '5':
1940 case '6': case '7': case '8': case '9':
1941 if (syntax & RE_NO_BK_REFS)
1942 goto normal_char;
1943
1944 c1 = c - '0';
1945
1946 if (c1 > regnum)
1947 return REG_ESUBREG;
1948
1949 /* Can't back reference to a subexpression if inside of it. */
1950 if (group_in_compile_stack (compile_stack, c1))
1951 goto normal_char;
1952
1953 laststart = b;
1954 BUF_PUSH_2 (duplicate, c1);
1955 break;
1956
1957
1958 case '+':
1959 case '?':
1960 if (syntax & RE_BK_PLUS_QM)
1961 goto handle_plus;
1962 else
1963 goto normal_backslash;
1964
1965 default:
1966 normal_backslash:
1967 /* You might think it would be useful for \ to mean
1968 not to translate; but if we don't translate it
1969 it will never match anything. */
1970 c = TRANSLATE (c);
1971 goto normal_char;
1972 }
1973 break;
1974
1975
1976 default:
1977 /* Expects the character in `c'. */
1978 normal_char:
1979 /* If no exactn currently being built. */
1980 if (!pending_exact
1981
1982 /* If last exactn not at current position. */
1983 || pending_exact + *pending_exact + 1 != b
1984
1985 /* We have only one byte following the exactn for the count. */
1986 || *pending_exact == (1 << BYTEWIDTH) - 1
1987
1988 /* If followed by a repetition operator. */
1989 || *p == '*' || *p == '^'
1990 || ((syntax & RE_BK_PLUS_QM)
1991 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
1992 : (*p == '+' || *p == '?'))
1993 || ((syntax & RE_INTERVALS)
1994 && ((syntax & RE_NO_BK_BRACES)
1995 ? *p == '{'
1996 : (p[0] == '\\' && p[1] == '{'))))
1997 {
1998 /* Start building a new exactn. */
1999
2000 laststart = b;
2001
2002 BUF_PUSH_2 (exactn, 0);
2003 pending_exact = b - 1;
2004 }
2005
2006 BUF_PUSH (c);
2007 (*pending_exact)++;
2008 break;
2009 } /* switch (c) */
2010 } /* while p != pend */
2011
2012
2013 /* Through the pattern now. */
2014
2015 if (fixup_alt_jump)
2016 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2017
2018 if (!COMPILE_STACK_EMPTY)
2019 return REG_EPAREN;
2020
2021 free (compile_stack.stack);
2022
2023 /* We have succeeded; set the length of the buffer. */
2024 bufp->used = b - bufp->buffer;
2025
2026#ifdef DEBUG
2027 if (debug)
2028 {
2029 DEBUG_PRINT1 ("\nCompiled pattern: ");
2030 print_compiled_pattern (bufp);
2031 }
2032#endif /* DEBUG */
2033
2034 return REG_NOERROR;
2035} /* regex_compile */
2036
2037/* Subroutines for `regex_compile'. */
2038
2039/* Store OP at LOC followed by two-byte integer parameter ARG. */
2040
2041static void
2042store_op1 (op, loc, arg)
2043 re_opcode_t op;
2044 unsigned char *loc;
2045 int arg;
2046{
2047 *loc = (unsigned char) op;
2048 STORE_NUMBER (loc + 1, arg);
2049}
2050
2051
2052/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2053
2054static void
2055store_op2 (op, loc, arg1, arg2)
2056 re_opcode_t op;
2057 unsigned char *loc;
2058 int arg1, arg2;
2059{
2060 *loc = (unsigned char) op;
2061 STORE_NUMBER (loc + 1, arg1);
2062 STORE_NUMBER (loc + 3, arg2);
2063}
2064
2065
2066/* Copy the bytes from LOC to END to open up three bytes of space at LOC
2067 for OP followed by two-byte integer parameter ARG. */
2068
2069static void
2070insert_op1 (op, loc, arg, end)
2071 re_opcode_t op;
2072 unsigned char *loc;
2073 int arg;
2074 unsigned char *end;
2075{
2076 register unsigned char *pfrom = end;
2077 register unsigned char *pto = end + 3;
2078
2079 while (pfrom != loc)
2080 *--pto = *--pfrom;
2081
2082 store_op1 (op, loc, arg);
2083}
2084
2085
2086/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2087
2088static void
2089insert_op2 (op, loc, arg1, arg2, end)
2090 re_opcode_t op;
2091 unsigned char *loc;
2092 int arg1, arg2;
2093 unsigned char *end;
2094{
2095 register unsigned char *pfrom = end;
2096 register unsigned char *pto = end + 5;
2097
2098 while (pfrom != loc)
2099 *--pto = *--pfrom;
2100
2101 store_op2 (op, loc, arg1, arg2);
2102}
2103
2104
2105/* P points to just after a ^ in PATTERN. Return true if that ^ comes
2106 after an alternative or a begin-subexpression. We assume there is at
2107 least one character before the ^. */
2108
2109static boolean
2110at_begline_loc_p (pattern, p, syntax)
2111 const char *pattern, *p;
2112 reg_syntax_t syntax;
2113{
2114 const char *prev = p - 2;
2115 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2116
2117 return
2118 /* After a subexpression? */
2119 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2120 /* After an alternative? */
2121 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2122}
2123
2124
2125/* The dual of at_begline_loc_p. This one is for $. We assume there is
2126 at least one character after the $, i.e., `P < PEND'. */
2127
2128static boolean
2129at_endline_loc_p (p, pend, syntax)
2130 const char *p, *pend;
2131 int syntax;
2132{
2133 const char *next = p;
2134 boolean next_backslash = *next == '\\';
2135 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2136
2137 return
2138 /* Before a subexpression? */
2139 (syntax & RE_NO_BK_PARENS ? *next == ')'
2140 : next_backslash && next_next && *next_next == ')')
2141 /* Before an alternative? */
2142 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2143 : next_backslash && next_next && *next_next == '|');
2144}
2145
2146
2147/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2148 false if it's not. */
2149
2150static boolean
2151group_in_compile_stack (compile_stack, regnum)
2152 compile_stack_type compile_stack;
2153 regnum_t regnum;
2154{
2155 int this_element;
2156
2157 for (this_element = compile_stack.avail - 1;
2158 this_element >= 0;
2159 this_element--)
2160 if (compile_stack.stack[this_element].regnum == regnum)
2161 return true;
2162
2163 return false;
2164}
2165
2166
2167/* Read the ending character of a range (in a bracket expression) from the
2168 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2169 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2170 Then we set the translation of all bits between the starting and
2171 ending characters (inclusive) in the compiled pattern B.
2172
2173 Return an error code.
2174
2175 We use these short variable names so we can use the same macros as
2176 `regex_compile' itself. */
2177
2178static reg_errcode_t
2179compile_range (p_ptr, pend, translate, syntax, b)
2180 const char **p_ptr, *pend;
2181 char *translate;
2182 reg_syntax_t syntax;
2183 unsigned char *b;
2184{
2185 unsigned this_char;
2186
2187 const char *p = *p_ptr;
2188 int range_start, range_end;
2189
2190 if (p == pend)
2191 return REG_ERANGE;
2192
2193 /* Even though the pattern is a signed `char *', we need to fetch
2194 with unsigned char *'s; if the high bit of the pattern character
2195 is set, the range endpoints will be negative if we fetch using a
2196 signed char *.
2197
2198 We also want to fetch the endpoints without translating them; the
2199 appropriate translation is done in the bit-setting loop below. */
2200 range_start = ((unsigned char *) p)[-2];
2201 range_end = ((unsigned char *) p)[0];
2202
2203 /* Have to increment the pointer into the pattern string, so the
2204 caller isn't still at the ending character. */
2205 (*p_ptr)++;
2206
2207 /* If the start is after the end, the range is empty. */
2208 if (range_start > range_end)
2209 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2210
2211 /* Here we see why `this_char' has to be larger than an `unsigned
2212 char' -- the range is inclusive, so if `range_end' == 0xff
2213 (assuming 8-bit characters), we would otherwise go into an infinite
2214 loop, since all characters <= 0xff. */
2215 for (this_char = range_start; this_char <= range_end; this_char++)
2216 {
2217 SET_LIST_BIT (TRANSLATE (this_char));
2218 }
2219
2220 return REG_NOERROR;
2221}
2222
2223/* Failure stack declarations and macros; both re_compile_fastmap and
2224 re_match_2 use a failure stack. These have to be macros because of
2225 REGEX_ALLOCATE. */
2226
2227
2228/* Number of failure points for which to initially allocate space
2229 when matching. If this number is exceeded, we allocate more
2230 space, so it is not a hard limit. */
2231#ifndef INIT_FAILURE_ALLOC
2232#define INIT_FAILURE_ALLOC 5
2233#endif
2234
2235/* Roughly the maximum number of failure points on the stack. Would be
2236 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2237 This is a variable only so users of regex can assign to it; we never
2238 change it ourselves. */
2239int re_max_failures = 2000; 88int re_max_failures = 2000;
2240 89# endif
2241typedef const unsigned char *fail_stack_elt_t;
2242
2243typedef struct
2244{
2245 fail_stack_elt_t *stack;
2246 unsigned size;
2247 unsigned avail; /* Offset of next open position. */
2248} fail_stack_type;
2249
2250#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2251#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2252#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2253#define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2254
2255
2256/* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2257
2258#define INIT_FAIL_STACK() \
2259 do { \
2260 fail_stack.stack = (fail_stack_elt_t *) \
2261 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2262 \
2263 if (fail_stack.stack == NULL) \
2264 return -2; \
2265 \
2266 fail_stack.size = INIT_FAILURE_ALLOC; \
2267 fail_stack.avail = 0; \
2268 } while (0)
2269
2270
2271/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2272
2273 Return 1 if succeeds, and 0 if either ran out of memory
2274 allocating space for it or it was already too large.
2275
2276 REGEX_REALLOCATE requires `destination' be declared. */
2277
2278#define DOUBLE_FAIL_STACK(fail_stack) \
2279 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2280 ? 0 \
2281 : ((fail_stack).stack = (fail_stack_elt_t *) \
2282 REGEX_REALLOCATE ((fail_stack).stack, \
2283 (fail_stack).size * sizeof (fail_stack_elt_t), \
2284 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2285 \
2286 (fail_stack).stack == NULL \
2287 ? 0 \
2288 : ((fail_stack).size <<= 1, \
2289 1)))
2290
2291
2292/* Push PATTERN_OP on FAIL_STACK.
2293
2294 Return 1 if was able to do so and 0 if ran out of memory allocating
2295 space to do so. */
2296#define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2297 ((FAIL_STACK_FULL () \
2298 && !DOUBLE_FAIL_STACK (fail_stack)) \
2299 ? 0 \
2300 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2301 1))
2302
2303/* This pushes an item onto the failure stack. Must be a four-byte
2304 value. Assumes the variable `fail_stack'. Probably should only
2305 be called from within `PUSH_FAILURE_POINT'. */
2306#define PUSH_FAILURE_ITEM(item) \
2307 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2308
2309/* The complement operation. Assumes `fail_stack' is nonempty. */
2310#define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2311
2312/* Used to omit pushing failure point id's when we're not debugging. */
2313#ifdef DEBUG
2314#define DEBUG_PUSH PUSH_FAILURE_ITEM
2315#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2316#else
2317#define DEBUG_PUSH(item)
2318#define DEBUG_POP(item_addr)
2319#endif
2320
2321
2322/* Push the information about the state we will need
2323 if we ever fail back to it.
2324
2325 Requires variables fail_stack, regstart, regend, reg_info, and
2326 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2327 declared.
2328
2329 Does `return FAILURE_CODE' if runs out of memory. */
2330
2331#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2332 do { \
2333 char *destination; \
2334 /* Must be int, so when we don't save any registers, the arithmetic \
2335 of 0 + -1 isn't done as unsigned. */ \
2336 int this_reg; \
2337 \
2338 DEBUG_STATEMENT (failure_id++); \
2339 DEBUG_STATEMENT (nfailure_points_pushed++); \
2340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2341 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2342 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2343 \
2344 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2345 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2346 \
2347 /* Ensure we have enough space allocated for what we will push. */ \
2348 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2349 { \
2350 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2351 return failure_code; \
2352 \
2353 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2354 (fail_stack).size); \
2355 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2356 } \
2357 \
2358 /* Push the info, starting with the registers. */ \
2359 DEBUG_PRINT1 ("\n"); \
2360 \
2361 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2362 this_reg++) \
2363 { \
2364 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2365 DEBUG_STATEMENT (num_regs_pushed++); \
2366 \
2367 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2368 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2369 \
2370 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2371 PUSH_FAILURE_ITEM (regend[this_reg]); \
2372 \
2373 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2374 DEBUG_PRINT2 (" match_null=%d", \
2375 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2376 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2377 DEBUG_PRINT2 (" matched_something=%d", \
2378 MATCHED_SOMETHING (reg_info[this_reg])); \
2379 DEBUG_PRINT2 (" ever_matched=%d", \
2380 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2381 DEBUG_PRINT1 ("\n"); \
2382 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2383 } \
2384 \
2385 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2386 PUSH_FAILURE_ITEM (lowest_active_reg); \
2387 \
2388 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2389 PUSH_FAILURE_ITEM (highest_active_reg); \
2390 \
2391 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2392 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2393 PUSH_FAILURE_ITEM (pattern_place); \
2394 \
2395 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2396 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2397 size2); \
2398 DEBUG_PRINT1 ("'\n"); \
2399 PUSH_FAILURE_ITEM (string_place); \
2400 \
2401 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2402 DEBUG_PUSH (failure_id); \
2403 } while (0)
2404
2405/* This is the number of items that are pushed and popped on the stack
2406 for each register. */
2407#define NUM_REG_ITEMS 3
2408
2409/* Individual items aside from the registers. */
2410#ifdef DEBUG
2411#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2412#else
2413#define NUM_NONREG_ITEMS 4
2414#endif
2415
2416/* We push at most this many items on the stack. */
2417#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2418
2419/* We actually push this many items. */
2420#define NUM_FAILURE_ITEMS \
2421 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2422 + NUM_NONREG_ITEMS)
2423
2424/* How many items can still be added to the stack without overflowing it. */
2425#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2426
2427
2428/* Pops what PUSH_FAIL_STACK pushes.
2429
2430 We restore into the parameters, all of which should be lvalues:
2431 STR -- the saved data position.
2432 PAT -- the saved pattern position.
2433 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2434 REGSTART, REGEND -- arrays of string positions.
2435 REG_INFO -- array of information about each subexpression.
2436
2437 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2438 `pend', `string1', `size1', `string2', and `size2'. */
2439
2440#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2441{ \
2442 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2443 int this_reg; \
2444 const unsigned char *string_temp; \
2445 \
2446 assert (!FAIL_STACK_EMPTY ()); \
2447 \
2448 /* Remove failure points and point to how many regs pushed. */ \
2449 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2450 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2451 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2452 \
2453 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2454 \
2455 DEBUG_POP (&failure_id); \
2456 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2457 \
2458 /* If the saved string location is NULL, it came from an \
2459 on_failure_keep_string_jump opcode, and we want to throw away the \
2460 saved NULL, thus retaining our current position in the string. */ \
2461 string_temp = POP_FAILURE_ITEM (); \
2462 if (string_temp != NULL) \
2463 str = (const char *) string_temp; \
2464 \
2465 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2466 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2467 DEBUG_PRINT1 ("'\n"); \
2468 \
2469 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2470 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2471 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2472 \
2473 /* Restore register info. */ \
2474 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2475 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2476 \
2477 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2478 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2479 \
2480 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2481 { \
2482 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2483 \
2484 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2485 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2486 \
2487 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2488 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2489 \
2490 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2491 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2492 } \
2493 \
2494 DEBUG_STATEMENT (nfailure_points_popped++); \
2495} /* POP_FAILURE_POINT */
2496
2497/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2498 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2499 characters can start a string that matches the pattern. This fastmap
2500 is used by re_search to skip quickly over impossible starting points.
2501
2502 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2503 area as BUFP->fastmap.
2504
2505 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2506 the pattern buffer.
2507
2508 Returns 0 if we succeed, -2 if an internal error. */
2509
2510int
2511re_compile_fastmap (bufp)
2512 struct re_pattern_buffer *bufp;
2513{
2514 int j, k;
2515 fail_stack_type fail_stack;
2516#ifndef REGEX_MALLOC
2517 char *destination;
2518#endif
2519 /* We don't push any register information onto the failure stack. */
2520 unsigned num_regs = 0;
2521
2522 register char *fastmap = bufp->fastmap;
2523 unsigned char *pattern = bufp->buffer;
2524 unsigned long size = bufp->used;
2525 const unsigned char *p = pattern;
2526 register unsigned char *pend = pattern + size;
2527
2528 /* Assume that each path through the pattern can be null until
2529 proven otherwise. We set this false at the bottom of switch
2530 statement, to which we get only if a particular path doesn't
2531 match the empty string. */
2532 boolean path_can_be_null = true;
2533
2534 /* We aren't doing a `succeed_n' to begin with. */
2535 boolean succeed_n_p = false;
2536
2537 assert (fastmap != NULL && p != NULL);
2538
2539 INIT_FAIL_STACK ();
2540 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2541 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2542 bufp->can_be_null = 0;
2543
2544 while (p != pend || !FAIL_STACK_EMPTY ())
2545 {
2546 if (p == pend)
2547 {
2548 bufp->can_be_null |= path_can_be_null;
2549
2550 /* Reset for next path. */
2551 path_can_be_null = true;
2552
2553 p = fail_stack.stack[--fail_stack.avail];
2554 }
2555
2556 /* We should never be about to go beyond the end of the pattern. */
2557 assert (p < pend);
2558
2559#ifdef SWITCH_ENUM_BUG
2560 switch ((int) ((re_opcode_t) *p++))
2561#else
2562 switch ((re_opcode_t) *p++)
2563#endif
2564 {
2565
2566 /* I guess the idea here is to simply not bother with a fastmap
2567 if a backreference is used, since it's too hard to figure out
2568 the fastmap for the corresponding group. Setting
2569 `can_be_null' stops `re_search_2' from using the fastmap, so
2570 that is all we do. */
2571 case duplicate:
2572 bufp->can_be_null = 1;
2573 return 0;
2574
2575
2576 /* Following are the cases which match a character. These end
2577 with `break'. */
2578
2579 case exactn:
2580 fastmap[p[1]] = 1;
2581 break;
2582
2583
2584 case charset:
2585 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2586 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2587 fastmap[j] = 1;
2588 break;
2589
2590
2591 case charset_not:
2592 /* Chars beyond end of map must be allowed. */
2593 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2594 fastmap[j] = 1;
2595
2596 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2597 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2598 fastmap[j] = 1;
2599 break;
2600
2601
2602 case wordchar:
2603 for (j = 0; j < (1 << BYTEWIDTH); j++)
2604 if (SYNTAX (j) == Sword)
2605 fastmap[j] = 1;
2606 break;
2607
2608
2609 case notwordchar:
2610 for (j = 0; j < (1 << BYTEWIDTH); j++)
2611 if (SYNTAX (j) != Sword)
2612 fastmap[j] = 1;
2613 break;
2614
2615
2616 case anychar:
2617 /* `.' matches anything ... */
2618 for (j = 0; j < (1 << BYTEWIDTH); j++)
2619 fastmap[j] = 1;
2620
2621 /* ... except perhaps newline. */
2622 if (!(bufp->syntax & RE_DOT_NEWLINE))
2623 fastmap['\n'] = 0;
2624
2625 /* Return if we have already set `can_be_null'; if we have,
2626 then the fastmap is irrelevant. Something's wrong here. */
2627 else if (bufp->can_be_null)
2628 return 0;
2629
2630 /* Otherwise, have to check alternative paths. */
2631 break;
2632
2633
2634#ifdef emacs
2635 case syntaxspec:
2636 k = *p++;
2637 for (j = 0; j < (1 << BYTEWIDTH); j++)
2638 if (SYNTAX (j) == (enum syntaxcode) k)
2639 fastmap[j] = 1;
2640 break;
2641
2642
2643 case notsyntaxspec:
2644 k = *p++;
2645 for (j = 0; j < (1 << BYTEWIDTH); j++)
2646 if (SYNTAX (j) != (enum syntaxcode) k)
2647 fastmap[j] = 1;
2648 break;
2649
2650
2651 /* All cases after this match the empty string. These end with
2652 `continue'. */
2653
2654
2655 case before_dot:
2656 case at_dot:
2657 case after_dot:
2658 continue;
2659#endif /* not emacs */
2660
2661
2662 case no_op:
2663 case begline:
2664 case endline:
2665 case begbuf:
2666 case endbuf:
2667 case wordbound:
2668 case notwordbound:
2669 case wordbeg:
2670 case wordend:
2671 case push_dummy_failure:
2672 continue;
2673
2674
2675 case jump_n:
2676 case pop_failure_jump:
2677 case maybe_pop_jump:
2678 case jump:
2679 case jump_past_alt:
2680 case dummy_failure_jump:
2681 EXTRACT_NUMBER_AND_INCR (j, p);
2682 p += j;
2683 if (j > 0)
2684 continue;
2685
2686 /* Jump backward implies we just went through the body of a
2687 loop and matched nothing. Opcode jumped to should be
2688 `on_failure_jump' or `succeed_n'. Just treat it like an
2689 ordinary jump. For a * loop, it has pushed its failure
2690 point already; if so, discard that as redundant. */
2691 if ((re_opcode_t) *p != on_failure_jump
2692 && (re_opcode_t) *p != succeed_n)
2693 continue;
2694
2695 p++;
2696 EXTRACT_NUMBER_AND_INCR (j, p);
2697 p += j;
2698
2699 /* If what's on the stack is where we are now, pop it. */
2700 if (!FAIL_STACK_EMPTY ()
2701 && fail_stack.stack[fail_stack.avail - 1] == p)
2702 fail_stack.avail--;
2703
2704 continue;
2705
2706
2707 case on_failure_jump:
2708 case on_failure_keep_string_jump:
2709 handle_on_failure_jump:
2710 EXTRACT_NUMBER_AND_INCR (j, p);
2711
2712 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2713 end of the pattern. We don't want to push such a point,
2714 since when we restore it above, entering the switch will
2715 increment `p' past the end of the pattern. We don't need
2716 to push such a point since we obviously won't find any more
2717 fastmap entries beyond `pend'. Such a pattern can match
2718 the null string, though. */
2719 if (p + j < pend)
2720 {
2721 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2722 return -2;
2723 }
2724 else
2725 bufp->can_be_null = 1;
2726
2727 if (succeed_n_p)
2728 {
2729 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2730 succeed_n_p = false;
2731 }
2732
2733 continue;
2734
2735
2736 case succeed_n:
2737 /* Get to the number of times to succeed. */
2738 p += 2;
2739
2740 /* Increment p past the n for when k != 0. */
2741 EXTRACT_NUMBER_AND_INCR (k, p);
2742 if (k == 0)
2743 {
2744 p -= 4;
2745 succeed_n_p = true; /* Spaghetti code alert. */
2746 goto handle_on_failure_jump;
2747 }
2748 continue;
2749
2750
2751 case set_number_at:
2752 p += 4;
2753 continue;
2754
2755
2756 case start_memory:
2757 case stop_memory:
2758 p += 2;
2759 continue;
2760
2761
2762 default:
2763 abort (); /* We have listed all the cases. */
2764 } /* switch *p++ */
2765
2766 /* Getting here means we have found the possible starting
2767 characters for one path of the pattern -- and that the empty
2768 string does not match. We need not follow this path further.
2769 Instead, look at the next alternative (remembered on the
2770 stack), or quit if no more. The test at the top of the loop
2771 does these things. */
2772 path_can_be_null = false;
2773 p = pend;
2774 } /* while p */
2775
2776 /* Set `can_be_null' for the last path (also the first path, if the
2777 pattern is empty). */
2778 bufp->can_be_null |= path_can_be_null;
2779 return 0;
2780} /* re_compile_fastmap */
2781
2782/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2783 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2784 this memory for recording register information. STARTS and ENDS
2785 must be allocated using the malloc library routine, and must each
2786 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2787
2788 If NUM_REGS == 0, then subsequent matches should allocate their own
2789 register data.
2790
2791 Unless this function is called, the first search or match using
2792 PATTERN_BUFFER will allocate its own register data, without
2793 freeing the old data. */
2794
2795void
2796re_set_registers (bufp, regs, num_regs, starts, ends)
2797 struct re_pattern_buffer *bufp;
2798 struct re_registers *regs;
2799 unsigned num_regs;
2800 regoff_t *starts, *ends;
2801{
2802 if (num_regs)
2803 {
2804 bufp->regs_allocated = REGS_REALLOCATE;
2805 regs->num_regs = num_regs;
2806 regs->start = starts;
2807 regs->end = ends;
2808 }
2809 else
2810 {
2811 bufp->regs_allocated = REGS_UNALLOCATED;
2812 regs->num_regs = 0;
2813 regs->start = regs->end = (regoff_t) 0;
2814 }
2815}
2816
2817/* Searching routines. */
2818
2819/* Like re_search_2, below, but only one string is specified, and
2820 doesn't let you say where to stop matching. */
2821
2822int
2823re_search (bufp, string, size, startpos, range, regs)
2824 struct re_pattern_buffer *bufp;
2825 const char *string;
2826 int size, startpos, range;
2827 struct re_registers *regs;
2828{
2829 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2830 regs, size);
2831}
2832
2833
2834/* Using the compiled pattern in BUFP->buffer, first tries to match the
2835 virtual concatenation of STRING1 and STRING2, starting first at index
2836 STARTPOS, then at STARTPOS + 1, and so on.
2837
2838 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2839
2840 RANGE is how far to scan while trying to match. RANGE = 0 means try
2841 only at STARTPOS; in general, the last start tried is STARTPOS +
2842 RANGE.
2843
2844 In REGS, return the indices of the virtual concatenation of STRING1
2845 and STRING2 that matched the entire BUFP->buffer and its contained
2846 subexpressions.
2847
2848 Do not consider matching one past the index STOP in the virtual
2849 concatenation of STRING1 and STRING2.
2850
2851 We return either the position in the strings at which the match was
2852 found, -1 if no match, or -2 if error (such as failure
2853 stack overflow). */
2854
2855int
2856re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2857 struct re_pattern_buffer *bufp;
2858 const char *string1, *string2;
2859 int size1, size2;
2860 int startpos;
2861 int range;
2862 struct re_registers *regs;
2863 int stop;
2864{
2865 int val;
2866 register char *fastmap = bufp->fastmap;
2867 register char *translate = bufp->translate;
2868 int total_size = size1 + size2;
2869 int endpos = startpos + range;
2870
2871 /* Check for out-of-range STARTPOS. */
2872 if (startpos < 0 || startpos > total_size)
2873 return -1;
2874
2875 /* Fix up RANGE if it might eventually take us outside
2876 the virtual concatenation of STRING1 and STRING2. */
2877 if (endpos < -1)
2878 range = -1 - startpos;
2879 else if (endpos > total_size)
2880 range = total_size - startpos;
2881
2882 /* If the search isn't to be a backwards one, don't waste time in a
2883 search for a pattern that must be anchored. */
2884 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2885 {
2886 if (startpos > 0)
2887 return -1;
2888 else
2889 range = 1;
2890 }
2891
2892 /* Update the fastmap now if not correct already. */
2893 if (fastmap && !bufp->fastmap_accurate)
2894 if (re_compile_fastmap (bufp) == -2)
2895 return -2;
2896
2897 /* Loop through the string, looking for a place to start matching. */
2898 for (;;)
2899 {
2900 /* If a fastmap is supplied, skip quickly over characters that
2901 cannot be the start of a match. If the pattern can match the
2902 null string, however, we don't need to skip characters; we want
2903 the first null string. */
2904 if (fastmap && startpos < total_size && !bufp->can_be_null)
2905 {
2906 if (range > 0) /* Searching forwards. */
2907 {
2908 register const char *d;
2909 register int lim = 0;
2910 int irange = range;
2911
2912 if (startpos < size1 && startpos + range >= size1)
2913 lim = range - (size1 - startpos);
2914
2915 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2916
2917 /* Written out as an if-else to avoid testing `translate'
2918 inside the loop. */
2919 if (translate)
2920 while (range > lim
2921 && !fastmap[(unsigned char)
2922 translate[(unsigned char) *d++]])
2923 range--;
2924 else
2925 while (range > lim && !fastmap[(unsigned char) *d++])
2926 range--;
2927
2928 startpos += irange - range;
2929 }
2930 else /* Searching backwards. */
2931 {
2932 register char c = (size1 == 0 || startpos >= size1
2933 ? string2[startpos - size1]
2934 : string1[startpos]);
2935
2936 if (!fastmap[(unsigned char) TRANSLATE (c)])
2937 goto advance;
2938 }
2939 }
2940
2941 /* If can't match the null string, and that's all we have left, fail. */
2942 if (range >= 0 && startpos == total_size && fastmap
2943 && !bufp->can_be_null)
2944 return -1;
2945
2946 val = re_match_2 (bufp, string1, size1, string2, size2,
2947 startpos, regs, stop);
2948 if (val >= 0)
2949 return startpos;
2950
2951 if (val == -2)
2952 return -2;
2953
2954 advance:
2955 if (!range)
2956 break;
2957 else if (range > 0)
2958 {
2959 range--;
2960 startpos++;
2961 }
2962 else
2963 {
2964 range++;
2965 startpos--;
2966 }
2967 }
2968 return -1;
2969} /* re_search_2 */
2970
2971/* Declarations and macros for re_match_2. */
2972
2973static int bcmp_translate ();
2974static boolean alt_match_null_string_p (),
2975 common_op_match_null_string_p (),
2976 group_match_null_string_p ();
2977
2978/* Structure for per-register (a.k.a. per-group) information.
2979 This must not be longer than one word, because we push this value
2980 onto the failure stack. Other register information, such as the
2981 starting and ending positions (which are addresses), and the list of
2982 inner groups (which is a bits list) are maintained in separate
2983 variables.
2984
2985 We are making a (strictly speaking) nonportable assumption here: that
2986 the compiler will pack our bit fields into something that fits into
2987 the type of `word', i.e., is something that fits into one item on the
2988 failure stack. */
2989typedef union
2990{
2991 fail_stack_elt_t word;
2992 struct
2993 {
2994 /* This field is one if this group can match the empty string,
2995 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
2996#define MATCH_NULL_UNSET_VALUE 3
2997 unsigned match_null_string_p : 2;
2998 unsigned is_active : 1;
2999 unsigned matched_something : 1;
3000 unsigned ever_matched_something : 1;
3001 } bits;
3002} register_info_type;
3003
3004#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3005#define IS_ACTIVE(R) ((R).bits.is_active)
3006#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3007#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3008
3009
3010/* Call this when have matched a real character; it sets `matched' flags
3011 for the subexpressions which we are currently inside. Also records
3012 that those subexprs have matched. */
3013#define SET_REGS_MATCHED() \
3014 do \
3015 { \
3016 unsigned r; \
3017 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3018 { \
3019 MATCHED_SOMETHING (reg_info[r]) \
3020 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3021 = 1; \
3022 } \
3023 } \
3024 while (0)
3025
3026
3027/* This converts PTR, a pointer into one of the search strings `string1'
3028 and `string2' into an offset from the beginning of that string. */
3029#define POINTER_TO_OFFSET(ptr) \
3030 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3031
3032/* Registers are set to a sentinel when they haven't yet matched. */
3033#define REG_UNSET_VALUE ((char *) -1)
3034#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3035
3036
3037/* Macros for dealing with the split strings in re_match_2. */
3038
3039#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3040
3041/* Call before fetching a character with *d. This switches over to
3042 string2 if necessary. */
3043#define PREFETCH() \
3044 while (d == dend) \
3045 { \
3046 /* End of string2 => fail. */ \
3047 if (dend == end_match_2) \
3048 goto fail; \
3049 /* End of string1 => advance to string2. */ \
3050 d = string2; \
3051 dend = end_match_2; \
3052 }
3053
3054
3055/* Test if at very beginning or at very end of the virtual concatenation
3056 of `string1' and `string2'. If only one string, it's `string2'. */
3057#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3058#define AT_STRINGS_END(d) ((d) == end2)
3059
3060
3061/* Test if D points to a character which is word-constituent. We have
3062 two special cases to check for: if past the end of string1, look at
3063 the first character in string2; and if before the beginning of
3064 string2, look at the last character in string1. */
3065#define WORDCHAR_P(d) \
3066 (SYNTAX ((d) == end1 ? *string2 \
3067 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3068 == Sword)
3069
3070/* Test if the character before D and the one at D differ with respect
3071 to being word-constituent. */
3072#define AT_WORD_BOUNDARY(d) \
3073 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3074 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3075
3076
3077/* Free everything we malloc. */
3078#ifdef REGEX_MALLOC
3079#define FREE_VAR(var) if (var) free (var); var = NULL
3080#define FREE_VARIABLES() \
3081 do { \
3082 FREE_VAR (fail_stack.stack); \
3083 FREE_VAR (regstart); \
3084 FREE_VAR (regend); \
3085 FREE_VAR (old_regstart); \
3086 FREE_VAR (old_regend); \
3087 FREE_VAR (best_regstart); \
3088 FREE_VAR (best_regend); \
3089 FREE_VAR (reg_info); \
3090 FREE_VAR (reg_dummy); \
3091 FREE_VAR (reg_info_dummy); \
3092 } while (0)
3093#else /* not REGEX_MALLOC */
3094/* Some MIPS systems (at least) want this to free alloca'd storage. */
3095#define FREE_VARIABLES() alloca (0)
3096#endif /* not REGEX_MALLOC */
3097
3098
3099/* These values must meet several constraints. They must not be valid
3100 register values; since we have a limit of 255 registers (because
3101 we use only one byte in the pattern for the register number), we can
3102 use numbers larger than 255. They must differ by 1, because of
3103 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3104 be larger than the value for the highest register, so we do not try
3105 to actually save any registers when none are active. */
3106#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3107#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3108
3109/* Matching routines. */
3110
3111#ifndef emacs /* Emacs never uses this. */
3112/* re_match is like re_match_2 except it takes only a single string. */
3113
3114int
3115re_match (bufp, string, size, pos, regs)
3116 struct re_pattern_buffer *bufp;
3117 const char *string;
3118 int size, pos;
3119 struct re_registers *regs;
3120 {
3121 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3122}
3123#endif /* not emacs */
3124
3125
3126/* re_match_2 matches the compiled pattern in BUFP against the
3127 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3128 and SIZE2, respectively). We start matching at POS, and stop
3129 matching at STOP.
3130
3131 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3132 store offsets for the substring each group matched in REGS. See the
3133 documentation for exactly how many groups we fill.
3134
3135 We return -1 if no match, -2 if an internal error (such as the
3136 failure stack overflowing). Otherwise, we return the length of the
3137 matched substring. */
3138
3139int
3140re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3141 struct re_pattern_buffer *bufp;
3142 const char *string1, *string2;
3143 int size1, size2;
3144 int pos;
3145 struct re_registers *regs;
3146 int stop;
3147{
3148 /* General temporaries. */
3149 int mcnt;
3150 unsigned char *p1;
3151
3152 /* Just past the end of the corresponding string. */
3153 const char *end1, *end2;
3154
3155 /* Pointers into string1 and string2, just past the last characters in
3156 each to consider matching. */
3157 const char *end_match_1, *end_match_2;
3158
3159 /* Where we are in the data, and the end of the current string. */
3160 const char *d, *dend;
3161
3162 /* Where we are in the pattern, and the end of the pattern. */
3163 unsigned char *p = bufp->buffer;
3164 register unsigned char *pend = p + bufp->used;
3165
3166 /* We use this to map every character in the string. */
3167 char *translate = bufp->translate;
3168
3169 /* Failure point stack. Each place that can handle a failure further
3170 down the line pushes a failure point on this stack. It consists of
3171 restart, regend, and reg_info for all registers corresponding to
3172 the subexpressions we're currently inside, plus the number of such
3173 registers, and, finally, two char *'s. The first char * is where
3174 to resume scanning the pattern; the second one is where to resume
3175 scanning the strings. If the latter is zero, the failure point is
3176 a ``dummy''; if a failure happens and the failure point is a dummy,
3177 it gets discarded and the next next one is tried. */
3178 fail_stack_type fail_stack;
3179#ifdef DEBUG
3180 static unsigned failure_id = 0;
3181 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3182#endif 90#endif
3183
3184 /* We fill all the registers internally, independent of what we
3185 return, for use in backreferences. The number here includes
3186 an element for register zero. */
3187 unsigned num_regs = bufp->re_nsub + 1;
3188
3189 /* The currently active registers. */
3190 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3191 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3192
3193 /* Information on the contents of registers. These are pointers into
3194 the input strings; they record just what was matched (on this
3195 attempt) by a subexpression part of the pattern, that is, the
3196 regnum-th regstart pointer points to where in the pattern we began
3197 matching and the regnum-th regend points to right after where we
3198 stopped matching the regnum-th subexpression. (The zeroth register
3199 keeps track of what the whole pattern matches.) */
3200 const char **regstart = NULL, **regend = NULL;
3201
3202 /* If a group that's operated upon by a repetition operator fails to
3203 match anything, then the register for its start will need to be
3204 restored because it will have been set to wherever in the string we
3205 are when we last see its open-group operator. Similarly for a
3206 register's end. */
3207 const char **old_regstart = NULL, **old_regend = NULL;
3208
3209 /* The is_active field of reg_info helps us keep track of which (possibly
3210 nested) subexpressions we are currently in. The matched_something
3211 field of reg_info[reg_num] helps us tell whether or not we have
3212 matched any of the pattern so far this time through the reg_num-th
3213 subexpression. These two fields get reset each time through any
3214 loop their register is in. */
3215 register_info_type *reg_info = NULL;
3216
3217 /* The following record the register info as found in the above
3218 variables when we find a match better than any we've seen before.
3219 This happens as we backtrack through the failure points, which in
3220 turn happens only if we have not yet matched the entire string. */
3221 unsigned best_regs_set = false;
3222 const char **best_regstart = NULL, **best_regend = NULL;
3223
3224 /* Logically, this is `best_regend[0]'. But we don't want to have to
3225 allocate space for that if we're not allocating space for anything
3226 else (see below). Also, we never need info about register 0 for
3227 any of the other register vectors, and it seems rather a kludge to
3228 treat `best_regend' differently than the rest. So we keep track of
3229 the end of the best match so far in a separate variable. We
3230 initialize this to NULL so that when we backtrack the first time
3231 and need to test it, it's not garbage. */
3232 const char *match_end = NULL;
3233
3234 /* Used when we pop values we don't care about. */
3235 const char **reg_dummy = NULL;
3236 register_info_type *reg_info_dummy = NULL;
3237
3238#ifdef DEBUG
3239 /* Counts the total number of registers pushed. */
3240 unsigned num_regs_pushed = 0;
3241#endif
3242
3243 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3244
3245 INIT_FAIL_STACK ();
3246
3247 /* Do not bother to initialize all the register variables if there are
3248 no groups in the pattern, as it takes a fair amount of time. If
3249 there are groups, we include space for register 0 (the whole
3250 pattern), even though we never use it, since it simplifies the
3251 array indexing. We should fix this. */
3252 if (bufp->re_nsub)
3253 {
3254 regstart = REGEX_TALLOC (num_regs, const char *);
3255 regend = REGEX_TALLOC (num_regs, const char *);
3256 old_regstart = REGEX_TALLOC (num_regs, const char *);
3257 old_regend = REGEX_TALLOC (num_regs, const char *);
3258 best_regstart = REGEX_TALLOC (num_regs, const char *);
3259 best_regend = REGEX_TALLOC (num_regs, const char *);
3260 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3261 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3262 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3263
3264 if (!(regstart && regend && old_regstart && old_regend && reg_info
3265 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3266 {
3267 FREE_VARIABLES ();
3268 return -2;
3269 }
3270 }
3271#ifdef REGEX_MALLOC
3272 else
3273 {
3274 /* We must initialize all our variables to NULL, so that
3275 `FREE_VARIABLES' doesn't try to free them. */
3276 regstart = regend = old_regstart = old_regend = best_regstart
3277 = best_regend = reg_dummy = NULL;
3278 reg_info = reg_info_dummy = (register_info_type *) NULL;
3279 }
3280#endif /* REGEX_MALLOC */
3281
3282 /* The starting position is bogus. */
3283 if (pos < 0 || pos > size1 + size2)
3284 {
3285 FREE_VARIABLES ();
3286 return -1;
3287 }
3288
3289 /* Initialize subexpression text positions to -1 to mark ones that no
3290 start_memory/stop_memory has been seen for. Also initialize the
3291 register information struct. */
3292 for (mcnt = 1; mcnt < num_regs; mcnt++)
3293 {
3294 regstart[mcnt] = regend[mcnt]
3295 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3296
3297 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3298 IS_ACTIVE (reg_info[mcnt]) = 0;
3299 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3300 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3301 }
3302
3303 /* We move `string1' into `string2' if the latter's empty -- but not if
3304 `string1' is null. */
3305 if (size2 == 0 && string1 != NULL)
3306 {
3307 string2 = string1;
3308 size2 = size1;
3309 string1 = 0;
3310 size1 = 0;
3311 }
3312 end1 = string1 + size1;
3313 end2 = string2 + size2;
3314
3315 /* Compute where to stop matching, within the two strings. */
3316 if (stop <= size1)
3317 {
3318 end_match_1 = string1 + stop;
3319 end_match_2 = string2;
3320 }
3321 else
3322 {
3323 end_match_1 = end1;
3324 end_match_2 = string2 + stop - size1;
3325 }
3326
3327 /* `p' scans through the pattern as `d' scans through the data.
3328 `dend' is the end of the input string that `d' points within. `d'
3329 is advanced into the following input string whenever necessary, but
3330 this happens before fetching; therefore, at the beginning of the
3331 loop, `d' can be pointing at the end of a string, but it cannot
3332 equal `string2'. */
3333 if (size1 > 0 && pos <= size1)
3334 {
3335 d = string1 + pos;
3336 dend = end_match_1;
3337 }
3338 else
3339 {
3340 d = string2 + pos - size1;
3341 dend = end_match_2;
3342 }
3343
3344 DEBUG_PRINT1 ("The compiled pattern is: ");
3345 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3346 DEBUG_PRINT1 ("The string to match is: `");
3347 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3348 DEBUG_PRINT1 ("'\n");
3349
3350 /* This loops over pattern commands. It exits by returning from the
3351 function if the match is complete, or it drops through if the match
3352 fails at this starting point in the input data. */
3353 for (;;)
3354 {
3355 DEBUG_PRINT2 ("\n0x%x: ", p);
3356
3357 if (p == pend)
3358 { /* End of pattern means we might have succeeded. */
3359 DEBUG_PRINT1 ("end of pattern ... ");
3360
3361 /* If we haven't matched the entire string, and we want the
3362 longest match, try backtracking. */
3363 if (d != end_match_2)
3364 {
3365 DEBUG_PRINT1 ("backtracking.\n");
3366
3367 if (!FAIL_STACK_EMPTY ())
3368 { /* More failure points to try. */
3369 boolean same_str_p = (FIRST_STRING_P (match_end)
3370 == MATCHING_IN_FIRST_STRING);
3371
3372 /* If exceeds best match so far, save it. */
3373 if (!best_regs_set
3374 || (same_str_p && d > match_end)
3375 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3376 {
3377 best_regs_set = true;
3378 match_end = d;
3379
3380 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3381
3382 for (mcnt = 1; mcnt < num_regs; mcnt++)
3383 {
3384 best_regstart[mcnt] = regstart[mcnt];
3385 best_regend[mcnt] = regend[mcnt];
3386 }
3387 }
3388 goto fail;
3389 }
3390
3391 /* If no failure points, don't restore garbage. */
3392 else if (best_regs_set)
3393 {
3394 restore_best_regs:
3395 /* Restore best match. It may happen that `dend ==
3396 end_match_1' while the restored d is in string2.
3397 For example, the pattern `x.*y.*z' against the
3398 strings `x-' and `y-z-', if the two strings are
3399 not consecutive in memory. */
3400 DEBUG_PRINT1 ("Restoring best registers.\n");
3401
3402 d = match_end;
3403 dend = ((d >= string1 && d <= end1)
3404 ? end_match_1 : end_match_2);
3405
3406 for (mcnt = 1; mcnt < num_regs; mcnt++)
3407 {
3408 regstart[mcnt] = best_regstart[mcnt];
3409 regend[mcnt] = best_regend[mcnt];
3410 }
3411 }
3412 } /* d != end_match_2 */
3413
3414 DEBUG_PRINT1 ("Accepting match.\n");
3415
3416 /* If caller wants register contents data back, do it. */
3417 if (regs && !bufp->no_sub)
3418 {
3419 /* Have the register data arrays been allocated? */
3420 if (bufp->regs_allocated == REGS_UNALLOCATED)
3421 { /* No. So allocate them with malloc. We need one
3422 extra element beyond `num_regs' for the `-1' marker
3423 GNU code uses. */
3424 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3425 regs->start = TALLOC (regs->num_regs, regoff_t);
3426 regs->end = TALLOC (regs->num_regs, regoff_t);
3427 if (regs->start == NULL || regs->end == NULL)
3428 return -2;
3429 bufp->regs_allocated = REGS_REALLOCATE;
3430 }
3431 else if (bufp->regs_allocated == REGS_REALLOCATE)
3432 { /* Yes. If we need more elements than were already
3433 allocated, reallocate them. If we need fewer, just
3434 leave it alone. */
3435 if (regs->num_regs < num_regs + 1)
3436 {
3437 regs->num_regs = num_regs + 1;
3438 RETALLOC (regs->start, regs->num_regs, regoff_t);
3439 RETALLOC (regs->end, regs->num_regs, regoff_t);
3440 if (regs->start == NULL || regs->end == NULL)
3441 return -2;
3442 }
3443 }
3444 else
3445 assert (bufp->regs_allocated == REGS_FIXED);
3446
3447 /* Convert the pointer data in `regstart' and `regend' to
3448 indices. Register zero has to be set differently,
3449 since we haven't kept track of any info for it. */
3450 if (regs->num_regs > 0)
3451 {
3452 regs->start[0] = pos;
3453 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3454 : d - string2 + size1);
3455 }
3456
3457 /* Go through the first `min (num_regs, regs->num_regs)'
3458 registers, since that is all we initialized. */
3459 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3460 {
3461 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3462 regs->start[mcnt] = regs->end[mcnt] = -1;
3463 else
3464 {
3465 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3466 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3467 }
3468 }
3469
3470 /* If the regs structure we return has more elements than
3471 were in the pattern, set the extra elements to -1. If
3472 we (re)allocated the registers, this is the case,
3473 because we always allocate enough to have at least one
3474 -1 at the end. */
3475 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3476 regs->start[mcnt] = regs->end[mcnt] = -1;
3477 } /* regs && !bufp->no_sub */
3478
3479 FREE_VARIABLES ();
3480 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3481 nfailure_points_pushed, nfailure_points_popped,
3482 nfailure_points_pushed - nfailure_points_popped);
3483 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3484
3485 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3486 ? string1
3487 : string2 - size1);
3488
3489 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3490
3491 return mcnt;
3492 }
3493
3494 /* Otherwise match next pattern command. */
3495#ifdef SWITCH_ENUM_BUG
3496 switch ((int) ((re_opcode_t) *p++))
3497#else
3498 switch ((re_opcode_t) *p++)
3499#endif
3500 {
3501 /* Ignore these. Used to ignore the n of succeed_n's which
3502 currently have n == 0. */
3503 case no_op:
3504 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3505 break;
3506
3507
3508 /* Match the next n pattern characters exactly. The following
3509 byte in the pattern defines n, and the n bytes after that
3510 are the characters to match. */
3511 case exactn:
3512 mcnt = *p++;
3513 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3514
3515 /* This is written out as an if-else so we don't waste time
3516 testing `translate' inside the loop. */
3517 if (translate)
3518 {
3519 do
3520 {
3521 PREFETCH ();
3522 if (translate[(unsigned char) *d++] != (char) *p++)
3523 goto fail;
3524 }
3525 while (--mcnt);
3526 }
3527 else
3528 {
3529 do
3530 {
3531 PREFETCH ();
3532 if (*d++ != (char) *p++) goto fail;
3533 }
3534 while (--mcnt);
3535 }
3536 SET_REGS_MATCHED ();
3537 break;
3538
3539
3540 /* Match any character except possibly a newline or a null. */
3541 case anychar:
3542 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3543
3544 PREFETCH ();
3545
3546 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3547 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3548 goto fail;
3549
3550 SET_REGS_MATCHED ();
3551 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3552 d++;
3553 break;
3554
3555
3556 case charset:
3557 case charset_not:
3558 {
3559 register unsigned char c;
3560 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3561
3562 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3563
3564 PREFETCH ();
3565 c = TRANSLATE (*d); /* The character to match. */
3566
3567 /* Cast to `unsigned' instead of `unsigned char' in case the
3568 bit list is a full 32 bytes long. */
3569 if (c < (unsigned) (*p * BYTEWIDTH)
3570 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3571 not = !not;
3572
3573 p += 1 + *p;
3574
3575 if (!not) goto fail;
3576
3577 SET_REGS_MATCHED ();
3578 d++;
3579 break;
3580 }
3581
3582
3583 /* The beginning of a group is represented by start_memory.
3584 The arguments are the register number in the next byte, and the
3585 number of groups inner to this one in the next. The text
3586 matched within the group is recorded (in the internal
3587 registers data structure) under the register number. */
3588 case start_memory:
3589 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3590
3591 /* Find out if this group can match the empty string. */
3592 p1 = p; /* To send to group_match_null_string_p. */
3593
3594 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3595 REG_MATCH_NULL_STRING_P (reg_info[*p])
3596 = group_match_null_string_p (&p1, pend, reg_info);
3597
3598 /* Save the position in the string where we were the last time
3599 we were at this open-group operator in case the group is
3600 operated upon by a repetition operator, e.g., with `(a*)*b'
3601 against `ab'; then we want to ignore where we are now in
3602 the string in case this attempt to match fails. */
3603 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3604 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3605 : regstart[*p];
3606 DEBUG_PRINT2 (" old_regstart: %d\n",
3607 POINTER_TO_OFFSET (old_regstart[*p]));
3608
3609 regstart[*p] = d;
3610 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3611
3612 IS_ACTIVE (reg_info[*p]) = 1;
3613 MATCHED_SOMETHING (reg_info[*p]) = 0;
3614
3615 /* This is the new highest active register. */
3616 highest_active_reg = *p;
3617
3618 /* If nothing was active before, this is the new lowest active
3619 register. */
3620 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3621 lowest_active_reg = *p;
3622
3623 /* Move past the register number and inner group count. */
3624 p += 2;
3625 break;
3626
3627
3628 /* The stop_memory opcode represents the end of a group. Its
3629 arguments are the same as start_memory's: the register
3630 number, and the number of inner groups. */
3631 case stop_memory:
3632 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3633
3634 /* We need to save the string position the last time we were at
3635 this close-group operator in case the group is operated
3636 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3637 against `aba'; then we want to ignore where we are now in
3638 the string in case this attempt to match fails. */
3639 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3640 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3641 : regend[*p];
3642 DEBUG_PRINT2 (" old_regend: %d\n",
3643 POINTER_TO_OFFSET (old_regend[*p]));
3644
3645 regend[*p] = d;
3646 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3647
3648 /* This register isn't active anymore. */
3649 IS_ACTIVE (reg_info[*p]) = 0;
3650
3651 /* If this was the only register active, nothing is active
3652 anymore. */
3653 if (lowest_active_reg == highest_active_reg)
3654 {
3655 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3656 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3657 }
3658 else
3659 { /* We must scan for the new highest active register, since
3660 it isn't necessarily one less than now: consider
3661 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3662 new highest active register is 1. */
3663 unsigned char r = *p - 1;
3664 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3665 r--;
3666
3667 /* If we end up at register zero, that means that we saved
3668 the registers as the result of an `on_failure_jump', not
3669 a `start_memory', and we jumped to past the innermost
3670 `stop_memory'. For example, in ((.)*) we save
3671 registers 1 and 2 as a result of the *, but when we pop
3672 back to the second ), we are at the stop_memory 1.
3673 Thus, nothing is active. */
3674 if (r == 0)
3675 {
3676 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3677 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3678 }
3679 else
3680 highest_active_reg = r;
3681 }
3682
3683 /* If just failed to match something this time around with a
3684 group that's operated on by a repetition operator, try to
3685 force exit from the ``loop'', and restore the register
3686 information for this group that we had before trying this
3687 last match. */
3688 if ((!MATCHED_SOMETHING (reg_info[*p])
3689 || (re_opcode_t) p[-3] == start_memory)
3690 && (p + 2) < pend)
3691 {
3692 boolean is_a_jump_n = false;
3693
3694 p1 = p + 2;
3695 mcnt = 0;
3696 switch ((re_opcode_t) *p1++)
3697 {
3698 case jump_n:
3699 is_a_jump_n = true;
3700 case pop_failure_jump:
3701 case maybe_pop_jump:
3702 case jump:
3703 case dummy_failure_jump:
3704 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3705 if (is_a_jump_n)
3706 p1 += 2;
3707 break;
3708
3709 default:
3710 /* do nothing */ ;
3711 }
3712 p1 += mcnt;
3713
3714 /* If the next operation is a jump backwards in the pattern
3715 to an on_failure_jump right before the start_memory
3716 corresponding to this stop_memory, exit from the loop
3717 by forcing a failure after pushing on the stack the
3718 on_failure_jump's jump in the pattern, and d. */
3719 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3720 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3721 {
3722 /* If this group ever matched anything, then restore
3723 what its registers were before trying this last
3724 failed match, e.g., with `(a*)*b' against `ab' for
3725 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3726 against `aba' for regend[3].
3727
3728 Also restore the registers for inner groups for,
3729 e.g., `((a*)(b*))*' against `aba' (register 3 would
3730 otherwise get trashed). */
3731
3732 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3733 {
3734 unsigned r;
3735
3736 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3737
3738 /* Restore this and inner groups' (if any) registers. */
3739 for (r = *p; r < *p + *(p + 1); r++)
3740 {
3741 regstart[r] = old_regstart[r];
3742
3743 /* xx why this test? */
3744 if ((int) old_regend[r] >= (int) regstart[r])
3745 regend[r] = old_regend[r];
3746 }
3747 }
3748 p1++;
3749 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3750 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3751
3752 goto fail;
3753 }
3754 }
3755
3756 /* Move past the register number and the inner group count. */
3757 p += 2;
3758 break;
3759
3760
3761 /* \<digit> has been turned into a `duplicate' command which is
3762 followed by the numeric value of <digit> as the register number. */
3763 case duplicate:
3764 {
3765 register const char *d2, *dend2;
3766 int regno = *p++; /* Get which register to match against. */
3767 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3768
3769 /* Can't back reference a group which we've never matched. */
3770 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3771 goto fail;
3772
3773 /* Where in input to try to start matching. */
3774 d2 = regstart[regno];
3775
3776 /* Where to stop matching; if both the place to start and
3777 the place to stop matching are in the same string, then
3778 set to the place to stop, otherwise, for now have to use
3779 the end of the first string. */
3780
3781 dend2 = ((FIRST_STRING_P (regstart[regno])
3782 == FIRST_STRING_P (regend[regno]))
3783 ? regend[regno] : end_match_1);
3784 for (;;)
3785 {
3786 /* If necessary, advance to next segment in register
3787 contents. */
3788 while (d2 == dend2)
3789 {
3790 if (dend2 == end_match_2) break;
3791 if (dend2 == regend[regno]) break;
3792
3793 /* End of string1 => advance to string2. */
3794 d2 = string2;
3795 dend2 = regend[regno];
3796 }
3797 /* At end of register contents => success */
3798 if (d2 == dend2) break;
3799
3800 /* If necessary, advance to next segment in data. */
3801 PREFETCH ();
3802
3803 /* How many characters left in this segment to match. */
3804 mcnt = dend - d;
3805
3806 /* Want how many consecutive characters we can match in
3807 one shot, so, if necessary, adjust the count. */
3808 if (mcnt > dend2 - d2)
3809 mcnt = dend2 - d2;
3810
3811 /* Compare that many; failure if mismatch, else move
3812 past them. */
3813 if (translate
3814 ? bcmp_translate (d, d2, mcnt, translate)
3815 : bcmp (d, d2, mcnt))
3816 goto fail;
3817 d += mcnt, d2 += mcnt;
3818 }
3819 }
3820 break;
3821
3822
3823 /* begline matches the empty string at the beginning of the string
3824 (unless `not_bol' is set in `bufp'), and, if
3825 `newline_anchor' is set, after newlines. */
3826 case begline:
3827 DEBUG_PRINT1 ("EXECUTING begline.\n");
3828
3829 if (AT_STRINGS_BEG (d))
3830 {
3831 if (!bufp->not_bol) break;
3832 }
3833 else if (d[-1] == '\n' && bufp->newline_anchor)
3834 {
3835 break;
3836 }
3837 /* In all other cases, we fail. */
3838 goto fail;
3839
3840
3841 /* endline is the dual of begline. */
3842 case endline:
3843 DEBUG_PRINT1 ("EXECUTING endline.\n");
3844
3845 if (AT_STRINGS_END (d))
3846 {
3847 if (!bufp->not_eol) break;
3848 }
3849
3850 /* We have to ``prefetch'' the next character. */
3851 else if ((d == end1 ? *string2 : *d) == '\n'
3852 && bufp->newline_anchor)
3853 {
3854 break;
3855 }
3856 goto fail;
3857
3858
3859 /* Match at the very beginning of the data. */
3860 case begbuf:
3861 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3862 if (AT_STRINGS_BEG (d))
3863 break;
3864 goto fail;
3865
3866
3867 /* Match at the very end of the data. */
3868 case endbuf:
3869 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3870 if (AT_STRINGS_END (d))
3871 break;
3872 goto fail;
3873
3874
3875 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3876 pushes NULL as the value for the string on the stack. Then
3877 `pop_failure_point' will keep the current value for the
3878 string, instead of restoring it. To see why, consider
3879 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3880 then the . fails against the \n. But the next thing we want
3881 to do is match the \n against the \n; if we restored the
3882 string value, we would be back at the foo.
3883
3884 Because this is used only in specific cases, we don't need to
3885 check all the things that `on_failure_jump' does, to make
3886 sure the right things get saved on the stack. Hence we don't
3887 share its code. The only reason to push anything on the
3888 stack at all is that otherwise we would have to change
3889 `anychar's code to do something besides goto fail in this
3890 case; that seems worse than this. */
3891 case on_failure_keep_string_jump:
3892 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3893
3894 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3895 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3896
3897 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3898 break;
3899
3900
3901 /* Uses of on_failure_jump:
3902
3903 Each alternative starts with an on_failure_jump that points
3904 to the beginning of the next alternative. Each alternative
3905 except the last ends with a jump that in effect jumps past
3906 the rest of the alternatives. (They really jump to the
3907 ending jump of the following alternative, because tensioning
3908 these jumps is a hassle.)
3909
3910 Repeats start with an on_failure_jump that points past both
3911 the repetition text and either the following jump or
3912 pop_failure_jump back to this on_failure_jump. */
3913 case on_failure_jump:
3914 on_failure:
3915 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3916
3917 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3918 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3919
3920 /* If this on_failure_jump comes right before a group (i.e.,
3921 the original * applied to a group), save the information
3922 for that group and all inner ones, so that if we fail back
3923 to this point, the group's information will be correct.
3924 For example, in \(a*\)*\1, we need the preceding group,
3925 and in \(\(a*\)b*\)\2, we need the inner group. */
3926
3927 /* We can't use `p' to check ahead because we push
3928 a failure point to `p + mcnt' after we do this. */
3929 p1 = p;
3930
3931 /* We need to skip no_op's before we look for the
3932 start_memory in case this on_failure_jump is happening as
3933 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3934 against aba. */
3935 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3936 p1++;
3937
3938 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3939 {
3940 /* We have a new highest active register now. This will
3941 get reset at the start_memory we are about to get to,
3942 but we will have saved all the registers relevant to
3943 this repetition op, as described above. */
3944 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3945 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3946 lowest_active_reg = *(p1 + 1);
3947 }
3948
3949 DEBUG_PRINT1 (":\n");
3950 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3951 break;
3952
3953
3954 /* A smart repeat ends with `maybe_pop_jump'.
3955 We change it to either `pop_failure_jump' or `jump'. */
3956 case maybe_pop_jump:
3957 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3958 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3959 {
3960 register unsigned char *p2 = p;
3961
3962 /* Compare the beginning of the repeat with what in the
3963 pattern follows its end. If we can establish that there
3964 is nothing that they would both match, i.e., that we
3965 would have to backtrack because of (as in, e.g., `a*a')
3966 then we can change to pop_failure_jump, because we'll
3967 never have to backtrack.
3968
3969 This is not true in the case of alternatives: in
3970 `(a|ab)*' we do need to backtrack to the `ab' alternative
3971 (e.g., if the string was `ab'). But instead of trying to
3972 detect that here, the alternative has put on a dummy
3973 failure point which is what we will end up popping. */
3974
3975 /* Skip over open/close-group commands. */
3976 while (p2 + 2 < pend
3977 && ((re_opcode_t) *p2 == stop_memory
3978 || (re_opcode_t) *p2 == start_memory))
3979 p2 += 3; /* Skip over args, too. */
3980
3981 /* If we're at the end of the pattern, we can change. */
3982 if (p2 == pend)
3983 {
3984 /* Consider what happens when matching ":\(.*\)"
3985 against ":/". I don't really understand this code
3986 yet. */
3987 p[-3] = (unsigned char) pop_failure_jump;
3988 DEBUG_PRINT1
3989 (" End of pattern: change to `pop_failure_jump'.\n");
3990 }
3991
3992 else if ((re_opcode_t) *p2 == exactn
3993 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
3994 {
3995 register unsigned char c
3996 = *p2 == (unsigned char) endline ? '\n' : p2[2];
3997 p1 = p + mcnt;
3998
3999 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4000 to the `maybe_finalize_jump' of this case. Examine what
4001 follows. */
4002 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4003 {
4004 p[-3] = (unsigned char) pop_failure_jump;
4005 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4006 c, p1[5]);
4007 }
4008
4009 else if ((re_opcode_t) p1[3] == charset
4010 || (re_opcode_t) p1[3] == charset_not)
4011 {
4012 int not = (re_opcode_t) p1[3] == charset_not;
4013
4014 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4015 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4016 not = !not;
4017
4018 /* `not' is equal to 1 if c would match, which means
4019 that we can't change to pop_failure_jump. */
4020 if (!not)
4021 {
4022 p[-3] = (unsigned char) pop_failure_jump;
4023 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4024 }
4025 }
4026 }
4027 }
4028 p -= 2; /* Point at relative address again. */
4029 if ((re_opcode_t) p[-1] != pop_failure_jump)
4030 {
4031 p[-1] = (unsigned char) jump;
4032 DEBUG_PRINT1 (" Match => jump.\n");
4033 goto unconditional_jump;
4034 }
4035 /* Note fall through. */
4036
4037
4038 /* The end of a simple repeat has a pop_failure_jump back to
4039 its matching on_failure_jump, where the latter will push a
4040 failure point. The pop_failure_jump takes off failure
4041 points put on by this pop_failure_jump's matching
4042 on_failure_jump; we got through the pattern to here from the
4043 matching on_failure_jump, so didn't fail. */
4044 case pop_failure_jump:
4045 {
4046 /* We need to pass separate storage for the lowest and
4047 highest registers, even though we don't care about the
4048 actual values. Otherwise, we will restore only one
4049 register from the stack, since lowest will == highest in
4050 `pop_failure_point'. */
4051 unsigned dummy_low_reg, dummy_high_reg;
4052 unsigned char *pdummy;
4053 const char *sdummy;
4054
4055 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4056 POP_FAILURE_POINT (sdummy, pdummy,
4057 dummy_low_reg, dummy_high_reg,
4058 reg_dummy, reg_dummy, reg_info_dummy);
4059 }
4060 /* Note fall through. */
4061
4062
4063 /* Unconditionally jump (without popping any failure points). */
4064 case jump:
4065 unconditional_jump:
4066 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4067 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4068 p += mcnt; /* Do the jump. */
4069 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4070 break;
4071
4072
4073 /* We need this opcode so we can detect where alternatives end
4074 in `group_match_null_string_p' et al. */
4075 case jump_past_alt:
4076 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4077 goto unconditional_jump;
4078
4079
4080 /* Normally, the on_failure_jump pushes a failure point, which
4081 then gets popped at pop_failure_jump. We will end up at
4082 pop_failure_jump, also, and with a pattern of, say, `a+', we
4083 are skipping over the on_failure_jump, so we have to push
4084 something meaningless for pop_failure_jump to pop. */
4085 case dummy_failure_jump:
4086 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4087 /* It doesn't matter what we push for the string here. What
4088 the code at `fail' tests is the value for the pattern. */
4089 PUSH_FAILURE_POINT (0, 0, -2);
4090 goto unconditional_jump;
4091
4092
4093 /* At the end of an alternative, we need to push a dummy failure
4094 point in case we are followed by a `pop_failure_jump', because
4095 we don't want the failure point for the alternative to be
4096 popped. For example, matching `(a|ab)*' against `aab'
4097 requires that we match the `ab' alternative. */
4098 case push_dummy_failure:
4099 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4100 /* See comments just above at `dummy_failure_jump' about the
4101 two zeroes. */
4102 PUSH_FAILURE_POINT (0, 0, -2);
4103 break;
4104
4105 /* Have to succeed matching what follows at least n times.
4106 After that, handle like `on_failure_jump'. */
4107 case succeed_n:
4108 EXTRACT_NUMBER (mcnt, p + 2);
4109 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4110
4111 assert (mcnt >= 0);
4112 /* Originally, this is how many times we HAVE to succeed. */
4113 if (mcnt > 0)
4114 {
4115 mcnt--;
4116 p += 2;
4117 STORE_NUMBER_AND_INCR (p, mcnt);
4118 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4119 }
4120 else if (mcnt == 0)
4121 {
4122 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4123 p[2] = (unsigned char) no_op;
4124 p[3] = (unsigned char) no_op;
4125 goto on_failure;
4126 }
4127 break;
4128
4129 case jump_n:
4130 EXTRACT_NUMBER (mcnt, p + 2);
4131 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4132
4133 /* Originally, this is how many times we CAN jump. */
4134 if (mcnt)
4135 {
4136 mcnt--;
4137 STORE_NUMBER (p + 2, mcnt);
4138 goto unconditional_jump;
4139 }
4140 /* If don't have to jump any more, skip over the rest of command. */
4141 else
4142 p += 4;
4143 break;
4144
4145 case set_number_at:
4146 {
4147 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4148
4149 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4150 p1 = p + mcnt;
4151 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4152 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4153 STORE_NUMBER (p1, mcnt);
4154 break;
4155 }
4156
4157 case wordbound:
4158 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4159 if (AT_WORD_BOUNDARY (d))
4160 break;
4161 goto fail;
4162
4163 case notwordbound:
4164 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4165 if (AT_WORD_BOUNDARY (d))
4166 goto fail;
4167 break;
4168
4169 case wordbeg:
4170 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4171 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4172 break;
4173 goto fail;
4174
4175 case wordend:
4176 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4177 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4178 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4179 break;
4180 goto fail;
4181
4182#ifdef emacs
4183#ifdef emacs19
4184 case before_dot:
4185 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4186 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4187 goto fail;
4188 break;
4189
4190 case at_dot:
4191 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4192 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4193 goto fail;
4194 break;
4195
4196 case after_dot:
4197 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4198 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4199 goto fail;
4200 break;
4201#else /* not emacs19 */
4202 case at_dot:
4203 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4204 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4205 goto fail;
4206 break;
4207#endif /* not emacs19 */
4208
4209 case syntaxspec:
4210 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4211 mcnt = *p++;
4212 goto matchsyntax;
4213
4214 case wordchar:
4215 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4216 mcnt = (int) Sword;
4217 matchsyntax:
4218 PREFETCH ();
4219 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4220 goto fail;
4221 SET_REGS_MATCHED ();
4222 break;
4223
4224 case notsyntaxspec:
4225 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4226 mcnt = *p++;
4227 goto matchnotsyntax;
4228
4229 case notwordchar:
4230 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4231 mcnt = (int) Sword;
4232 matchnotsyntax:
4233 PREFETCH ();
4234 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4235 goto fail;
4236 SET_REGS_MATCHED ();
4237 break;
4238
4239#else /* not emacs */
4240 case wordchar:
4241 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4242 PREFETCH ();
4243 if (!WORDCHAR_P (d))
4244 goto fail;
4245 SET_REGS_MATCHED ();
4246 d++;
4247 break;
4248
4249 case notwordchar:
4250 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4251 PREFETCH ();
4252 if (WORDCHAR_P (d))
4253 goto fail;
4254 SET_REGS_MATCHED ();
4255 d++;
4256 break;
4257#endif /* not emacs */
4258
4259 default:
4260 abort ();
4261 }
4262 continue; /* Successfully executed one pattern command; keep going. */
4263
4264
4265 /* We goto here if a matching operation fails. */
4266 fail:
4267 if (!FAIL_STACK_EMPTY ())
4268 { /* A restart point is known. Restore to that state. */
4269 DEBUG_PRINT1 ("\nFAIL:\n");
4270 POP_FAILURE_POINT (d, p,
4271 lowest_active_reg, highest_active_reg,
4272 regstart, regend, reg_info);
4273
4274 /* If this failure point is a dummy, try the next one. */
4275 if (!p)
4276 goto fail;
4277
4278 /* If we failed to the end of the pattern, don't examine *p. */
4279 assert (p <= pend);
4280 if (p < pend)
4281 {
4282 boolean is_a_jump_n = false;
4283
4284 /* If failed to a backwards jump that's part of a repetition
4285 loop, need to pop this failure point and use the next one. */
4286 switch ((re_opcode_t) *p)
4287 {
4288 case jump_n:
4289 is_a_jump_n = true;
4290 case maybe_pop_jump:
4291 case pop_failure_jump:
4292 case jump:
4293 p1 = p + 1;
4294 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4295 p1 += mcnt;
4296
4297 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4298 || (!is_a_jump_n
4299 && (re_opcode_t) *p1 == on_failure_jump))
4300 goto fail;
4301 break;
4302 default:
4303 /* do nothing */ ;
4304 }
4305 }
4306
4307 if (d >= string1 && d <= end1)
4308 dend = end_match_1;
4309 }
4310 else
4311 break; /* Matching at this starting point really fails. */
4312 } /* for (;;) */
4313
4314 if (best_regs_set)
4315 goto restore_best_regs;
4316
4317 FREE_VARIABLES ();
4318
4319 return -1; /* Failure to match. */
4320} /* re_match_2 */
4321
4322/* Subroutine definitions for re_match_2. */
4323
4324
4325/* We are passed P pointing to a register number after a start_memory.
4326
4327 Return true if the pattern up to the corresponding stop_memory can
4328 match the empty string, and false otherwise.
4329
4330 If we find the matching stop_memory, sets P to point to one past its number.
4331 Otherwise, sets P to an undefined byte less than or equal to END.
4332
4333 We don't handle duplicates properly (yet). */
4334
4335static boolean
4336group_match_null_string_p (p, end, reg_info)
4337 unsigned char **p, *end;
4338 register_info_type *reg_info;
4339{
4340 int mcnt;
4341 /* Point to after the args to the start_memory. */
4342 unsigned char *p1 = *p + 2;
4343
4344 while (p1 < end)
4345 {
4346 /* Skip over opcodes that can match nothing, and return true or
4347 false, as appropriate, when we get to one that can't, or to the
4348 matching stop_memory. */
4349
4350 switch ((re_opcode_t) *p1)
4351 {
4352 /* Could be either a loop or a series of alternatives. */
4353 case on_failure_jump:
4354 p1++;
4355 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4356
4357 /* If the next operation is not a jump backwards in the
4358 pattern. */
4359
4360 if (mcnt >= 0)
4361 {
4362 /* Go through the on_failure_jumps of the alternatives,
4363 seeing if any of the alternatives cannot match nothing.
4364 The last alternative starts with only a jump,
4365 whereas the rest start with on_failure_jump and end
4366 with a jump, e.g., here is the pattern for `a|b|c':
4367
4368 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4369 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4370 /exactn/1/c
4371
4372 So, we have to first go through the first (n-1)
4373 alternatives and then deal with the last one separately. */
4374
4375
4376 /* Deal with the first (n-1) alternatives, which start
4377 with an on_failure_jump (see above) that jumps to right
4378 past a jump_past_alt. */
4379
4380 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4381 {
4382 /* `mcnt' holds how many bytes long the alternative
4383 is, including the ending `jump_past_alt' and
4384 its number. */
4385
4386 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4387 reg_info))
4388 return false;
4389
4390 /* Move to right after this alternative, including the
4391 jump_past_alt. */
4392 p1 += mcnt;
4393
4394 /* Break if it's the beginning of an n-th alternative
4395 that doesn't begin with an on_failure_jump. */
4396 if ((re_opcode_t) *p1 != on_failure_jump)
4397 break;
4398
4399 /* Still have to check that it's not an n-th
4400 alternative that starts with an on_failure_jump. */
4401 p1++;
4402 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4403 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4404 {
4405 /* Get to the beginning of the n-th alternative. */
4406 p1 -= 3;
4407 break;
4408 }
4409 }
4410
4411 /* Deal with the last alternative: go back and get number
4412 of the `jump_past_alt' just before it. `mcnt' contains
4413 the length of the alternative. */
4414 EXTRACT_NUMBER (mcnt, p1 - 2);
4415
4416 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4417 return false;
4418
4419 p1 += mcnt; /* Get past the n-th alternative. */
4420 } /* if mcnt > 0 */
4421 break;
4422
4423
4424 case stop_memory:
4425 assert (p1[1] == **p);
4426 *p = p1 + 2;
4427 return true;
4428
4429
4430 default:
4431 if (!common_op_match_null_string_p (&p1, end, reg_info))
4432 return false;
4433 }
4434 } /* while p1 < end */
4435
4436 return false;
4437} /* group_match_null_string_p */
4438
4439
4440/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4441 It expects P to be the first byte of a single alternative and END one
4442 byte past the last. The alternative can contain groups. */
4443
4444static boolean
4445alt_match_null_string_p (p, end, reg_info)
4446 unsigned char *p, *end;
4447 register_info_type *reg_info;
4448{
4449 int mcnt;
4450 unsigned char *p1 = p;
4451
4452 while (p1 < end)
4453 {
4454 /* Skip over opcodes that can match nothing, and break when we get
4455 to one that can't. */
4456
4457 switch ((re_opcode_t) *p1)
4458 {
4459 /* It's a loop. */
4460 case on_failure_jump:
4461 p1++;
4462 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4463 p1 += mcnt;
4464 break;
4465
4466 default:
4467 if (!common_op_match_null_string_p (&p1, end, reg_info))
4468 return false;
4469 }
4470 } /* while p1 < end */
4471
4472 return true;
4473} /* alt_match_null_string_p */
4474
4475
4476/* Deals with the ops common to group_match_null_string_p and
4477 alt_match_null_string_p.
4478
4479 Sets P to one after the op and its arguments, if any. */
4480
4481static boolean
4482common_op_match_null_string_p (p, end, reg_info)
4483 unsigned char **p, *end;
4484 register_info_type *reg_info;
4485{
4486 int mcnt;
4487 boolean ret;
4488 int reg_no;
4489 unsigned char *p1 = *p;
4490
4491 switch ((re_opcode_t) *p1++)
4492 {
4493 case no_op:
4494 case begline:
4495 case endline:
4496 case begbuf:
4497 case endbuf:
4498 case wordbeg:
4499 case wordend:
4500 case wordbound:
4501 case notwordbound:
4502#ifdef emacs
4503 case before_dot:
4504 case at_dot:
4505 case after_dot:
4506#endif
4507 break;
4508
4509 case start_memory:
4510 reg_no = *p1;
4511 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4512 ret = group_match_null_string_p (&p1, end, reg_info);
4513
4514 /* Have to set this here in case we're checking a group which
4515 contains a group and a back reference to it. */
4516
4517 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4518 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4519
4520 if (!ret)
4521 return false;
4522 break;
4523
4524 /* If this is an optimized succeed_n for zero times, make the jump. */
4525 case jump:
4526 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4527 if (mcnt >= 0)
4528 p1 += mcnt;
4529 else
4530 return false;
4531 break;
4532
4533 case succeed_n:
4534 /* Get to the number of times to succeed. */
4535 p1 += 2;
4536 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4537
4538 if (mcnt == 0)
4539 {
4540 p1 -= 4;
4541 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4542 p1 += mcnt;
4543 }
4544 else
4545 return false;
4546 break;
4547
4548 case duplicate:
4549 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4550 return false;
4551 break;
4552
4553 case set_number_at:
4554 p1 += 4;
4555
4556 default:
4557 /* All other opcodes mean we cannot match the empty string. */
4558 return false;
4559 }
4560
4561 *p = p1;
4562 return true;
4563} /* common_op_match_null_string_p */
4564
4565
4566/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4567 bytes; nonzero otherwise. */
4568
4569static int
4570bcmp_translate(
4571 unsigned char *s1,
4572 unsigned char *s2,
4573 int len,
4574 char *translate
4575)
4576{
4577 register unsigned char *p1 = s1, *p2 = s2;
4578 while (len)
4579 {
4580 if (translate[*p1++] != translate[*p2++]) return 1;
4581 len--;
4582 }
4583 return 0;
4584}
4585
4586/* Entry points for GNU code. */
4587
4588/* re_compile_pattern is the GNU regular expression compiler: it
4589 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4590 Returns 0 if the pattern was valid, otherwise an error string.
4591
4592 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4593 are set in BUFP on entry.
4594
4595 We call regex_compile to do the actual compilation. */
4596
4597const char *
4598re_compile_pattern (pattern, length, bufp)
4599 const char *pattern;
4600 int length;
4601 struct re_pattern_buffer *bufp;
4602{
4603 reg_errcode_t ret;
4604
4605 /* GNU code is written to assume at least RE_NREGS registers will be set
4606 (and at least one extra will be -1). */
4607 bufp->regs_allocated = REGS_UNALLOCATED;
4608
4609 /* And GNU code determines whether or not to get register information
4610 by passing null for the REGS argument to re_match, etc., not by
4611 setting no_sub. */
4612 bufp->no_sub = 0;
4613
4614 /* Match anchors at newline. */
4615 bufp->newline_anchor = 1;
4616
4617 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4618
4619 return re_error_msg[(int) ret];
4620}
4621
4622/* Entry points compatible with 4.2 BSD regex library. We don't define
4623 them if this is an Emacs or POSIX compilation. */
4624
4625#if !defined (emacs) && !defined (_POSIX_SOURCE)
4626
4627/* BSD has one and only one pattern buffer. */
4628static struct re_pattern_buffer re_comp_buf;
4629
4630char *
4631re_comp (s)
4632 const char *s;
4633{
4634 reg_errcode_t ret;
4635
4636 if (!s)
4637 {
4638 if (!re_comp_buf.buffer)
4639 return "No previous regular expression";
4640 return 0;
4641 }
4642
4643 if (!re_comp_buf.buffer)
4644 {
4645 re_comp_buf.buffer = (unsigned char *) malloc (200);
4646 if (re_comp_buf.buffer == NULL)
4647 return "Memory exhausted";
4648 re_comp_buf.allocated = 200;
4649
4650 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4651 if (re_comp_buf.fastmap == NULL)
4652 return "Memory exhausted";
4653 }
4654
4655 /* Since `re_exec' always passes NULL for the `regs' argument, we
4656 don't need to initialize the pattern buffer fields which affect it. */
4657
4658 /* Match anchors at newlines. */
4659 re_comp_buf.newline_anchor = 1;
4660
4661 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4662
4663 /* Yes, we're discarding `const' here. */
4664 return (char *) re_error_msg[(int) ret];
4665}
4666
4667
4668int
4669re_exec (s)
4670 const char *s;
4671{
4672 const int len = strlen (s);
4673 return
4674 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4675}
4676#endif /* not emacs and not _POSIX_SOURCE */
4677
4678/* POSIX.2 functions. Don't define these for Emacs. */
4679
4680#ifndef emacs
4681
4682/* regcomp takes a regular expression as a string and compiles it.
4683
4684 PREG is a regex_t *. We do not expect any fields to be initialized,
4685 since POSIX says we shouldn't. Thus, we set
4686
4687 `buffer' to the compiled pattern;
4688 `used' to the length of the compiled pattern;
4689 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4690 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4691 RE_SYNTAX_POSIX_BASIC;
4692 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4693 `fastmap' and `fastmap_accurate' to zero;
4694 `re_nsub' to the number of subexpressions in PATTERN.
4695
4696 PATTERN is the address of the pattern string.
4697
4698 CFLAGS is a series of bits which affect compilation.
4699
4700 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4701 use POSIX basic syntax.
4702
4703 If REG_NEWLINE is set, then . and [^...] don't match newline.
4704 Also, regexec will try a match beginning after every newline.
4705
4706 If REG_ICASE is set, then we considers upper- and lowercase
4707 versions of letters to be equivalent when matching.
4708
4709 If REG_NOSUB is set, then when PREG is passed to regexec, that
4710 routine will report only success or failure, and nothing about the
4711 registers.
4712
4713 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4714 the return codes and their meanings.) */
4715
4716int
4717regcomp (preg, pattern, cflags)
4718 regex_t *preg;
4719 const char *pattern;
4720 int cflags;
4721{
4722 reg_errcode_t ret;
4723 unsigned syntax
4724 = (cflags & REG_EXTENDED) ?
4725 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4726
4727 /* regex_compile will allocate the space for the compiled pattern. */
4728 preg->buffer = 0;
4729 preg->allocated = 0;
4730
4731 /* Don't bother to use a fastmap when searching. This simplifies the
4732 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4733 characters after newlines into the fastmap. This way, we just try
4734 every character. */
4735 preg->fastmap = 0;
4736
4737 if (cflags & REG_ICASE)
4738 {
4739 unsigned i;
4740
4741 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4742 if (preg->translate == NULL)
4743 return (int) REG_ESPACE;
4744
4745 /* Map uppercase characters to corresponding lowercase ones. */
4746 for (i = 0; i < CHAR_SET_SIZE; i++)
4747 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4748 }
4749 else
4750 preg->translate = NULL;
4751
4752 /* If REG_NEWLINE is set, newlines are treated differently. */
4753 if (cflags & REG_NEWLINE)
4754 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4755 syntax &= ~RE_DOT_NEWLINE;
4756 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4757 /* It also changes the matching behavior. */
4758 preg->newline_anchor = 1;
4759 }
4760 else
4761 preg->newline_anchor = 0;
4762
4763 preg->no_sub = !!(cflags & REG_NOSUB);
4764
4765 /* POSIX says a null character in the pattern terminates it, so we
4766 can use strlen here in compiling the pattern. */
4767 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4768
4769 /* POSIX doesn't distinguish between an unmatched open-group and an
4770 unmatched close-group: both are REG_EPAREN. */
4771 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4772
4773 return (int) ret;
4774}
4775
4776
4777/* regexec searches for a given pattern, specified by PREG, in the
4778 string STRING.
4779
4780 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4781 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4782 least NMATCH elements, and we set them to the offsets of the
4783 corresponding matched substrings.
4784
4785 EFLAGS specifies `execution flags' which affect matching: if
4786 REG_NOTBOL is set, then ^ does not match at the beginning of the
4787 string; if REG_NOTEOL is set, then $ does not match at the end.
4788
4789 We return 0 if we find a match and REG_NOMATCH if not. */
4790
4791int
4792regexec (preg, string, nmatch, pmatch, eflags)
4793 const regex_t *preg;
4794 const char *string;
4795 size_t nmatch;
4796 regmatch_t pmatch[];
4797 int eflags;
4798{
4799 int ret;
4800 struct re_registers regs;
4801 regex_t private_preg;
4802 int len = strlen (string);
4803 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4804
4805 private_preg = *preg;
4806
4807 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4808 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4809
4810 /* The user has told us exactly how many registers to return
4811 information about, via `nmatch'. We have to pass that on to the
4812 matching routines. */
4813 private_preg.regs_allocated = REGS_FIXED;
4814
4815 if (want_reg_info)
4816 {
4817 regs.num_regs = nmatch;
4818 regs.start = TALLOC (nmatch, regoff_t);
4819 regs.end = TALLOC (nmatch, regoff_t);
4820 if (regs.start == NULL || regs.end == NULL)
4821 return (int) REG_NOMATCH;
4822 }
4823
4824 /* Perform the searching operation. */
4825 ret = re_search (&private_preg, string, len,
4826 /* start: */ 0, /* range: */ len,
4827 want_reg_info ? &regs : (struct re_registers *) 0);
4828
4829 /* Copy the register information to the POSIX structure. */
4830 if (want_reg_info)
4831 {
4832 if (ret >= 0)
4833 {
4834 unsigned r;
4835
4836 for (r = 0; r < nmatch; r++)
4837 {
4838 pmatch[r].rm_so = regs.start[r];
4839 pmatch[r].rm_eo = regs.end[r];
4840 }
4841 }
4842
4843 /* If we needed the temporary register info, free the space now. */
4844 free (regs.start);
4845 free (regs.end);
4846 }
4847
4848 /* We want zero return to mean success, unlike `re_search'. */
4849 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4850}
4851
4852
4853/* Returns a message corresponding to an error code, ERRCODE, returned
4854 from either regcomp or regexec. We don't use PREG here. */
4855
4856size_t
4857regerror (errcode, preg, errbuf, errbuf_size)
4858 int errcode;
4859 const regex_t *preg;
4860 char *errbuf;
4861 size_t errbuf_size;
4862{
4863 const char *msg;
4864 size_t msg_size;
4865
4866 if (errcode < 0
4867 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4868 /* Only error codes returned by the rest of the code should be passed
4869 to this routine. If we are given anything else, or if other regex
4870 code generates an invalid error code, then the program has a bug.
4871 Dump core so we can fix it. */
4872 abort ();
4873
4874 msg = re_error_msg[errcode];
4875
4876 /* POSIX doesn't require that we do anything in this case, but why
4877 not be nice. */
4878 if (! msg)
4879 msg = "Success";
4880
4881 msg_size = strlen (msg) + 1; /* Includes the null. */
4882
4883 if (errbuf_size != 0)
4884 {
4885 if (msg_size > errbuf_size)
4886 {
4887 strncpy (errbuf, msg, errbuf_size - 1);
4888 errbuf[errbuf_size - 1] = 0;
4889 }
4890 else
4891 strcpy (errbuf, msg);
4892 }
4893
4894 return msg_size;
4895}
4896
4897
4898/* Free dynamically allocated space used by PREG. */
4899
4900void
4901regfree (preg)
4902 regex_t *preg;
4903{
4904 if (preg->buffer != NULL)
4905 free (preg->buffer);
4906 preg->buffer = NULL;
4907
4908 preg->allocated = 0;
4909 preg->used = 0;
4910
4911 if (preg->fastmap != NULL)
4912 free (preg->fastmap);
4913 preg->fastmap = NULL;
4914 preg->fastmap_accurate = 0;
4915
4916 if (preg->translate != NULL)
4917 free (preg->translate);
4918 preg->translate = NULL;
4919}
4920
4921#endif /* not emacs */
4922
4923/*
4924Local variables:
4925make-backup-files: t
4926version-control: t
4927trim-versions-without-asking: nil
4928End:
4929*/
diff --git a/win32/regex.h b/win32/regex.h
index 6eb64f140..61c968387 100644
--- a/win32/regex.h
+++ b/win32/regex.h
@@ -1,70 +1,90 @@
1/* Definitions for data structures and routines for the regular 1#include <stdio.h>
2 expression library, version 0.12. 2#include <stddef.h>
3 3
4 Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc. 4/* Definitions for data structures and routines for the regular
5 expression library.
6 Copyright (C) 1985,1989-93,1995-98,2000,2001,2002,2003,2005,2006,2008
7 Free Software Foundation, Inc.
8 This file is part of the GNU C Library.
5 9
6 This program is free software; you can redistribute it and/or modify 10 The GNU C Library is free software; you can redistribute it and/or
7 it under the terms of the GNU General Public License as published by 11 modify it under the terms of the GNU Lesser General Public
8 the Free Software Foundation; either version 2, or (at your option) 12 License as published by the Free Software Foundation; either
9 any later version. 13 version 2.1 of the License, or (at your option) any later version.
10 14
11 This program is distributed in the hope that it will be useful, 15 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 GNU General Public License for more details. 18 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
19 19
20#ifndef __REGEXP_LIBRARY_H__ 20 You should have received a copy of the GNU Lesser General Public
21#define __REGEXP_LIBRARY_H__ 21 License along with the GNU C Library; if not, write to the Free
22 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 02110-1301 USA. */
22 24
23/* POSIX says that <sys/types.h> must be included (by the caller) before 25#ifndef _REGEX_H
24 <regex.h>. */ 26#define _REGEX_H 1
25 27
26#ifdef VMS 28#ifdef HAVE_STDDEF_H
27/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
28 should be there. */
29#include <stddef.h> 29#include <stddef.h>
30#endif 30#endif
31 31
32#ifdef HAVE_SYS_TYPES_H
33#include <sys/types.h>
34#endif
35
36#ifndef _LIBC
37#define __USE_GNU 1
38#endif
39
40/* Allow the use in C++ code. */
41#ifdef __cplusplus
42extern "C" {
43#endif
44
45/* The following two types have to be signed and unsigned integer type
46 wide enough to hold a value of a pointer. For most ANSI compilers
47 ptrdiff_t and size_t should be likely OK. Still size of these two
48 types is 2 for Microsoft C. Ugh... */
49typedef long int s_reg_t;
50typedef unsigned long int active_reg_t;
32 51
33/* The following bits are used to determine the regexp syntax we 52/* The following bits are used to determine the regexp syntax we
34 recognize. The set/not-set meanings are chosen so that Emacs syntax 53 recognize. The set/not-set meanings are chosen so that Emacs syntax
35 remains the value 0. The bits are given in alphabetical order, and 54 remains the value 0. The bits are given in alphabetical order, and
36 the definitions shifted by one from the previous bit; thus, when we 55 the definitions shifted by one from the previous bit; thus, when we
37 add or remove a bit, only one other definition need change. */ 56 add or remove a bit, only one other definition need change. */
38typedef unsigned reg_syntax_t; 57typedef unsigned long int reg_syntax_t;
39 58
59#ifdef __USE_GNU
40/* If this bit is not set, then \ inside a bracket expression is literal. 60/* If this bit is not set, then \ inside a bracket expression is literal.
41 If set, then such a \ quotes the following character. */ 61 If set, then such a \ quotes the following character. */
42#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) 62# define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1)
43 63
44/* If this bit is not set, then + and ? are operators, and \+ and \? are 64/* If this bit is not set, then + and ? are operators, and \+ and \? are
45 literals. 65 literals.
46 If set, then \+ and \? are operators and + and ? are literals. */ 66 If set, then \+ and \? are operators and + and ? are literals. */
47#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 67# define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
48 68
49/* If this bit is set, then character classes are supported. They are: 69/* If this bit is set, then character classes are supported. They are:
50 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 70 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
51 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 71 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
52 If not set, then character classes are not supported. */ 72 If not set, then character classes are not supported. */
53#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) 73# define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
54 74
55/* If this bit is set, then ^ and $ are always anchors (outside bracket 75/* If this bit is set, then ^ and $ are always anchors (outside bracket
56 expressions, of course). 76 expressions, of course).
57 If this bit is not set, then it depends: 77 If this bit is not set, then it depends:
58 ^ is an anchor if it is at the beginning of a regular 78 ^ is an anchor if it is at the beginning of a regular
59 expression or after an open-group or an alternation operator; 79 expression or after an open-group or an alternation operator;
60 $ is an anchor if it is at the end of a regular expression, or 80 $ is an anchor if it is at the end of a regular expression, or
61 before a close-group or an alternation operator. 81 before a close-group or an alternation operator.
62 82
63 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 83 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
64 POSIX draft 11.2 says that * etc. in leading positions is undefined. 84 POSIX draft 11.2 says that * etc. in leading positions is undefined.
65 We already implemented a previous draft which made those constructs 85 We already implemented a previous draft which made those constructs
66 invalid, though, so we haven't changed the code back. */ 86 invalid, though, so we haven't changed the code back. */
67#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 87# define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
68 88
69/* If this bit is set, then special characters are always special 89/* If this bit is set, then special characters are always special
70 regardless of where they are in the pattern. 90 regardless of where they are in the pattern.
@@ -72,63 +92,94 @@ typedef unsigned reg_syntax_t;
72 some contexts; otherwise they are ordinary. Specifically, 92 some contexts; otherwise they are ordinary. Specifically,
73 * + ? and intervals are only special when not after the beginning, 93 * + ? and intervals are only special when not after the beginning,
74 open-group, or alternation operator. */ 94 open-group, or alternation operator. */
75#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 95# define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
76 96
77/* If this bit is set, then *, +, ?, and { cannot be first in an re or 97/* If this bit is set, then *, +, ?, and { cannot be first in an re or
78 immediately after an alternation or begin-group operator. */ 98 immediately after an alternation or begin-group operator. */
79#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) 99# define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
80 100
81/* If this bit is set, then . matches newline. 101/* If this bit is set, then . matches newline.
82 If not set, then it doesn't. */ 102 If not set, then it doesn't. */
83#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) 103# define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
84 104
85/* If this bit is set, then . doesn't match NUL. 105/* If this bit is set, then . doesn't match NUL.
86 If not set, then it does. */ 106 If not set, then it does. */
87#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) 107# define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
88 108
89/* If this bit is set, nonmatching lists [^...] do not match newline. 109/* If this bit is set, nonmatching lists [^...] do not match newline.
90 If not set, they do. */ 110 If not set, they do. */
91#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 111# define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
92 112
93/* If this bit is set, either \{...\} or {...} defines an 113/* If this bit is set, either \{...\} or {...} defines an
94 interval, depending on RE_NO_BK_BRACES. 114 interval, depending on RE_NO_BK_BRACES.
95 If not set, \{, \}, {, and } are literals. */ 115 If not set, \{, \}, {, and } are literals. */
96#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 116# define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
97 117
98/* If this bit is set, +, ? and | aren't recognized as operators. 118/* If this bit is set, +, ? and | aren't recognized as operators.
99 If not set, they are. */ 119 If not set, they are. */
100#define RE_LIMITED_OPS (RE_INTERVALS << 1) 120# define RE_LIMITED_OPS (RE_INTERVALS << 1)
101 121
102/* If this bit is set, newline is an alternation operator. 122/* If this bit is set, newline is an alternation operator.
103 If not set, newline is literal. */ 123 If not set, newline is literal. */
104#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) 124# define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
105 125
106/* If this bit is set, then `{...}' defines an interval, and \{ and \} 126/* If this bit is set, then `{...}' defines an interval, and \{ and \}
107 are literals. 127 are literals.
108 If not set, then `\{...\}' defines an interval. */ 128 If not set, then `\{...\}' defines an interval. */
109#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) 129# define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
110 130
111/* If this bit is set, (...) defines a group, and \( and \) are literals. 131/* If this bit is set, (...) defines a group, and \( and \) are literals.
112 If not set, \(...\) defines a group, and ( and ) are literals. */ 132 If not set, \(...\) defines a group, and ( and ) are literals. */
113#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) 133# define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
114 134
115/* If this bit is set, then \<digit> matches <digit>. 135/* If this bit is set, then \<digit> matches <digit>.
116 If not set, then \<digit> is a back-reference. */ 136 If not set, then \<digit> is a back-reference. */
117#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 137# define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
118 138
119/* If this bit is set, then | is an alternation operator, and \| is literal. 139/* If this bit is set, then | is an alternation operator, and \| is literal.
120 If not set, then \| is an alternation operator, and | is literal. */ 140 If not set, then \| is an alternation operator, and | is literal. */
121#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 141# define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
122 142
123/* If this bit is set, then an ending range point collating higher 143/* If this bit is set, then an ending range point collating higher
124 than the starting range point, as in [z-a], is invalid. 144 than the starting range point, as in [z-a], is invalid.
125 If not set, then when ending range point collates higher than the 145 If not set, then when ending range point collates higher than the
126 starting range point, the range is ignored. */ 146 starting range point, the range is ignored. */
127#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 147# define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
128 148
129/* If this bit is set, then an unmatched ) is ordinary. 149/* If this bit is set, then an unmatched ) is ordinary.
130 If not set, then an unmatched ) is invalid. */ 150 If not set, then an unmatched ) is invalid. */
131#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 151# define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
152
153/* If this bit is set, succeed as soon as we match the whole pattern,
154 without further backtracking. */
155# define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)
156
157/* If this bit is set, do not process the GNU regex operators.
158 If not set, then the GNU regex operators are recognized. */
159# define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1)
160
161/* If this bit is set, a syntactically invalid interval is treated as
162 a string of ordinary characters. For example, the ERE 'a{1' is
163 treated as 'a\{1'. */
164# define RE_INVALID_INTERVAL_ORD (RE_NO_GNU_OPS << 1)
165
166/* If this bit is set, then ignore case when matching.
167 If not set, then case is significant. */
168# define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
169
170/* This bit is used internally like RE_CONTEXT_INDEP_ANCHORS but only
171 for ^, because it is difficult to scan the regex backwards to find
172 whether ^ should be special. */
173# define RE_CARET_ANCHORS_HERE (RE_ICASE << 1)
174
175/* If this bit is set, then \{ cannot be first in an bre or
176 immediately after an alternation or begin-group operator. */
177# define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
178
179/* If this bit is set, then no_sub will be set to 1 during
180 re_compile_pattern. */
181#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
182#endif
132 183
133/* This global variable defines the particular regexp syntax to use (for 184/* This global variable defines the particular regexp syntax to use (for
134 some interfaces). When a regexp is compiled, the syntax used is 185 some interfaces). When a regexp is compiled, the syntax used is
@@ -136,6 +187,7 @@ typedef unsigned reg_syntax_t;
136 already-compiled regexps. */ 187 already-compiled regexps. */
137extern reg_syntax_t re_syntax_options; 188extern reg_syntax_t re_syntax_options;
138 189
190#ifdef __USE_GNU
139/* Define combinations of the above bits for the standard possibilities. 191/* Define combinations of the above bits for the standard possibilities.
140 (The [[[ comments delimit what gets put into the Texinfo file, so 192 (The [[[ comments delimit what gets put into the Texinfo file, so
141 don't delete them!) */ 193 don't delete them!) */
@@ -143,13 +195,22 @@ extern reg_syntax_t re_syntax_options;
143#define RE_SYNTAX_EMACS 0 195#define RE_SYNTAX_EMACS 0
144 196
145#define RE_SYNTAX_AWK \ 197#define RE_SYNTAX_AWK \
146 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ 198 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
147 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 199 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
148 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ 200 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
149 | RE_UNMATCHED_RIGHT_PAREN_ORD) 201 | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \
150 202 | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS)
151#define RE_SYNTAX_POSIX_AWK \ 203
152 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) 204#define RE_SYNTAX_GNU_AWK \
205 ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
206 | RE_INVALID_INTERVAL_ORD) \
207 & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \
208 | RE_CONTEXT_INVALID_OPS ))
209
210#define RE_SYNTAX_POSIX_AWK \
211 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
212 | RE_INTERVALS | RE_NO_GNU_OPS \
213 | RE_INVALID_INTERVAL_ORD)
153 214
154#define RE_SYNTAX_GREP \ 215#define RE_SYNTAX_GREP \
155 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ 216 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
@@ -163,7 +224,8 @@ extern reg_syntax_t re_syntax_options;
163 | RE_NO_BK_VBAR) 224 | RE_NO_BK_VBAR)
164 225
165#define RE_SYNTAX_POSIX_EGREP \ 226#define RE_SYNTAX_POSIX_EGREP \
166 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) 227 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \
228 | RE_INVALID_INTERVAL_ORD)
167 229
168/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 230/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
169#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC 231#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
@@ -176,7 +238,7 @@ extern reg_syntax_t re_syntax_options;
176 | RE_INTERVALS | RE_NO_EMPTY_RANGES) 238 | RE_INTERVALS | RE_NO_EMPTY_RANGES)
177 239
178#define RE_SYNTAX_POSIX_BASIC \ 240#define RE_SYNTAX_POSIX_BASIC \
179 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) 241 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM | RE_CONTEXT_INVALID_DUP)
180 242
181/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes 243/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
182 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this 244 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
@@ -185,13 +247,13 @@ extern reg_syntax_t re_syntax_options;
185 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) 247 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
186 248
187#define RE_SYNTAX_POSIX_EXTENDED \ 249#define RE_SYNTAX_POSIX_EXTENDED \
188 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 250 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
189 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ 251 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
190 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ 252 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
191 | RE_UNMATCHED_RIGHT_PAREN_ORD) 253 | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD)
192 254
193/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 255/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is
194 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 256 removed and RE_NO_BK_REFS is added. */
195#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ 257#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
196 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 258 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
197 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ 259 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
@@ -202,10 +264,12 @@ extern reg_syntax_t re_syntax_options;
202/* Maximum number of duplicates an interval can allow. Some systems 264/* Maximum number of duplicates an interval can allow. Some systems
203 (erroneously) define this in other header files, but we want our 265 (erroneously) define this in other header files, but we want our
204 value, so remove any previous define. */ 266 value, so remove any previous define. */
205#ifdef RE_DUP_MAX 267# ifdef RE_DUP_MAX
206#undef RE_DUP_MAX 268# undef RE_DUP_MAX
269# endif
270/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */
271# define RE_DUP_MAX (0x7fff)
207#endif 272#endif
208#define RE_DUP_MAX ((1 << 15) - 1)
209 273
210 274
211/* POSIX `cflags' bits (i.e., information for `regcomp'). */ 275/* POSIX `cflags' bits (i.e., information for `regcomp'). */
@@ -240,18 +304,26 @@ extern reg_syntax_t re_syntax_options;
240/* Like REG_NOTBOL, except for the end-of-line. */ 304/* Like REG_NOTBOL, except for the end-of-line. */
241#define REG_NOTEOL (1 << 1) 305#define REG_NOTEOL (1 << 1)
242 306
307/* Use PMATCH[0] to delimit the start and end of the search in the
308 buffer. */
309#define REG_STARTEND (1 << 2)
310
243 311
244/* If any error codes are removed, changed, or added, update the 312/* If any error codes are removed, changed, or added, update the
245 `re_error_msg' table in regex.c. */ 313 `re_error_msg' table in regex.c. */
246typedef enum 314typedef enum
247{ 315{
316#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K
317 REG_ENOSYS = -1, /* This will never happen for this implementation. */
318#endif
319
248 REG_NOERROR = 0, /* Success. */ 320 REG_NOERROR = 0, /* Success. */
249 REG_NOMATCH, /* Didn't find a match (for regexec). */ 321 REG_NOMATCH, /* Didn't find a match (for regexec). */
250 322
251 /* POSIX regcomp return error codes. (In the order listed in the 323 /* POSIX regcomp return error codes. (In the order listed in the
252 standard.) */ 324 standard.) */
253 REG_BADPAT, /* Invalid pattern. */ 325 REG_BADPAT, /* Invalid pattern. */
254 REG_ECOLLATE, /* Not implemented. */ 326 REG_ECOLLATE, /* Inalid collating element. */
255 REG_ECTYPE, /* Invalid character class name. */ 327 REG_ECTYPE, /* Invalid character class name. */
256 REG_EESCAPE, /* Trailing backslash. */ 328 REG_EESCAPE, /* Trailing backslash. */
257 REG_ESUBREG, /* Invalid back reference. */ 329 REG_ESUBREG, /* Invalid back reference. */
@@ -275,85 +347,92 @@ typedef enum
275 compiled, the `re_nsub' field is available. All other fields are 347 compiled, the `re_nsub' field is available. All other fields are
276 private to the regex routines. */ 348 private to the regex routines. */
277 349
350#ifndef RE_TRANSLATE_TYPE
351# define __RE_TRANSLATE_TYPE unsigned char *
352# ifdef __USE_GNU
353# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
354# endif
355#endif
356
357#ifdef __USE_GNU
358# define __REPB_PREFIX(name) name
359#else
360# define __REPB_PREFIX(name) __##name
361#endif
362
278struct re_pattern_buffer 363struct re_pattern_buffer
279{ 364{
280/* [[[begin pattern_buffer]]] */ 365 /* Space that holds the compiled pattern. It is declared as
281 /* Space that holds the compiled pattern. It is declared as 366 `unsigned char *' because its elements are sometimes used as
282 `unsigned char *' because its elements are 367 array indexes. */
283 sometimes used as array indexes. */ 368 unsigned char *__REPB_PREFIX(buffer);
284 unsigned char *buffer;
285 369
286 /* Number of bytes to which `buffer' points. */ 370 /* Number of bytes to which `buffer' points. */
287 unsigned long allocated; 371 unsigned long int __REPB_PREFIX(allocated);
288 372
289 /* Number of bytes actually used in `buffer'. */ 373 /* Number of bytes actually used in `buffer'. */
290 unsigned long used; 374 unsigned long int __REPB_PREFIX(used);
291 375
292 /* Syntax setting with which the pattern was compiled. */ 376 /* Syntax setting with which the pattern was compiled. */
293 reg_syntax_t syntax; 377 reg_syntax_t __REPB_PREFIX(syntax);
294 378
295 /* Pointer to a fastmap, if any, otherwise zero. re_search uses 379 /* Pointer to a fastmap, if any, otherwise zero. re_search uses the
296 the fastmap, if there is one, to skip over impossible 380 fastmap, if there is one, to skip over impossible starting points
297 starting points for matches. */ 381 for matches. */
298 char *fastmap; 382 char *__REPB_PREFIX(fastmap);
299 383
300 /* Either a translate table to apply to all characters before 384 /* Either a translate table to apply to all characters before
301 comparing them, or zero for no translation. The translation 385 comparing them, or zero for no translation. The translation is
302 is applied to a pattern when it is compiled and to a string 386 applied to a pattern when it is compiled and to a string when it
303 when it is matched. */ 387 is matched. */
304 char *translate; 388 __RE_TRANSLATE_TYPE __REPB_PREFIX(translate);
305 389
306 /* Number of subexpressions found by the compiler. */ 390 /* Number of subexpressions found by the compiler. */
307 size_t re_nsub; 391 size_t re_nsub;
308 392
309 /* Zero if this pattern cannot match the empty string, one else. 393 /* Zero if this pattern cannot match the empty string, one else.
310 Well, in truth it's used only in `re_search_2', to see 394 Well, in truth it's used only in `re_search_2', to see whether or
311 whether or not we should use the fastmap, so we don't set 395 not we should use the fastmap, so we don't set this absolutely
312 this absolutely perfectly; see `re_compile_fastmap' (the 396 perfectly; see `re_compile_fastmap' (the `duplicate' case). */
313 `duplicate' case). */ 397 unsigned __REPB_PREFIX(can_be_null) : 1;
314 unsigned can_be_null : 1; 398
315 399 /* If REGS_UNALLOCATED, allocate space in the `regs' structure
316 /* If REGS_UNALLOCATED, allocate space in the `regs' structure 400 for `max (RE_NREGS, re_nsub + 1)' groups.
317 for `max (RE_NREGS, re_nsub + 1)' groups. 401 If REGS_REALLOCATE, reallocate space if necessary.
318 If REGS_REALLOCATE, reallocate space if necessary. 402 If REGS_FIXED, use what's there. */
319 If REGS_FIXED, use what's there. */ 403#ifdef __USE_GNU
320#define REGS_UNALLOCATED 0 404# define REGS_UNALLOCATED 0
321#define REGS_REALLOCATE 1 405# define REGS_REALLOCATE 1
322#define REGS_FIXED 2 406# define REGS_FIXED 2
323 unsigned regs_allocated : 2; 407#endif
324 408 unsigned __REPB_PREFIX(regs_allocated) : 2;
325 /* Set to zero when `regex_compile' compiles a pattern; set to one
326 by `re_compile_fastmap' if it updates the fastmap. */
327 unsigned fastmap_accurate : 1;
328
329 /* If set, `re_match_2' does not return information about
330 subexpressions. */
331 unsigned no_sub : 1;
332
333 /* If set, a beginning-of-line anchor doesn't match at the
334 beginning of the string. */
335 unsigned not_bol : 1;
336
337 /* Similarly for an end-of-line anchor. */
338 unsigned not_eol : 1;
339
340 /* If true, an anchor at a newline matches. */
341 unsigned newline_anchor : 1;
342
343/* [[[end pattern_buffer]]] */
344};
345 409
346typedef struct re_pattern_buffer regex_t; 410 /* Set to zero when `regex_compile' compiles a pattern; set to one
411 by `re_compile_fastmap' if it updates the fastmap. */
412 unsigned __REPB_PREFIX(fastmap_accurate) : 1;
413
414 /* If set, `re_match_2' does not return information about
415 subexpressions. */
416 unsigned __REPB_PREFIX(no_sub) : 1;
417
418 /* If set, a beginning-of-line anchor doesn't match at the beginning
419 of the string. */
420 unsigned __REPB_PREFIX(not_bol) : 1;
421
422 /* Similarly for an end-of-line anchor. */
423 unsigned __REPB_PREFIX(not_eol) : 1;
347 424
425 /* If true, an anchor at a newline matches. */
426 unsigned __REPB_PREFIX(newline_anchor) : 1;
427};
348 428
349/* search.c (search_buffer) in Emacs needs this one opcode value. It is 429typedef struct re_pattern_buffer regex_t;
350 defined both in `regex.c' and here. */
351#define RE_EXACTN_VALUE 1
352 430
353/* Type for byte offsets within the string. POSIX mandates this. */ 431/* Type for byte offsets within the string. POSIX mandates this. */
354typedef int regoff_t; 432typedef int regoff_t;
355 433
356 434
435#ifdef __USE_GNU
357/* This is the structure we store register match data in. See 436/* This is the structure we store register match data in. See
358 regex.texinfo for a full description of what registers match. */ 437 regex.texinfo for a full description of what registers match. */
359struct re_registers 438struct re_registers
@@ -367,8 +446,9 @@ struct re_registers
367/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, 446/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
368 `re_match_2' returns information about at least this many registers 447 `re_match_2' returns information about at least this many registers
369 the first time a `regs' structure is passed. */ 448 the first time a `regs' structure is passed. */
370#ifndef RE_NREGS 449# ifndef RE_NREGS
371#define RE_NREGS 30 450# define RE_NREGS 30
451# endif
372#endif 452#endif
373 453
374 454
@@ -383,38 +463,22 @@ typedef struct
383 463
384/* Declarations for routines. */ 464/* Declarations for routines. */
385 465
386/* To avoid duplicating every routine declaration -- once with a 466#ifdef __USE_GNU
387 prototype (if we are ANSI), and once without (if we aren't) -- we
388 use the following macro to declare argument types. This
389 unfortunately clutters up the declarations a bit, but I think it's
390 worth it. */
391
392#if __STDC__
393
394#define _RE_ARGS(args) args
395
396#else /* not __STDC__ */
397
398#define _RE_ARGS(args) ()
399
400#endif /* not __STDC__ */
401
402/* Sets the current default syntax to SYNTAX, and return the old syntax. 467/* Sets the current default syntax to SYNTAX, and return the old syntax.
403 You can also simply assign to the `re_syntax_options' variable. */ 468 You can also simply assign to the `re_syntax_options' variable. */
404extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); 469extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax);
405 470
406/* Compile the regular expression PATTERN, with length LENGTH 471/* Compile the regular expression PATTERN, with length LENGTH
407 and syntax given by the global `re_syntax_options', into the buffer 472 and syntax given by the global `re_syntax_options', into the buffer
408 BUFFER. Return NULL if successful, and an error string if not. */ 473 BUFFER. Return NULL if successful, and an error string if not. */
409extern const char *re_compile_pattern 474extern const char *re_compile_pattern (const char *__pattern, size_t __length,
410 _RE_ARGS ((const char *pattern, int length, 475 struct re_pattern_buffer *__buffer);
411 struct re_pattern_buffer *buffer));
412 476
413 477
414/* Compile a fastmap for the compiled pattern in BUFFER; used to 478/* Compile a fastmap for the compiled pattern in BUFFER; used to
415 accelerate searches. Return 0 if successful and -2 if was an 479 accelerate searches. Return 0 if successful and -2 if was an
416 internal error. */ 480 internal error. */
417extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); 481extern int re_compile_fastmap (struct re_pattern_buffer *__buffer);
418 482
419 483
420/* Search in the string STRING (with length LENGTH) for the pattern 484/* Search in the string STRING (with length LENGTH) for the pattern
@@ -422,31 +486,30 @@ extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer));
422 characters. Return the starting position of the match, -1 for no 486 characters. Return the starting position of the match, -1 for no
423 match, or -2 for an internal error. Also return register 487 match, or -2 for an internal error. Also return register
424 information in REGS (if REGS and BUFFER->no_sub are nonzero). */ 488 information in REGS (if REGS and BUFFER->no_sub are nonzero). */
425extern int re_search 489extern int re_search (struct re_pattern_buffer *__buffer, const char *__cstring,
426 _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, 490 int __length, int __start, int __range,
427 int length, int start, int range, struct re_registers *regs)); 491 struct re_registers *__regs);
428 492
429 493
430/* Like `re_search', but search in the concatenation of STRING1 and 494/* Like `re_search', but search in the concatenation of STRING1 and
431 STRING2. Also, stop searching at index START + STOP. */ 495 STRING2. Also, stop searching at index START + STOP. */
432extern int re_search_2 496extern int re_search_2 (struct re_pattern_buffer *__buffer,
433 _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, 497 const char *__string1, int __length1,
434 int length1, const char *string2, int length2, 498 const char *__string2, int __length2, int __start,
435 int start, int range, struct re_registers *regs, int stop)); 499 int __range, struct re_registers *__regs, int __stop);
436 500
437 501
438/* Like `re_search', but return how many characters in STRING the regexp 502/* Like `re_search', but return how many characters in STRING the regexp
439 in BUFFER matched, starting at position START. */ 503 in BUFFER matched, starting at position START. */
440extern int re_match 504extern int re_match (struct re_pattern_buffer *__buffer, const char *__cstring,
441 _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, 505 int __length, int __start, struct re_registers *__regs);
442 int length, int start, struct re_registers *regs));
443 506
444 507
445/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ 508/* Relates to `re_match' as `re_search_2' relates to `re_search'. */
446extern int re_match_2 509extern int re_match_2 (struct re_pattern_buffer *__buffer,
447 _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, 510 const char *__string1, int __length1,
448 int length1, const char *string2, int length2, 511 const char *__string2, int __length2, int __start,
449 int start, struct re_registers *regs, int stop)); 512 struct re_registers *__regs, int __stop);
450 513
451 514
452/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 515/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
@@ -461,30 +524,59 @@ extern int re_match_2
461 Unless this function is called, the first search or match using 524 Unless this function is called, the first search or match using
462 PATTERN_BUFFER will allocate its own register data, without 525 PATTERN_BUFFER will allocate its own register data, without
463 freeing the old data. */ 526 freeing the old data. */
464extern void re_set_registers 527extern void re_set_registers (struct re_pattern_buffer *__buffer,
465 _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, 528 struct re_registers *__regs,
466 unsigned num_regs, regoff_t *starts, regoff_t *ends)); 529 unsigned int __num_regs,
467 530 regoff_t *__starts, regoff_t *__ends);
531#endif /* Use GNU */
532
533#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_BSD)
534# ifndef _CRAY
468/* 4.2 bsd compatibility. */ 535/* 4.2 bsd compatibility. */
469extern char *re_comp _RE_ARGS ((const char *)); 536extern char *re_comp (const char *);
470extern int re_exec _RE_ARGS ((const char *)); 537extern int re_exec (const char *);
538# endif
539#endif
540
541/* GCC 2.95 and later have "__restrict"; C99 compilers have
542 "restrict", and "configure" may have defined "restrict". */
543#ifndef __restrict
544# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__))
545# if defined restrict || 199901L <= __STDC_VERSION__
546# define __restrict restrict
547# else
548# define __restrict
549# endif
550# endif
551#endif
552/* gcc 3.1 and up support the [restrict] syntax. */
553#ifndef __restrict_arr
554# if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) \
555 && !defined __GNUG__
556# define __restrict_arr __restrict
557# else
558# define __restrict_arr
559# endif
560#endif
471 561
472/* POSIX compatibility. */ 562/* POSIX compatibility. */
473extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); 563extern int regcomp (regex_t *__restrict __preg,
474extern int regexec 564 const char *__restrict __pattern,
475 _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, 565 int __cflags);
476 regmatch_t pmatch[], int eflags)); 566
477extern size_t regerror 567extern int regexec (const regex_t *__restrict __preg,
478 _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, 568 const char *__restrict __cstring, size_t __nmatch,
479 size_t errbuf_size)); 569 regmatch_t __pmatch[__restrict_arr],
480extern void regfree _RE_ARGS ((regex_t *preg)); 570 int __eflags);
481 571
482#endif /* not __REGEXP_LIBRARY_H__ */ 572extern size_t regerror (int __errcode, const regex_t *__restrict __preg,
483 573 char *__restrict __errbuf, size_t __errbuf_size);
484/* 574
485Local variables: 575extern void regfree (regex_t *__preg);
486make-backup-files: t 576
487version-control: t 577
488trim-versions-without-asking: nil 578#ifdef __cplusplus
489End: 579}
490*/ 580#endif /* C++ */
581
582#endif /* regex.h */
diff --git a/win32/regex_internal.c b/win32/regex_internal.c
new file mode 100644
index 000000000..d4121f2f4
--- /dev/null
+++ b/win32/regex_internal.c
@@ -0,0 +1,1744 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
32
33#ifdef GAWK
34#undef MAX /* safety */
35static int
36MAX(size_t a, size_t b)
37{
38 return (a > b ? a : b);
39}
40#endif
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48internal_function
49re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
50 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
51{
52 reg_errcode_t ret;
53 int init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (BE (ret != REG_NOERROR, 0))
63 return ret;
64
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
76internal_function
77re_string_construct (re_string_t *pstr, const char *str, int len,
78 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
79{
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (BE (ret != REG_NOERROR, 0))
88 return ret;
89 }
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (BE (ret != REG_NOERROR, 0))
108 return ret;
109 }
110 }
111 else
112#endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117#ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121#endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
130 }
131 }
132
133 return REG_NOERROR;
134}
135
136/* Helper functions for re_string_allocate, and re_string_construct. */
137
138static reg_errcode_t
139internal_function
140re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
141{
142#ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
144 {
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
149 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
150 return REG_ESPACE;
151
152 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_wcs == NULL, 0))
154 return REG_ESPACE;
155 pstr->wcs = new_wcs;
156 if (pstr->offsets != NULL)
157 {
158 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
159 if (BE (new_offsets == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_offsets;
162 }
163 }
164#endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
166 {
167 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_mbs == NULL, 0))
170 return REG_ESPACE;
171 pstr->mbs = new_mbs;
172 }
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
175}
176
177
178static void
179internal_function
180re_string_construct_common (const char *str, int len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, int icase,
182 const re_dfa_t *dfa)
183{
184 pstr->raw_mbs = (const unsigned char *) str;
185 pstr->len = len;
186 pstr->raw_len = len;
187 pstr->trans = trans;
188 pstr->icase = icase ? 1 : 0;
189 pstr->mbs_allocated = (trans != NULL || icase);
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
195}
196
197#ifdef RE_ENABLE_I18N
198
199/* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
206
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210static void
211internal_function
212build_wcs_buffer (re_string_t *pstr)
213{
214#ifdef _LIBC
215 unsigned char buf[MB_LEN_MAX];
216 assert (MB_LEN_MAX >= pstr->mb_cur_max);
217#else
218 unsigned char buf[64];
219#endif
220 mbstate_t prev_st;
221 int byte_idx, end_idx, remain_len;
222 size_t mbclen;
223
224 /* Build the buffers from pstr->valid_len to either pstr->len or
225 pstr->bufs_len. */
226 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
227 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
228 {
229 wchar_t wc;
230 const char *p;
231
232 remain_len = end_idx - byte_idx;
233 prev_st = pstr->cur_state;
234 /* Apply the translation if we need. */
235 if (BE (pstr->trans != NULL, 0))
236 {
237 int i, ch;
238
239 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
240 {
241 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
242 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
243 }
244 p = (const char *) buf;
245 }
246 else
247 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
248 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
249 if (BE (mbclen == (size_t) -2, 0))
250 {
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr->cur_state = prev_st;
253 break;
254 }
255 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
256 {
257 /* We treat these cases as a singlebyte character. */
258 mbclen = 1;
259 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
260 if (BE (pstr->trans != NULL, 0))
261 wc = pstr->trans[wc];
262 pstr->cur_state = prev_st;
263 }
264
265 /* Write wide character and padding. */
266 pstr->wcs[byte_idx++] = wc;
267 /* Write paddings. */
268 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
269 pstr->wcs[byte_idx++] = WEOF;
270 }
271 pstr->valid_len = byte_idx;
272 pstr->valid_raw_len = byte_idx;
273}
274
275/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
276 but for REG_ICASE. */
277
278static reg_errcode_t
279internal_function
280build_wcs_upper_buffer (re_string_t *pstr)
281{
282 mbstate_t prev_st;
283 int src_idx, byte_idx, end_idx, remain_len;
284 size_t mbclen;
285#ifdef _LIBC
286 char buf[MB_LEN_MAX];
287 assert (MB_LEN_MAX >= pstr->mb_cur_max);
288#else
289 char buf[64];
290#endif
291
292 byte_idx = pstr->valid_len;
293 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
294
295 /* The following optimization assumes that ASCII characters can be
296 mapped to wide characters with a simple cast. */
297 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
298 {
299 while (byte_idx < end_idx)
300 {
301 wchar_t wc;
302
303 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
304 && mbsinit (&pstr->cur_state))
305 {
306 /* In case of a singlebyte character. */
307 pstr->mbs[byte_idx]
308 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
309 /* The next step uses the assumption that wchar_t is encoded
310 ASCII-safe: all ASCII values can be converted like this. */
311 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
312 ++byte_idx;
313 continue;
314 }
315
316 remain_len = end_idx - byte_idx;
317 prev_st = pstr->cur_state;
318 mbclen = __mbrtowc (&wc,
319 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
320 + byte_idx), remain_len, &pstr->cur_state);
321 if (BE (mbclen + 2 > 2, 1))
322 {
323 wchar_t wcu = wc;
324 if (iswlower (wc))
325 {
326 size_t mbcdlen;
327
328 wcu = towupper (wc);
329 mbcdlen = wcrtomb (buf, wcu, &prev_st);
330 if (BE (mbclen == mbcdlen, 1))
331 memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 else
333 {
334 src_idx = byte_idx;
335 goto offsets_needed;
336 }
337 }
338 else
339 memcpy (pstr->mbs + byte_idx,
340 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 pstr->wcs[byte_idx++] = wcu;
342 /* Write paddings. */
343 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 pstr->wcs[byte_idx++] = WEOF;
345 }
346 else if (mbclen == (size_t) -1 || mbclen == 0)
347 {
348 /* It is an invalid character or '\0'. Just use the byte. */
349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
355 }
356 else
357 {
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
361 }
362 }
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
366 }
367 else
368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369 {
370 wchar_t wc;
371 const char *p;
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
376 {
377 int i, ch;
378
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 {
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
383 }
384 p = (const char *) buf;
385 }
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen + 2 > 2, 1))
390 {
391 wchar_t wcu = wc;
392 if (iswlower (wc))
393 {
394 size_t mbcdlen;
395
396 wcu = towupper (wc);
397 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
398 if (BE (mbclen == mbcdlen, 1))
399 memcpy (pstr->mbs + byte_idx, buf, mbclen);
400 else if (mbcdlen != (size_t) -1)
401 {
402 size_t i;
403
404 if (byte_idx + mbcdlen > pstr->bufs_len)
405 {
406 pstr->cur_state = prev_st;
407 break;
408 }
409
410 if (pstr->offsets == NULL)
411 {
412 pstr->offsets = re_malloc (int, pstr->bufs_len);
413
414 if (pstr->offsets == NULL)
415 return REG_ESPACE;
416 }
417 if (!pstr->offsets_needed)
418 {
419 for (i = 0; i < (size_t) byte_idx; ++i)
420 pstr->offsets[i] = i;
421 pstr->offsets_needed = 1;
422 }
423
424 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
425 pstr->wcs[byte_idx] = wcu;
426 pstr->offsets[byte_idx] = src_idx;
427 for (i = 1; i < mbcdlen; ++i)
428 {
429 pstr->offsets[byte_idx + i]
430 = src_idx + (i < mbclen ? i : mbclen - 1);
431 pstr->wcs[byte_idx + i] = WEOF;
432 }
433 pstr->len += mbcdlen - mbclen;
434 if (pstr->raw_stop > src_idx)
435 pstr->stop += mbcdlen - mbclen;
436 end_idx = (pstr->bufs_len > pstr->len)
437 ? pstr->len : pstr->bufs_len;
438 byte_idx += mbcdlen;
439 src_idx += mbclen;
440 continue;
441 }
442 else
443 memcpy (pstr->mbs + byte_idx, p, mbclen);
444 }
445 else
446 memcpy (pstr->mbs + byte_idx, p, mbclen);
447
448 if (BE (pstr->offsets_needed != 0, 0))
449 {
450 size_t i;
451 for (i = 0; i < mbclen; ++i)
452 pstr->offsets[byte_idx + i] = src_idx + i;
453 }
454 src_idx += mbclen;
455
456 pstr->wcs[byte_idx++] = wcu;
457 /* Write paddings. */
458 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
459 pstr->wcs[byte_idx++] = WEOF;
460 }
461 else if (mbclen == (size_t) -1 || mbclen == 0)
462 {
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465
466 if (BE (pstr->trans != NULL, 0))
467 ch = pstr->trans [ch];
468 pstr->mbs[byte_idx] = ch;
469
470 if (BE (pstr->offsets_needed != 0, 0))
471 pstr->offsets[byte_idx] = src_idx;
472 ++src_idx;
473
474 /* And also cast it to wide char. */
475 pstr->wcs[byte_idx++] = (wchar_t) ch;
476 if (BE (mbclen == (size_t) -1, 0))
477 pstr->cur_state = prev_st;
478 }
479 else
480 {
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr->cur_state = prev_st;
483 break;
484 }
485 }
486 pstr->valid_len = byte_idx;
487 pstr->valid_raw_len = src_idx;
488 return REG_NOERROR;
489}
490
491/* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 Return the index. */
493
494static int
495internal_function
496re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
497{
498 mbstate_t prev_st;
499 int rawbuf_idx;
500 size_t mbclen;
501 wint_t wc = WEOF;
502
503 /* Skip the characters which are not necessary to check. */
504 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
505 rawbuf_idx < new_raw_idx;)
506 {
507 wchar_t wc2;
508 int remain_len = pstr->len - rawbuf_idx;
509 prev_st = pstr->cur_state;
510 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
511 remain_len, &pstr->cur_state);
512 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
513 {
514 /* We treat these cases as a single byte character. */
515 if (mbclen == 0 || remain_len == 0)
516 wc = L'\0';
517 else
518 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
519 mbclen = 1;
520 pstr->cur_state = prev_st;
521 }
522 else
523 wc = (wint_t) wc2;
524 /* Then proceed the next character. */
525 rawbuf_idx += mbclen;
526 }
527 *last_wc = (wint_t) wc;
528 return rawbuf_idx;
529}
530#endif /* RE_ENABLE_I18N */
531
532/* Build the buffer PSTR->MBS, and apply the translation if we need.
533 This function is used in case of REG_ICASE. */
534
535static void
536internal_function
537build_upper_buffer (re_string_t *pstr)
538{
539 int char_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541
542 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
543 {
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
545 if (BE (pstr->trans != NULL, 0))
546 ch = pstr->trans[ch];
547 if (islower (ch))
548 pstr->mbs[char_idx] = toupper (ch);
549 else
550 pstr->mbs[char_idx] = ch;
551 }
552 pstr->valid_len = char_idx;
553 pstr->valid_raw_len = char_idx;
554}
555
556/* Apply TRANS to the buffer in PSTR. */
557
558static void
559internal_function
560re_string_translate_buffer (re_string_t *pstr)
561{
562 int buf_idx, end_idx;
563 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
564
565 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
566 {
567 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
568 pstr->mbs[buf_idx] = pstr->trans[ch];
569 }
570
571 pstr->valid_len = buf_idx;
572 pstr->valid_raw_len = buf_idx;
573}
574
575/* This function re-construct the buffers.
576 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
577 convert to upper case in case of REG_ICASE, apply translation. */
578
579static reg_errcode_t
580internal_function
581re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
582{
583 int offset = idx - pstr->raw_mbs_idx;
584 if (BE (offset < 0, 0))
585 {
586 /* Reset buffer. */
587#ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590#endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
602 }
603
604 if (BE (offset != 0, 1))
605 {
606 /* Should the already checked characters be kept? */
607 if (BE (offset < pstr->valid_raw_len, 1))
608 {
609 /* Yes, move them to the front of the buffer. */
610#ifdef RE_ENABLE_I18N
611 if (BE (pstr->offsets_needed, 0))
612 {
613 int low = 0, high = pstr->valid_len, mid;
614 do
615 {
616 mid = (high + low) / 2;
617 if (pstr->offsets[mid] > offset)
618 high = mid;
619 else if (pstr->offsets[mid] < offset)
620 low = mid + 1;
621 else
622 break;
623 }
624 while (low < high);
625 if (pstr->offsets[mid] < offset)
626 ++mid;
627 pstr->tip_context = re_string_context_at (pstr, mid - 1,
628 eflags);
629 /* This can be quite complicated, so handle specially
630 only the common and easy case where the character with
631 different length representation of lower and upper
632 case is present at or after offset. */
633 if (pstr->valid_len > offset
634 && mid == offset && pstr->offsets[mid] == offset)
635 {
636 memmove (pstr->wcs, pstr->wcs + offset,
637 (pstr->valid_len - offset) * sizeof (wint_t));
638 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639 pstr->valid_len -= offset;
640 pstr->valid_raw_len -= offset;
641 for (low = 0; low < pstr->valid_len; low++)
642 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 }
644 else
645 {
646 /* Otherwise, just find out how long the partial multibyte
647 character at offset is and fill it with WEOF/255. */
648 pstr->len = pstr->raw_len - idx + offset;
649 pstr->stop = pstr->raw_stop - idx + offset;
650 pstr->offsets_needed = 0;
651 while (mid > 0 && pstr->offsets[mid - 1] == offset)
652 --mid;
653 while (mid < pstr->valid_len)
654 if (pstr->wcs[mid] != WEOF)
655 break;
656 else
657 ++mid;
658 if (mid == pstr->valid_len)
659 pstr->valid_len = 0;
660 else
661 {
662 pstr->valid_len = pstr->offsets[mid] - offset;
663 if (pstr->valid_len)
664 {
665 for (low = 0; low < pstr->valid_len; ++low)
666 pstr->wcs[low] = WEOF;
667 memset (pstr->mbs, 255, pstr->valid_len);
668 }
669 }
670 pstr->valid_raw_len = pstr->valid_len;
671 }
672 }
673 else
674#endif
675 {
676 pstr->tip_context = re_string_context_at (pstr, offset - 1,
677 eflags);
678#ifdef RE_ENABLE_I18N
679 if (pstr->mb_cur_max > 1)
680 memmove (pstr->wcs, pstr->wcs + offset,
681 (pstr->valid_len - offset) * sizeof (wint_t));
682#endif /* RE_ENABLE_I18N */
683 if (BE (pstr->mbs_allocated, 0))
684 memmove (pstr->mbs, pstr->mbs + offset,
685 pstr->valid_len - offset);
686 pstr->valid_len -= offset;
687 pstr->valid_raw_len -= offset;
688#if DEBUG
689 assert (pstr->valid_len > 0);
690#endif
691 }
692 }
693 else
694 {
695#ifdef RE_ENABLE_I18N
696 /* No, skip all characters until IDX. */
697 int prev_valid_len = pstr->valid_len;
698
699 if (BE (pstr->offsets_needed, 0))
700 {
701 pstr->len = pstr->raw_len - idx + offset;
702 pstr->stop = pstr->raw_stop - idx + offset;
703 pstr->offsets_needed = 0;
704 }
705#endif
706 pstr->valid_len = 0;
707#ifdef RE_ENABLE_I18N
708 if (pstr->mb_cur_max > 1)
709 {
710 int wcs_idx;
711 wint_t wc = WEOF;
712
713 if (pstr->is_utf8)
714 {
715 const unsigned char *raw, *p, *end;
716
717 /* Special case UTF-8. Multi-byte chars start with any
718 byte other than 0x80 - 0xbf. */
719 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720 end = raw + (offset - pstr->mb_cur_max);
721 if (end < pstr->raw_mbs)
722 end = pstr->raw_mbs;
723 p = raw + offset - 1;
724#ifdef _LIBC
725 /* We know the wchar_t encoding is UCS4, so for the simple
726 case, ASCII characters, skip the conversion step. */
727 if (isascii (*p) && BE (pstr->trans == NULL, 1))
728 {
729 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730 /* pstr->valid_len = 0; */
731 wc = (wchar_t) *p;
732 }
733 else
734#endif
735 for (; p >= end; --p)
736 if ((*p & 0xc0) != 0x80)
737 {
738 mbstate_t cur_state;
739 wchar_t wc2;
740 int mlen = raw + pstr->len - p;
741 unsigned char buf[6];
742 size_t mbclen;
743
744 if (BE (pstr->trans != NULL, 0))
745 {
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
749 }
750 /* XXX Don't use mbrtowc, we know which conversion
751 to use (UTF-8 -> UCS4). */
752 memset (&cur_state, 0, sizeof (cur_state));
753 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
754 &cur_state);
755 if (raw + offset - p <= mbclen
756 && mbclen < (size_t) -2)
757 {
758 memset (&pstr->cur_state, '\0',
759 sizeof (mbstate_t));
760 pstr->valid_len = mbclen - (raw + offset - p);
761 wc = wc2;
762 }
763 break;
764 }
765 }
766
767 if (wc == WEOF)
768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
769 if (wc == WEOF)
770 pstr->tip_context
771 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
772 else
773 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
774 && IS_WIDE_WORD_CHAR (wc))
775 ? CONTEXT_WORD
776 : ((IS_WIDE_NEWLINE (wc)
777 && pstr->newline_anchor)
778 ? CONTEXT_NEWLINE : 0));
779 if (BE (pstr->valid_len, 0))
780 {
781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
782 pstr->wcs[wcs_idx] = WEOF;
783 if (pstr->mbs_allocated)
784 memset (pstr->mbs, 255, pstr->valid_len);
785 }
786 pstr->valid_raw_len = pstr->valid_len;
787 }
788 else
789#endif /* RE_ENABLE_I18N */
790 {
791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
792 pstr->valid_raw_len = 0;
793 if (pstr->trans)
794 c = pstr->trans[c];
795 pstr->tip_context = (bitset_contain (pstr->word_char, c)
796 ? CONTEXT_WORD
797 : ((IS_NEWLINE (c) && pstr->newline_anchor)
798 ? CONTEXT_NEWLINE : 0));
799 }
800 }
801 if (!BE (pstr->mbs_allocated, 0))
802 pstr->mbs += offset;
803 }
804 pstr->raw_mbs_idx = idx;
805 pstr->len -= offset;
806 pstr->stop -= offset;
807
808 /* Then build the buffers. */
809#ifdef RE_ENABLE_I18N
810 if (pstr->mb_cur_max > 1)
811 {
812 if (pstr->icase)
813 {
814 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815 if (BE (ret != REG_NOERROR, 0))
816 return ret;
817 }
818 else
819 build_wcs_buffer (pstr);
820 }
821 else
822#endif /* RE_ENABLE_I18N */
823 if (BE (pstr->mbs_allocated, 0))
824 {
825 if (pstr->icase)
826 build_upper_buffer (pstr);
827 else if (pstr->trans != NULL)
828 re_string_translate_buffer (pstr);
829 }
830 else
831 pstr->valid_len = pstr->len;
832
833 pstr->cur_idx = 0;
834 return REG_NOERROR;
835}
836
837static unsigned char
838internal_function __attribute ((pure))
839re_string_peek_byte_case (const re_string_t *pstr, int idx)
840{
841 int ch, off;
842
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr->mbs_allocated, 1))
845 return re_string_peek_byte (pstr, idx);
846
847#ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851#endif
852
853 off = pstr->cur_idx + idx;
854#ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857#endif
858
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860
861#ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868#endif
869
870 return ch;
871}
872
873static unsigned char
874internal_function __attribute ((pure))
875re_string_fetch_byte_case (re_string_t *pstr)
876{
877 if (BE (!pstr->mbs_allocated, 1))
878 return re_string_fetch_byte (pstr);
879
880#ifdef RE_ENABLE_I18N
881 if (pstr->offsets_needed)
882 {
883 int off, ch;
884
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
891
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
894
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
900
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
904 }
905#endif
906
907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908}
909
910static void
911internal_function
912re_string_destruct (re_string_t *pstr)
913{
914#ifdef RE_ENABLE_I18N
915 re_free (pstr->wcs);
916 re_free (pstr->offsets);
917#endif /* RE_ENABLE_I18N */
918 if (pstr->mbs_allocated)
919 re_free (pstr->mbs);
920}
921
922/* Return the context at IDX in INPUT. */
923
924static unsigned int
925internal_function
926re_string_context_at (const re_string_t *input, int idx, int eflags)
927{
928 int c;
929 if (BE (idx < 0, 0))
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
933 if (BE (idx == input->len, 0))
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936#ifdef RE_ENABLE_I18N
937 if (input->mb_cur_max > 1)
938 {
939 wint_t wc;
940 int wc_idx = idx;
941 while(input->wcs[wc_idx] == WEOF)
942 {
943#ifdef DEBUG
944 /* It must not happen. */
945 assert (wc_idx >= 0);
946#endif
947 --wc_idx;
948 if (wc_idx < 0)
949 return input->tip_context;
950 }
951 wc = input->wcs[wc_idx];
952 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
953 return CONTEXT_WORD;
954 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
955 ? CONTEXT_NEWLINE : 0);
956 }
957 else
958#endif
959 {
960 c = re_string_byte_at (input, idx);
961 if (bitset_contain (input->word_char, c))
962 return CONTEXT_WORD;
963 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
964 }
965}
966
967/* Functions for set operation. */
968
969static reg_errcode_t
970internal_function
971re_node_set_alloc (re_node_set *set, int size)
972{
973 /*
974 * ADR: valgrind says size can be 0, which then doesn't
975 * free the block of size 0. Harumph. This seems
976 * to work ok, though.
977 */
978 if (size == 0)
979 {
980 memset(set, 0, sizeof(*set));
981 return REG_NOERROR;
982 }
983 set->alloc = size;
984 set->nelem = 0;
985 set->elems = re_malloc (int, size);
986 if (BE (set->elems == NULL, 0))
987 return REG_ESPACE;
988 return REG_NOERROR;
989}
990
991static reg_errcode_t
992internal_function
993re_node_set_init_1 (re_node_set *set, int elem)
994{
995 set->alloc = 1;
996 set->nelem = 1;
997 set->elems = re_malloc (int, 1);
998 if (BE (set->elems == NULL, 0))
999 {
1000 set->alloc = set->nelem = 0;
1001 return REG_ESPACE;
1002 }
1003 set->elems[0] = elem;
1004 return REG_NOERROR;
1005}
1006
1007static reg_errcode_t
1008internal_function
1009re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1010{
1011 set->alloc = 2;
1012 set->elems = re_malloc (int, 2);
1013 if (BE (set->elems == NULL, 0))
1014 return REG_ESPACE;
1015 if (elem1 == elem2)
1016 {
1017 set->nelem = 1;
1018 set->elems[0] = elem1;
1019 }
1020 else
1021 {
1022 set->nelem = 2;
1023 if (elem1 < elem2)
1024 {
1025 set->elems[0] = elem1;
1026 set->elems[1] = elem2;
1027 }
1028 else
1029 {
1030 set->elems[0] = elem2;
1031 set->elems[1] = elem1;
1032 }
1033 }
1034 return REG_NOERROR;
1035}
1036
1037static reg_errcode_t
1038internal_function
1039re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1040{
1041 dest->nelem = src->nelem;
1042 if (src->nelem > 0)
1043 {
1044 dest->alloc = dest->nelem;
1045 dest->elems = re_malloc (int, dest->alloc);
1046 if (BE (dest->elems == NULL, 0))
1047 {
1048 dest->alloc = dest->nelem = 0;
1049 return REG_ESPACE;
1050 }
1051 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1052 }
1053 else
1054 re_node_set_init_empty (dest);
1055 return REG_NOERROR;
1056}
1057
1058/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1059 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1060 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1061
1062static reg_errcode_t
1063internal_function
1064re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1065 const re_node_set *src2)
1066{
1067 int i1, i2, is, id, delta, sbase;
1068 if (src1->nelem == 0 || src2->nelem == 0)
1069 return REG_NOERROR;
1070
1071 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1072 conservative estimate. */
1073 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1074 {
1075 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1076 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1077 if (BE (new_elems == NULL, 0))
1078 return REG_ESPACE;
1079 dest->elems = new_elems;
1080 dest->alloc = new_alloc;
1081 }
1082
1083 /* Find the items in the intersection of SRC1 and SRC2, and copy
1084 into the top of DEST those that are not already in DEST itself. */
1085 sbase = dest->nelem + src1->nelem + src2->nelem;
1086 i1 = src1->nelem - 1;
1087 i2 = src2->nelem - 1;
1088 id = dest->nelem - 1;
1089 for (;;)
1090 {
1091 if (src1->elems[i1] == src2->elems[i2])
1092 {
1093 /* Try to find the item in DEST. Maybe we could binary search? */
1094 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1095 --id;
1096
1097 if (id < 0 || dest->elems[id] != src1->elems[i1])
1098 dest->elems[--sbase] = src1->elems[i1];
1099
1100 if (--i1 < 0 || --i2 < 0)
1101 break;
1102 }
1103
1104 /* Lower the highest of the two items. */
1105 else if (src1->elems[i1] < src2->elems[i2])
1106 {
1107 if (--i2 < 0)
1108 break;
1109 }
1110 else
1111 {
1112 if (--i1 < 0)
1113 break;
1114 }
1115 }
1116
1117 id = dest->nelem - 1;
1118 is = dest->nelem + src1->nelem + src2->nelem - 1;
1119 delta = is - sbase + 1;
1120
1121 /* Now copy. When DELTA becomes zero, the remaining
1122 DEST elements are already in place; this is more or
1123 less the same loop that is in re_node_set_merge. */
1124 dest->nelem += delta;
1125 if (delta > 0 && id >= 0)
1126 for (;;)
1127 {
1128 if (dest->elems[is] > dest->elems[id])
1129 {
1130 /* Copy from the top. */
1131 dest->elems[id + delta--] = dest->elems[is--];
1132 if (delta == 0)
1133 break;
1134 }
1135 else
1136 {
1137 /* Slide from the bottom. */
1138 dest->elems[id + delta] = dest->elems[id];
1139 if (--id < 0)
1140 break;
1141 }
1142 }
1143
1144 /* Copy remaining SRC elements. */
1145 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1146
1147 return REG_NOERROR;
1148}
1149
1150/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1151 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1152
1153static reg_errcode_t
1154internal_function
1155re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1156 const re_node_set *src2)
1157{
1158 int i1, i2, id;
1159 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1160 {
1161 dest->alloc = src1->nelem + src2->nelem;
1162 dest->elems = re_malloc (int, dest->alloc);
1163 if (BE (dest->elems == NULL, 0))
1164 return REG_ESPACE;
1165 }
1166 else
1167 {
1168 if (src1 != NULL && src1->nelem > 0)
1169 return re_node_set_init_copy (dest, src1);
1170 else if (src2 != NULL && src2->nelem > 0)
1171 return re_node_set_init_copy (dest, src2);
1172 else
1173 re_node_set_init_empty (dest);
1174 return REG_NOERROR;
1175 }
1176 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1177 {
1178 if (src1->elems[i1] > src2->elems[i2])
1179 {
1180 dest->elems[id++] = src2->elems[i2++];
1181 continue;
1182 }
1183 if (src1->elems[i1] == src2->elems[i2])
1184 ++i2;
1185 dest->elems[id++] = src1->elems[i1++];
1186 }
1187 if (i1 < src1->nelem)
1188 {
1189 memcpy (dest->elems + id, src1->elems + i1,
1190 (src1->nelem - i1) * sizeof (int));
1191 id += src1->nelem - i1;
1192 }
1193 else if (i2 < src2->nelem)
1194 {
1195 memcpy (dest->elems + id, src2->elems + i2,
1196 (src2->nelem - i2) * sizeof (int));
1197 id += src2->nelem - i2;
1198 }
1199 dest->nelem = id;
1200 return REG_NOERROR;
1201}
1202
1203/* Calculate the union set of the sets DEST and SRC. And store it to
1204 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1205
1206static reg_errcode_t
1207internal_function
1208re_node_set_merge (re_node_set *dest, const re_node_set *src)
1209{
1210 int is, id, sbase, delta;
1211 if (src == NULL || src->nelem == 0)
1212 return REG_NOERROR;
1213 if (dest->alloc < 2 * src->nelem + dest->nelem)
1214 {
1215 int new_alloc = 2 * (src->nelem + dest->alloc);
1216 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1217 if (BE (new_buffer == NULL, 0))
1218 return REG_ESPACE;
1219 dest->elems = new_buffer;
1220 dest->alloc = new_alloc;
1221 }
1222
1223 if (BE (dest->nelem == 0, 0))
1224 {
1225 dest->nelem = src->nelem;
1226 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1227 return REG_NOERROR;
1228 }
1229
1230 /* Copy into the top of DEST the items of SRC that are not
1231 found in DEST. Maybe we could binary search in DEST? */
1232 for (sbase = dest->nelem + 2 * src->nelem,
1233 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1234 {
1235 if (dest->elems[id] == src->elems[is])
1236 is--, id--;
1237 else if (dest->elems[id] < src->elems[is])
1238 dest->elems[--sbase] = src->elems[is--];
1239 else /* if (dest->elems[id] > src->elems[is]) */
1240 --id;
1241 }
1242
1243 if (is >= 0)
1244 {
1245 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1246 sbase -= is + 1;
1247 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1248 }
1249
1250 id = dest->nelem - 1;
1251 is = dest->nelem + 2 * src->nelem - 1;
1252 delta = is - sbase + 1;
1253 if (delta == 0)
1254 return REG_NOERROR;
1255
1256 /* Now copy. When DELTA becomes zero, the remaining
1257 DEST elements are already in place. */
1258 dest->nelem += delta;
1259 for (;;)
1260 {
1261 if (dest->elems[is] > dest->elems[id])
1262 {
1263 /* Copy from the top. */
1264 dest->elems[id + delta--] = dest->elems[is--];
1265 if (delta == 0)
1266 break;
1267 }
1268 else
1269 {
1270 /* Slide from the bottom. */
1271 dest->elems[id + delta] = dest->elems[id];
1272 if (--id < 0)
1273 {
1274 /* Copy remaining SRC elements. */
1275 memcpy (dest->elems, dest->elems + sbase,
1276 delta * sizeof (int));
1277 break;
1278 }
1279 }
1280 }
1281
1282 return REG_NOERROR;
1283}
1284
1285/* Insert the new element ELEM to the re_node_set* SET.
1286 SET should not already have ELEM.
1287 return -1 if an error has occurred, return 1 otherwise. */
1288
1289static int
1290internal_function
1291re_node_set_insert (re_node_set *set, int elem)
1292{
1293 int idx;
1294 /* In case the set is empty. */
1295 if (set->alloc == 0)
1296 {
1297 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1298 return 1;
1299 else
1300 return -1;
1301 }
1302
1303 if (BE (set->nelem, 0) == 0)
1304 {
1305 /* We already guaranteed above that set->alloc != 0. */
1306 set->elems[0] = elem;
1307 ++set->nelem;
1308 return 1;
1309 }
1310
1311 /* Realloc if we need. */
1312 if (set->alloc == set->nelem)
1313 {
1314 int *new_elems;
1315 set->alloc = set->alloc * 2;
1316 new_elems = re_realloc (set->elems, int, set->alloc);
1317 if (BE (new_elems == NULL, 0))
1318 return -1;
1319 set->elems = new_elems;
1320 }
1321
1322 /* Move the elements which follows the new element. Test the
1323 first element separately to skip a check in the inner loop. */
1324 if (elem < set->elems[0])
1325 {
1326 idx = 0;
1327 for (idx = set->nelem; idx > 0; idx--)
1328 set->elems[idx] = set->elems[idx - 1];
1329 }
1330 else
1331 {
1332 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1333 set->elems[idx] = set->elems[idx - 1];
1334 }
1335
1336 /* Insert the new element. */
1337 set->elems[idx] = elem;
1338 ++set->nelem;
1339 return 1;
1340}
1341
1342/* Insert the new element ELEM to the re_node_set* SET.
1343 SET should not already have any element greater than or equal to ELEM.
1344 Return -1 if an error has occurred, return 1 otherwise. */
1345
1346static int
1347internal_function
1348re_node_set_insert_last (re_node_set *set, int elem)
1349{
1350 /* Realloc if we need. */
1351 if (set->alloc == set->nelem)
1352 {
1353 int *new_elems;
1354 set->alloc = (set->alloc + 1) * 2;
1355 new_elems = re_realloc (set->elems, int, set->alloc);
1356 if (BE (new_elems == NULL, 0))
1357 return -1;
1358 set->elems = new_elems;
1359 }
1360
1361 /* Insert the new element. */
1362 set->elems[set->nelem++] = elem;
1363 return 1;
1364}
1365
1366/* Compare two node sets SET1 and SET2.
1367 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1368
1369static int
1370internal_function __attribute ((pure))
1371re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1372{
1373 int i;
1374 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1375 return 0;
1376 for (i = set1->nelem ; --i >= 0 ; )
1377 if (set1->elems[i] != set2->elems[i])
1378 return 0;
1379 return 1;
1380}
1381
1382/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1383
1384static int
1385internal_function __attribute ((pure))
1386re_node_set_contains (const re_node_set *set, int elem)
1387{
1388 unsigned int idx, right, mid;
1389 if (set->nelem <= 0)
1390 return 0;
1391
1392 /* Binary search the element. */
1393 idx = 0;
1394 right = set->nelem - 1;
1395 while (idx < right)
1396 {
1397 mid = (idx + right) / 2;
1398 if (set->elems[mid] < elem)
1399 idx = mid + 1;
1400 else
1401 right = mid;
1402 }
1403 return set->elems[idx] == elem ? idx + 1 : 0;
1404}
1405
1406static void
1407internal_function
1408re_node_set_remove_at (re_node_set *set, int idx)
1409{
1410 if (idx < 0 || idx >= set->nelem)
1411 return;
1412 --set->nelem;
1413 for (; idx < set->nelem; idx++)
1414 set->elems[idx] = set->elems[idx + 1];
1415}
1416
1417
1418/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1419 Or return -1, if an error has occurred. */
1420
1421static int
1422internal_function
1423re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1424{
1425 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1426 {
1427 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1428 int *new_nexts, *new_indices;
1429 re_node_set *new_edests, *new_eclosures;
1430 re_token_t *new_nodes;
1431
1432 /* Avoid overflows in realloc. */
1433 const size_t max_object_size = MAX (sizeof (re_token_t),
1434 MAX (sizeof (re_node_set),
1435 sizeof (int)));
1436 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1437 return -1;
1438
1439 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1440 if (BE (new_nodes == NULL, 0))
1441 return -1;
1442 dfa->nodes = new_nodes;
1443 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1444 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1445 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1446 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1447 if (BE (new_nexts == NULL || new_indices == NULL
1448 || new_edests == NULL || new_eclosures == NULL, 0))
1449 return -1;
1450 dfa->nexts = new_nexts;
1451 dfa->org_indices = new_indices;
1452 dfa->edests = new_edests;
1453 dfa->eclosures = new_eclosures;
1454 dfa->nodes_alloc = new_nodes_alloc;
1455 }
1456 dfa->nodes[dfa->nodes_len] = token;
1457 dfa->nodes[dfa->nodes_len].constraint = 0;
1458#ifdef RE_ENABLE_I18N
1459 dfa->nodes[dfa->nodes_len].accept_mb =
1460 (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1461#endif
1462 dfa->nexts[dfa->nodes_len] = -1;
1463 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1464 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1465 return dfa->nodes_len++;
1466}
1467
1468static inline unsigned int
1469internal_function
1470calc_state_hash (const re_node_set *nodes, unsigned int context)
1471{
1472 unsigned int hash = nodes->nelem + context;
1473 int i;
1474 for (i = 0 ; i < nodes->nelem ; i++)
1475 hash += nodes->elems[i];
1476 return hash;
1477}
1478
1479/* Search for the state whose node_set is equivalent to NODES.
1480 Return the pointer to the state, if we found it in the DFA.
1481 Otherwise create the new one and return it. In case of an error
1482 return NULL and set the error code in ERR.
1483 Note: - We assume NULL as the invalid state, then it is possible that
1484 return value is NULL and ERR is REG_NOERROR.
1485 - We never return non-NULL value in case of any errors, it is for
1486 optimization. */
1487
1488static re_dfastate_t *
1489internal_function
1490re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1491 const re_node_set *nodes)
1492{
1493 unsigned int hash;
1494 re_dfastate_t *new_state;
1495 struct re_state_table_entry *spot;
1496 int i;
1497 if (BE (nodes->nelem == 0, 0))
1498 {
1499 *err = REG_NOERROR;
1500 return NULL;
1501 }
1502 hash = calc_state_hash (nodes, 0);
1503 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504
1505 for (i = 0 ; i < spot->num ; i++)
1506 {
1507 re_dfastate_t *state = spot->array[i];
1508 if (hash != state->hash)
1509 continue;
1510 if (re_node_set_compare (&state->nodes, nodes))
1511 return state;
1512 }
1513
1514 /* There are no appropriate state in the dfa, create the new one. */
1515 new_state = create_ci_newstate (dfa, nodes, hash);
1516 if (BE (new_state == NULL, 0))
1517 *err = REG_ESPACE;
1518
1519 return new_state;
1520}
1521
1522/* Search for the state whose node_set is equivalent to NODES and
1523 whose context is equivalent to CONTEXT.
1524 Return the pointer to the state, if we found it in the DFA.
1525 Otherwise create the new one and return it. In case of an error
1526 return NULL and set the error code in ERR.
1527 Note: - We assume NULL as the invalid state, then it is possible that
1528 return value is NULL and ERR is REG_NOERROR.
1529 - We never return non-NULL value in case of any errors, it is for
1530 optimization. */
1531
1532static re_dfastate_t *
1533internal_function
1534re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535 const re_node_set *nodes, unsigned int context)
1536{
1537 unsigned int hash;
1538 re_dfastate_t *new_state;
1539 struct re_state_table_entry *spot;
1540 int i;
1541 if (nodes->nelem == 0)
1542 {
1543 *err = REG_NOERROR;
1544 return NULL;
1545 }
1546 hash = calc_state_hash (nodes, context);
1547 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1548
1549 for (i = 0 ; i < spot->num ; i++)
1550 {
1551 re_dfastate_t *state = spot->array[i];
1552 if (state->hash == hash
1553 && state->context == context
1554 && re_node_set_compare (state->entrance_nodes, nodes))
1555 return state;
1556 }
1557 /* There are no appropriate state in `dfa', create the new one. */
1558 new_state = create_cd_newstate (dfa, nodes, context, hash);
1559 if (BE (new_state == NULL, 0))
1560 *err = REG_ESPACE;
1561
1562 return new_state;
1563}
1564
1565/* Finish initialization of the new state NEWSTATE, and using its hash value
1566 HASH put in the appropriate bucket of DFA's state table. Return value
1567 indicates the error code if failed. */
1568
1569static reg_errcode_t
1570register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1571 unsigned int hash)
1572{
1573 struct re_state_table_entry *spot;
1574 reg_errcode_t err;
1575 int i;
1576
1577 newstate->hash = hash;
1578 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1579 if (BE (err != REG_NOERROR, 0))
1580 return REG_ESPACE;
1581 for (i = 0; i < newstate->nodes.nelem; i++)
1582 {
1583 int elem = newstate->nodes.elems[i];
1584 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1585 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1586 return REG_ESPACE;
1587 }
1588
1589 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1590 if (BE (spot->alloc <= spot->num, 0))
1591 {
1592 int new_alloc = 2 * spot->num + 2;
1593 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1594 new_alloc);
1595 if (BE (new_array == NULL, 0))
1596 return REG_ESPACE;
1597 spot->array = new_array;
1598 spot->alloc = new_alloc;
1599 }
1600 spot->array[spot->num++] = newstate;
1601 return REG_NOERROR;
1602}
1603
1604static void
1605free_state (re_dfastate_t *state)
1606{
1607 re_node_set_free (&state->non_eps_nodes);
1608 re_node_set_free (&state->inveclosure);
1609 if (state->entrance_nodes != &state->nodes)
1610 {
1611 re_node_set_free (state->entrance_nodes);
1612 re_free (state->entrance_nodes);
1613 }
1614 re_node_set_free (&state->nodes);
1615 re_free (state->word_trtable);
1616 re_free (state->trtable);
1617 re_free (state);
1618}
1619
1620/* Create the new state which is independ of contexts.
1621 Return the new state if succeeded, otherwise return NULL. */
1622
1623static re_dfastate_t *
1624internal_function
1625create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1626 unsigned int hash)
1627{
1628 int i;
1629 reg_errcode_t err;
1630 re_dfastate_t *newstate;
1631
1632 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1633 if (BE (newstate == NULL, 0))
1634 return NULL;
1635 err = re_node_set_init_copy (&newstate->nodes, nodes);
1636 if (BE (err != REG_NOERROR, 0))
1637 {
1638 re_free (newstate);
1639 return NULL;
1640 }
1641
1642 newstate->entrance_nodes = &newstate->nodes;
1643 for (i = 0 ; i < nodes->nelem ; i++)
1644 {
1645 re_token_t *node = dfa->nodes + nodes->elems[i];
1646 re_token_type_t type = node->type;
1647 if (type == CHARACTER && !node->constraint)
1648 continue;
1649#ifdef RE_ENABLE_I18N
1650 newstate->accept_mb |= node->accept_mb;
1651#endif /* RE_ENABLE_I18N */
1652
1653 /* If the state has the halt node, the state is a halt state. */
1654 if (type == END_OF_RE)
1655 newstate->halt = 1;
1656 else if (type == OP_BACK_REF)
1657 newstate->has_backref = 1;
1658 else if (type == ANCHOR || node->constraint)
1659 newstate->has_constraint = 1;
1660 }
1661 err = register_state (dfa, newstate, hash);
1662 if (BE (err != REG_NOERROR, 0))
1663 {
1664 free_state (newstate);
1665 newstate = NULL;
1666 }
1667 return newstate;
1668}
1669
1670/* Create the new state which is depend on the context CONTEXT.
1671 Return the new state if succeeded, otherwise return NULL. */
1672
1673static re_dfastate_t *
1674internal_function
1675create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1676 unsigned int context, unsigned int hash)
1677{
1678 int i, nctx_nodes = 0;
1679 reg_errcode_t err;
1680 re_dfastate_t *newstate;
1681
1682 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1683 if (BE (newstate == NULL, 0))
1684 return NULL;
1685 err = re_node_set_init_copy (&newstate->nodes, nodes);
1686 if (BE (err != REG_NOERROR, 0))
1687 {
1688 re_free (newstate);
1689 return NULL;
1690 }
1691
1692 newstate->context = context;
1693 newstate->entrance_nodes = &newstate->nodes;
1694
1695 for (i = 0 ; i < nodes->nelem ; i++)
1696 {
1697 re_token_t *node = dfa->nodes + nodes->elems[i];
1698 re_token_type_t type = node->type;
1699 unsigned int constraint = node->constraint;
1700
1701 if (type == CHARACTER && !constraint)
1702 continue;
1703#ifdef RE_ENABLE_I18N
1704 newstate->accept_mb |= node->accept_mb;
1705#endif /* RE_ENABLE_I18N */
1706
1707 /* If the state has the halt node, the state is a halt state. */
1708 if (type == END_OF_RE)
1709 newstate->halt = 1;
1710 else if (type == OP_BACK_REF)
1711 newstate->has_backref = 1;
1712
1713 if (constraint)
1714 {
1715 if (newstate->entrance_nodes == &newstate->nodes)
1716 {
1717 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718 if (BE (newstate->entrance_nodes == NULL, 0))
1719 {
1720 free_state (newstate);
1721 return NULL;
1722 }
1723 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1724 != REG_NOERROR)
1725 return NULL;
1726 nctx_nodes = 0;
1727 newstate->has_constraint = 1;
1728 }
1729
1730 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731 {
1732 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733 ++nctx_nodes;
1734 }
1735 }
1736 }
1737 err = register_state (dfa, newstate, hash);
1738 if (BE (err != REG_NOERROR, 0))
1739 {
1740 free_state (newstate);
1741 newstate = NULL;
1742 }
1743 return newstate;
1744}
diff --git a/win32/regex_internal.h b/win32/regex_internal.h
new file mode 100644
index 000000000..4184d7f5a
--- /dev/null
+++ b/win32/regex_internal.h
@@ -0,0 +1,810 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2008, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21#ifndef _REGEX_INTERNAL_H
22#define _REGEX_INTERNAL_H 1
23
24#include <assert.h>
25#include <ctype.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
31# include <langinfo.h>
32#endif
33#if defined HAVE_LOCALE_H || defined _LIBC
34# include <locale.h>
35#endif
36#if defined HAVE_WCHAR_H || defined _LIBC
37# include <wchar.h>
38#endif /* HAVE_WCHAR_H || _LIBC */
39#if defined HAVE_WCTYPE_H || defined _LIBC
40# include <wctype.h>
41#endif /* HAVE_WCTYPE_H || _LIBC */
42#if defined HAVE_STDBOOL_H || defined _LIBC
43# include <stdbool.h>
44#endif /* HAVE_STDBOOL_H || _LIBC */
45#if !defined(ZOS_USS)
46#if defined HAVE_STDINT_H || defined _LIBC
47# include <stdint.h>
48#endif /* HAVE_STDINT_H || _LIBC */
49#endif /* !ZOS_USS */
50#if defined _LIBC
51# include <bits/libc-lock.h>
52#else
53# define __libc_lock_define(CLASS,NAME)
54# define __libc_lock_init(NAME) do { } while (0)
55# define __libc_lock_lock(NAME) do { } while (0)
56# define __libc_lock_unlock(NAME) do { } while (0)
57#endif
58
59#ifndef GAWK
60/* In case that the system doesn't have isblank(). */
61#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
62# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
63#endif
64#else /* GAWK */
65/*
66 * This is a freaking mess. On glibc systems you have to define
67 * a magic constant to get isblank() out of <ctype.h>, since it's
68 * a C99 function. To heck with all that and borrow a page from
69 * dfa.c's book.
70 */
71
72static int
73is_blank (int c)
74{
75 return (c == ' ' || c == '\t');
76}
77#endif /* GAWK */
78
79#ifdef _LIBC
80# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
81# define _RE_DEFINE_LOCALE_FUNCTIONS 1
82# include <locale/localeinfo.h>
83# include <locale/elem-hash.h>
84# include <locale/coll-lookup.h>
85# endif
86#endif
87
88/* This is for other GNU distributions with internationalized messages. */
89#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
90# include <libintl.h>
91# ifdef _LIBC
92# undef gettext
93# define gettext(msgid) \
94 INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
95# endif
96#else
97# define gettext(msgid) (msgid)
98#endif
99
100#ifndef gettext_noop
101/* This define is so xgettext can find the internationalizable
102 strings. */
103# define gettext_noop(String) String
104#endif
105
106/* For loser systems without the definition. */
107#ifndef SIZE_MAX
108# define SIZE_MAX ((size_t) -1)
109#endif
110
111#ifndef NO_MBSUPPORT
112#include "mbsupport.h" /* gawk */
113#endif
114#ifndef MB_CUR_MAX
115#define MB_CUR_MAX 1
116#endif
117
118#if (defined MBS_SUPPORT) || _LIBC
119# define RE_ENABLE_I18N
120#endif
121
122#if __GNUC__ >= 3
123# define BE(expr, val) __builtin_expect (expr, val)
124#else
125# define BE(expr, val) (expr)
126# ifdef inline
127# undef inline
128# endif
129# define inline
130#endif
131
132/* Number of single byte character. */
133#define SBC_MAX 256
134
135#define COLL_ELEM_LEN_MAX 8
136
137/* The character which represents newline. */
138#define NEWLINE_CHAR '\n'
139#define WIDE_NEWLINE_CHAR L'\n'
140
141/* Rename to standard API for using out of glibc. */
142#ifndef _LIBC
143# ifdef __wctype
144# undef __wctype
145# endif
146# define __wctype wctype
147# ifdef __iswctype
148# undef __iswctype
149# endif
150# define __iswctype iswctype
151# define __btowc btowc
152# define __mbrtowc mbrtowc
153#undef __mempcpy /* GAWK */
154# define __mempcpy mempcpy
155# define __wcrtomb wcrtomb
156# define __regfree regfree
157# define attribute_hidden
158#endif /* not _LIBC */
159
160#ifdef __GNUC__
161# define __attribute(arg) __attribute__ (arg)
162#else
163# define __attribute(arg)
164#endif
165
166extern const char __re_error_msgid[] attribute_hidden;
167extern const size_t __re_error_msgid_idx[] attribute_hidden;
168
169/* An integer used to represent a set of bits. It must be unsigned,
170 and must be at least as wide as unsigned int. */
171typedef unsigned long int bitset_word_t;
172/* All bits set in a bitset_word_t. */
173#define BITSET_WORD_MAX ULONG_MAX
174/* Number of bits in a bitset_word_t. */
175#define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
176/* Number of bitset_word_t in a bit_set. */
177#define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
178typedef bitset_word_t bitset_t[BITSET_WORDS];
179typedef bitset_word_t *re_bitset_ptr_t;
180typedef const bitset_word_t *re_const_bitset_ptr_t;
181
182#define bitset_set(set,i) \
183 (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
184#define bitset_clear(set,i) \
185 (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
186#define bitset_contain(set,i) \
187 (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
188#define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
189#define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
190#define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
191
192#define PREV_WORD_CONSTRAINT 0x0001
193#define PREV_NOTWORD_CONSTRAINT 0x0002
194#define NEXT_WORD_CONSTRAINT 0x0004
195#define NEXT_NOTWORD_CONSTRAINT 0x0008
196#define PREV_NEWLINE_CONSTRAINT 0x0010
197#define NEXT_NEWLINE_CONSTRAINT 0x0020
198#define PREV_BEGBUF_CONSTRAINT 0x0040
199#define NEXT_ENDBUF_CONSTRAINT 0x0080
200#define WORD_DELIM_CONSTRAINT 0x0100
201#define NOT_WORD_DELIM_CONSTRAINT 0x0200
202
203typedef enum
204{
205 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
206 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
207 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
208 INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
209 LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
210 LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
211 BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
212 BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
213 WORD_DELIM = WORD_DELIM_CONSTRAINT,
214 NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
215} re_context_type;
216
217typedef struct
218{
219 int alloc;
220 int nelem;
221 int *elems;
222} re_node_set;
223
224typedef enum
225{
226 NON_TYPE = 0,
227
228 /* Node type, These are used by token, node, tree. */
229 CHARACTER = 1,
230 END_OF_RE = 2,
231 SIMPLE_BRACKET = 3,
232 OP_BACK_REF = 4,
233 OP_PERIOD = 5,
234#ifdef RE_ENABLE_I18N
235 COMPLEX_BRACKET = 6,
236 OP_UTF8_PERIOD = 7,
237#endif /* RE_ENABLE_I18N */
238
239 /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
240 when the debugger shows values of this enum type. */
241#define EPSILON_BIT 8
242 OP_OPEN_SUBEXP = EPSILON_BIT | 0,
243 OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
244 OP_ALT = EPSILON_BIT | 2,
245 OP_DUP_ASTERISK = EPSILON_BIT | 3,
246 ANCHOR = EPSILON_BIT | 4,
247
248 /* Tree type, these are used only by tree. */
249 CONCAT = 16,
250 SUBEXP = 17,
251
252 /* Token type, these are used only by token. */
253 OP_DUP_PLUS = 18,
254 OP_DUP_QUESTION,
255 OP_OPEN_BRACKET,
256 OP_CLOSE_BRACKET,
257 OP_CHARSET_RANGE,
258 OP_OPEN_DUP_NUM,
259 OP_CLOSE_DUP_NUM,
260 OP_NON_MATCH_LIST,
261 OP_OPEN_COLL_ELEM,
262 OP_CLOSE_COLL_ELEM,
263 OP_OPEN_EQUIV_CLASS,
264 OP_CLOSE_EQUIV_CLASS,
265 OP_OPEN_CHAR_CLASS,
266 OP_CLOSE_CHAR_CLASS,
267 OP_WORD,
268 OP_NOTWORD,
269 OP_SPACE,
270 OP_NOTSPACE,
271 BACK_SLASH
272
273} re_token_type_t;
274
275#ifdef RE_ENABLE_I18N
276typedef struct
277{
278 /* Multibyte characters. */
279 wchar_t *mbchars;
280
281 /* Collating symbols. */
282# ifdef _LIBC
283 int32_t *coll_syms;
284# endif
285
286 /* Equivalence classes. */
287# ifdef _LIBC
288 int32_t *equiv_classes;
289# endif
290
291 /* Range expressions. */
292# ifdef _LIBC
293 uint32_t *range_starts;
294 uint32_t *range_ends;
295# else /* not _LIBC */
296 wchar_t *range_starts;
297 wchar_t *range_ends;
298# endif /* not _LIBC */
299
300 /* Character classes. */
301 wctype_t *char_classes;
302
303 /* If this character set is the non-matching list. */
304 unsigned int non_match : 1;
305
306 /* # of multibyte characters. */
307 int nmbchars;
308
309 /* # of collating symbols. */
310 int ncoll_syms;
311
312 /* # of equivalence classes. */
313 int nequiv_classes;
314
315 /* # of range expressions. */
316 int nranges;
317
318 /* # of character classes. */
319 int nchar_classes;
320} re_charset_t;
321#endif /* RE_ENABLE_I18N */
322
323typedef struct
324{
325 union
326 {
327 unsigned char c; /* for CHARACTER */
328 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
329#ifdef RE_ENABLE_I18N
330 re_charset_t *mbcset; /* for COMPLEX_BRACKET */
331#endif /* RE_ENABLE_I18N */
332 int idx; /* for BACK_REF */
333 re_context_type ctx_type; /* for ANCHOR */
334 } opr;
335#if __GNUC__ >= 2
336 re_token_type_t type : 8;
337#else
338 re_token_type_t type;
339#endif
340 unsigned int constraint : 10; /* context constraint */
341 unsigned int duplicated : 1;
342 unsigned int opt_subexp : 1;
343#ifdef RE_ENABLE_I18N
344 unsigned int accept_mb : 1;
345 /* These 2 bits can be moved into the union if needed (e.g. if running out
346 of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
347 unsigned int mb_partial : 1;
348#endif
349 unsigned int word_char : 1;
350} re_token_t;
351
352#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
353
354struct re_string_t
355{
356 /* Indicate the raw buffer which is the original string passed as an
357 argument of regexec(), re_search(), etc.. */
358 const unsigned char *raw_mbs;
359 /* Store the multibyte string. In case of "case insensitive mode" like
360 REG_ICASE, upper cases of the string are stored, otherwise MBS points
361 the same address that RAW_MBS points. */
362 unsigned char *mbs;
363#ifdef RE_ENABLE_I18N
364 /* Store the wide character string which is corresponding to MBS. */
365 wint_t *wcs;
366 int *offsets;
367 mbstate_t cur_state;
368#endif
369 /* Index in RAW_MBS. Each character mbs[i] corresponds to
370 raw_mbs[raw_mbs_idx + i]. */
371 int raw_mbs_idx;
372 /* The length of the valid characters in the buffers. */
373 int valid_len;
374 /* The corresponding number of bytes in raw_mbs array. */
375 int valid_raw_len;
376 /* The length of the buffers MBS and WCS. */
377 int bufs_len;
378 /* The index in MBS, which is updated by re_string_fetch_byte. */
379 int cur_idx;
380 /* length of RAW_MBS array. */
381 int raw_len;
382 /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
383 int len;
384 /* End of the buffer may be shorter than its length in the cases such
385 as re_match_2, re_search_2. Then, we use STOP for end of the buffer
386 instead of LEN. */
387 int raw_stop;
388 /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
389 int stop;
390
391 /* The context of mbs[0]. We store the context independently, since
392 the context of mbs[0] may be different from raw_mbs[0], which is
393 the beginning of the input string. */
394 unsigned int tip_context;
395 /* The translation passed as a part of an argument of re_compile_pattern. */
396 RE_TRANSLATE_TYPE trans;
397 /* Copy of re_dfa_t's word_char. */
398 re_const_bitset_ptr_t word_char;
399 /* 1 if REG_ICASE. */
400 unsigned char icase;
401 unsigned char is_utf8;
402 unsigned char map_notascii;
403 unsigned char mbs_allocated;
404 unsigned char offsets_needed;
405 unsigned char newline_anchor;
406 unsigned char word_ops_used;
407 int mb_cur_max;
408};
409typedef struct re_string_t re_string_t;
410
411
412struct re_dfa_t;
413typedef struct re_dfa_t re_dfa_t;
414
415#ifndef _LIBC
416# ifdef __i386__
417# define internal_function __attribute ((regparm (3), stdcall))
418# else
419# define internal_function
420# endif
421#endif
422
423#ifndef NOT_IN_libc
424static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
425 int new_buf_len)
426 internal_function;
427# ifdef RE_ENABLE_I18N
428static void build_wcs_buffer (re_string_t *pstr) internal_function;
429static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr)
430 internal_function;
431# endif /* RE_ENABLE_I18N */
432static void build_upper_buffer (re_string_t *pstr) internal_function;
433static void re_string_translate_buffer (re_string_t *pstr) internal_function;
434static unsigned int re_string_context_at (const re_string_t *input, int idx,
435 int eflags)
436 internal_function __attribute ((pure));
437#endif
438#define re_string_peek_byte(pstr, offset) \
439 ((pstr)->mbs[(pstr)->cur_idx + offset])
440#define re_string_fetch_byte(pstr) \
441 ((pstr)->mbs[(pstr)->cur_idx++])
442#define re_string_first_byte(pstr, idx) \
443 ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
444#define re_string_is_single_byte_char(pstr, idx) \
445 ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
446 || (pstr)->wcs[(idx) + 1] != WEOF))
447#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
448#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
449#define re_string_get_buffer(pstr) ((pstr)->mbs)
450#define re_string_length(pstr) ((pstr)->len)
451#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
452#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
453#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
454
455#ifndef _LIBC
456# if HAVE_ALLOCA
457# if (_MSC_VER)
458# include <malloc.h>
459# define __libc_use_alloca(n) 0
460# else
461# include <alloca.h>
462/* The OS usually guarantees only one guard page at the bottom of the stack,
463 and a page size can be as small as 4096 bytes. So we cannot safely
464 allocate anything larger than 4096 bytes. Also care for the possibility
465 of a few compiler-allocated temporary stack slots. */
466# define __libc_use_alloca(n) ((n) < 4032)
467# endif
468# else
469/* alloca is implemented with malloc, so just use malloc. */
470# define __libc_use_alloca(n) 0
471# endif
472#endif
473
474#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
475/* SunOS 4.1.x realloc doesn't accept null pointers: pre-Standard C. Sigh. */
476#define re_realloc(p,t,n) ((p != NULL) ? (t *) realloc (p,(n)*sizeof(t)) : (t *) calloc(n,sizeof(t)))
477#define re_free(p) free (p)
478
479struct bin_tree_t
480{
481 struct bin_tree_t *parent;
482 struct bin_tree_t *left;
483 struct bin_tree_t *right;
484 struct bin_tree_t *first;
485 struct bin_tree_t *next;
486
487 re_token_t token;
488
489 /* `node_idx' is the index in dfa->nodes, if `type' == 0.
490 Otherwise `type' indicate the type of this node. */
491 int node_idx;
492};
493typedef struct bin_tree_t bin_tree_t;
494
495#define BIN_TREE_STORAGE_SIZE \
496 ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
497
498struct bin_tree_storage_t
499{
500 struct bin_tree_storage_t *next;
501 bin_tree_t data[BIN_TREE_STORAGE_SIZE];
502};
503typedef struct bin_tree_storage_t bin_tree_storage_t;
504
505#define CONTEXT_WORD 1
506#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
507#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
508#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
509
510#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
511#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
512#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
513#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
514#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
515
516#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
517#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
518#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
519#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
520
521#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
522 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
523 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
524 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
525 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
526
527#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
528 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
529 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
530 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
531 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
532
533struct re_dfastate_t
534{
535 unsigned int hash;
536 re_node_set nodes;
537 re_node_set non_eps_nodes;
538 re_node_set inveclosure;
539 re_node_set *entrance_nodes;
540 struct re_dfastate_t **trtable, **word_trtable;
541 unsigned int context : 4;
542 unsigned int halt : 1;
543 /* If this state can accept `multi byte'.
544 Note that we refer to multibyte characters, and multi character
545 collating elements as `multi byte'. */
546 unsigned int accept_mb : 1;
547 /* If this state has backreference node(s). */
548 unsigned int has_backref : 1;
549 unsigned int has_constraint : 1;
550};
551typedef struct re_dfastate_t re_dfastate_t;
552
553struct re_state_table_entry
554{
555 int num;
556 int alloc;
557 re_dfastate_t **array;
558};
559
560/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
561
562typedef struct
563{
564 int next_idx;
565 int alloc;
566 re_dfastate_t **array;
567} state_array_t;
568
569/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
570
571typedef struct
572{
573 int node;
574 int str_idx; /* The position NODE match at. */
575 state_array_t path;
576} re_sub_match_last_t;
577
578/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
579 And information about the node, whose type is OP_CLOSE_SUBEXP,
580 corresponding to NODE is stored in LASTS. */
581
582typedef struct
583{
584 int str_idx;
585 int node;
586 state_array_t *path;
587 int alasts; /* Allocation size of LASTS. */
588 int nlasts; /* The number of LASTS. */
589 re_sub_match_last_t **lasts;
590} re_sub_match_top_t;
591
592struct re_backref_cache_entry
593{
594 int node;
595 int str_idx;
596 int subexp_from;
597 int subexp_to;
598 char more;
599 char unused;
600 unsigned short int eps_reachable_subexps_map;
601};
602
603typedef struct
604{
605 /* The string object corresponding to the input string. */
606 re_string_t input;
607#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
608 const re_dfa_t *const dfa;
609#else
610 const re_dfa_t *dfa;
611#endif
612 /* EFLAGS of the argument of regexec. */
613 int eflags;
614 /* Where the matching ends. */
615 int match_last;
616 int last_node;
617 /* The state log used by the matcher. */
618 re_dfastate_t **state_log;
619 int state_log_top;
620 /* Back reference cache. */
621 int nbkref_ents;
622 int abkref_ents;
623 struct re_backref_cache_entry *bkref_ents;
624 int max_mb_elem_len;
625 int nsub_tops;
626 int asub_tops;
627 re_sub_match_top_t **sub_tops;
628} re_match_context_t;
629
630typedef struct
631{
632 re_dfastate_t **sifted_states;
633 re_dfastate_t **limited_states;
634 int last_node;
635 int last_str_idx;
636 re_node_set limits;
637} re_sift_context_t;
638
639struct re_fail_stack_ent_t
640{
641 int idx;
642 int node;
643 regmatch_t *regs;
644 re_node_set eps_via_nodes;
645};
646
647struct re_fail_stack_t
648{
649 int num;
650 int alloc;
651 struct re_fail_stack_ent_t *stack;
652};
653
654struct re_dfa_t
655{
656 re_token_t *nodes;
657 size_t nodes_alloc;
658 size_t nodes_len;
659 int *nexts;
660 int *org_indices;
661 re_node_set *edests;
662 re_node_set *eclosures;
663 re_node_set *inveclosures;
664 struct re_state_table_entry *state_table;
665 re_dfastate_t *init_state;
666 re_dfastate_t *init_state_word;
667 re_dfastate_t *init_state_nl;
668 re_dfastate_t *init_state_begbuf;
669 bin_tree_t *str_tree;
670 bin_tree_storage_t *str_tree_storage;
671 re_bitset_ptr_t sb_char;
672 int str_tree_storage_idx;
673
674 /* number of subexpressions `re_nsub' is in regex_t. */
675 unsigned int state_hash_mask;
676 int init_node;
677 int nbackref; /* The number of backreference in this dfa. */
678
679 /* Bitmap expressing which backreference is used. */
680 bitset_word_t used_bkref_map;
681 bitset_word_t completed_bkref_map;
682
683 unsigned int has_plural_match : 1;
684 /* If this dfa has "multibyte node", which is a backreference or
685 a node which can accept multibyte character or multi character
686 collating element. */
687 unsigned int has_mb_node : 1;
688 unsigned int is_utf8 : 1;
689 unsigned int map_notascii : 1;
690 unsigned int word_ops_used : 1;
691 int mb_cur_max;
692 bitset_t word_char;
693 reg_syntax_t syntax;
694 int *subexp_map;
695#ifdef DEBUG
696 char* re_str;
697#endif
698#if defined _LIBC
699 __libc_lock_define (, lock)
700#endif
701};
702
703#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
704#define re_node_set_remove(set,id) \
705 (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
706#define re_node_set_empty(p) ((p)->nelem = 0)
707#define re_node_set_free(set) re_free ((set)->elems)
708
709
710typedef enum
711{
712 SB_CHAR,
713 MB_CHAR,
714 EQUIV_CLASS,
715 COLL_SYM,
716 CHAR_CLASS
717} bracket_elem_type;
718
719typedef struct
720{
721 bracket_elem_type type;
722 union
723 {
724 unsigned char ch;
725 unsigned char *name;
726 wchar_t wch;
727 } opr;
728} bracket_elem_t;
729
730
731/* Inline functions for bitset operation. */
732static inline void
733bitset_not (bitset_t set)
734{
735 int bitset_i;
736 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
737 set[bitset_i] = ~set[bitset_i];
738}
739
740static inline void
741bitset_merge (bitset_t dest, const bitset_t src)
742{
743 int bitset_i;
744 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
745 dest[bitset_i] |= src[bitset_i];
746}
747
748static inline void
749bitset_mask (bitset_t dest, const bitset_t src)
750{
751 int bitset_i;
752 for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
753 dest[bitset_i] &= src[bitset_i];
754}
755
756#ifdef RE_ENABLE_I18N
757/* Inline functions for re_string. */
758static inline int
759internal_function __attribute ((pure))
760re_string_char_size_at (const re_string_t *pstr, int idx)
761{
762 int byte_idx;
763 if (pstr->mb_cur_max == 1)
764 return 1;
765 for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
766 if (pstr->wcs[idx + byte_idx] != WEOF)
767 break;
768 return byte_idx;
769}
770
771static inline wint_t
772internal_function __attribute ((pure))
773re_string_wchar_at (const re_string_t *pstr, int idx)
774{
775 if (pstr->mb_cur_max == 1)
776 return (wint_t) pstr->mbs[idx];
777 return (wint_t) pstr->wcs[idx];
778}
779
780# ifndef NOT_IN_libc
781static int
782internal_function __attribute ((pure))
783re_string_elem_size_at (const re_string_t *pstr, int idx)
784{
785# ifdef _LIBC
786 const unsigned char *p, *extra;
787 const int32_t *table, *indirect;
788 int32_t tmp;
789# include <locale/weight.h>
790 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
791
792 if (nrules != 0)
793 {
794 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
795 extra = (const unsigned char *)
796 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
797 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
798 _NL_COLLATE_INDIRECTMB);
799 p = pstr->mbs + idx;
800 tmp = findidx (&p);
801 return p - pstr->mbs - idx;
802 }
803 else
804# endif /* _LIBC */
805 return 1;
806}
807# endif
808#endif /* RE_ENABLE_I18N */
809
810#endif /* _REGEX_INTERNAL_H */
diff --git a/win32/regexec.c b/win32/regexec.c
new file mode 100644
index 000000000..eb5e1d443
--- /dev/null
+++ b/win32/regexec.c
@@ -0,0 +1,4369 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags);
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len);
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len);
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated);
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
56static int check_matching (re_match_context_t *mctx, int fl_longest_match,
57 int *p_match_first) internal_function;
58static int check_halt_state_context (const re_match_context_t *mctx,
59 const re_dfastate_t *state, int idx)
60 internal_function;
61static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
62 regmatch_t *prev_idx_match, int cur_node,
63 int cur_idx, int nmatch) internal_function;
64static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
65 int str_idx, int dest_node, int nregs,
66 regmatch_t *regs,
67 re_node_set *eps_via_nodes)
68 internal_function;
69static reg_errcode_t set_regs (const regex_t *preg,
70 const re_match_context_t *mctx,
71 size_t nmatch, regmatch_t *pmatch,
72 int fl_backtrack) internal_function;
73static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
74 internal_function;
75
76#ifdef RE_ENABLE_I18N
77static int sift_states_iter_mb (const re_match_context_t *mctx,
78 re_sift_context_t *sctx,
79 int node_idx, int str_idx, int max_str_idx)
80 internal_function;
81#endif /* RE_ENABLE_I18N */
82static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
83 re_sift_context_t *sctx)
84 internal_function;
85static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
86 re_sift_context_t *sctx, int str_idx,
87 re_node_set *cur_dest)
88 internal_function;
89static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
90 re_sift_context_t *sctx,
91 int str_idx,
92 re_node_set *dest_nodes)
93 internal_function;
94static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
95 re_node_set *dest_nodes,
96 const re_node_set *candidates)
97 internal_function;
98static int check_dst_limits (const re_match_context_t *mctx,
99 re_node_set *limits,
100 int dst_node, int dst_idx, int src_node,
101 int src_idx) internal_function;
102static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
103 int boundaries, int subexp_idx,
104 int from_node, int bkref_idx)
105 internal_function;
106static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
107 int limit, int subexp_idx,
108 int node, int str_idx,
109 int bkref_idx) internal_function;
110static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
111 re_node_set *dest_nodes,
112 const re_node_set *candidates,
113 re_node_set *limits,
114 struct re_backref_cache_entry *bkref_ents,
115 int str_idx) internal_function;
116static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
117 re_sift_context_t *sctx,
118 int str_idx, const re_node_set *candidates)
119 internal_function;
120static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
121 re_dfastate_t **dst,
122 re_dfastate_t **src, int num)
123 internal_function;
124static re_dfastate_t *find_recover_state (reg_errcode_t *err,
125 re_match_context_t *mctx) internal_function;
126static re_dfastate_t *transit_state (reg_errcode_t *err,
127 re_match_context_t *mctx,
128 re_dfastate_t *state) internal_function;
129static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
130 re_match_context_t *mctx,
131 re_dfastate_t *next_state)
132 internal_function;
133static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
134 re_node_set *cur_nodes,
135 int str_idx) internal_function;
136#if 0
137static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
138 re_match_context_t *mctx,
139 re_dfastate_t *pstate)
140 internal_function;
141#endif
142#ifdef RE_ENABLE_I18N
143static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate)
145 internal_function;
146#endif /* RE_ENABLE_I18N */
147static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
148 const re_node_set *nodes)
149 internal_function;
150static reg_errcode_t get_subexp (re_match_context_t *mctx,
151 int bkref_node, int bkref_str_idx)
152 internal_function;
153static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
154 const re_sub_match_top_t *sub_top,
155 re_sub_match_last_t *sub_last,
156 int bkref_node, int bkref_str)
157 internal_function;
158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes)
168 internal_function;
169static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
170 re_node_set *cur_nodes,
171 int ex_subexp, int type)
172 internal_function;
173static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
174 re_node_set *dst_nodes,
175 int target, int ex_subexp,
176 int type) internal_function;
177static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
178 re_node_set *cur_nodes, int cur_str,
179 int subexp_num, int type)
180 internal_function;
181static int build_trtable (const re_dfa_t *dfa,
182 re_dfastate_t *state) internal_function;
183#ifdef RE_ENABLE_I18N
184static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
185 const re_string_t *input, int idx)
186 internal_function;
187# ifdef _LIBC
188static unsigned int find_collation_sequence_value (const unsigned char *mbs,
189 size_t name_len)
190 internal_function;
191# endif /* _LIBC */
192#endif /* RE_ENABLE_I18N */
193static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
194 const re_dfastate_t *state,
195 re_node_set *states_node,
196 bitset_t *states_ch) internal_function;
197static int check_node_accept (const re_match_context_t *mctx,
198 const re_token_t *node, int idx)
199 internal_function;
200static reg_errcode_t extend_buffers (re_match_context_t *mctx)
201 internal_function;
202
203/* Entry point for POSIX code. */
204
205/* regexec searches for a given pattern, specified by PREG, in the
206 string STRING.
207
208 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
209 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
210 least NMATCH elements, and we set them to the offsets of the
211 corresponding matched substrings.
212
213 EFLAGS specifies `execution flags' which affect matching: if
214 REG_NOTBOL is set, then ^ does not match at the beginning of the
215 string; if REG_NOTEOL is set, then $ does not match at the end.
216
217 We return 0 if we find a match and REG_NOMATCH if not. */
218
219int
220regexec (
221 const regex_t *__restrict preg,
222 const char *__restrict string,
223 size_t nmatch,
224 regmatch_t pmatch[],
225 int eflags)
226{
227 reg_errcode_t err;
228 int start, length;
229
230 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
231 return REG_BADPAT;
232
233 if (eflags & REG_STARTEND)
234 {
235 start = pmatch[0].rm_so;
236 length = pmatch[0].rm_eo;
237 }
238 else
239 {
240 start = 0;
241 length = strlen (string);
242 }
243
244 __libc_lock_lock (dfa->lock);
245 if (preg->no_sub)
246 err = re_search_internal (preg, string, length, start, length - start,
247 length, 0, NULL, eflags);
248 else
249 err = re_search_internal (preg, string, length, start, length - start,
250 length, nmatch, pmatch, eflags);
251 __libc_lock_unlock (dfa->lock);
252 return err != REG_NOERROR;
253}
254
255#ifdef _LIBC
256# include <shlib-compat.h>
257versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258
259# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
260__typeof__ (__regexec) __compat_regexec;
261
262int
263attribute_compat_text_section
264__compat_regexec (const regex_t *__restrict preg,
265 const char *__restrict string, size_t nmatch,
266 regmatch_t pmatch[], int eflags)
267{
268 return regexec (preg, string, nmatch, pmatch,
269 eflags & (REG_NOTBOL | REG_NOTEOL));
270}
271compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
272# endif
273#endif
274
275/* Entry points for GNU code. */
276
277/* re_match, re_search, re_match_2, re_search_2
278
279 The former two functions operate on STRING with length LENGTH,
280 while the later two operate on concatenation of STRING1 and STRING2
281 with lengths LENGTH1 and LENGTH2, respectively.
282
283 re_match() matches the compiled pattern in BUFP against the string,
284 starting at index START.
285
286 re_search() first tries matching at index START, then it tries to match
287 starting from index START + 1, and so on. The last start position tried
288 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
289 way as re_match().)
290
291 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
292 the first STOP characters of the concatenation of the strings should be
293 concerned.
294
295 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
296 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
297 computed relative to the concatenation, not relative to the individual
298 strings.)
299
300 On success, re_match* functions return the length of the match, re_search*
301 return the position of the start of the match. Return value -1 means no
302 match was found and -2 indicates an internal error. */
303
304int
305re_match (struct re_pattern_buffer *bufp,
306 const char *string,
307 int length,
308 int start,
309 struct re_registers *regs)
310{
311 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
312}
313#ifdef _LIBC
314weak_alias (__re_match, re_match)
315#endif
316
317int
318re_search (struct re_pattern_buffer *bufp,
319 const char *string,
320 int length, int start, int range,
321 struct re_registers *regs)
322{
323 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
324}
325#ifdef _LIBC
326weak_alias (__re_search, re_search)
327#endif
328
329int
330re_match_2 (struct re_pattern_buffer *bufp,
331 const char *string1, int length1,
332 const char *string2, int length2, int start,
333 struct re_registers *regs, int stop)
334{
335 return re_search_2_stub (bufp, string1, length1, string2, length2,
336 start, 0, regs, stop, 1);
337}
338#ifdef _LIBC
339weak_alias (__re_match_2, re_match_2)
340#endif
341
342int
343re_search_2 (struct re_pattern_buffer *bufp,
344 const char *string1, int length1,
345 const char *string2, int length2, int start,
346 int range, struct re_registers *regs, int stop)
347{
348 return re_search_2_stub (bufp, string1, length1, string2, length2,
349 start, range, regs, stop, 0);
350}
351#ifdef _LIBC
352weak_alias (__re_search_2, re_search_2)
353#endif
354
355static int
356re_search_2_stub (struct re_pattern_buffer *bufp,
357 const char *string1, int length1,
358 const char *string2, int length2, int start,
359 int range, struct re_registers *regs,
360 int stop, int ret_len)
361{
362 const char *str;
363 int rval;
364 int len = length1 + length2;
365 int free_str = 0;
366
367 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
368 return -2;
369
370 /* Concatenate the strings. */
371 if (length2 > 0)
372 if (length1 > 0)
373 {
374 char *s = re_malloc (char, len);
375
376 if (BE (s == NULL, 0))
377 return -2;
378 memcpy (s, string1, length1);
379 memcpy (s + length1, string2, length2);
380 str = s;
381 free_str = 1;
382 }
383 else
384 str = string2;
385 else
386 str = string1;
387
388 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
389 if (free_str)
390 re_free ((char *) str);
391 return rval;
392}
393
394/* The parameters have the same meaning as those of re_search.
395 Additional parameters:
396 If RET_LEN is nonzero the length of the match is returned (re_match style);
397 otherwise the position of the match is returned. */
398
399static int
400re_search_stub (struct re_pattern_buffer *bufp,
401 const char *string, int length, int start,
402 int range, int stop,
403 struct re_registers *regs, int ret_len)
404{
405 reg_errcode_t result;
406 regmatch_t *pmatch;
407 int nregs, rval;
408 int eflags = 0;
409
410 /* Check for out-of-range. */
411 if (BE (start < 0 || start > length, 0))
412 return -1;
413 if (BE (start + range > length, 0))
414 range = length - start;
415 else if (BE (start + range < 0, 0))
416 range = -start;
417
418 __libc_lock_lock (dfa->lock);
419
420 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
421 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
422
423 /* Compile fastmap if we haven't yet. */
424 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
425 re_compile_fastmap (bufp);
426
427 if (BE (bufp->no_sub, 0))
428 regs = NULL;
429
430 /* We need at least 1 register. */
431 if (regs == NULL)
432 nregs = 1;
433 else if (BE (bufp->regs_allocated == REGS_FIXED &&
434 regs->num_regs < bufp->re_nsub + 1, 0))
435 {
436 nregs = regs->num_regs;
437 if (BE (nregs < 1, 0))
438 {
439 /* Nothing can be copied to regs. */
440 regs = NULL;
441 nregs = 1;
442 }
443 }
444 else
445 nregs = bufp->re_nsub + 1;
446 pmatch = re_malloc (regmatch_t, nregs);
447 if (BE (pmatch == NULL, 0))
448 {
449 rval = -2;
450 goto out;
451 }
452
453 result = re_search_internal (bufp, string, length, start, range, stop,
454 nregs, pmatch, eflags);
455
456 rval = 0;
457
458 /* I hope we needn't fill their regs with -1's when no match was found. */
459 if (result != REG_NOERROR)
460 rval = -1;
461 else if (regs != NULL)
462 {
463 /* If caller wants register contents data back, copy them. */
464 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
465 bufp->regs_allocated);
466 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
467 rval = -2;
468 }
469
470 if (BE (rval == 0, 1))
471 {
472 if (ret_len)
473 {
474 assert (pmatch[0].rm_so == start);
475 rval = pmatch[0].rm_eo - start;
476 }
477 else
478 rval = pmatch[0].rm_so;
479 }
480 re_free (pmatch);
481 out:
482 __libc_lock_unlock (dfa->lock);
483 return rval;
484}
485
486static unsigned
487re_copy_regs (struct re_registers *regs,
488 regmatch_t *pmatch,
489 int nregs, int regs_allocated)
490{
491 int rval = REGS_REALLOCATE;
492 int i;
493 int need_regs = nregs + 1;
494 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
495 uses. */
496
497 /* Have the register data arrays been allocated? */
498 if (regs_allocated == REGS_UNALLOCATED)
499 { /* No. So allocate them with malloc. */
500 regs->start = re_malloc (regoff_t, need_regs);
501 if (BE (regs->start == NULL, 0))
502 return REGS_UNALLOCATED;
503 regs->end = re_malloc (regoff_t, need_regs);
504 if (BE (regs->end == NULL, 0))
505 {
506 re_free (regs->start);
507 return REGS_UNALLOCATED;
508 }
509 regs->num_regs = need_regs;
510 }
511 else if (regs_allocated == REGS_REALLOCATE)
512 { /* Yes. If we need more elements than were already
513 allocated, reallocate them. If we need fewer, just
514 leave it alone. */
515 if (BE (need_regs > regs->num_regs, 0))
516 {
517 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
518 regoff_t *new_end;
519 if (BE (new_start == NULL, 0))
520 return REGS_UNALLOCATED;
521 new_end = re_realloc (regs->end, regoff_t, need_regs);
522 if (BE (new_end == NULL, 0))
523 {
524 re_free (new_start);
525 return REGS_UNALLOCATED;
526 }
527 regs->start = new_start;
528 regs->end = new_end;
529 regs->num_regs = need_regs;
530 }
531 }
532 else
533 {
534 assert (regs_allocated == REGS_FIXED);
535 /* This function may not be called with REGS_FIXED and nregs too big. */
536 assert (regs->num_regs >= nregs);
537 rval = REGS_FIXED;
538 }
539
540 /* Copy the regs. */
541 for (i = 0; i < nregs; ++i)
542 {
543 regs->start[i] = pmatch[i].rm_so;
544 regs->end[i] = pmatch[i].rm_eo;
545 }
546 for ( ; i < regs->num_regs; ++i)
547 regs->start[i] = regs->end[i] = -1;
548
549 return rval;
550}
551
552/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
553 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
554 this memory for recording register information. STARTS and ENDS
555 must be allocated using the malloc library routine, and must each
556 be at least NUM_REGS * sizeof (regoff_t) bytes long.
557
558 If NUM_REGS == 0, then subsequent matches should allocate their own
559 register data.
560
561 Unless this function is called, the first search or match using
562 PATTERN_BUFFER will allocate its own register data, without
563 freeing the old data. */
564
565void
566re_set_registers (struct re_pattern_buffer *bufp,
567 struct re_registers *regs,
568 unsigned num_regs,
569 regoff_t *starts,
570 regoff_t *ends)
571{
572 if (num_regs)
573 {
574 bufp->regs_allocated = REGS_REALLOCATE;
575 regs->num_regs = num_regs;
576 regs->start = starts;
577 regs->end = ends;
578 }
579 else
580 {
581 bufp->regs_allocated = REGS_UNALLOCATED;
582 regs->num_regs = 0;
583 regs->start = regs->end = (regoff_t *) 0;
584 }
585}
586#ifdef _LIBC
587weak_alias (__re_set_registers, re_set_registers)
588#endif
589
590/* Entry points compatible with 4.2 BSD regex library. We don't define
591 them unless specifically requested. */
592
593#if defined _REGEX_RE_COMP || defined _LIBC
594int
595# ifdef _LIBC
596weak_function
597# endif
598re_exec (s)
599 const char *s;
600{
601 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
602}
603#endif /* _REGEX_RE_COMP */
604
605/* Internal entry point. */
606
607/* Searches for a compiled pattern PREG in the string STRING, whose
608 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
609 mingings with regexec. START, and RANGE have the same meanings
610 with re_search.
611 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
612 otherwise return the error code.
613 Note: We assume front end functions already check ranges.
614 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
615
616static reg_errcode_t
617re_search_internal (const regex_t *preg,
618 const char *string,
619 int length, int start, int range, int stop,
620 size_t nmatch, regmatch_t pmatch[],
621 int eflags)
622{
623 reg_errcode_t err;
624 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
625 int left_lim, right_lim, incr;
626 int fl_longest_match, match_first, match_kind, match_last = -1;
627 int extra_nmatch;
628 int sb, ch;
629#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
630 re_match_context_t mctx = { .dfa = dfa };
631#else
632 re_match_context_t mctx;
633#endif
634 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
635 && range && !preg->can_be_null) ? preg->fastmap : NULL;
636 RE_TRANSLATE_TYPE t = preg->translate;
637
638#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
639 memset (&mctx, '\0', sizeof (re_match_context_t));
640 mctx.dfa = dfa;
641#endif
642
643 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
644 nmatch -= extra_nmatch;
645
646 /* Check if the DFA haven't been compiled. */
647 if (BE (preg->used == 0 || dfa->init_state == NULL
648 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
649 || dfa->init_state_begbuf == NULL, 0))
650 return REG_NOMATCH;
651
652#ifdef DEBUG
653 /* We assume front-end functions already check them. */
654 assert (start + range >= 0 && start + range <= length);
655#endif
656
657 /* If initial states with non-begbuf contexts have no elements,
658 the regex must be anchored. If preg->newline_anchor is set,
659 we'll never use init_state_nl, so do not check it. */
660 if (dfa->init_state->nodes.nelem == 0
661 && dfa->init_state_word->nodes.nelem == 0
662 && (dfa->init_state_nl->nodes.nelem == 0
663 || !preg->newline_anchor))
664 {
665 if (start != 0 && start + range != 0)
666 return REG_NOMATCH;
667 start = range = 0;
668 }
669
670 /* We must check the longest matching, if nmatch > 0. */
671 fl_longest_match = (nmatch != 0 || dfa->nbackref);
672
673 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
674 preg->translate, preg->syntax & RE_ICASE, dfa);
675 if (BE (err != REG_NOERROR, 0))
676 goto free_return;
677 mctx.input.stop = stop;
678 mctx.input.raw_stop = stop;
679 mctx.input.newline_anchor = preg->newline_anchor;
680
681 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
682 if (BE (err != REG_NOERROR, 0))
683 goto free_return;
684
685 /* We will log all the DFA states through which the dfa pass,
686 if nmatch > 1, or this dfa has "multibyte node", which is a
687 back-reference or a node which can accept multibyte character or
688 multi character collating element. */
689 if (nmatch > 1 || dfa->has_mb_node)
690 {
691 /* Avoid overflow. */
692 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
693 {
694 err = REG_ESPACE;
695 goto free_return;
696 }
697
698 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
699 if (BE (mctx.state_log == NULL, 0))
700 {
701 err = REG_ESPACE;
702 goto free_return;
703 }
704 }
705 else
706 mctx.state_log = NULL;
707
708 match_first = start;
709 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
710 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
711
712 /* Check incrementally whether of not the input string match. */
713 incr = (range < 0) ? -1 : 1;
714 left_lim = (range < 0) ? start + range : start;
715 right_lim = (range < 0) ? start : start + range;
716 sb = dfa->mb_cur_max == 1;
717 match_kind =
718 (fastmap
719 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
720 | (range >= 0 ? 2 : 0)
721 | (t != NULL ? 1 : 0))
722 : 8);
723
724 for (;; match_first += incr)
725 {
726 err = REG_NOMATCH;
727 if (match_first < left_lim || right_lim < match_first)
728 goto free_return;
729
730 /* Advance as rapidly as possible through the string, until we
731 find a plausible place to start matching. This may be done
732 with varying efficiency, so there are various possibilities:
733 only the most common of them are specialized, in order to
734 save on code size. We use a switch statement for speed. */
735 switch (match_kind)
736 {
737 case 8:
738 /* No fastmap. */
739 break;
740
741 case 7:
742 /* Fastmap with single-byte translation, match forward. */
743 while (BE (match_first < right_lim, 1)
744 && !fastmap[t[(unsigned char) string[match_first]]])
745 ++match_first;
746 goto forward_match_found_start_or_reached_end;
747
748 case 6:
749 /* Fastmap without translation, match forward. */
750 while (BE (match_first < right_lim, 1)
751 && !fastmap[(unsigned char) string[match_first]])
752 ++match_first;
753
754 forward_match_found_start_or_reached_end:
755 if (BE (match_first == right_lim, 0))
756 {
757 ch = match_first >= length
758 ? 0 : (unsigned char) string[match_first];
759 if (!fastmap[t ? t[ch] : ch])
760 goto free_return;
761 }
762 break;
763
764 case 4:
765 case 5:
766 /* Fastmap without multi-byte translation, match backwards. */
767 while (match_first >= left_lim)
768 {
769 ch = match_first >= length
770 ? 0 : (unsigned char) string[match_first];
771 if (fastmap[t ? t[ch] : ch])
772 break;
773 --match_first;
774 }
775 if (match_first < left_lim)
776 goto free_return;
777 break;
778
779 default:
780 /* In this case, we can't determine easily the current byte,
781 since it might be a component byte of a multibyte
782 character. Then we use the constructed buffer instead. */
783 for (;;)
784 {
785 /* If MATCH_FIRST is out of the valid range, reconstruct the
786 buffers. */
787 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
788 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
789 {
790 err = re_string_reconstruct (&mctx.input, match_first,
791 eflags);
792 if (BE (err != REG_NOERROR, 0))
793 goto free_return;
794
795 offset = match_first - mctx.input.raw_mbs_idx;
796 }
797 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
798 Note that MATCH_FIRST must not be smaller than 0. */
799 ch = (match_first >= length
800 ? 0 : re_string_byte_at (&mctx.input, offset));
801 if (fastmap[ch])
802 break;
803 match_first += incr;
804 if (match_first < left_lim || match_first > right_lim)
805 {
806 err = REG_NOMATCH;
807 goto free_return;
808 }
809 }
810 break;
811 }
812
813 /* Reconstruct the buffers so that the matcher can assume that
814 the matching starts from the beginning of the buffer. */
815 err = re_string_reconstruct (&mctx.input, match_first, eflags);
816 if (BE (err != REG_NOERROR, 0))
817 goto free_return;
818
819#ifdef RE_ENABLE_I18N
820 /* Don't consider this char as a possible match start if it part,
821 yet isn't the head, of a multibyte character. */
822 if (!sb && !re_string_first_byte (&mctx.input, 0))
823 continue;
824#endif
825
826 /* It seems to be appropriate one, then use the matcher. */
827 /* We assume that the matching starts from 0. */
828 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
829 match_last = check_matching (&mctx, fl_longest_match,
830 range >= 0 ? &match_first : NULL);
831 if (match_last != -1)
832 {
833 if (BE (match_last == -2, 0))
834 {
835 err = REG_ESPACE;
836 goto free_return;
837 }
838 else
839 {
840 mctx.match_last = match_last;
841 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
842 {
843 re_dfastate_t *pstate = mctx.state_log[match_last];
844 mctx.last_node = check_halt_state_context (&mctx, pstate,
845 match_last);
846 }
847 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
848 || dfa->nbackref)
849 {
850 err = prune_impossible_nodes (&mctx);
851 if (err == REG_NOERROR)
852 break;
853 if (BE (err != REG_NOMATCH, 0))
854 goto free_return;
855 match_last = -1;
856 }
857 else
858 break; /* We found a match. */
859 }
860 }
861
862 match_ctx_clean (&mctx);
863 }
864
865#ifdef DEBUG
866 assert (match_last != -1);
867 assert (err == REG_NOERROR);
868#endif
869
870 /* Set pmatch[] if we need. */
871 if (nmatch > 0)
872 {
873 int reg_idx;
874
875 /* Initialize registers. */
876 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
877 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
878
879 /* Set the points where matching start/end. */
880 pmatch[0].rm_so = 0;
881 pmatch[0].rm_eo = mctx.match_last;
882
883 if (!preg->no_sub && nmatch > 1)
884 {
885 err = set_regs (preg, &mctx, nmatch, pmatch,
886 dfa->has_plural_match && dfa->nbackref > 0);
887 if (BE (err != REG_NOERROR, 0))
888 goto free_return;
889 }
890
891 /* At last, add the offset to the each registers, since we slided
892 the buffers so that we could assume that the matching starts
893 from 0. */
894 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
895 if (pmatch[reg_idx].rm_so != -1)
896 {
897#ifdef RE_ENABLE_I18N
898 if (BE (mctx.input.offsets_needed != 0, 0))
899 {
900 pmatch[reg_idx].rm_so =
901 (pmatch[reg_idx].rm_so == mctx.input.valid_len
902 ? mctx.input.valid_raw_len
903 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
904 pmatch[reg_idx].rm_eo =
905 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
906 ? mctx.input.valid_raw_len
907 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
908 }
909#else
910 assert (mctx.input.offsets_needed == 0);
911#endif
912 pmatch[reg_idx].rm_so += match_first;
913 pmatch[reg_idx].rm_eo += match_first;
914 }
915 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
916 {
917 pmatch[nmatch + reg_idx].rm_so = -1;
918 pmatch[nmatch + reg_idx].rm_eo = -1;
919 }
920
921 if (dfa->subexp_map)
922 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
923 if (dfa->subexp_map[reg_idx] != reg_idx)
924 {
925 pmatch[reg_idx + 1].rm_so
926 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
927 pmatch[reg_idx + 1].rm_eo
928 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
929 }
930 }
931
932 free_return:
933 re_free (mctx.state_log);
934 if (dfa->nbackref)
935 match_ctx_free (&mctx);
936 re_string_destruct (&mctx.input);
937 return err;
938}
939
940static reg_errcode_t
941prune_impossible_nodes (re_match_context_t *mctx)
942{
943 const re_dfa_t *const dfa = mctx->dfa;
944 int halt_node, match_last;
945 reg_errcode_t ret;
946 re_dfastate_t **sifted_states;
947 re_dfastate_t **lim_states = NULL;
948 re_sift_context_t sctx;
949#ifdef DEBUG
950 assert (mctx->state_log != NULL);
951#endif
952 match_last = mctx->match_last;
953 halt_node = mctx->last_node;
954
955 /* Avoid overflow. */
956 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
957 return REG_ESPACE;
958
959 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
960 if (BE (sifted_states == NULL, 0))
961 {
962 ret = REG_ESPACE;
963 goto free_return;
964 }
965 if (dfa->nbackref)
966 {
967 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
968 if (BE (lim_states == NULL, 0))
969 {
970 ret = REG_ESPACE;
971 goto free_return;
972 }
973 while (1)
974 {
975 memset (lim_states, '\0',
976 sizeof (re_dfastate_t *) * (match_last + 1));
977 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
978 match_last);
979 ret = sift_states_backward (mctx, &sctx);
980 re_node_set_free (&sctx.limits);
981 if (BE (ret != REG_NOERROR, 0))
982 goto free_return;
983 if (sifted_states[0] != NULL || lim_states[0] != NULL)
984 break;
985 do
986 {
987 --match_last;
988 if (match_last < 0)
989 {
990 ret = REG_NOMATCH;
991 goto free_return;
992 }
993 } while (mctx->state_log[match_last] == NULL
994 || !mctx->state_log[match_last]->halt);
995 halt_node = check_halt_state_context (mctx,
996 mctx->state_log[match_last],
997 match_last);
998 }
999 ret = merge_state_array (dfa, sifted_states, lim_states,
1000 match_last + 1);
1001 re_free (lim_states);
1002 lim_states = NULL;
1003 if (BE (ret != REG_NOERROR, 0))
1004 goto free_return;
1005 }
1006 else
1007 {
1008 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1009 ret = sift_states_backward (mctx, &sctx);
1010 re_node_set_free (&sctx.limits);
1011 if (BE (ret != REG_NOERROR, 0))
1012 goto free_return;
1013 if (sifted_states[0] == NULL)
1014 {
1015 ret = REG_NOMATCH;
1016 goto free_return;
1017 }
1018 }
1019 re_free (mctx->state_log);
1020 mctx->state_log = sifted_states;
1021 sifted_states = NULL;
1022 mctx->last_node = halt_node;
1023 mctx->match_last = match_last;
1024 ret = REG_NOERROR;
1025 free_return:
1026 re_free (sifted_states);
1027 re_free (lim_states);
1028 return ret;
1029}
1030
1031/* Acquire an initial state and return it.
1032 We must select appropriate initial state depending on the context,
1033 since initial states may have constraints like "\<", "^", etc.. */
1034
1035static inline re_dfastate_t *
1036__attribute ((always_inline)) internal_function
1037acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1038 int idx)
1039{
1040 const re_dfa_t *const dfa = mctx->dfa;
1041 if (dfa->init_state->has_constraint)
1042 {
1043 unsigned int context;
1044 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1045 if (IS_WORD_CONTEXT (context))
1046 return dfa->init_state_word;
1047 else if (IS_ORDINARY_CONTEXT (context))
1048 return dfa->init_state;
1049 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1050 return dfa->init_state_begbuf;
1051 else if (IS_NEWLINE_CONTEXT (context))
1052 return dfa->init_state_nl;
1053 else if (IS_BEGBUF_CONTEXT (context))
1054 {
1055 /* It is relatively rare case, then calculate on demand. */
1056 return re_acquire_state_context (err, dfa,
1057 dfa->init_state->entrance_nodes,
1058 context);
1059 }
1060 else
1061 /* Must not happen? */
1062 return dfa->init_state;
1063 }
1064 else
1065 return dfa->init_state;
1066}
1067
1068/* Check whether the regular expression match input string INPUT or not,
1069 and return the index where the matching end, return -1 if not match,
1070 or return -2 in case of an error.
1071 FL_LONGEST_MATCH means we want the POSIX longest matching.
1072 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1073 next place where we may want to try matching.
1074 Note that the matcher assume that the matching starts from the current
1075 index of the buffer. */
1076
1077static int
1078internal_function
1079check_matching (re_match_context_t *mctx, int fl_longest_match,
1080 int *p_match_first)
1081{
1082 const re_dfa_t *const dfa = mctx->dfa;
1083 reg_errcode_t err;
1084 int match = 0;
1085 int match_last = -1;
1086 int cur_str_idx = re_string_cur_idx (&mctx->input);
1087 re_dfastate_t *cur_state;
1088 int at_init_state = p_match_first != NULL;
1089 int next_start_idx = cur_str_idx;
1090
1091 err = REG_NOERROR;
1092 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1093 /* An initial state must not be NULL (invalid). */
1094 if (BE (cur_state == NULL, 0))
1095 {
1096 assert (err == REG_ESPACE);
1097 return -2;
1098 }
1099
1100 if (mctx->state_log != NULL)
1101 {
1102 mctx->state_log[cur_str_idx] = cur_state;
1103
1104 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1105 later. E.g. Processing back references. */
1106 if (BE (dfa->nbackref, 0))
1107 {
1108 at_init_state = 0;
1109 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1110 if (BE (err != REG_NOERROR, 0))
1111 return err;
1112
1113 if (cur_state->has_backref)
1114 {
1115 err = transit_state_bkref (mctx, &cur_state->nodes);
1116 if (BE (err != REG_NOERROR, 0))
1117 return err;
1118 }
1119 }
1120 }
1121
1122 /* If the RE accepts NULL string. */
1123 if (BE (cur_state->halt, 0))
1124 {
1125 if (!cur_state->has_constraint
1126 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1127 {
1128 if (!fl_longest_match)
1129 return cur_str_idx;
1130 else
1131 {
1132 match_last = cur_str_idx;
1133 match = 1;
1134 }
1135 }
1136 }
1137
1138 while (!re_string_eoi (&mctx->input))
1139 {
1140 re_dfastate_t *old_state = cur_state;
1141 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1142
1143 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1144 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1145 && mctx->input.valid_len < mctx->input.len))
1146 {
1147 err = extend_buffers (mctx);
1148 if (BE (err != REG_NOERROR, 0))
1149 {
1150 assert (err == REG_ESPACE);
1151 return -2;
1152 }
1153 }
1154
1155 cur_state = transit_state (&err, mctx, cur_state);
1156 if (mctx->state_log != NULL)
1157 cur_state = merge_state_with_log (&err, mctx, cur_state);
1158
1159 if (cur_state == NULL)
1160 {
1161 /* Reached the invalid state or an error. Try to recover a valid
1162 state using the state log, if available and if we have not
1163 already found a valid (even if not the longest) match. */
1164 if (BE (err != REG_NOERROR, 0))
1165 return -2;
1166
1167 if (mctx->state_log == NULL
1168 || (match && !fl_longest_match)
1169 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1170 break;
1171 }
1172
1173 if (BE (at_init_state, 0))
1174 {
1175 if (old_state == cur_state)
1176 next_start_idx = next_char_idx;
1177 else
1178 at_init_state = 0;
1179 }
1180
1181 if (cur_state->halt)
1182 {
1183 /* Reached a halt state.
1184 Check the halt state can satisfy the current context. */
1185 if (!cur_state->has_constraint
1186 || check_halt_state_context (mctx, cur_state,
1187 re_string_cur_idx (&mctx->input)))
1188 {
1189 /* We found an appropriate halt state. */
1190 match_last = re_string_cur_idx (&mctx->input);
1191 match = 1;
1192
1193 /* We found a match, do not modify match_first below. */
1194 p_match_first = NULL;
1195 if (!fl_longest_match)
1196 break;
1197 }
1198 }
1199 }
1200
1201 if (p_match_first)
1202 *p_match_first += next_start_idx;
1203
1204 return match_last;
1205}
1206
1207/* Check NODE match the current context. */
1208
1209static int
1210internal_function
1211check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1212{
1213 re_token_type_t type = dfa->nodes[node].type;
1214 unsigned int constraint = dfa->nodes[node].constraint;
1215 if (type != END_OF_RE)
1216 return 0;
1217 if (!constraint)
1218 return 1;
1219 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1220 return 0;
1221 return 1;
1222}
1223
1224/* Check the halt state STATE match the current context.
1225 Return 0 if not match, if the node, STATE has, is a halt node and
1226 match the context, return the node. */
1227
1228static int
1229internal_function
1230check_halt_state_context (const re_match_context_t *mctx,
1231 const re_dfastate_t *state, int idx)
1232{
1233 int i;
1234 unsigned int context;
1235#ifdef DEBUG
1236 assert (state->halt);
1237#endif
1238 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1239 for (i = 0; i < state->nodes.nelem; ++i)
1240 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1241 return state->nodes.elems[i];
1242 return 0;
1243}
1244
1245/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1246 corresponding to the DFA).
1247 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1248 of errors. */
1249
1250static int
1251internal_function
1252proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1253 int *pidx, int node, re_node_set *eps_via_nodes,
1254 struct re_fail_stack_t *fs)
1255{
1256 const re_dfa_t *const dfa = mctx->dfa;
1257 int i, err;
1258 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1259 {
1260 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1261 re_node_set *edests = &dfa->edests[node];
1262 int dest_node;
1263 err = re_node_set_insert (eps_via_nodes, node);
1264 if (BE (err < 0, 0))
1265 return -2;
1266 /* Pick up a valid destination, or return -1 if none is found. */
1267 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1268 {
1269 int candidate = edests->elems[i];
1270 if (!re_node_set_contains (cur_nodes, candidate))
1271 continue;
1272 if (dest_node == -1)
1273 dest_node = candidate;
1274
1275 else
1276 {
1277 /* In order to avoid infinite loop like "(a*)*", return the second
1278 epsilon-transition if the first was already considered. */
1279 if (re_node_set_contains (eps_via_nodes, dest_node))
1280 return candidate;
1281
1282 /* Otherwise, push the second epsilon-transition on the fail stack. */
1283 else if (fs != NULL
1284 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1285 eps_via_nodes))
1286 return -2;
1287
1288 /* We know we are going to exit. */
1289 break;
1290 }
1291 }
1292 return dest_node;
1293 }
1294 else
1295 {
1296 int naccepted = 0;
1297 re_token_type_t type = dfa->nodes[node].type;
1298
1299#ifdef RE_ENABLE_I18N
1300 if (dfa->nodes[node].accept_mb)
1301 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1302 else
1303#endif /* RE_ENABLE_I18N */
1304 if (type == OP_BACK_REF)
1305 {
1306 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1307 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1308 if (fs != NULL)
1309 {
1310 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1311 return -1;
1312 else if (naccepted)
1313 {
1314 char *buf = (char *) re_string_get_buffer (&mctx->input);
1315 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1316 naccepted) != 0)
1317 return -1;
1318 }
1319 }
1320
1321 if (naccepted == 0)
1322 {
1323 int dest_node;
1324 err = re_node_set_insert (eps_via_nodes, node);
1325 if (BE (err < 0, 0))
1326 return -2;
1327 dest_node = dfa->edests[node].elems[0];
1328 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1329 dest_node))
1330 return dest_node;
1331 }
1332 }
1333
1334 if (naccepted != 0
1335 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1336 {
1337 int dest_node = dfa->nexts[node];
1338 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1339 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1340 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1341 dest_node)))
1342 return -1;
1343 re_node_set_empty (eps_via_nodes);
1344 return dest_node;
1345 }
1346 }
1347 return -1;
1348}
1349
1350static reg_errcode_t
1351internal_function
1352push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1353 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1354{
1355 reg_errcode_t err;
1356 int num = fs->num++;
1357 if (fs->num == fs->alloc)
1358 {
1359 struct re_fail_stack_ent_t *new_array;
1360 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1361 * fs->alloc * 2));
1362 if (new_array == NULL)
1363 return REG_ESPACE;
1364 fs->alloc *= 2;
1365 fs->stack = new_array;
1366 }
1367 fs->stack[num].idx = str_idx;
1368 fs->stack[num].node = dest_node;
1369 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1370 if (fs->stack[num].regs == NULL)
1371 return REG_ESPACE;
1372 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1373 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1374 return err;
1375}
1376
1377static int
1378internal_function
1379pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1380 regmatch_t *regs, re_node_set *eps_via_nodes)
1381{
1382 int num = --fs->num;
1383 assert (num >= 0);
1384 *pidx = fs->stack[num].idx;
1385 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1386 re_node_set_free (eps_via_nodes);
1387 re_free (fs->stack[num].regs);
1388 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1389 return fs->stack[num].node;
1390}
1391
1392/* Set the positions where the subexpressions are starts/ends to registers
1393 PMATCH.
1394 Note: We assume that pmatch[0] is already set, and
1395 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1396
1397static reg_errcode_t
1398internal_function
1399set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1400 regmatch_t *pmatch, int fl_backtrack)
1401{
1402 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1403 int idx, cur_node;
1404 re_node_set eps_via_nodes;
1405 struct re_fail_stack_t *fs;
1406 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1407 regmatch_t *prev_idx_match;
1408 int prev_idx_match_malloced = 0;
1409
1410#ifdef DEBUG
1411 assert (nmatch > 1);
1412 assert (mctx->state_log != NULL);
1413#endif
1414 if (fl_backtrack)
1415 {
1416 fs = &fs_body;
1417 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1418 if (fs->stack == NULL)
1419 return REG_ESPACE;
1420 }
1421 else
1422 fs = NULL;
1423
1424 cur_node = dfa->init_node;
1425 re_node_set_init_empty (&eps_via_nodes);
1426
1427#ifdef HAVE_ALLOCA
1428 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1429 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1430 else
1431#endif
1432 {
1433 prev_idx_match = re_malloc (regmatch_t, nmatch);
1434 if (prev_idx_match == NULL)
1435 {
1436 free_fail_stack_return (fs);
1437 return REG_ESPACE;
1438 }
1439 prev_idx_match_malloced = 1;
1440 }
1441 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1442
1443 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1444 {
1445 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1446
1447 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1448 {
1449 int reg_idx;
1450 if (fs)
1451 {
1452 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1453 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1454 break;
1455 if (reg_idx == nmatch)
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return free_fail_stack_return (fs);
1461 }
1462 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1463 &eps_via_nodes);
1464 }
1465 else
1466 {
1467 re_node_set_free (&eps_via_nodes);
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
1470 return REG_NOERROR;
1471 }
1472 }
1473
1474 /* Proceed to next node. */
1475 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1476 &eps_via_nodes, fs);
1477
1478 if (BE (cur_node < 0, 0))
1479 {
1480 if (BE (cur_node == -2, 0))
1481 {
1482 re_node_set_free (&eps_via_nodes);
1483 if (prev_idx_match_malloced)
1484 re_free (prev_idx_match);
1485 free_fail_stack_return (fs);
1486 return REG_ESPACE;
1487 }
1488 if (fs)
1489 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1490 &eps_via_nodes);
1491 else
1492 {
1493 re_node_set_free (&eps_via_nodes);
1494 if (prev_idx_match_malloced)
1495 re_free (prev_idx_match);
1496 return REG_NOMATCH;
1497 }
1498 }
1499 }
1500 re_node_set_free (&eps_via_nodes);
1501 if (prev_idx_match_malloced)
1502 re_free (prev_idx_match);
1503 return free_fail_stack_return (fs);
1504}
1505
1506static reg_errcode_t
1507internal_function
1508free_fail_stack_return (struct re_fail_stack_t *fs)
1509{
1510 if (fs)
1511 {
1512 int fs_idx;
1513 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1514 {
1515 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1516 re_free (fs->stack[fs_idx].regs);
1517 }
1518 re_free (fs->stack);
1519 }
1520 return REG_NOERROR;
1521}
1522
1523static void
1524internal_function
1525update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1526 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1527{
1528 int type = dfa->nodes[cur_node].type;
1529 if (type == OP_OPEN_SUBEXP)
1530 {
1531 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1532
1533 /* We are at the first node of this sub expression. */
1534 if (reg_num < nmatch)
1535 {
1536 pmatch[reg_num].rm_so = cur_idx;
1537 pmatch[reg_num].rm_eo = -1;
1538 }
1539 }
1540 else if (type == OP_CLOSE_SUBEXP)
1541 {
1542 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1543 if (reg_num < nmatch)
1544 {
1545 /* We are at the last node of this sub expression. */
1546 if (pmatch[reg_num].rm_so < cur_idx)
1547 {
1548 pmatch[reg_num].rm_eo = cur_idx;
1549 /* This is a non-empty match or we are not inside an optional
1550 subexpression. Accept this right away. */
1551 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1552 }
1553 else
1554 {
1555 if (dfa->nodes[cur_node].opt_subexp
1556 && prev_idx_match[reg_num].rm_so != -1)
1557 /* We transited through an empty match for an optional
1558 subexpression, like (a?)*, and this is not the subexp's
1559 first match. Copy back the old content of the registers
1560 so that matches of an inner subexpression are undone as
1561 well, like in ((a?))*. */
1562 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1563 else
1564 /* We completed a subexpression, but it may be part of
1565 an optional one, so do not update PREV_IDX_MATCH. */
1566 pmatch[reg_num].rm_eo = cur_idx;
1567 }
1568 }
1569 }
1570}
1571
1572/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1573 and sift the nodes in each states according to the following rules.
1574 Updated state_log will be wrote to STATE_LOG.
1575
1576 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1577 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1578 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1579 the LAST_NODE, we throw away the node `a'.
1580 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1581 string `s' and transit to `b':
1582 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1583 away the node `a'.
1584 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1585 thrown away, we throw away the node `a'.
1586 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1587 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1588 node `a'.
1589 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1590 we throw away the node `a'. */
1591
1592#define STATE_NODE_CONTAINS(state,node) \
1593 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1594
1595static reg_errcode_t
1596internal_function
1597sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1598{
1599 reg_errcode_t err;
1600 int null_cnt = 0;
1601 int str_idx = sctx->last_str_idx;
1602 re_node_set cur_dest;
1603
1604#ifdef DEBUG
1605 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1606#endif
1607
1608 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1609 transit to the last_node and the last_node itself. */
1610 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1611 if (BE (err != REG_NOERROR, 0))
1612 return err;
1613 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1614 if (BE (err != REG_NOERROR, 0))
1615 goto free_return;
1616
1617 /* Then check each states in the state_log. */
1618 while (str_idx > 0)
1619 {
1620 /* Update counters. */
1621 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1622 if (null_cnt > mctx->max_mb_elem_len)
1623 {
1624 memset (sctx->sifted_states, '\0',
1625 sizeof (re_dfastate_t *) * str_idx);
1626 re_node_set_free (&cur_dest);
1627 return REG_NOERROR;
1628 }
1629 re_node_set_empty (&cur_dest);
1630 --str_idx;
1631
1632 if (mctx->state_log[str_idx])
1633 {
1634 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1635 if (BE (err != REG_NOERROR, 0))
1636 goto free_return;
1637 }
1638
1639 /* Add all the nodes which satisfy the following conditions:
1640 - It can epsilon transit to a node in CUR_DEST.
1641 - It is in CUR_SRC.
1642 And update state_log. */
1643 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1644 if (BE (err != REG_NOERROR, 0))
1645 goto free_return;
1646 }
1647 err = REG_NOERROR;
1648 free_return:
1649 re_node_set_free (&cur_dest);
1650 return err;
1651}
1652
1653static reg_errcode_t
1654internal_function
1655build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1656 int str_idx, re_node_set *cur_dest)
1657{
1658 const re_dfa_t *const dfa = mctx->dfa;
1659 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1660 int i;
1661
1662 /* Then build the next sifted state.
1663 We build the next sifted state on `cur_dest', and update
1664 `sifted_states[str_idx]' with `cur_dest'.
1665 Note:
1666 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1667 `cur_src' points the node_set of the old `state_log[str_idx]'
1668 (with the epsilon nodes pre-filtered out). */
1669 for (i = 0; i < cur_src->nelem; i++)
1670 {
1671 int prev_node = cur_src->elems[i];
1672 int naccepted = 0;
1673 int ret;
1674
1675#ifdef DEBUG
1676 re_token_type_t type = dfa->nodes[prev_node].type;
1677 assert (!IS_EPSILON_NODE (type));
1678#endif
1679#ifdef RE_ENABLE_I18N
1680 /* If the node may accept `multi byte'. */
1681 if (dfa->nodes[prev_node].accept_mb)
1682 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1683 str_idx, sctx->last_str_idx);
1684#endif /* RE_ENABLE_I18N */
1685
1686 /* We don't check backreferences here.
1687 See update_cur_sifted_state(). */
1688 if (!naccepted
1689 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1690 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1691 dfa->nexts[prev_node]))
1692 naccepted = 1;
1693
1694 if (naccepted == 0)
1695 continue;
1696
1697 if (sctx->limits.nelem)
1698 {
1699 int to_idx = str_idx + naccepted;
1700 if (check_dst_limits (mctx, &sctx->limits,
1701 dfa->nexts[prev_node], to_idx,
1702 prev_node, str_idx))
1703 continue;
1704 }
1705 ret = re_node_set_insert (cur_dest, prev_node);
1706 if (BE (ret == -1, 0))
1707 return REG_ESPACE;
1708 }
1709
1710 return REG_NOERROR;
1711}
1712
1713/* Helper functions. */
1714
1715static reg_errcode_t
1716internal_function
1717clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1718{
1719 int top = mctx->state_log_top;
1720
1721 if (next_state_log_idx >= mctx->input.bufs_len
1722 || (next_state_log_idx >= mctx->input.valid_len
1723 && mctx->input.valid_len < mctx->input.len))
1724 {
1725 reg_errcode_t err;
1726 err = extend_buffers (mctx);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729 }
1730
1731 if (top < next_state_log_idx)
1732 {
1733 memset (mctx->state_log + top + 1, '\0',
1734 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1735 mctx->state_log_top = next_state_log_idx;
1736 }
1737 return REG_NOERROR;
1738}
1739
1740static reg_errcode_t
1741internal_function
1742merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1743 re_dfastate_t **src, int num)
1744{
1745 int st_idx;
1746 reg_errcode_t err;
1747 for (st_idx = 0; st_idx < num; ++st_idx)
1748 {
1749 if (dst[st_idx] == NULL)
1750 dst[st_idx] = src[st_idx];
1751 else if (src[st_idx] != NULL)
1752 {
1753 re_node_set merged_set;
1754 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1755 &src[st_idx]->nodes);
1756 if (BE (err != REG_NOERROR, 0))
1757 return err;
1758 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1759 re_node_set_free (&merged_set);
1760 if (BE (err != REG_NOERROR, 0))
1761 return err;
1762 }
1763 }
1764 return REG_NOERROR;
1765}
1766
1767static reg_errcode_t
1768internal_function
1769update_cur_sifted_state (const re_match_context_t *mctx,
1770 re_sift_context_t *sctx, int str_idx,
1771 re_node_set *dest_nodes)
1772{
1773 const re_dfa_t *const dfa = mctx->dfa;
1774 reg_errcode_t err = REG_NOERROR;
1775 const re_node_set *candidates;
1776 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1777 : &mctx->state_log[str_idx]->nodes);
1778
1779 if (dest_nodes->nelem == 0)
1780 sctx->sifted_states[str_idx] = NULL;
1781 else
1782 {
1783 if (candidates)
1784 {
1785 /* At first, add the nodes which can epsilon transit to a node in
1786 DEST_NODE. */
1787 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1788 if (BE (err != REG_NOERROR, 0))
1789 return err;
1790
1791 /* Then, check the limitations in the current sift_context. */
1792 if (sctx->limits.nelem)
1793 {
1794 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1795 mctx->bkref_ents, str_idx);
1796 if (BE (err != REG_NOERROR, 0))
1797 return err;
1798 }
1799 }
1800
1801 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1802 if (BE (err != REG_NOERROR, 0))
1803 return err;
1804 }
1805
1806 if (candidates && mctx->state_log[str_idx]->has_backref)
1807 {
1808 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1809 if (BE (err != REG_NOERROR, 0))
1810 return err;
1811 }
1812 return REG_NOERROR;
1813}
1814
1815static reg_errcode_t
1816internal_function
1817add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1818 const re_node_set *candidates)
1819{
1820 reg_errcode_t err = REG_NOERROR;
1821 int i;
1822
1823 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1824 if (BE (err != REG_NOERROR, 0))
1825 return err;
1826
1827 if (!state->inveclosure.alloc)
1828 {
1829 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1830 if (BE (err != REG_NOERROR, 0))
1831 return REG_ESPACE;
1832 for (i = 0; i < dest_nodes->nelem; i++)
1833 {
1834 err = re_node_set_merge (&state->inveclosure,
1835 dfa->inveclosures + dest_nodes->elems[i]);
1836 if (BE (err != REG_NOERROR, 0))
1837 return REG_ESPACE;
1838 }
1839 }
1840 return re_node_set_add_intersect (dest_nodes, candidates,
1841 &state->inveclosure);
1842}
1843
1844static reg_errcode_t
1845internal_function
1846sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1847 const re_node_set *candidates)
1848{
1849 int ecl_idx;
1850 reg_errcode_t err;
1851 re_node_set *inv_eclosure = dfa->inveclosures + node;
1852 re_node_set except_nodes;
1853 re_node_set_init_empty (&except_nodes);
1854 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1855 {
1856 int cur_node = inv_eclosure->elems[ecl_idx];
1857 if (cur_node == node)
1858 continue;
1859 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1860 {
1861 int edst1 = dfa->edests[cur_node].elems[0];
1862 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1863 ? dfa->edests[cur_node].elems[1] : -1);
1864 if ((!re_node_set_contains (inv_eclosure, edst1)
1865 && re_node_set_contains (dest_nodes, edst1))
1866 || (edst2 > 0
1867 && !re_node_set_contains (inv_eclosure, edst2)
1868 && re_node_set_contains (dest_nodes, edst2)))
1869 {
1870 err = re_node_set_add_intersect (&except_nodes, candidates,
1871 dfa->inveclosures + cur_node);
1872 if (BE (err != REG_NOERROR, 0))
1873 {
1874 re_node_set_free (&except_nodes);
1875 return err;
1876 }
1877 }
1878 }
1879 }
1880 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881 {
1882 int cur_node = inv_eclosure->elems[ecl_idx];
1883 if (!re_node_set_contains (&except_nodes, cur_node))
1884 {
1885 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1886 re_node_set_remove_at (dest_nodes, idx);
1887 }
1888 }
1889 re_node_set_free (&except_nodes);
1890 return REG_NOERROR;
1891}
1892
1893static int
1894internal_function
1895check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1896 int dst_node, int dst_idx, int src_node, int src_idx)
1897{
1898 const re_dfa_t *const dfa = mctx->dfa;
1899 int lim_idx, src_pos, dst_pos;
1900
1901 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1902 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1903 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1904 {
1905 int subexp_idx;
1906 struct re_backref_cache_entry *ent;
1907 ent = mctx->bkref_ents + limits->elems[lim_idx];
1908 subexp_idx = dfa->nodes[ent->node].opr.idx;
1909
1910 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1911 subexp_idx, dst_node, dst_idx,
1912 dst_bkref_idx);
1913 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1914 subexp_idx, src_node, src_idx,
1915 src_bkref_idx);
1916
1917 /* In case of:
1918 <src> <dst> ( <subexp> )
1919 ( <subexp> ) <src> <dst>
1920 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1921 if (src_pos == dst_pos)
1922 continue; /* This is unrelated limitation. */
1923 else
1924 return 1;
1925 }
1926 return 0;
1927}
1928
1929static int
1930internal_function
1931check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1932 int subexp_idx, int from_node, int bkref_idx)
1933{
1934 const re_dfa_t *const dfa = mctx->dfa;
1935 const re_node_set *eclosures = dfa->eclosures + from_node;
1936 int node_idx;
1937
1938 /* Else, we are on the boundary: examine the nodes on the epsilon
1939 closure. */
1940 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1941 {
1942 int node = eclosures->elems[node_idx];
1943 switch (dfa->nodes[node].type)
1944 {
1945 case OP_BACK_REF:
1946 if (bkref_idx != -1)
1947 {
1948 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1949 do
1950 {
1951 int dst, cpos;
1952
1953 if (ent->node != node)
1954 continue;
1955
1956 if (subexp_idx < BITSET_WORD_BITS
1957 && !(ent->eps_reachable_subexps_map
1958 & ((bitset_word_t) 1 << subexp_idx)))
1959 continue;
1960
1961 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1962 OP_CLOSE_SUBEXP cases below. But, if the
1963 destination node is the same node as the source
1964 node, don't recurse because it would cause an
1965 infinite loop: a regex that exhibits this behavior
1966 is ()\1*\1* */
1967 dst = dfa->edests[node].elems[0];
1968 if (dst == from_node)
1969 {
1970 if (boundaries & 1)
1971 return -1;
1972 else /* if (boundaries & 2) */
1973 return 0;
1974 }
1975
1976 cpos =
1977 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1978 dst, bkref_idx);
1979 if (cpos == -1 /* && (boundaries & 1) */)
1980 return -1;
1981 if (cpos == 0 && (boundaries & 2))
1982 return 0;
1983
1984 if (subexp_idx < BITSET_WORD_BITS)
1985 ent->eps_reachable_subexps_map
1986 &= ~((bitset_word_t) 1 << subexp_idx);
1987 }
1988 while (ent++->more);
1989 }
1990 break;
1991
1992 case OP_OPEN_SUBEXP:
1993 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1994 return -1;
1995 break;
1996
1997 case OP_CLOSE_SUBEXP:
1998 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1999 return 0;
2000 break;
2001
2002 default:
2003 break;
2004 }
2005 }
2006
2007 return (boundaries & 2) ? 1 : 0;
2008}
2009
2010static int
2011internal_function
2012check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2013 int subexp_idx, int from_node, int str_idx,
2014 int bkref_idx)
2015{
2016 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2017 int boundaries;
2018
2019 /* If we are outside the range of the subexpression, return -1 or 1. */
2020 if (str_idx < lim->subexp_from)
2021 return -1;
2022
2023 if (lim->subexp_to < str_idx)
2024 return 1;
2025
2026 /* If we are within the subexpression, return 0. */
2027 boundaries = (str_idx == lim->subexp_from);
2028 boundaries |= (str_idx == lim->subexp_to) << 1;
2029 if (boundaries == 0)
2030 return 0;
2031
2032 /* Else, examine epsilon closure. */
2033 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2034 from_node, bkref_idx);
2035}
2036
2037/* Check the limitations of sub expressions LIMITS, and remove the nodes
2038 which are against limitations from DEST_NODES. */
2039
2040static reg_errcode_t
2041internal_function
2042check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2043 const re_node_set *candidates, re_node_set *limits,
2044 struct re_backref_cache_entry *bkref_ents, int str_idx)
2045{
2046 reg_errcode_t err;
2047 int node_idx, lim_idx;
2048
2049 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2050 {
2051 int subexp_idx;
2052 struct re_backref_cache_entry *ent;
2053 ent = bkref_ents + limits->elems[lim_idx];
2054
2055 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2056 continue; /* This is unrelated limitation. */
2057
2058 subexp_idx = dfa->nodes[ent->node].opr.idx;
2059 if (ent->subexp_to == str_idx)
2060 {
2061 int ops_node = -1;
2062 int cls_node = -1;
2063 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064 {
2065 int node = dest_nodes->elems[node_idx];
2066 re_token_type_t type = dfa->nodes[node].type;
2067 if (type == OP_OPEN_SUBEXP
2068 && subexp_idx == dfa->nodes[node].opr.idx)
2069 ops_node = node;
2070 else if (type == OP_CLOSE_SUBEXP
2071 && subexp_idx == dfa->nodes[node].opr.idx)
2072 cls_node = node;
2073 }
2074
2075 /* Check the limitation of the open subexpression. */
2076 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2077 if (ops_node >= 0)
2078 {
2079 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2080 candidates);
2081 if (BE (err != REG_NOERROR, 0))
2082 return err;
2083 }
2084
2085 /* Check the limitation of the close subexpression. */
2086 if (cls_node >= 0)
2087 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2088 {
2089 int node = dest_nodes->elems[node_idx];
2090 if (!re_node_set_contains (dfa->inveclosures + node,
2091 cls_node)
2092 && !re_node_set_contains (dfa->eclosures + node,
2093 cls_node))
2094 {
2095 /* It is against this limitation.
2096 Remove it form the current sifted state. */
2097 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2098 candidates);
2099 if (BE (err != REG_NOERROR, 0))
2100 return err;
2101 --node_idx;
2102 }
2103 }
2104 }
2105 else /* (ent->subexp_to != str_idx) */
2106 {
2107 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2108 {
2109 int node = dest_nodes->elems[node_idx];
2110 re_token_type_t type = dfa->nodes[node].type;
2111 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2112 {
2113 if (subexp_idx != dfa->nodes[node].opr.idx)
2114 continue;
2115 /* It is against this limitation.
2116 Remove it form the current sifted state. */
2117 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2118 candidates);
2119 if (BE (err != REG_NOERROR, 0))
2120 return err;
2121 }
2122 }
2123 }
2124 }
2125 return REG_NOERROR;
2126}
2127
2128static reg_errcode_t
2129internal_function
2130sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2131 int str_idx, const re_node_set *candidates)
2132{
2133 const re_dfa_t *const dfa = mctx->dfa;
2134 reg_errcode_t err;
2135 int node_idx, node;
2136 re_sift_context_t local_sctx;
2137 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2138
2139 if (first_idx == -1)
2140 return REG_NOERROR;
2141
2142 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2143
2144 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2145 {
2146 int enabled_idx;
2147 re_token_type_t type;
2148 struct re_backref_cache_entry *entry;
2149 node = candidates->elems[node_idx];
2150 type = dfa->nodes[node].type;
2151 /* Avoid infinite loop for the REs like "()\1+". */
2152 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2153 continue;
2154 if (type != OP_BACK_REF)
2155 continue;
2156
2157 entry = mctx->bkref_ents + first_idx;
2158 enabled_idx = first_idx;
2159 do
2160 {
2161 int subexp_len;
2162 int to_idx;
2163 int dst_node;
2164 int ret;
2165 re_dfastate_t *cur_state;
2166
2167 if (entry->node != node)
2168 continue;
2169 subexp_len = entry->subexp_to - entry->subexp_from;
2170 to_idx = str_idx + subexp_len;
2171 dst_node = (subexp_len ? dfa->nexts[node]
2172 : dfa->edests[node].elems[0]);
2173
2174 if (to_idx > sctx->last_str_idx
2175 || sctx->sifted_states[to_idx] == NULL
2176 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2177 || check_dst_limits (mctx, &sctx->limits, node,
2178 str_idx, dst_node, to_idx))
2179 continue;
2180
2181 if (local_sctx.sifted_states == NULL)
2182 {
2183 local_sctx = *sctx;
2184 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2185 if (BE (err != REG_NOERROR, 0))
2186 goto free_return;
2187 }
2188 local_sctx.last_node = node;
2189 local_sctx.last_str_idx = str_idx;
2190 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2191 if (BE (ret < 0, 0))
2192 {
2193 err = REG_ESPACE;
2194 goto free_return;
2195 }
2196 cur_state = local_sctx.sifted_states[str_idx];
2197 err = sift_states_backward (mctx, &local_sctx);
2198 if (BE (err != REG_NOERROR, 0))
2199 goto free_return;
2200 if (sctx->limited_states != NULL)
2201 {
2202 err = merge_state_array (dfa, sctx->limited_states,
2203 local_sctx.sifted_states,
2204 str_idx + 1);
2205 if (BE (err != REG_NOERROR, 0))
2206 goto free_return;
2207 }
2208 local_sctx.sifted_states[str_idx] = cur_state;
2209 re_node_set_remove (&local_sctx.limits, enabled_idx);
2210
2211 /* mctx->bkref_ents may have changed, reload the pointer. */
2212 entry = mctx->bkref_ents + enabled_idx;
2213 }
2214 while (enabled_idx++, entry++->more);
2215 }
2216 err = REG_NOERROR;
2217 free_return:
2218 if (local_sctx.sifted_states != NULL)
2219 {
2220 re_node_set_free (&local_sctx.limits);
2221 }
2222
2223 return err;
2224}
2225
2226
2227#ifdef RE_ENABLE_I18N
2228static int
2229internal_function
2230sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2231 int node_idx, int str_idx, int max_str_idx)
2232{
2233 const re_dfa_t *const dfa = mctx->dfa;
2234 int naccepted;
2235 /* Check the node can accept `multi byte'. */
2236 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2237 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2238 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2239 dfa->nexts[node_idx]))
2240 /* The node can't accept the `multi byte', or the
2241 destination was already thrown away, then the node
2242 couldn't accept the current input `multi byte'. */
2243 naccepted = 0;
2244 /* Otherwise, it is sure that the node could accept
2245 `naccepted' bytes input. */
2246 return naccepted;
2247}
2248#endif /* RE_ENABLE_I18N */
2249
2250
2251/* Functions for state transition. */
2252
2253/* Return the next state to which the current state STATE will transit by
2254 accepting the current input byte, and update STATE_LOG if necessary.
2255 If STATE can accept a multibyte char/collating element/back reference
2256 update the destination of STATE_LOG. */
2257
2258static re_dfastate_t *
2259internal_function
2260transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2261 re_dfastate_t *state)
2262{
2263 re_dfastate_t **trtable;
2264 unsigned char ch;
2265
2266#ifdef RE_ENABLE_I18N
2267 /* If the current state can accept multibyte. */
2268 if (BE (state->accept_mb, 0))
2269 {
2270 *err = transit_state_mb (mctx, state);
2271 if (BE (*err != REG_NOERROR, 0))
2272 return NULL;
2273 }
2274#endif /* RE_ENABLE_I18N */
2275
2276 /* Then decide the next state with the single byte. */
2277#if 0
2278 if (0)
2279 /* don't use transition table */
2280 return transit_state_sb (err, mctx, state);
2281#endif
2282
2283 /* Use transition table */
2284 ch = re_string_fetch_byte (&mctx->input);
2285 for (;;)
2286 {
2287 trtable = state->trtable;
2288 if (BE (trtable != NULL, 1))
2289 return trtable[ch];
2290
2291 trtable = state->word_trtable;
2292 if (BE (trtable != NULL, 1))
2293 {
2294 unsigned int context;
2295 context
2296 = re_string_context_at (&mctx->input,
2297 re_string_cur_idx (&mctx->input) - 1,
2298 mctx->eflags);
2299 if (IS_WORD_CONTEXT (context))
2300 return trtable[ch + SBC_MAX];
2301 else
2302 return trtable[ch];
2303 }
2304
2305 if (!build_trtable (mctx->dfa, state))
2306 {
2307 *err = REG_ESPACE;
2308 return NULL;
2309 }
2310
2311 /* Retry, we now have a transition table. */
2312 }
2313}
2314
2315/* Update the state_log if we need */
2316static re_dfastate_t *
2317internal_function
2318merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2319 re_dfastate_t *next_state)
2320{
2321 const re_dfa_t *const dfa = mctx->dfa;
2322 int cur_idx = re_string_cur_idx (&mctx->input);
2323
2324 if (cur_idx > mctx->state_log_top)
2325 {
2326 mctx->state_log[cur_idx] = next_state;
2327 mctx->state_log_top = cur_idx;
2328 }
2329 else if (mctx->state_log[cur_idx] == NULL)
2330 {
2331 mctx->state_log[cur_idx] = next_state;
2332 }
2333 else
2334 {
2335 re_dfastate_t *pstate;
2336 unsigned int context;
2337 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2338 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2339 the destination of a multibyte char/collating element/
2340 back reference. Then the next state is the union set of
2341 these destinations and the results of the transition table. */
2342 pstate = mctx->state_log[cur_idx];
2343 log_nodes = pstate->entrance_nodes;
2344 if (next_state != NULL)
2345 {
2346 table_nodes = next_state->entrance_nodes;
2347 *err = re_node_set_init_union (&next_nodes, table_nodes,
2348 log_nodes);
2349 if (BE (*err != REG_NOERROR, 0))
2350 return NULL;
2351 }
2352 else
2353 next_nodes = *log_nodes;
2354 /* Note: We already add the nodes of the initial state,
2355 then we don't need to add them here. */
2356
2357 context = re_string_context_at (&mctx->input,
2358 re_string_cur_idx (&mctx->input) - 1,
2359 mctx->eflags);
2360 next_state = mctx->state_log[cur_idx]
2361 = re_acquire_state_context (err, dfa, &next_nodes, context);
2362 /* We don't need to check errors here, since the return value of
2363 this function is next_state and ERR is already set. */
2364
2365 if (table_nodes != NULL)
2366 re_node_set_free (&next_nodes);
2367 }
2368
2369 if (BE (dfa->nbackref, 0) && next_state != NULL)
2370 {
2371 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2372 later. We must check them here, since the back references in the
2373 next state might use them. */
2374 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2375 cur_idx);
2376 if (BE (*err != REG_NOERROR, 0))
2377 return NULL;
2378
2379 /* If the next state has back references. */
2380 if (next_state->has_backref)
2381 {
2382 *err = transit_state_bkref (mctx, &next_state->nodes);
2383 if (BE (*err != REG_NOERROR, 0))
2384 return NULL;
2385 next_state = mctx->state_log[cur_idx];
2386 }
2387 }
2388
2389 return next_state;
2390}
2391
2392/* Skip bytes in the input that correspond to part of a
2393 multi-byte match, then look in the log for a state
2394 from which to restart matching. */
2395static re_dfastate_t *
2396internal_function
2397find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2398{
2399 re_dfastate_t *cur_state;
2400 do
2401 {
2402 int max = mctx->state_log_top;
2403 int cur_str_idx = re_string_cur_idx (&mctx->input);
2404
2405 do
2406 {
2407 if (++cur_str_idx > max)
2408 return NULL;
2409 re_string_skip_bytes (&mctx->input, 1);
2410 }
2411 while (mctx->state_log[cur_str_idx] == NULL);
2412
2413 cur_state = merge_state_with_log (err, mctx, NULL);
2414 }
2415 while (*err == REG_NOERROR && cur_state == NULL);
2416 return cur_state;
2417}
2418
2419/* Helper functions for transit_state. */
2420
2421/* From the node set CUR_NODES, pick up the nodes whose types are
2422 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2423 expression. And register them to use them later for evaluating the
2424 correspoding back references. */
2425
2426static reg_errcode_t
2427internal_function
2428check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2429 int str_idx)
2430{
2431 const re_dfa_t *const dfa = mctx->dfa;
2432 int node_idx;
2433 reg_errcode_t err;
2434
2435 /* TODO: This isn't efficient.
2436 Because there might be more than one nodes whose types are
2437 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2438 nodes.
2439 E.g. RE: (a){2} */
2440 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2441 {
2442 int node = cur_nodes->elems[node_idx];
2443 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2444 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2445 && (dfa->used_bkref_map
2446 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2447 {
2448 err = match_ctx_add_subtop (mctx, node, str_idx);
2449 if (BE (err != REG_NOERROR, 0))
2450 return err;
2451 }
2452 }
2453 return REG_NOERROR;
2454}
2455
2456#if 0
2457/* Return the next state to which the current state STATE will transit by
2458 accepting the current input byte. */
2459
2460static re_dfastate_t *
2461transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2462 re_dfastate_t *state)
2463{
2464 const re_dfa_t *const dfa = mctx->dfa;
2465 re_node_set next_nodes;
2466 re_dfastate_t *next_state;
2467 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2468 unsigned int context;
2469
2470 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2471 if (BE (*err != REG_NOERROR, 0))
2472 return NULL;
2473 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2474 {
2475 int cur_node = state->nodes.elems[node_cnt];
2476 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2477 {
2478 *err = re_node_set_merge (&next_nodes,
2479 dfa->eclosures + dfa->nexts[cur_node]);
2480 if (BE (*err != REG_NOERROR, 0))
2481 {
2482 re_node_set_free (&next_nodes);
2483 return NULL;
2484 }
2485 }
2486 }
2487 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2488 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2489 /* We don't need to check errors here, since the return value of
2490 this function is next_state and ERR is already set. */
2491
2492 re_node_set_free (&next_nodes);
2493 re_string_skip_bytes (&mctx->input, 1);
2494 return next_state;
2495}
2496#endif
2497
2498#ifdef RE_ENABLE_I18N
2499static reg_errcode_t
2500internal_function
2501transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2502{
2503 const re_dfa_t *const dfa = mctx->dfa;
2504 reg_errcode_t err;
2505 int i;
2506
2507 for (i = 0; i < pstate->nodes.nelem; ++i)
2508 {
2509 re_node_set dest_nodes, *new_nodes;
2510 int cur_node_idx = pstate->nodes.elems[i];
2511 int naccepted, dest_idx;
2512 unsigned int context;
2513 re_dfastate_t *dest_state;
2514
2515 if (!dfa->nodes[cur_node_idx].accept_mb)
2516 continue;
2517
2518 if (dfa->nodes[cur_node_idx].constraint)
2519 {
2520 context = re_string_context_at (&mctx->input,
2521 re_string_cur_idx (&mctx->input),
2522 mctx->eflags);
2523 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2524 context))
2525 continue;
2526 }
2527
2528 /* How many bytes the node can accept? */
2529 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2530 re_string_cur_idx (&mctx->input));
2531 if (naccepted == 0)
2532 continue;
2533
2534 /* The node can accepts `naccepted' bytes. */
2535 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2536 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2537 : mctx->max_mb_elem_len);
2538 err = clean_state_log_if_needed (mctx, dest_idx);
2539 if (BE (err != REG_NOERROR, 0))
2540 return err;
2541#ifdef DEBUG
2542 assert (dfa->nexts[cur_node_idx] != -1);
2543#endif
2544 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2545
2546 dest_state = mctx->state_log[dest_idx];
2547 if (dest_state == NULL)
2548 dest_nodes = *new_nodes;
2549 else
2550 {
2551 err = re_node_set_init_union (&dest_nodes,
2552 dest_state->entrance_nodes, new_nodes);
2553 if (BE (err != REG_NOERROR, 0))
2554 return err;
2555 }
2556 context = re_string_context_at (&mctx->input, dest_idx - 1,
2557 mctx->eflags);
2558 mctx->state_log[dest_idx]
2559 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2560 if (dest_state != NULL)
2561 re_node_set_free (&dest_nodes);
2562 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2563 return err;
2564 }
2565 return REG_NOERROR;
2566}
2567#endif /* RE_ENABLE_I18N */
2568
2569static reg_errcode_t
2570internal_function
2571transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2572{
2573 const re_dfa_t *const dfa = mctx->dfa;
2574 reg_errcode_t err;
2575 int i;
2576 int cur_str_idx = re_string_cur_idx (&mctx->input);
2577
2578 for (i = 0; i < nodes->nelem; ++i)
2579 {
2580 int dest_str_idx, prev_nelem, bkc_idx;
2581 int node_idx = nodes->elems[i];
2582 unsigned int context;
2583 const re_token_t *node = dfa->nodes + node_idx;
2584 re_node_set *new_dest_nodes;
2585
2586 /* Check whether `node' is a backreference or not. */
2587 if (node->type != OP_BACK_REF)
2588 continue;
2589
2590 if (node->constraint)
2591 {
2592 context = re_string_context_at (&mctx->input, cur_str_idx,
2593 mctx->eflags);
2594 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2595 continue;
2596 }
2597
2598 /* `node' is a backreference.
2599 Check the substring which the substring matched. */
2600 bkc_idx = mctx->nbkref_ents;
2601 err = get_subexp (mctx, node_idx, cur_str_idx);
2602 if (BE (err != REG_NOERROR, 0))
2603 goto free_return;
2604
2605 /* And add the epsilon closures (which is `new_dest_nodes') of
2606 the backreference to appropriate state_log. */
2607#ifdef DEBUG
2608 assert (dfa->nexts[node_idx] != -1);
2609#endif
2610 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2611 {
2612 int subexp_len;
2613 re_dfastate_t *dest_state;
2614 struct re_backref_cache_entry *bkref_ent;
2615 bkref_ent = mctx->bkref_ents + bkc_idx;
2616 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2617 continue;
2618 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2619 new_dest_nodes = (subexp_len == 0
2620 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2621 : dfa->eclosures + dfa->nexts[node_idx]);
2622 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2623 - bkref_ent->subexp_from);
2624 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2625 mctx->eflags);
2626 dest_state = mctx->state_log[dest_str_idx];
2627 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2628 : mctx->state_log[cur_str_idx]->nodes.nelem);
2629 /* Add `new_dest_node' to state_log. */
2630 if (dest_state == NULL)
2631 {
2632 mctx->state_log[dest_str_idx]
2633 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2634 context);
2635 if (BE (mctx->state_log[dest_str_idx] == NULL
2636 && err != REG_NOERROR, 0))
2637 goto free_return;
2638 }
2639 else
2640 {
2641 re_node_set dest_nodes;
2642 err = re_node_set_init_union (&dest_nodes,
2643 dest_state->entrance_nodes,
2644 new_dest_nodes);
2645 if (BE (err != REG_NOERROR, 0))
2646 {
2647 re_node_set_free (&dest_nodes);
2648 goto free_return;
2649 }
2650 mctx->state_log[dest_str_idx]
2651 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2652 re_node_set_free (&dest_nodes);
2653 if (BE (mctx->state_log[dest_str_idx] == NULL
2654 && err != REG_NOERROR, 0))
2655 goto free_return;
2656 }
2657 /* We need to check recursively if the backreference can epsilon
2658 transit. */
2659 if (subexp_len == 0
2660 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2661 {
2662 err = check_subexp_matching_top (mctx, new_dest_nodes,
2663 cur_str_idx);
2664 if (BE (err != REG_NOERROR, 0))
2665 goto free_return;
2666 err = transit_state_bkref (mctx, new_dest_nodes);
2667 if (BE (err != REG_NOERROR, 0))
2668 goto free_return;
2669 }
2670 }
2671 }
2672 err = REG_NOERROR;
2673 free_return:
2674 return err;
2675}
2676
2677/* Enumerate all the candidates which the backreference BKREF_NODE can match
2678 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2679 Note that we might collect inappropriate candidates here.
2680 However, the cost of checking them strictly here is too high, then we
2681 delay these checking for prune_impossible_nodes(). */
2682
2683static reg_errcode_t
2684internal_function
2685get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2686{
2687 const re_dfa_t *const dfa = mctx->dfa;
2688 int subexp_num, sub_top_idx;
2689 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2690 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2691 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2692 if (cache_idx != -1)
2693 {
2694 const struct re_backref_cache_entry *entry
2695 = mctx->bkref_ents + cache_idx;
2696 do
2697 if (entry->node == bkref_node)
2698 return REG_NOERROR; /* We already checked it. */
2699 while (entry++->more);
2700 }
2701
2702 subexp_num = dfa->nodes[bkref_node].opr.idx;
2703
2704 /* For each sub expression */
2705 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2706 {
2707 reg_errcode_t err;
2708 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2709 re_sub_match_last_t *sub_last;
2710 int sub_last_idx, sl_str, bkref_str_off;
2711
2712 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2713 continue; /* It isn't related. */
2714
2715 sl_str = sub_top->str_idx;
2716 bkref_str_off = bkref_str_idx;
2717 /* At first, check the last node of sub expressions we already
2718 evaluated. */
2719 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2720 {
2721 int sl_str_diff;
2722 sub_last = sub_top->lasts[sub_last_idx];
2723 sl_str_diff = sub_last->str_idx - sl_str;
2724 /* The matched string by the sub expression match with the substring
2725 at the back reference? */
2726 if (sl_str_diff > 0)
2727 {
2728 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2729 {
2730 /* Not enough chars for a successful match. */
2731 if (bkref_str_off + sl_str_diff > mctx->input.len)
2732 break;
2733
2734 err = clean_state_log_if_needed (mctx,
2735 bkref_str_off
2736 + sl_str_diff);
2737 if (BE (err != REG_NOERROR, 0))
2738 return err;
2739 buf = (const char *) re_string_get_buffer (&mctx->input);
2740 }
2741 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2742 /* We don't need to search this sub expression any more. */
2743 break;
2744 }
2745 bkref_str_off += sl_str_diff;
2746 sl_str += sl_str_diff;
2747 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2748 bkref_str_idx);
2749
2750 /* Reload buf, since the preceding call might have reallocated
2751 the buffer. */
2752 buf = (const char *) re_string_get_buffer (&mctx->input);
2753
2754 if (err == REG_NOMATCH)
2755 continue;
2756 if (BE (err != REG_NOERROR, 0))
2757 return err;
2758 }
2759
2760 if (sub_last_idx < sub_top->nlasts)
2761 continue;
2762 if (sub_last_idx > 0)
2763 ++sl_str;
2764 /* Then, search for the other last nodes of the sub expression. */
2765 for (; sl_str <= bkref_str_idx; ++sl_str)
2766 {
2767 int cls_node, sl_str_off;
2768 const re_node_set *nodes;
2769 sl_str_off = sl_str - sub_top->str_idx;
2770 /* The matched string by the sub expression match with the substring
2771 at the back reference? */
2772 if (sl_str_off > 0)
2773 {
2774 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2775 {
2776 /* If we are at the end of the input, we cannot match. */
2777 if (bkref_str_off >= mctx->input.len)
2778 break;
2779
2780 err = extend_buffers (mctx);
2781 if (BE (err != REG_NOERROR, 0))
2782 return err;
2783
2784 buf = (const char *) re_string_get_buffer (&mctx->input);
2785 }
2786 if (buf [bkref_str_off++] != buf[sl_str - 1])
2787 break; /* We don't need to search this sub expression
2788 any more. */
2789 }
2790 if (mctx->state_log[sl_str] == NULL)
2791 continue;
2792 /* Does this state have a ')' of the sub expression? */
2793 nodes = &mctx->state_log[sl_str]->nodes;
2794 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2795 OP_CLOSE_SUBEXP);
2796 if (cls_node == -1)
2797 continue; /* No. */
2798 if (sub_top->path == NULL)
2799 {
2800 sub_top->path = calloc (sizeof (state_array_t),
2801 sl_str - sub_top->str_idx + 1);
2802 if (sub_top->path == NULL)
2803 return REG_ESPACE;
2804 }
2805 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2806 in the current context? */
2807 err = check_arrival (mctx, sub_top->path, sub_top->node,
2808 sub_top->str_idx, cls_node, sl_str,
2809 OP_CLOSE_SUBEXP);
2810 if (err == REG_NOMATCH)
2811 continue;
2812 if (BE (err != REG_NOERROR, 0))
2813 return err;
2814 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2815 if (BE (sub_last == NULL, 0))
2816 return REG_ESPACE;
2817 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2818 bkref_str_idx);
2819 if (err == REG_NOMATCH)
2820 continue;
2821 }
2822 }
2823 return REG_NOERROR;
2824}
2825
2826/* Helper functions for get_subexp(). */
2827
2828/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2829 If it can arrive, register the sub expression expressed with SUB_TOP
2830 and SUB_LAST. */
2831
2832static reg_errcode_t
2833internal_function
2834get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2835 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2836{
2837 reg_errcode_t err;
2838 int to_idx;
2839 /* Can the subexpression arrive the back reference? */
2840 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2841 sub_last->str_idx, bkref_node, bkref_str,
2842 OP_OPEN_SUBEXP);
2843 if (err != REG_NOERROR)
2844 return err;
2845 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2846 sub_last->str_idx);
2847 if (BE (err != REG_NOERROR, 0))
2848 return err;
2849 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2850 return clean_state_log_if_needed (mctx, to_idx);
2851}
2852
2853/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2854 Search '(' if FL_OPEN, or search ')' otherwise.
2855 TODO: This function isn't efficient...
2856 Because there might be more than one nodes whose types are
2857 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2858 nodes.
2859 E.g. RE: (a){2} */
2860
2861static int
2862internal_function
2863find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2864 int subexp_idx, int type)
2865{
2866 int cls_idx;
2867 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2868 {
2869 int cls_node = nodes->elems[cls_idx];
2870 const re_token_t *node = dfa->nodes + cls_node;
2871 if (node->type == type
2872 && node->opr.idx == subexp_idx)
2873 return cls_node;
2874 }
2875 return -1;
2876}
2877
2878/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2879 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2880 heavily reused.
2881 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2882
2883static reg_errcode_t
2884internal_function
2885check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2886 int top_str, int last_node, int last_str, int type)
2887{
2888 const re_dfa_t *const dfa = mctx->dfa;
2889 reg_errcode_t err = REG_NOERROR;
2890 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2891 re_dfastate_t *cur_state = NULL;
2892 re_node_set *cur_nodes, next_nodes;
2893 re_dfastate_t **backup_state_log;
2894 unsigned int context;
2895
2896 subexp_num = dfa->nodes[top_node].opr.idx;
2897 /* Extend the buffer if we need. */
2898 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2899 {
2900 re_dfastate_t **new_array;
2901 int old_alloc = path->alloc;
2902 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2903 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2904 if (BE (new_array == NULL, 0))
2905 {
2906 path->alloc = old_alloc;
2907 return REG_ESPACE;
2908 }
2909 path->array = new_array;
2910 memset (new_array + old_alloc, '\0',
2911 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2912 }
2913
2914 str_idx = path->next_idx ? path->next_idx : top_str;
2915
2916 /* Temporary modify MCTX. */
2917 backup_state_log = mctx->state_log;
2918 backup_cur_idx = mctx->input.cur_idx;
2919 mctx->state_log = path->array;
2920 mctx->input.cur_idx = str_idx;
2921
2922 /* Setup initial node set. */
2923 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2924 if (str_idx == top_str)
2925 {
2926 err = re_node_set_init_1 (&next_nodes, top_node);
2927 if (BE (err != REG_NOERROR, 0))
2928 return err;
2929 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2930 if (BE (err != REG_NOERROR, 0))
2931 {
2932 re_node_set_free (&next_nodes);
2933 return err;
2934 }
2935 }
2936 else
2937 {
2938 cur_state = mctx->state_log[str_idx];
2939 if (cur_state && cur_state->has_backref)
2940 {
2941 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2942 if (BE (err != REG_NOERROR, 0))
2943 return err;
2944 }
2945 else
2946 re_node_set_init_empty (&next_nodes);
2947 }
2948 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2949 {
2950 if (next_nodes.nelem)
2951 {
2952 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953 subexp_num, type);
2954 if (BE (err != REG_NOERROR, 0))
2955 {
2956 re_node_set_free (&next_nodes);
2957 return err;
2958 }
2959 }
2960 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2961 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2962 {
2963 re_node_set_free (&next_nodes);
2964 return err;
2965 }
2966 mctx->state_log[str_idx] = cur_state;
2967 }
2968
2969 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2970 {
2971 re_node_set_empty (&next_nodes);
2972 if (mctx->state_log[str_idx + 1])
2973 {
2974 err = re_node_set_merge (&next_nodes,
2975 &mctx->state_log[str_idx + 1]->nodes);
2976 if (BE (err != REG_NOERROR, 0))
2977 {
2978 re_node_set_free (&next_nodes);
2979 return err;
2980 }
2981 }
2982 if (cur_state)
2983 {
2984 err = check_arrival_add_next_nodes (mctx, str_idx,
2985 &cur_state->non_eps_nodes,
2986 &next_nodes);
2987 if (BE (err != REG_NOERROR, 0))
2988 {
2989 re_node_set_free (&next_nodes);
2990 return err;
2991 }
2992 }
2993 ++str_idx;
2994 if (next_nodes.nelem)
2995 {
2996 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2997 if (BE (err != REG_NOERROR, 0))
2998 {
2999 re_node_set_free (&next_nodes);
3000 return err;
3001 }
3002 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3003 subexp_num, type);
3004 if (BE (err != REG_NOERROR, 0))
3005 {
3006 re_node_set_free (&next_nodes);
3007 return err;
3008 }
3009 }
3010 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3011 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3012 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3013 {
3014 re_node_set_free (&next_nodes);
3015 return err;
3016 }
3017 mctx->state_log[str_idx] = cur_state;
3018 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3019 }
3020 re_node_set_free (&next_nodes);
3021 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3022 : &mctx->state_log[last_str]->nodes);
3023 path->next_idx = str_idx;
3024
3025 /* Fix MCTX. */
3026 mctx->state_log = backup_state_log;
3027 mctx->input.cur_idx = backup_cur_idx;
3028
3029 /* Then check the current node set has the node LAST_NODE. */
3030 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3031 return REG_NOERROR;
3032
3033 return REG_NOMATCH;
3034}
3035
3036/* Helper functions for check_arrival. */
3037
3038/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3039 to NEXT_NODES.
3040 TODO: This function is similar to the functions transit_state*(),
3041 however this function has many additional works.
3042 Can't we unify them? */
3043
3044static reg_errcode_t
3045internal_function
3046check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3047 re_node_set *cur_nodes, re_node_set *next_nodes)
3048{
3049 const re_dfa_t *const dfa = mctx->dfa;
3050 int result;
3051 int cur_idx;
3052#ifdef RE_ENABLE_I18N
3053 reg_errcode_t err = REG_NOERROR;
3054#endif
3055 re_node_set union_set;
3056 re_node_set_init_empty (&union_set);
3057 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3058 {
3059 int naccepted = 0;
3060 int cur_node = cur_nodes->elems[cur_idx];
3061#ifdef DEBUG
3062 re_token_type_t type = dfa->nodes[cur_node].type;
3063 assert (!IS_EPSILON_NODE (type));
3064#endif
3065#ifdef RE_ENABLE_I18N
3066 /* If the node may accept `multi byte'. */
3067 if (dfa->nodes[cur_node].accept_mb)
3068 {
3069 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3070 str_idx);
3071 if (naccepted > 1)
3072 {
3073 re_dfastate_t *dest_state;
3074 int next_node = dfa->nexts[cur_node];
3075 int next_idx = str_idx + naccepted;
3076 dest_state = mctx->state_log[next_idx];
3077 re_node_set_empty (&union_set);
3078 if (dest_state)
3079 {
3080 err = re_node_set_merge (&union_set, &dest_state->nodes);
3081 if (BE (err != REG_NOERROR, 0))
3082 {
3083 re_node_set_free (&union_set);
3084 return err;
3085 }
3086 }
3087 result = re_node_set_insert (&union_set, next_node);
3088 if (BE (result < 0, 0))
3089 {
3090 re_node_set_free (&union_set);
3091 return REG_ESPACE;
3092 }
3093 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3094 &union_set);
3095 if (BE (mctx->state_log[next_idx] == NULL
3096 && err != REG_NOERROR, 0))
3097 {
3098 re_node_set_free (&union_set);
3099 return err;
3100 }
3101 }
3102 }
3103#endif /* RE_ENABLE_I18N */
3104 if (naccepted
3105 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3106 {
3107 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3108 if (BE (result < 0, 0))
3109 {
3110 re_node_set_free (&union_set);
3111 return REG_ESPACE;
3112 }
3113 }
3114 }
3115 re_node_set_free (&union_set);
3116 return REG_NOERROR;
3117}
3118
3119/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3120 CUR_NODES, however exclude the nodes which are:
3121 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3122 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3123*/
3124
3125static reg_errcode_t
3126internal_function
3127check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3128 int ex_subexp, int type)
3129{
3130 reg_errcode_t err;
3131 int idx, outside_node;
3132 re_node_set new_nodes;
3133#ifdef DEBUG
3134 assert (cur_nodes->nelem);
3135#endif
3136 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3137 if (BE (err != REG_NOERROR, 0))
3138 return err;
3139 /* Create a new node set NEW_NODES with the nodes which are epsilon
3140 closures of the node in CUR_NODES. */
3141
3142 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3143 {
3144 int cur_node = cur_nodes->elems[idx];
3145 const re_node_set *eclosure = dfa->eclosures + cur_node;
3146 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3147 if (outside_node == -1)
3148 {
3149 /* There are no problematic nodes, just merge them. */
3150 err = re_node_set_merge (&new_nodes, eclosure);
3151 if (BE (err != REG_NOERROR, 0))
3152 {
3153 re_node_set_free (&new_nodes);
3154 return err;
3155 }
3156 }
3157 else
3158 {
3159 /* There are problematic nodes, re-calculate incrementally. */
3160 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3161 ex_subexp, type);
3162 if (BE (err != REG_NOERROR, 0))
3163 {
3164 re_node_set_free (&new_nodes);
3165 return err;
3166 }
3167 }
3168 }
3169 re_node_set_free (cur_nodes);
3170 *cur_nodes = new_nodes;
3171 return REG_NOERROR;
3172}
3173
3174/* Helper function for check_arrival_expand_ecl.
3175 Check incrementally the epsilon closure of TARGET, and if it isn't
3176 problematic append it to DST_NODES. */
3177
3178static reg_errcode_t
3179internal_function
3180check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3181 int target, int ex_subexp, int type)
3182{
3183 int cur_node;
3184 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3185 {
3186 int err;
3187
3188 if (dfa->nodes[cur_node].type == type
3189 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3190 {
3191 if (type == OP_CLOSE_SUBEXP)
3192 {
3193 err = re_node_set_insert (dst_nodes, cur_node);
3194 if (BE (err == -1, 0))
3195 return REG_ESPACE;
3196 }
3197 break;
3198 }
3199 err = re_node_set_insert (dst_nodes, cur_node);
3200 if (BE (err == -1, 0))
3201 return REG_ESPACE;
3202 if (dfa->edests[cur_node].nelem == 0)
3203 break;
3204 if (dfa->edests[cur_node].nelem == 2)
3205 {
3206 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3207 dfa->edests[cur_node].elems[1],
3208 ex_subexp, type);
3209 if (BE (err != REG_NOERROR, 0))
3210 return err;
3211 }
3212 cur_node = dfa->edests[cur_node].elems[0];
3213 }
3214 return REG_NOERROR;
3215}
3216
3217
3218/* For all the back references in the current state, calculate the
3219 destination of the back references by the appropriate entry
3220 in MCTX->BKREF_ENTS. */
3221
3222static reg_errcode_t
3223internal_function
3224expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3225 int cur_str, int subexp_num, int type)
3226{
3227 const re_dfa_t *const dfa = mctx->dfa;
3228 reg_errcode_t err;
3229 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3230 struct re_backref_cache_entry *ent;
3231
3232 if (cache_idx_start == -1)
3233 return REG_NOERROR;
3234
3235 restart:
3236 ent = mctx->bkref_ents + cache_idx_start;
3237 do
3238 {
3239 int to_idx, next_node;
3240
3241 /* Is this entry ENT is appropriate? */
3242 if (!re_node_set_contains (cur_nodes, ent->node))
3243 continue; /* No. */
3244
3245 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3246 /* Calculate the destination of the back reference, and append it
3247 to MCTX->STATE_LOG. */
3248 if (to_idx == cur_str)
3249 {
3250 /* The backreference did epsilon transit, we must re-check all the
3251 node in the current state. */
3252 re_node_set new_dests;
3253 reg_errcode_t err2, err3;
3254 next_node = dfa->edests[ent->node].elems[0];
3255 if (re_node_set_contains (cur_nodes, next_node))
3256 continue;
3257 err = re_node_set_init_1 (&new_dests, next_node);
3258 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3259 err3 = re_node_set_merge (cur_nodes, &new_dests);
3260 re_node_set_free (&new_dests);
3261 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3262 || err3 != REG_NOERROR, 0))
3263 {
3264 err = (err != REG_NOERROR ? err
3265 : (err2 != REG_NOERROR ? err2 : err3));
3266 return err;
3267 }
3268 /* TODO: It is still inefficient... */
3269 goto restart;
3270 }
3271 else
3272 {
3273 re_node_set union_set;
3274 next_node = dfa->nexts[ent->node];
3275 if (mctx->state_log[to_idx])
3276 {
3277 int ret;
3278 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3279 next_node))
3280 continue;
3281 err = re_node_set_init_copy (&union_set,
3282 &mctx->state_log[to_idx]->nodes);
3283 ret = re_node_set_insert (&union_set, next_node);
3284 if (BE (err != REG_NOERROR || ret < 0, 0))
3285 {
3286 re_node_set_free (&union_set);
3287 err = err != REG_NOERROR ? err : REG_ESPACE;
3288 return err;
3289 }
3290 }
3291 else
3292 {
3293 err = re_node_set_init_1 (&union_set, next_node);
3294 if (BE (err != REG_NOERROR, 0))
3295 return err;
3296 }
3297 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3298 re_node_set_free (&union_set);
3299 if (BE (mctx->state_log[to_idx] == NULL
3300 && err != REG_NOERROR, 0))
3301 return err;
3302 }
3303 }
3304 while (ent++->more);
3305 return REG_NOERROR;
3306}
3307
3308/* Build transition table for the state.
3309 Return 1 if succeeded, otherwise return NULL. */
3310
3311static int
3312internal_function
3313build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3314{
3315 reg_errcode_t err;
3316 int i, j, ch, need_word_trtable = 0;
3317 bitset_word_t elem, mask;
3318 bool dests_node_malloced = false;
3319 bool dest_states_malloced = false;
3320 int ndests; /* Number of the destination states from `state'. */
3321 re_dfastate_t **trtable;
3322 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3323 re_node_set follows, *dests_node;
3324 bitset_t *dests_ch;
3325 bitset_t acceptable;
3326
3327 struct dests_alloc
3328 {
3329 re_node_set dests_node[SBC_MAX];
3330 bitset_t dests_ch[SBC_MAX];
3331 } *dests_alloc;
3332
3333 /* We build DFA states which corresponds to the destination nodes
3334 from `state'. `dests_node[i]' represents the nodes which i-th
3335 destination state contains, and `dests_ch[i]' represents the
3336 characters which i-th destination state accepts. */
3337#ifdef HAVE_ALLOCA
3338 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3339 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3340 else
3341#endif
3342 {
3343 dests_alloc = re_malloc (struct dests_alloc, 1);
3344 if (BE (dests_alloc == NULL, 0))
3345 return 0;
3346 dests_node_malloced = true;
3347 }
3348 dests_node = dests_alloc->dests_node;
3349 dests_ch = dests_alloc->dests_ch;
3350
3351 /* Initialize transiton table. */
3352 state->word_trtable = state->trtable = NULL;
3353
3354 /* At first, group all nodes belonging to `state' into several
3355 destinations. */
3356 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3357 if (BE (ndests <= 0, 0))
3358 {
3359 if (dests_node_malloced)
3360 free (dests_alloc);
3361 /* Return 0 in case of an error, 1 otherwise. */
3362 if (ndests == 0)
3363 {
3364 state->trtable = (re_dfastate_t **)
3365 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3366 return 1;
3367 }
3368 return 0;
3369 }
3370
3371 err = re_node_set_alloc (&follows, ndests + 1);
3372 if (BE (err != REG_NOERROR, 0))
3373 goto out_free;
3374
3375 /* Avoid arithmetic overflow in size calculation. */
3376 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3377 / (3 * sizeof (re_dfastate_t *)))
3378 < ndests),
3379 0))
3380 goto out_free;
3381
3382#ifdef HAVE_ALLOCA
3383 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3384 + ndests * 3 * sizeof (re_dfastate_t *)))
3385 dest_states = (re_dfastate_t **)
3386 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3387 else
3388#endif
3389 {
3390 dest_states = (re_dfastate_t **)
3391 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3392 if (BE (dest_states == NULL, 0))
3393 {
3394out_free:
3395 if (dest_states_malloced)
3396 free (dest_states);
3397 re_node_set_free (&follows);
3398 for (i = 0; i < ndests; ++i)
3399 re_node_set_free (dests_node + i);
3400 if (dests_node_malloced)
3401 free (dests_alloc);
3402 return 0;
3403 }
3404 dest_states_malloced = true;
3405 }
3406 dest_states_word = dest_states + ndests;
3407 dest_states_nl = dest_states_word + ndests;
3408 bitset_empty (acceptable);
3409
3410 /* Then build the states for all destinations. */
3411 for (i = 0; i < ndests; ++i)
3412 {
3413 int next_node;
3414 re_node_set_empty (&follows);
3415 /* Merge the follows of this destination states. */
3416 for (j = 0; j < dests_node[i].nelem; ++j)
3417 {
3418 next_node = dfa->nexts[dests_node[i].elems[j]];
3419 if (next_node != -1)
3420 {
3421 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3422 if (BE (err != REG_NOERROR, 0))
3423 goto out_free;
3424 }
3425 }
3426 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3427 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3428 goto out_free;
3429 /* If the new state has context constraint,
3430 build appropriate states for these contexts. */
3431 if (dest_states[i]->has_constraint)
3432 {
3433 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3434 CONTEXT_WORD);
3435 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3436 goto out_free;
3437
3438 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3439 need_word_trtable = 1;
3440
3441 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3442 CONTEXT_NEWLINE);
3443 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3444 goto out_free;
3445 }
3446 else
3447 {
3448 dest_states_word[i] = dest_states[i];
3449 dest_states_nl[i] = dest_states[i];
3450 }
3451 bitset_merge (acceptable, dests_ch[i]);
3452 }
3453
3454 if (!BE (need_word_trtable, 0))
3455 {
3456 /* We don't care about whether the following character is a word
3457 character, or we are in a single-byte character set so we can
3458 discern by looking at the character code: allocate a
3459 256-entry transition table. */
3460 trtable = state->trtable =
3461 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3462 if (BE (trtable == NULL, 0))
3463 goto out_free;
3464
3465 /* For all characters ch...: */
3466 for (i = 0; i < BITSET_WORDS; ++i)
3467 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3468 elem;
3469 mask <<= 1, elem >>= 1, ++ch)
3470 if (BE (elem & 1, 0))
3471 {
3472 /* There must be exactly one destination which accepts
3473 character ch. See group_nodes_into_DFAstates. */
3474 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475 ;
3476
3477 /* j-th destination accepts the word character ch. */
3478 if (dfa->word_char[i] & mask)
3479 trtable[ch] = dest_states_word[j];
3480 else
3481 trtable[ch] = dest_states[j];
3482 }
3483 }
3484 else
3485 {
3486 /* We care about whether the following character is a word
3487 character, and we are in a multi-byte character set: discern
3488 by looking at the character code: build two 256-entry
3489 transition tables, one starting at trtable[0] and one
3490 starting at trtable[SBC_MAX]. */
3491 trtable = state->word_trtable =
3492 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3493 if (BE (trtable == NULL, 0))
3494 goto out_free;
3495
3496 /* For all characters ch...: */
3497 for (i = 0; i < BITSET_WORDS; ++i)
3498 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3499 elem;
3500 mask <<= 1, elem >>= 1, ++ch)
3501 if (BE (elem & 1, 0))
3502 {
3503 /* There must be exactly one destination which accepts
3504 character ch. See group_nodes_into_DFAstates. */
3505 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3506 ;
3507
3508 /* j-th destination accepts the word character ch. */
3509 trtable[ch] = dest_states[j];
3510 trtable[ch + SBC_MAX] = dest_states_word[j];
3511 }
3512 }
3513
3514 /* new line */
3515 if (bitset_contain (acceptable, NEWLINE_CHAR))
3516 {
3517 /* The current state accepts newline character. */
3518 for (j = 0; j < ndests; ++j)
3519 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3520 {
3521 /* k-th destination accepts newline character. */
3522 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3523 if (need_word_trtable)
3524 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3525 /* There must be only one destination which accepts
3526 newline. See group_nodes_into_DFAstates. */
3527 break;
3528 }
3529 }
3530
3531 if (dest_states_malloced)
3532 free (dest_states);
3533
3534 re_node_set_free (&follows);
3535 for (i = 0; i < ndests; ++i)
3536 re_node_set_free (dests_node + i);
3537
3538 if (dests_node_malloced)
3539 free (dests_alloc);
3540
3541 return 1;
3542}
3543
3544/* Group all nodes belonging to STATE into several destinations.
3545 Then for all destinations, set the nodes belonging to the destination
3546 to DESTS_NODE[i] and set the characters accepted by the destination
3547 to DEST_CH[i]. This function return the number of destinations. */
3548
3549static int
3550internal_function
3551group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3552 re_node_set *dests_node, bitset_t *dests_ch)
3553{
3554 reg_errcode_t err;
3555 int result;
3556 int i, j, k;
3557 int ndests; /* Number of the destinations from `state'. */
3558 bitset_t accepts; /* Characters a node can accept. */
3559 const re_node_set *cur_nodes = &state->nodes;
3560 bitset_empty (accepts);
3561 ndests = 0;
3562
3563 /* For all the nodes belonging to `state', */
3564 for (i = 0; i < cur_nodes->nelem; ++i)
3565 {
3566 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3567 re_token_type_t type = node->type;
3568 unsigned int constraint = node->constraint;
3569
3570 /* Enumerate all single byte character this node can accept. */
3571 if (type == CHARACTER)
3572 bitset_set (accepts, node->opr.c);
3573 else if (type == SIMPLE_BRACKET)
3574 {
3575 bitset_merge (accepts, node->opr.sbcset);
3576 }
3577 else if (type == OP_PERIOD)
3578 {
3579#ifdef RE_ENABLE_I18N
3580 if (dfa->mb_cur_max > 1)
3581 bitset_merge (accepts, dfa->sb_char);
3582 else
3583#endif
3584 bitset_set_all (accepts);
3585 if (!(dfa->syntax & RE_DOT_NEWLINE))
3586 bitset_clear (accepts, '\n');
3587 if (dfa->syntax & RE_DOT_NOT_NULL)
3588 bitset_clear (accepts, '\0');
3589 }
3590#ifdef RE_ENABLE_I18N
3591 else if (type == OP_UTF8_PERIOD)
3592 {
3593 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3594 if (!(dfa->syntax & RE_DOT_NEWLINE))
3595 bitset_clear (accepts, '\n');
3596 if (dfa->syntax & RE_DOT_NOT_NULL)
3597 bitset_clear (accepts, '\0');
3598 }
3599#endif
3600 else
3601 continue;
3602
3603 /* Check the `accepts' and sift the characters which are not
3604 match it the context. */
3605 if (constraint)
3606 {
3607 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3608 {
3609 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3610 bitset_empty (accepts);
3611 if (accepts_newline)
3612 bitset_set (accepts, NEWLINE_CHAR);
3613 else
3614 continue;
3615 }
3616 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3617 {
3618 bitset_empty (accepts);
3619 continue;
3620 }
3621
3622 if (constraint & NEXT_WORD_CONSTRAINT)
3623 {
3624 bitset_word_t any_set = 0;
3625 if (type == CHARACTER && !node->word_char)
3626 {
3627 bitset_empty (accepts);
3628 continue;
3629 }
3630#ifdef RE_ENABLE_I18N
3631 if (dfa->mb_cur_max > 1)
3632 for (j = 0; j < BITSET_WORDS; ++j)
3633 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3634 else
3635#endif
3636 for (j = 0; j < BITSET_WORDS; ++j)
3637 any_set |= (accepts[j] &= dfa->word_char[j]);
3638 if (!any_set)
3639 continue;
3640 }
3641 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3642 {
3643 bitset_word_t any_set = 0;
3644 if (type == CHARACTER && node->word_char)
3645 {
3646 bitset_empty (accepts);
3647 continue;
3648 }
3649#ifdef RE_ENABLE_I18N
3650 if (dfa->mb_cur_max > 1)
3651 for (j = 0; j < BITSET_WORDS; ++j)
3652 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3653 else
3654#endif
3655 for (j = 0; j < BITSET_WORDS; ++j)
3656 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3657 if (!any_set)
3658 continue;
3659 }
3660 }
3661
3662 /* Then divide `accepts' into DFA states, or create a new
3663 state. Above, we make sure that accepts is not empty. */
3664 for (j = 0; j < ndests; ++j)
3665 {
3666 bitset_t intersec; /* Intersection sets, see below. */
3667 bitset_t remains;
3668 /* Flags, see below. */
3669 bitset_word_t has_intersec, not_subset, not_consumed;
3670
3671 /* Optimization, skip if this state doesn't accept the character. */
3672 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3673 continue;
3674
3675 /* Enumerate the intersection set of this state and `accepts'. */
3676 has_intersec = 0;
3677 for (k = 0; k < BITSET_WORDS; ++k)
3678 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3679 /* And skip if the intersection set is empty. */
3680 if (!has_intersec)
3681 continue;
3682
3683 /* Then check if this state is a subset of `accepts'. */
3684 not_subset = not_consumed = 0;
3685 for (k = 0; k < BITSET_WORDS; ++k)
3686 {
3687 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3688 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3689 }
3690
3691 /* If this state isn't a subset of `accepts', create a
3692 new group state, which has the `remains'. */
3693 if (not_subset)
3694 {
3695 bitset_copy (dests_ch[ndests], remains);
3696 bitset_copy (dests_ch[j], intersec);
3697 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3698 if (BE (err != REG_NOERROR, 0))
3699 goto error_return;
3700 ++ndests;
3701 }
3702
3703 /* Put the position in the current group. */
3704 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3705 if (BE (result < 0, 0))
3706 goto error_return;
3707
3708 /* If all characters are consumed, go to next node. */
3709 if (!not_consumed)
3710 break;
3711 }
3712 /* Some characters remain, create a new group. */
3713 if (j == ndests)
3714 {
3715 bitset_copy (dests_ch[ndests], accepts);
3716 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3717 if (BE (err != REG_NOERROR, 0))
3718 goto error_return;
3719 ++ndests;
3720 bitset_empty (accepts);
3721 }
3722 }
3723 return ndests;
3724 error_return:
3725 for (j = 0; j < ndests; ++j)
3726 re_node_set_free (dests_node + j);
3727 return -1;
3728}
3729
3730#ifdef RE_ENABLE_I18N
3731/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3732 Return the number of the bytes the node accepts.
3733 STR_IDX is the current index of the input string.
3734
3735 This function handles the nodes which can accept one character, or
3736 one collating element like '.', '[a-z]', opposite to the other nodes
3737 can only accept one byte. */
3738
3739static int
3740internal_function
3741check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3742 const re_string_t *input, int str_idx)
3743{
3744 const re_token_t *node = dfa->nodes + node_idx;
3745 int char_len, elem_len;
3746 int i;
3747 wint_t wc;
3748
3749 if (BE (node->type == OP_UTF8_PERIOD, 0))
3750 {
3751 unsigned char c = re_string_byte_at (input, str_idx), d;
3752 if (BE (c < 0xc2, 1))
3753 return 0;
3754
3755 if (str_idx + 2 > input->len)
3756 return 0;
3757
3758 d = re_string_byte_at (input, str_idx + 1);
3759 if (c < 0xe0)
3760 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3761 else if (c < 0xf0)
3762 {
3763 char_len = 3;
3764 if (c == 0xe0 && d < 0xa0)
3765 return 0;
3766 }
3767 else if (c < 0xf8)
3768 {
3769 char_len = 4;
3770 if (c == 0xf0 && d < 0x90)
3771 return 0;
3772 }
3773 else if (c < 0xfc)
3774 {
3775 char_len = 5;
3776 if (c == 0xf8 && d < 0x88)
3777 return 0;
3778 }
3779 else if (c < 0xfe)
3780 {
3781 char_len = 6;
3782 if (c == 0xfc && d < 0x84)
3783 return 0;
3784 }
3785 else
3786 return 0;
3787
3788 if (str_idx + char_len > input->len)
3789 return 0;
3790
3791 for (i = 1; i < char_len; ++i)
3792 {
3793 d = re_string_byte_at (input, str_idx + i);
3794 if (d < 0x80 || d > 0xbf)
3795 return 0;
3796 }
3797 return char_len;
3798 }
3799
3800 char_len = re_string_char_size_at (input, str_idx);
3801 if (node->type == OP_PERIOD)
3802 {
3803 if (char_len <= 1)
3804 return 0;
3805 /* FIXME: I don't think this if is needed, as both '\n'
3806 and '\0' are char_len == 1. */
3807 /* '.' accepts any one character except the following two cases. */
3808 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3809 re_string_byte_at (input, str_idx) == '\n') ||
3810 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3811 re_string_byte_at (input, str_idx) == '\0'))
3812 return 0;
3813 return char_len;
3814 }
3815
3816 elem_len = re_string_elem_size_at (input, str_idx);
3817 wc = __btowc(*(input->mbs+str_idx));
3818 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3819 return 0;
3820
3821 if (node->type == COMPLEX_BRACKET)
3822 {
3823 const re_charset_t *cset = node->opr.mbcset;
3824# ifdef _LIBC
3825 const unsigned char *pin
3826 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3827 int j;
3828 uint32_t nrules;
3829# endif /* _LIBC */
3830 int match_len = 0;
3831 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3832 ? re_string_wchar_at (input, str_idx) : 0);
3833
3834 /* match with multibyte character? */
3835 for (i = 0; i < cset->nmbchars; ++i)
3836 if (wc == cset->mbchars[i])
3837 {
3838 match_len = char_len;
3839 goto check_node_accept_bytes_match;
3840 }
3841 /* match with character_class? */
3842 for (i = 0; i < cset->nchar_classes; ++i)
3843 {
3844 wctype_t wt = cset->char_classes[i];
3845 if (__iswctype (wc, wt))
3846 {
3847 match_len = char_len;
3848 goto check_node_accept_bytes_match;
3849 }
3850 }
3851
3852# ifdef _LIBC
3853 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3854 if (nrules != 0)
3855 {
3856 unsigned int in_collseq = 0;
3857 const int32_t *table, *indirect;
3858 const unsigned char *weights, *extra;
3859 const char *collseqwc;
3860 /* This #include defines a local function! */
3861# include <locale/weight.h>
3862
3863 /* match with collating_symbol? */
3864 if (cset->ncoll_syms)
3865 extra = (const unsigned char *)
3866 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3867 for (i = 0; i < cset->ncoll_syms; ++i)
3868 {
3869 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3870 /* Compare the length of input collating element and
3871 the length of current collating element. */
3872 if (*coll_sym != elem_len)
3873 continue;
3874 /* Compare each bytes. */
3875 for (j = 0; j < *coll_sym; j++)
3876 if (pin[j] != coll_sym[1 + j])
3877 break;
3878 if (j == *coll_sym)
3879 {
3880 /* Match if every bytes is equal. */
3881 match_len = j;
3882 goto check_node_accept_bytes_match;
3883 }
3884 }
3885
3886 if (cset->nranges)
3887 {
3888 if (elem_len <= char_len)
3889 {
3890 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3891 in_collseq = __collseq_table_lookup (collseqwc, wc);
3892 }
3893 else
3894 in_collseq = find_collation_sequence_value (pin, elem_len);
3895 }
3896 /* match with range expression? */
3897 for (i = 0; i < cset->nranges; ++i)
3898 if (cset->range_starts[i] <= in_collseq
3899 && in_collseq <= cset->range_ends[i])
3900 {
3901 match_len = elem_len;
3902 goto check_node_accept_bytes_match;
3903 }
3904
3905 /* match with equivalence_class? */
3906 if (cset->nequiv_classes)
3907 {
3908 const unsigned char *cp = pin;
3909 table = (const int32_t *)
3910 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3911 weights = (const unsigned char *)
3912 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3913 extra = (const unsigned char *)
3914 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3915 indirect = (const int32_t *)
3916 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3917 int32_t idx = findidx (&cp);
3918 if (idx > 0)
3919 for (i = 0; i < cset->nequiv_classes; ++i)
3920 {
3921 int32_t equiv_class_idx = cset->equiv_classes[i];
3922 size_t weight_len = weights[idx & 0xffffff];
3923 if (weight_len == weights[equiv_class_idx & 0xffffff]
3924 && (idx >> 24) == (equiv_class_idx >> 24))
3925 {
3926 int cnt = 0;
3927
3928 idx &= 0xffffff;
3929 equiv_class_idx &= 0xffffff;
3930
3931 while (cnt <= weight_len
3932 && (weights[equiv_class_idx + 1 + cnt]
3933 == weights[idx + 1 + cnt]))
3934 ++cnt;
3935 if (cnt > weight_len)
3936 {
3937 match_len = elem_len;
3938 goto check_node_accept_bytes_match;
3939 }
3940 }
3941 }
3942 }
3943 }
3944 else
3945# endif /* _LIBC */
3946 {
3947 /* match with range expression? */
3948#if __GNUC__ >= 2
3949 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3950#else
3951 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3952 cmp_buf[2] = wc;
3953#endif
3954 for (i = 0; i < cset->nranges; ++i)
3955 {
3956 cmp_buf[0] = cset->range_starts[i];
3957 cmp_buf[4] = cset->range_ends[i];
3958 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3959 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3960 {
3961 match_len = char_len;
3962 goto check_node_accept_bytes_match;
3963 }
3964 }
3965 }
3966 check_node_accept_bytes_match:
3967 if (!cset->non_match)
3968 return match_len;
3969 else
3970 {
3971 if (match_len > 0)
3972 return 0;
3973 else
3974 return (elem_len > char_len) ? elem_len : char_len;
3975 }
3976 }
3977 return 0;
3978}
3979
3980# ifdef _LIBC
3981static unsigned int
3982internal_function
3983find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3984{
3985 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3986 if (nrules == 0)
3987 {
3988 if (mbs_len == 1)
3989 {
3990 /* No valid character. Match it as a single byte character. */
3991 const unsigned char *collseq = (const unsigned char *)
3992 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3993 return collseq[mbs[0]];
3994 }
3995 return UINT_MAX;
3996 }
3997 else
3998 {
3999 int32_t idx;
4000 const unsigned char *extra = (const unsigned char *)
4001 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4002 int32_t extrasize = (const unsigned char *)
4003 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4004
4005 for (idx = 0; idx < extrasize;)
4006 {
4007 int mbs_cnt, found = 0;
4008 int32_t elem_mbs_len;
4009 /* Skip the name of collating element name. */
4010 idx = idx + extra[idx] + 1;
4011 elem_mbs_len = extra[idx++];
4012 if (mbs_len == elem_mbs_len)
4013 {
4014 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4015 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4016 break;
4017 if (mbs_cnt == elem_mbs_len)
4018 /* Found the entry. */
4019 found = 1;
4020 }
4021 /* Skip the byte sequence of the collating element. */
4022 idx += elem_mbs_len;
4023 /* Adjust for the alignment. */
4024 idx = (idx + 3) & ~3;
4025 /* Skip the collation sequence value. */
4026 idx += sizeof (uint32_t);
4027 /* Skip the wide char sequence of the collating element. */
4028 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4029 /* If we found the entry, return the sequence value. */
4030 if (found)
4031 return *(uint32_t *) (extra + idx);
4032 /* Skip the collation sequence value. */
4033 idx += sizeof (uint32_t);
4034 }
4035 return UINT_MAX;
4036 }
4037}
4038# endif /* _LIBC */
4039#endif /* RE_ENABLE_I18N */
4040
4041/* Check whether the node accepts the byte which is IDX-th
4042 byte of the INPUT. */
4043
4044static int
4045internal_function
4046check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4047 int idx)
4048{
4049 unsigned char ch;
4050 ch = re_string_byte_at (&mctx->input, idx);
4051 switch (node->type)
4052 {
4053 case CHARACTER:
4054 if (node->opr.c != ch)
4055 return 0;
4056 break;
4057
4058 case SIMPLE_BRACKET:
4059 if (!bitset_contain (node->opr.sbcset, ch))
4060 return 0;
4061 break;
4062
4063#ifdef RE_ENABLE_I18N
4064 case OP_UTF8_PERIOD:
4065 if (ch >= 0x80)
4066 return 0;
4067 /* FALLTHROUGH */
4068#endif
4069 case OP_PERIOD:
4070 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4071 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4072 return 0;
4073 break;
4074
4075 default:
4076 return 0;
4077 }
4078
4079 if (node->constraint)
4080 {
4081 /* The node has constraints. Check whether the current context
4082 satisfies the constraints. */
4083 unsigned int context = re_string_context_at (&mctx->input, idx,
4084 mctx->eflags);
4085 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4086 return 0;
4087 }
4088
4089 return 1;
4090}
4091
4092/* Extend the buffers, if the buffers have run out. */
4093
4094static reg_errcode_t
4095internal_function
4096extend_buffers (re_match_context_t *mctx)
4097{
4098 reg_errcode_t ret;
4099 re_string_t *pstr = &mctx->input;
4100
4101 /* Avoid overflow. */
4102 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4103 return REG_ESPACE;
4104
4105 /* Double the lengthes of the buffers. */
4106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4107 if (BE (ret != REG_NOERROR, 0))
4108 return ret;
4109
4110 if (mctx->state_log != NULL)
4111 {
4112 /* And double the length of state_log. */
4113 /* XXX We have no indication of the size of this buffer. If this
4114 allocation fail we have no indication that the state_log array
4115 does not have the right size. */
4116 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4117 pstr->bufs_len + 1);
4118 if (BE (new_array == NULL, 0))
4119 return REG_ESPACE;
4120 mctx->state_log = new_array;
4121 }
4122
4123 /* Then reconstruct the buffers. */
4124 if (pstr->icase)
4125 {
4126#ifdef RE_ENABLE_I18N
4127 if (pstr->mb_cur_max > 1)
4128 {
4129 ret = build_wcs_upper_buffer (pstr);
4130 if (BE (ret != REG_NOERROR, 0))
4131 return ret;
4132 }
4133 else
4134#endif /* RE_ENABLE_I18N */
4135 build_upper_buffer (pstr);
4136 }
4137 else
4138 {
4139#ifdef RE_ENABLE_I18N
4140 if (pstr->mb_cur_max > 1)
4141 build_wcs_buffer (pstr);
4142 else
4143#endif /* RE_ENABLE_I18N */
4144 {
4145 if (pstr->trans != NULL)
4146 re_string_translate_buffer (pstr);
4147 }
4148 }
4149 return REG_NOERROR;
4150}
4151
4152
4153/* Functions for matching context. */
4154
4155/* Initialize MCTX. */
4156
4157static reg_errcode_t
4158internal_function
4159match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4160{
4161 mctx->eflags = eflags;
4162 mctx->match_last = -1;
4163 if (n > 0)
4164 {
4165 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4166 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4167 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4168 return REG_ESPACE;
4169 }
4170 /* Already zero-ed by the caller.
4171 else
4172 mctx->bkref_ents = NULL;
4173 mctx->nbkref_ents = 0;
4174 mctx->nsub_tops = 0; */
4175 mctx->abkref_ents = n;
4176 mctx->max_mb_elem_len = 1;
4177 mctx->asub_tops = n;
4178 return REG_NOERROR;
4179}
4180
4181/* Clean the entries which depend on the current input in MCTX.
4182 This function must be invoked when the matcher changes the start index
4183 of the input, or changes the input string. */
4184
4185static void
4186internal_function
4187match_ctx_clean (re_match_context_t *mctx)
4188{
4189 int st_idx;
4190 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4191 {
4192 int sl_idx;
4193 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4194 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4195 {
4196 re_sub_match_last_t *last = top->lasts[sl_idx];
4197 re_free (last->path.array);
4198 re_free (last);
4199 }
4200 re_free (top->lasts);
4201 if (top->path)
4202 {
4203 re_free (top->path->array);
4204 re_free (top->path);
4205 }
4206 free (top);
4207 }
4208
4209 mctx->nsub_tops = 0;
4210 mctx->nbkref_ents = 0;
4211}
4212
4213/* Free all the memory associated with MCTX. */
4214
4215static void
4216internal_function
4217match_ctx_free (re_match_context_t *mctx)
4218{
4219 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4220 match_ctx_clean (mctx);
4221 re_free (mctx->sub_tops);
4222 re_free (mctx->bkref_ents);
4223}
4224
4225/* Add a new backreference entry to MCTX.
4226 Note that we assume that caller never call this function with duplicate
4227 entry, and call with STR_IDX which isn't smaller than any existing entry.
4228*/
4229
4230static reg_errcode_t
4231internal_function
4232match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4233 int to)
4234{
4235 if (mctx->nbkref_ents >= mctx->abkref_ents)
4236 {
4237 struct re_backref_cache_entry* new_entry;
4238 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4239 mctx->abkref_ents * 2);
4240 if (BE (new_entry == NULL, 0))
4241 {
4242 re_free (mctx->bkref_ents);
4243 return REG_ESPACE;
4244 }
4245 mctx->bkref_ents = new_entry;
4246 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4247 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4248 mctx->abkref_ents *= 2;
4249 }
4250 if (mctx->nbkref_ents > 0
4251 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4252 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4253
4254 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4255 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4256 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4257 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4258
4259 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4260 If bit N is clear, means that this entry won't epsilon-transition to
4261 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4262 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4263 such node.
4264
4265 A backreference does not epsilon-transition unless it is empty, so set
4266 to all zeros if FROM != TO. */
4267 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4268 = (from == to ? ~0 : 0);
4269
4270 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4271 if (mctx->max_mb_elem_len < to - from)
4272 mctx->max_mb_elem_len = to - from;
4273 return REG_NOERROR;
4274}
4275
4276/* Search for the first entry which has the same str_idx, or -1 if none is
4277 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4278
4279static int
4280internal_function
4281search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4282{
4283 int left, right, mid, last;
4284 last = right = mctx->nbkref_ents;
4285 for (left = 0; left < right;)
4286 {
4287 mid = (left + right) / 2;
4288 if (mctx->bkref_ents[mid].str_idx < str_idx)
4289 left = mid + 1;
4290 else
4291 right = mid;
4292 }
4293 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4294 return left;
4295 else
4296 return -1;
4297}
4298
4299/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4300 at STR_IDX. */
4301
4302static reg_errcode_t
4303internal_function
4304match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4305{
4306#ifdef DEBUG
4307 assert (mctx->sub_tops != NULL);
4308 assert (mctx->asub_tops > 0);
4309#endif
4310 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4311 {
4312 int new_asub_tops = mctx->asub_tops * 2;
4313 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4314 re_sub_match_top_t *,
4315 new_asub_tops);
4316 if (BE (new_array == NULL, 0))
4317 return REG_ESPACE;
4318 mctx->sub_tops = new_array;
4319 mctx->asub_tops = new_asub_tops;
4320 }
4321 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4322 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4323 return REG_ESPACE;
4324 mctx->sub_tops[mctx->nsub_tops]->node = node;
4325 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4326 return REG_NOERROR;
4327}
4328
4329/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4330 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4331
4332static re_sub_match_last_t *
4333internal_function
4334match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4335{
4336 re_sub_match_last_t *new_entry;
4337 if (BE (subtop->nlasts == subtop->alasts, 0))
4338 {
4339 int new_alasts = 2 * subtop->alasts + 1;
4340 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4341 re_sub_match_last_t *,
4342 new_alasts);
4343 if (BE (new_array == NULL, 0))
4344 return NULL;
4345 subtop->lasts = new_array;
4346 subtop->alasts = new_alasts;
4347 }
4348 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4349 if (BE (new_entry != NULL, 1))
4350 {
4351 subtop->lasts[subtop->nlasts] = new_entry;
4352 new_entry->node = node;
4353 new_entry->str_idx = str_idx;
4354 ++subtop->nlasts;
4355 }
4356 return new_entry;
4357}
4358
4359static void
4360internal_function
4361sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4362 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4363{
4364 sctx->sifted_states = sifted_sts;
4365 sctx->limited_states = limited_sts;
4366 sctx->last_node = last_node;
4367 sctx->last_str_idx = last_str_idx;
4368 re_node_set_init_empty (&sctx->limits);
4369}