aboutsummaryrefslogtreecommitdiff
path: root/win32/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'win32/regcomp.c')
-rw-r--r--win32/regcomp.c3886
1 files changed, 3886 insertions, 0 deletions
diff --git a/win32/regcomp.c b/win32/regcomp.c
new file mode 100644
index 000000000..dca7e6ef3
--- /dev/null
+++ b/win32/regcomp.c
@@ -0,0 +1,3886 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21#define UNUSED_PARAM __attribute__ ((__unused__))
22
23static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 size_t length, reg_syntax_t syntax);
25static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
27 char *fastmap);
28static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29#ifdef RE_ENABLE_I18N
30static void free_charset (re_charset_t *cset);
31#endif /* RE_ENABLE_I18N */
32static void free_workarea_compile (regex_t *preg);
33static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34#ifdef RE_ENABLE_I18N
35static void optimize_utf8 (re_dfa_t *dfa);
36#endif
37static reg_errcode_t analyze (regex_t *preg);
38static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
52static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
53 unsigned int constraint);
54static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 int node, int root);
57static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58static int fetch_number (re_string_t *input, re_token_t *token,
59 reg_syntax_t syntax);
60static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax) internal_function;
62static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
81 reg_errcode_t *err);
82static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_string_t *regexp,
84 re_token_t *token, int token_len,
85 re_dfa_t *dfa,
86 reg_syntax_t syntax,
87 int accept_hyphen);
88static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token);
91#ifdef RE_ENABLE_I18N
92static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 re_charset_t *mbcset,
94 int *equiv_class_alloc,
95 const unsigned char *name);
96static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 bitset_t sbcset,
98 re_charset_t *mbcset,
99 int *char_class_alloc,
100 const char *class_name,
101 reg_syntax_t syntax);
102#else /* not RE_ENABLE_I18N */
103static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 const unsigned char *name);
105static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 bitset_t sbcset,
107 const char *class_name,
108 reg_syntax_t syntax);
109#endif /* not RE_ENABLE_I18N */
110static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 RE_TRANSLATE_TYPE trans,
112 const char *class_name,
113 const char *extra,
114 int non_match, reg_errcode_t *err);
115static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122static void free_token (re_token_t *node);
123static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125
126/* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
130
131const char __re_error_msgid[] attribute_hidden =
132 {
133#define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
135 "\0"
136#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
138 "\0"
139#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 "\0"
142#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 "\0"
145#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 "\0"
148#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 "\0"
151#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 "\0"
154#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 "\0"
157#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 "\0"
160#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 "\0"
163#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 "\0"
166#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 "\0"
169#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 "\0"
172#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 "\0"
175#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 "\0"
178#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 "\0"
181#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 };
184
185const size_t __re_error_msgid_idx[] attribute_hidden =
186 {
187 REG_NOERROR_IDX,
188 REG_NOMATCH_IDX,
189 REG_BADPAT_IDX,
190 REG_ECOLLATE_IDX,
191 REG_ECTYPE_IDX,
192 REG_EESCAPE_IDX,
193 REG_ESUBREG_IDX,
194 REG_EBRACK_IDX,
195 REG_EPAREN_IDX,
196 REG_EBRACE_IDX,
197 REG_BADBR_IDX,
198 REG_ERANGE_IDX,
199 REG_ESPACE_IDX,
200 REG_BADRPT_IDX,
201 REG_EEND_IDX,
202 REG_ESIZE_IDX,
203 REG_ERPAREN_IDX
204 };
205
206/* Entry points for GNU code. */
207
208
209#ifdef ZOS_USS
210
211/* For ZOS USS we must define btowc */
212
213wchar_t
214btowc (int c)
215{
216 wchar_t wtmp[2];
217 char tmp[2];
218
219 tmp[0] = c;
220 tmp[1] = 0;
221
222 mbtowc (wtmp, tmp, 1);
223 return wtmp[0];
224}
225#endif
226
227/* re_compile_pattern is the GNU regular expression compiler: it
228 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
229 Returns 0 if the pattern was valid, otherwise an error string.
230
231 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
232 are set in BUFP on entry. */
233
234const char *
235re_compile_pattern (const char *pattern,
236 size_t length,
237 struct re_pattern_buffer *bufp)
238{
239 reg_errcode_t ret;
240
241 /* And GNU code determines whether or not to get register information
242 by passing null for the REGS argument to re_match, etc., not by
243 setting no_sub, unless RE_NO_SUB is set. */
244 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
245
246 /* Match anchors at newline. */
247 bufp->newline_anchor = 1;
248
249 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
250
251 if (!ret)
252 return NULL;
253 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
254}
255#ifdef _LIBC
256weak_alias (__re_compile_pattern, re_compile_pattern)
257#endif
258
259/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
260 also be assigned to arbitrarily: each pattern buffer stores its own
261 syntax, so it can be changed between regex compilations. */
262/* This has no initializer because initialized variables in Emacs
263 become read-only after dumping. */
264reg_syntax_t re_syntax_options;
265
266
267/* Specify the precise syntax of regexps for compilation. This provides
268 for compatibility for various utilities which historically have
269 different, incompatible syntaxes.
270
271 The argument SYNTAX is a bit mask comprised of the various bits
272 defined in regex.h. We return the old syntax. */
273
274reg_syntax_t
275re_set_syntax (reg_syntax_t syntax)
276{
277 reg_syntax_t ret = re_syntax_options;
278
279 re_syntax_options = syntax;
280 return ret;
281}
282#ifdef _LIBC
283weak_alias (__re_set_syntax, re_set_syntax)
284#endif
285
286int
287re_compile_fastmap (struct re_pattern_buffer *bufp)
288{
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
291
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
301 return 0;
302}
303#ifdef _LIBC
304weak_alias (__re_compile_fastmap, re_compile_fastmap)
305#endif
306
307static inline void
308__attribute ((always_inline))
309re_set_fastmap (char *fastmap, int icase, int ch)
310{
311 fastmap[ch] = 1;
312 if (icase)
313 fastmap[tolower (ch)] = 1;
314}
315
316/* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
318
319static void
320re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
321 char *fastmap)
322{
323 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
324 int node_cnt;
325 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
326 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
327 {
328 int node = init_state->nodes.elems[node_cnt];
329 re_token_type_t type = dfa->nodes[node].type;
330
331 if (type == CHARACTER)
332 {
333 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
334#ifdef RE_ENABLE_I18N
335 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
336 {
337 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
338 wchar_t wc;
339 mbstate_t state;
340
341 p = buf;
342 *p++ = dfa->nodes[node].opr.c;
343 while (++node < dfa->nodes_len
344 && dfa->nodes[node].type == CHARACTER
345 && dfa->nodes[node].mb_partial)
346 *p++ = dfa->nodes[node].opr.c;
347 memset (&state, '\0', sizeof (state));
348 if (__mbrtowc (&wc, (const char *) buf, p - buf,
349 &state) == p - buf
350 && (__wcrtomb ((char *) buf, towlower (wc), &state)
351 != (size_t) -1))
352 re_set_fastmap (fastmap, 0, buf[0]);
353 re_free (buf);
354 }
355#endif
356 }
357 else if (type == SIMPLE_BRACKET)
358 {
359 int i, ch;
360 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
361 {
362 int j;
363 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
364 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
365 if (w & ((bitset_word_t) 1 << j))
366 re_set_fastmap (fastmap, icase, ch);
367 }
368 }
369#ifdef RE_ENABLE_I18N
370 else if (type == COMPLEX_BRACKET)
371 {
372 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
373 int i;
374
375# ifdef _LIBC
376 /* See if we have to try all bytes which start multiple collation
377 elements.
378 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
379 collation element, and don't catch 'b' since 'b' is
380 the only collation element which starts from 'b' (and
381 it is caught by SIMPLE_BRACKET). */
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
383 && (cset->ncoll_syms || cset->nranges))
384 {
385 const int32_t *table = (const int32_t *)
386 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
387 for (i = 0; i < SBC_MAX; ++i)
388 if (table[i] < 0)
389 re_set_fastmap (fastmap, icase, i);
390 }
391# endif /* _LIBC */
392
393 /* See if we have to start the match at all multibyte characters,
394 i.e. where we would not find an invalid sequence. This only
395 applies to multibyte character sets; for single byte character
396 sets, the SIMPLE_BRACKET again suffices. */
397 if (dfa->mb_cur_max > 1
398 && (cset->nchar_classes || cset->non_match || cset->nranges
399# ifdef _LIBC
400 || cset->nequiv_classes
401# endif /* _LIBC */
402 ))
403 {
404 unsigned char c = 0;
405 do
406 {
407 mbstate_t mbs;
408 memset (&mbs, 0, sizeof (mbs));
409 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
410 re_set_fastmap (fastmap, false, (int) c);
411 }
412 while (++c != 0);
413 }
414
415 else
416 {
417 /* ... Else catch all bytes which can start the mbchars. */
418 for (i = 0; i < cset->nmbchars; ++i)
419 {
420 char buf[256];
421 mbstate_t state;
422 memset (&state, '\0', sizeof (state));
423 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
424 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
425 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
426 {
427 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
428 != (size_t) -1)
429 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
430 }
431 }
432 }
433 }
434#endif /* RE_ENABLE_I18N */
435 else if (type == OP_PERIOD
436#ifdef RE_ENABLE_I18N
437 || type == OP_UTF8_PERIOD
438#endif /* RE_ENABLE_I18N */
439 || type == END_OF_RE)
440 {
441 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
442 if (type == END_OF_RE)
443 bufp->can_be_null = 1;
444 return;
445 }
446 }
447}
448
449/* Entry point for POSIX code. */
450/* regcomp takes a regular expression as a string and compiles it.
451
452 PREG is a regex_t *. We do not expect any fields to be initialized,
453 since POSIX says we shouldn't. Thus, we set
454
455 `buffer' to the compiled pattern;
456 `used' to the length of the compiled pattern;
457 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
458 REG_EXTENDED bit in CFLAGS is set; otherwise, to
459 RE_SYNTAX_POSIX_BASIC;
460 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
461 `fastmap' to an allocated space for the fastmap;
462 `fastmap_accurate' to zero;
463 `re_nsub' to the number of subexpressions in PATTERN.
464
465 PATTERN is the address of the pattern string.
466
467 CFLAGS is a series of bits which affect compilation.
468
469 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
470 use POSIX basic syntax.
471
472 If REG_NEWLINE is set, then . and [^...] don't match newline.
473 Also, regexec will try a match beginning after every newline.
474
475 If REG_ICASE is set, then we considers upper- and lowercase
476 versions of letters to be equivalent when matching.
477
478 If REG_NOSUB is set, then when PREG is passed to regexec, that
479 routine will report only success or failure, and nothing about the
480 registers.
481
482 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
483 the return codes and their meanings.) */
484
485int
486regcomp (regex_t *__restrict preg,
487 const char *__restrict pattern,
488 int cflags)
489{
490 reg_errcode_t ret;
491 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
492 : RE_SYNTAX_POSIX_BASIC);
493
494 preg->buffer = NULL;
495 preg->allocated = 0;
496 preg->used = 0;
497
498 /* Try to allocate space for the fastmap. */
499 preg->fastmap = re_malloc (char, SBC_MAX);
500 if (BE (preg->fastmap == NULL, 0))
501 return REG_ESPACE;
502
503 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
504
505 /* If REG_NEWLINE is set, newlines are treated differently. */
506 if (cflags & REG_NEWLINE)
507 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
508 syntax &= ~RE_DOT_NEWLINE;
509 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
510 /* It also changes the matching behavior. */
511 preg->newline_anchor = 1;
512 }
513 else
514 preg->newline_anchor = 0;
515 preg->no_sub = !!(cflags & REG_NOSUB);
516 preg->translate = NULL;
517
518 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
519
520 /* POSIX doesn't distinguish between an unmatched open-group and an
521 unmatched close-group: both are REG_EPAREN. */
522 if (ret == REG_ERPAREN)
523 ret = REG_EPAREN;
524
525 /* We have already checked preg->fastmap != NULL. */
526 if (BE (ret == REG_NOERROR, 1))
527 /* Compute the fastmap now, since regexec cannot modify the pattern
528 buffer. This function never fails in this implementation. */
529 (void) re_compile_fastmap (preg);
530 else
531 {
532 /* Some error occurred while compiling the expression. */
533 re_free (preg->fastmap);
534 preg->fastmap = NULL;
535 }
536
537 return (int) ret;
538}
539#ifdef _LIBC
540weak_alias (__regcomp, regcomp)
541#endif
542
543/* Returns a message corresponding to an error code, ERRCODE, returned
544 from either regcomp or regexec. We don't use PREG here. */
545
546size_t
547regerror(int errcode, UNUSED_PARAM const regex_t *__restrict preg,
548 char *__restrict errbuf, size_t errbuf_size)
549{
550 const char *msg;
551 size_t msg_size;
552
553 if (BE (errcode < 0
554 || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 / sizeof (__re_error_msgid_idx[0])), 0))
556 /* Only error codes returned by the rest of the code should be passed
557 to this routine. If we are given anything else, or if other regex
558 code generates an invalid error code, then the program has a bug.
559 Dump core so we can fix it. */
560 abort ();
561
562 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563
564 msg_size = strlen (msg) + 1; /* Includes the null. */
565
566 if (BE (errbuf_size != 0, 1))
567 {
568 if (BE (msg_size > errbuf_size, 0))
569 {
570 memcpy (errbuf, msg, errbuf_size - 1);
571 errbuf[errbuf_size - 1] = 0;
572 }
573 else
574 memcpy (errbuf, msg, msg_size);
575 }
576
577 return msg_size;
578}
579#ifdef _LIBC
580weak_alias (__regerror, regerror)
581#endif
582
583
584#ifdef RE_ENABLE_I18N
585/* This static array is used for the map to single-byte characters when
586 UTF-8 is used. Otherwise we would allocate memory just to initialize
587 it the same all the time. UTF-8 is the preferred encoding so this is
588 a worthwhile optimization. */
589#if __GNUC__ >= 3
590static const bitset_t utf8_sb_map = {
591 /* Set the first 128 bits. */
592 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
593};
594#else /* ! (__GNUC__ >= 3) */
595static bitset_t utf8_sb_map;
596#endif /* __GNUC__ >= 3 */
597#endif /* RE_ENABLE_I18N */
598
599
600static void
601free_dfa_content (re_dfa_t *dfa)
602{
603 int i, j;
604
605 if (dfa->nodes)
606 for (i = 0; i < dfa->nodes_len; ++i)
607 free_token (dfa->nodes + i);
608 re_free (dfa->nexts);
609 for (i = 0; i < dfa->nodes_len; ++i)
610 {
611 if (dfa->eclosures != NULL)
612 re_node_set_free (dfa->eclosures + i);
613 if (dfa->inveclosures != NULL)
614 re_node_set_free (dfa->inveclosures + i);
615 if (dfa->edests != NULL)
616 re_node_set_free (dfa->edests + i);
617 }
618 re_free (dfa->edests);
619 re_free (dfa->eclosures);
620 re_free (dfa->inveclosures);
621 re_free (dfa->nodes);
622
623 if (dfa->state_table)
624 for (i = 0; i <= dfa->state_hash_mask; ++i)
625 {
626 struct re_state_table_entry *entry = dfa->state_table + i;
627 for (j = 0; j < entry->num; ++j)
628 {
629 re_dfastate_t *state = entry->array[j];
630 free_state (state);
631 }
632 re_free (entry->array);
633 }
634 re_free (dfa->state_table);
635#ifdef RE_ENABLE_I18N
636 if (dfa->sb_char != utf8_sb_map)
637 re_free (dfa->sb_char);
638#endif
639 re_free (dfa->subexp_map);
640#ifdef DEBUG
641 re_free (dfa->re_str);
642#endif
643
644 re_free (dfa);
645}
646
647
648/* Free dynamically allocated space used by PREG. */
649
650void
651regfree (regex_t *preg)
652{
653 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
654 if (BE (dfa != NULL, 1))
655 free_dfa_content (dfa);
656 preg->buffer = NULL;
657 preg->allocated = 0;
658
659 re_free (preg->fastmap);
660 preg->fastmap = NULL;
661
662 re_free (preg->translate);
663 preg->translate = NULL;
664}
665#ifdef _LIBC
666weak_alias (__regfree, regfree)
667#endif
668
669/* Entry points compatible with 4.2 BSD regex library. We don't define
670 them unless specifically requested. */
671
672#if defined _REGEX_RE_COMP || defined _LIBC
673
674/* BSD has one and only one pattern buffer. */
675static struct re_pattern_buffer re_comp_buf;
676
677char *
678# ifdef _LIBC
679/* Make these definitions weak in libc, so POSIX programs can redefine
680 these names if they don't use our functions, and still use
681 regcomp/regexec above without link errors. */
682weak_function
683# endif
684re_comp (s)
685 const char *s;
686{
687 reg_errcode_t ret;
688 char *fastmap;
689
690 if (!s)
691 {
692 if (!re_comp_buf.buffer)
693 return gettext ("No previous regular expression");
694 return 0;
695 }
696
697 if (re_comp_buf.buffer)
698 {
699 fastmap = re_comp_buf.fastmap;
700 re_comp_buf.fastmap = NULL;
701 __regfree (&re_comp_buf);
702 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
703 re_comp_buf.fastmap = fastmap;
704 }
705
706 if (re_comp_buf.fastmap == NULL)
707 {
708 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
709 if (re_comp_buf.fastmap == NULL)
710 return (char *) gettext (__re_error_msgid
711 + __re_error_msgid_idx[(int) REG_ESPACE]);
712 }
713
714 /* Since `re_exec' always passes NULL for the `regs' argument, we
715 don't need to initialize the pattern buffer fields which affect it. */
716
717 /* Match anchors at newlines. */
718 re_comp_buf.newline_anchor = 1;
719
720 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
721
722 if (!ret)
723 return NULL;
724
725 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
726 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
727}
728
729#ifdef _LIBC
730libc_freeres_fn (free_mem)
731{
732 __regfree (&re_comp_buf);
733}
734#endif
735
736#endif /* _REGEX_RE_COMP */
737
738/* Internal entry point.
739 Compile the regular expression PATTERN, whose length is LENGTH.
740 SYNTAX indicate regular expression's syntax. */
741
742static reg_errcode_t
743re_compile_internal (regex_t *preg, const char * pattern, size_t length,
744 reg_syntax_t syntax)
745{
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
749
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
758
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
762 {
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
772 }
773 preg->used = sizeof (re_dfa_t);
774
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
777 {
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
782 }
783#ifdef DEBUG
784 /* Note: length+1 will not overflow since it is checked in init_dfa. */
785 dfa->re_str = re_malloc (char, length + 1);
786 strncpy (dfa->re_str, pattern, length + 1);
787#endif
788
789 __libc_lock_init (dfa->lock);
790
791 err = re_string_construct (&regexp, pattern, length, preg->translate,
792 syntax & RE_ICASE, dfa);
793 if (BE (err != REG_NOERROR, 0))
794 {
795 re_compile_internal_free_return:
796 free_workarea_compile (preg);
797 re_string_destruct (&regexp);
798 free_dfa_content (dfa);
799 preg->buffer = NULL;
800 preg->allocated = 0;
801 return err;
802 }
803
804 /* Parse the regular expression, and build a structure tree. */
805 preg->re_nsub = 0;
806 dfa->str_tree = parse (&regexp, preg, syntax, &err);
807 if (BE (dfa->str_tree == NULL, 0))
808 goto re_compile_internal_free_return;
809
810 /* Analyze the tree and create the nfa. */
811 err = analyze (preg);
812 if (BE (err != REG_NOERROR, 0))
813 goto re_compile_internal_free_return;
814
815#ifdef RE_ENABLE_I18N
816 /* If possible, do searching in single byte encoding to speed things up. */
817 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
818 optimize_utf8 (dfa);
819#endif
820
821 /* Then create the initial state of the dfa. */
822 err = create_initial_state (dfa);
823
824 /* Release work areas. */
825 free_workarea_compile (preg);
826 re_string_destruct (&regexp);
827
828 if (BE (err != REG_NOERROR, 0))
829 {
830 free_dfa_content (dfa);
831 preg->buffer = NULL;
832 preg->allocated = 0;
833 }
834
835 return err;
836}
837
838/* Initialize DFA. We use the length of the regular expression PAT_LEN
839 as the initial length of some arrays. */
840
841static reg_errcode_t
842init_dfa (re_dfa_t *dfa, size_t pat_len)
843{
844 unsigned int table_size;
845#ifndef _LIBC
846 const char *codeset_name;
847#endif
848
849 memset (dfa, '\0', sizeof (re_dfa_t));
850
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
853
854 /* Avoid overflows. */
855 if (pat_len == SIZE_MAX)
856 return REG_ESPACE;
857
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
865
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
868
869 dfa->mb_cur_max = MB_CUR_MAX;
870#ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876#else
877# ifdef HAVE_LANGINFO_CODESET
878 codeset_name = nl_langinfo (CODESET);
879# else
880 codeset_name = getenv ("LC_ALL");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LC_CTYPE");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LANG");
885 if (codeset_name == NULL)
886 codeset_name = "";
887 else if (strchr (codeset_name, '.') != NULL)
888 codeset_name = strchr (codeset_name, '.') + 1;
889# endif
890
891 /* strcasecmp isn't a standard interface. brute force check */
892#if 0
893 if (strcasecmp (codeset_name, "UTF-8") == 0
894 || strcasecmp (codeset_name, "UTF8") == 0)
895 dfa->is_utf8 = 1;
896#else
897 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
898 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
899 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
900 && (codeset_name[3] == '-'
901 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
902 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
903 dfa->is_utf8 = 1;
904#endif
905
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
909#endif
910
911#ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
913 {
914 if (dfa->is_utf8)
915 {
916#if !defined(__GNUC__) || __GNUC__ < 3
917 static short utf8_sb_map_inited = 0;
918
919 if (! utf8_sb_map_inited)
920 {
921 int i;
922
923 utf8_sb_map_inited = 0;
924 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
925 utf8_sb_map[i] = BITSET_WORD_MAX;
926 }
927#endif
928 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
929 }
930 else
931 {
932 int i, j, ch;
933
934 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
935 if (BE (dfa->sb_char == NULL, 0))
936 return REG_ESPACE;
937
938 /* Set the bits corresponding to single byte chars. */
939 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
940 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
941 {
942 wint_t wch = __btowc (ch);
943 if (wch != WEOF)
944 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
945# ifndef _LIBC
946 if (isascii (ch) && wch != ch)
947 dfa->map_notascii = 1;
948# endif
949 }
950 }
951 }
952#endif
953
954 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
955 return REG_ESPACE;
956 return REG_NOERROR;
957}
958
959/* Initialize WORD_CHAR table, which indicate which character is
960 "word". In this case "word" means that it is the word construction
961 character used by some operators like "\<", "\>", etc. */
962
963static void
964internal_function
965init_word_char (re_dfa_t *dfa)
966{
967 int i, j, ch;
968 dfa->word_ops_used = 1;
969 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973}
974
975/* Free the work area which are only used while compiling. */
976
977static void
978free_workarea_compile (regex_t *preg)
979{
980 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
983 {
984 next = storage->next;
985 re_free (storage);
986 }
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
992}
993
994/* Create initial states for all contexts. */
995
996static reg_errcode_t
997create_initial_state (re_dfa_t *dfa)
998{
999 int first, i;
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1002
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (BE (err != REG_NOERROR, 0))
1009 return err;
1010
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1017 {
1018 int node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021 int clexp_idx;
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 {
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 break;
1031 }
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1034
1035 if (type == OP_BACK_REF)
1036 {
1037 int dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 {
1040 err = re_node_set_merge (&init_nodes,
1041 dfa->eclosures + dest_idx);
1042 if (err != REG_NOERROR)
1043 return err;
1044 i = 0;
1045 }
1046 }
1047 }
1048
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (BE (dfa->init_state == NULL, 0))
1053 return err;
1054 if (dfa->init_state->has_constraint)
1055 {
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_WORD);
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_NEWLINE);
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
1064 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1065 || dfa->init_state_begbuf == NULL, 0))
1066 return err;
1067 }
1068 else
1069 dfa->init_state_word = dfa->init_state_nl
1070 = dfa->init_state_begbuf = dfa->init_state;
1071
1072 re_node_set_free (&init_nodes);
1073 return REG_NOERROR;
1074}
1075
1076#ifdef RE_ENABLE_I18N
1077/* If it is possible to do searching in single byte encoding instead of UTF-8
1078 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1079 DFA nodes where needed. */
1080
1081static void
1082optimize_utf8 (re_dfa_t *dfa)
1083{
1084 int node, i, mb_chars = 0, has_period = 0;
1085
1086 for (node = 0; node < dfa->nodes_len; ++node)
1087 switch (dfa->nodes[node].type)
1088 {
1089 case CHARACTER:
1090 if (dfa->nodes[node].opr.c >= 0x80)
1091 mb_chars = 1;
1092 break;
1093 case ANCHOR:
1094 switch (dfa->nodes[node].opr.ctx_type)
1095 {
1096 case LINE_FIRST:
1097 case LINE_LAST:
1098 case BUF_FIRST:
1099 case BUF_LAST:
1100 break;
1101 default:
1102 /* Word anchors etc. cannot be handled. It's okay to test
1103 opr.ctx_type since constraints (for all DFA nodes) are
1104 created by ORing one or more opr.ctx_type values. */
1105 return;
1106 }
1107 break;
1108 case OP_PERIOD:
1109 has_period = 1;
1110 break;
1111 case OP_BACK_REF:
1112 case OP_ALT:
1113 case END_OF_RE:
1114 case OP_DUP_ASTERISK:
1115 case OP_OPEN_SUBEXP:
1116 case OP_CLOSE_SUBEXP:
1117 break;
1118 case COMPLEX_BRACKET:
1119 return;
1120 case SIMPLE_BRACKET:
1121 /* Just double check. The non-ASCII range starts at 0x80. */
1122 assert (0x80 % BITSET_WORD_BITS == 0);
1123 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1124 if (dfa->nodes[node].opr.sbcset[i])
1125 return;
1126 break;
1127 default:
1128 abort ();
1129 }
1130
1131 if (mb_chars || has_period)
1132 for (node = 0; node < dfa->nodes_len; ++node)
1133 {
1134 if (dfa->nodes[node].type == CHARACTER
1135 && dfa->nodes[node].opr.c >= 0x80)
1136 dfa->nodes[node].mb_partial = 0;
1137 else if (dfa->nodes[node].type == OP_PERIOD)
1138 dfa->nodes[node].type = OP_UTF8_PERIOD;
1139 }
1140
1141 /* The search can be in single byte locale. */
1142 dfa->mb_cur_max = 1;
1143 dfa->is_utf8 = 0;
1144 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1145}
1146#endif
1147
1148/* Analyze the structure tree, and calculate "first", "next", "edest",
1149 "eclosure", and "inveclosure". */
1150
1151static reg_errcode_t
1152analyze (regex_t *preg)
1153{
1154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1155 reg_errcode_t ret;
1156
1157 /* Allocate arrays. */
1158 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1159 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1160 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1161 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1162 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1163 || dfa->eclosures == NULL, 0))
1164 return REG_ESPACE;
1165
1166 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1167 if (dfa->subexp_map != NULL)
1168 {
1169 int i;
1170 for (i = 0; i < preg->re_nsub; i++)
1171 dfa->subexp_map[i] = i;
1172 preorder (dfa->str_tree, optimize_subexps, dfa);
1173 for (i = 0; i < preg->re_nsub; i++)
1174 if (dfa->subexp_map[i] != i)
1175 break;
1176 if (i == preg->re_nsub)
1177 {
1178 free (dfa->subexp_map);
1179 dfa->subexp_map = NULL;
1180 }
1181 }
1182
1183 ret = postorder (dfa->str_tree, lower_subexps, preg);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1186 ret = postorder (dfa->str_tree, calc_first, dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1189 preorder (dfa->str_tree, calc_next, dfa);
1190 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 ret = calc_eclosure (dfa);
1194 if (BE (ret != REG_NOERROR, 0))
1195 return ret;
1196
1197 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1198 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1199 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1200 || dfa->nbackref)
1201 {
1202 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1203 if (BE (dfa->inveclosures == NULL, 0))
1204 return REG_ESPACE;
1205 ret = calc_inveclosure (dfa);
1206 }
1207
1208 return ret;
1209}
1210
1211/* Our parse trees are very unbalanced, so we cannot use a stack to
1212 implement parse tree visits. Instead, we use parent pointers and
1213 some hairy code in these two functions. */
1214static reg_errcode_t
1215postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1216 void *extra)
1217{
1218 bin_tree_t *node, *prev;
1219
1220 for (node = root; ; )
1221 {
1222 /* Descend down the tree, preferably to the left (or to the right
1223 if that's the only child). */
1224 while (node->left || node->right)
1225 if (node->left)
1226 node = node->left;
1227 else
1228 node = node->right;
1229
1230 do
1231 {
1232 reg_errcode_t err = fn (extra, node);
1233 if (BE (err != REG_NOERROR, 0))
1234 return err;
1235 if (node->parent == NULL)
1236 return REG_NOERROR;
1237 prev = node;
1238 node = node->parent;
1239 }
1240 /* Go up while we have a node that is reached from the right. */
1241 while (node->right == prev || node->right == NULL);
1242 node = node->right;
1243 }
1244}
1245
1246static reg_errcode_t
1247preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 void *extra)
1249{
1250 bin_tree_t *node;
1251
1252 for (node = root; ; )
1253 {
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1257
1258 /* Go to the left node, or up and to the right. */
1259 if (node->left)
1260 node = node->left;
1261 else
1262 {
1263 bin_tree_t *prev = NULL;
1264 while (node->right == prev || node->right == NULL)
1265 {
1266 prev = node;
1267 node = node->parent;
1268 if (!node)
1269 return REG_NOERROR;
1270 }
1271 node = node->right;
1272 }
1273 }
1274}
1275
1276/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1277 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1278 backreferences as well. Requires a preorder visit. */
1279static reg_errcode_t
1280optimize_subexps (void *extra, bin_tree_t *node)
1281{
1282 re_dfa_t *dfa = (re_dfa_t *) extra;
1283
1284 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1285 {
1286 int idx = node->token.opr.idx;
1287 node->token.opr.idx = dfa->subexp_map[idx];
1288 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1289 }
1290
1291 else if (node->token.type == SUBEXP
1292 && node->left && node->left->token.type == SUBEXP)
1293 {
1294 int other_idx = node->left->token.opr.idx;
1295
1296 node->left = node->left->left;
1297 if (node->left)
1298 node->left->parent = node;
1299
1300 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1301 if (other_idx < BITSET_WORD_BITS)
1302 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1303 }
1304
1305 return REG_NOERROR;
1306}
1307
1308/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1309 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1310static reg_errcode_t
1311lower_subexps (void *extra, bin_tree_t *node)
1312{
1313 regex_t *preg = (regex_t *) extra;
1314 reg_errcode_t err = REG_NOERROR;
1315
1316 if (node->left && node->left->token.type == SUBEXP)
1317 {
1318 node->left = lower_subexp (&err, preg, node->left);
1319 if (node->left)
1320 node->left->parent = node;
1321 }
1322 if (node->right && node->right->token.type == SUBEXP)
1323 {
1324 node->right = lower_subexp (&err, preg, node->right);
1325 if (node->right)
1326 node->right->parent = node;
1327 }
1328
1329 return err;
1330}
1331
1332static bin_tree_t *
1333lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1334{
1335 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1336 bin_tree_t *body = node->left;
1337 bin_tree_t *op, *cls, *tree1, *tree;
1338
1339 if (preg->no_sub
1340 /* We do not optimize empty subexpressions, because otherwise we may
1341 have bad CONCAT nodes with NULL children. This is obviously not
1342 very common, so we do not lose much. An example that triggers
1343 this case is the sed "script" /\(\)/x. */
1344 && node->left != NULL
1345 && (node->token.opr.idx >= BITSET_WORD_BITS
1346 || !(dfa->used_bkref_map
1347 & ((bitset_word_t) 1 << node->token.opr.idx))))
1348 return node->left;
1349
1350 /* Convert the SUBEXP node to the concatenation of an
1351 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1352 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1353 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1354 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1355 tree = create_tree (dfa, op, tree1, CONCAT);
1356 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1357 {
1358 *err = REG_ESPACE;
1359 return NULL;
1360 }
1361
1362 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1363 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1364 return tree;
1365}
1366
1367/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1368 nodes. Requires a postorder visit. */
1369static reg_errcode_t
1370calc_first (void *extra, bin_tree_t *node)
1371{
1372 re_dfa_t *dfa = (re_dfa_t *) extra;
1373 if (node->token.type == CONCAT)
1374 {
1375 node->first = node->left->first;
1376 node->node_idx = node->left->node_idx;
1377 }
1378 else
1379 {
1380 node->first = node;
1381 node->node_idx = re_dfa_add_node (dfa, node->token);
1382 if (BE (node->node_idx == -1, 0))
1383 return REG_ESPACE;
1384 if (node->token.type == ANCHOR)
1385 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1386 }
1387 return REG_NOERROR;
1388}
1389
1390/* Pass 2: compute NEXT on the tree. Preorder visit. */
1391static reg_errcode_t
1392calc_next (UNUSED_PARAM void *extra, bin_tree_t *node)
1393{
1394 switch (node->token.type)
1395 {
1396 case OP_DUP_ASTERISK:
1397 node->left->next = node;
1398 break;
1399 case CONCAT:
1400 node->left->next = node->right->first;
1401 node->right->next = node->next;
1402 break;
1403 default:
1404 if (node->left)
1405 node->left->next = node->next;
1406 if (node->right)
1407 node->right->next = node->next;
1408 break;
1409 }
1410 return REG_NOERROR;
1411}
1412
1413/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1414static reg_errcode_t
1415link_nfa_nodes (void *extra, bin_tree_t *node)
1416{
1417 re_dfa_t *dfa = (re_dfa_t *) extra;
1418 int idx = node->node_idx;
1419 reg_errcode_t err = REG_NOERROR;
1420
1421 switch (node->token.type)
1422 {
1423 case CONCAT:
1424 break;
1425
1426 case END_OF_RE:
1427 assert (node->next == NULL);
1428 break;
1429
1430 case OP_DUP_ASTERISK:
1431 case OP_ALT:
1432 {
1433 int left, right;
1434 dfa->has_plural_match = 1;
1435 if (node->left != NULL)
1436 left = node->left->first->node_idx;
1437 else
1438 left = node->next->node_idx;
1439 if (node->right != NULL)
1440 right = node->right->first->node_idx;
1441 else
1442 right = node->next->node_idx;
1443 assert (left > -1);
1444 assert (right > -1);
1445 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1446 }
1447 break;
1448
1449 case ANCHOR:
1450 case OP_OPEN_SUBEXP:
1451 case OP_CLOSE_SUBEXP:
1452 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1453 break;
1454
1455 case OP_BACK_REF:
1456 dfa->nexts[idx] = node->next->node_idx;
1457 if (node->token.type == OP_BACK_REF)
1458 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1459 break;
1460
1461 default:
1462 assert (!IS_EPSILON_NODE (node->token.type));
1463 dfa->nexts[idx] = node->next->node_idx;
1464 break;
1465 }
1466
1467 return err;
1468}
1469
1470/* Duplicate the epsilon closure of the node ROOT_NODE.
1471 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1472 to their own constraint. */
1473
1474static reg_errcode_t
1475internal_function
1476duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1477 int root_node, unsigned int init_constraint)
1478{
1479 int org_node, clone_node, ret;
1480 unsigned int constraint = init_constraint;
1481 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 {
1483 int org_dest, clone_dest;
1484 if (dfa->nodes[org_node].type == OP_BACK_REF)
1485 {
1486 /* If the back reference epsilon-transit, its destination must
1487 also have the constraint. Then duplicate the epsilon closure
1488 of the destination of the back reference, and store it in
1489 edests of the back reference. */
1490 org_dest = dfa->nexts[org_node];
1491 re_node_set_empty (dfa->edests + clone_node);
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == -1, 0))
1494 return REG_ESPACE;
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497 if (BE (ret < 0, 0))
1498 return REG_ESPACE;
1499 }
1500 else if (dfa->edests[org_node].nelem == 0)
1501 {
1502 /* In case of the node can't epsilon-transit, don't duplicate the
1503 destination and store the original destination as the
1504 destination of the node. */
1505 dfa->nexts[clone_node] = dfa->nexts[org_node];
1506 break;
1507 }
1508 else if (dfa->edests[org_node].nelem == 1)
1509 {
1510 /* In case of the node can epsilon-transit, and it has only one
1511 destination. */
1512 org_dest = dfa->edests[org_node].elems[0];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 /* If the node is root_node itself, it means the epsilon clsoure
1515 has a loop. Then tie it to the destination of the root_node. */
1516 if (org_node == root_node && clone_node != org_node)
1517 {
1518 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1521 break;
1522 }
1523 /* In case of the node has another constraint, add it. */
1524 constraint |= dfa->nodes[org_node].constraint;
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == -1, 0))
1527 return REG_ESPACE;
1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 if (BE (ret < 0, 0))
1530 return REG_ESPACE;
1531 }
1532 else /* dfa->edests[org_node].nelem == 2 */
1533 {
1534 /* In case of the node can epsilon-transit, and it has two
1535 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
1538 /* Search for a duplicated node which satisfies the constraint. */
1539 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1540 if (clone_dest == -1)
1541 {
1542 /* There is no such duplicated node, create a new one. */
1543 reg_errcode_t err;
1544 clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 if (BE (clone_dest == -1, 0))
1546 return REG_ESPACE;
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1550 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1551 root_node, constraint);
1552 if (BE (err != REG_NOERROR, 0))
1553 return err;
1554 }
1555 else
1556 {
1557 /* There is a duplicated node which satisfies the constraint,
1558 use it to avoid infinite loop. */
1559 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (BE (ret < 0, 0))
1561 return REG_ESPACE;
1562 }
1563
1564 org_dest = dfa->edests[org_node].elems[1];
1565 clone_dest = duplicate_node (dfa, org_dest, constraint);
1566 if (BE (clone_dest == -1, 0))
1567 return REG_ESPACE;
1568 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1569 if (BE (ret < 0, 0))
1570 return REG_ESPACE;
1571 }
1572 org_node = org_dest;
1573 clone_node = clone_dest;
1574 }
1575 return REG_NOERROR;
1576}
1577
1578/* Search for a node which is duplicated from the node ORG_NODE, and
1579 satisfies the constraint CONSTRAINT. */
1580
1581static int
1582search_duplicated_node (const re_dfa_t *dfa, int org_node,
1583 unsigned int constraint)
1584{
1585 int idx;
1586 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 {
1588 if (org_node == dfa->org_indices[idx]
1589 && constraint == dfa->nodes[idx].constraint)
1590 return idx; /* Found. */
1591 }
1592 return -1; /* Not found. */
1593}
1594
1595/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1596 Return the index of the new node, or -1 if insufficient storage is
1597 available. */
1598
1599static int
1600duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1601{
1602 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1603 if (BE (dup_idx != -1, 1))
1604 {
1605 dfa->nodes[dup_idx].constraint = constraint;
1606 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1607 dfa->nodes[dup_idx].duplicated = 1;
1608
1609 /* Store the index of the original node. */
1610 dfa->org_indices[dup_idx] = org_idx;
1611 }
1612 return dup_idx;
1613}
1614
1615static reg_errcode_t
1616calc_inveclosure (re_dfa_t *dfa)
1617{
1618 int src, idx, ret;
1619 for (idx = 0; idx < dfa->nodes_len; ++idx)
1620 re_node_set_init_empty (dfa->inveclosures + idx);
1621
1622 for (src = 0; src < dfa->nodes_len; ++src)
1623 {
1624 int *elems = dfa->eclosures[src].elems;
1625 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1626 {
1627 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1628 if (BE (ret == -1, 0))
1629 return REG_ESPACE;
1630 }
1631 }
1632
1633 return REG_NOERROR;
1634}
1635
1636/* Calculate "eclosure" for all the node in DFA. */
1637
1638static reg_errcode_t
1639calc_eclosure (re_dfa_t *dfa)
1640{
1641 int node_idx, incomplete;
1642#ifdef DEBUG
1643 assert (dfa->nodes_len > 0);
1644#endif
1645 incomplete = 0;
1646 /* For each nodes, calculate epsilon closure. */
1647 for (node_idx = 0; ; ++node_idx)
1648 {
1649 reg_errcode_t err;
1650 re_node_set eclosure_elem;
1651 if (node_idx == dfa->nodes_len)
1652 {
1653 if (!incomplete)
1654 break;
1655 incomplete = 0;
1656 node_idx = 0;
1657 }
1658
1659#ifdef DEBUG
1660 assert (dfa->eclosures[node_idx].nelem != -1);
1661#endif
1662
1663 /* If we have already calculated, skip it. */
1664 if (dfa->eclosures[node_idx].nelem != 0)
1665 continue;
1666 /* Calculate epsilon closure of `node_idx'. */
1667 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1668 if (BE (err != REG_NOERROR, 0))
1669 return err;
1670
1671 if (dfa->eclosures[node_idx].nelem == 0)
1672 {
1673 incomplete = 1;
1674 re_node_set_free (&eclosure_elem);
1675 }
1676 }
1677 return REG_NOERROR;
1678}
1679
1680/* Calculate epsilon closure of NODE. */
1681
1682static reg_errcode_t
1683calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1684{
1685 reg_errcode_t err;
1686 int i;
1687 re_node_set eclosure;
1688 int ret;
1689 int incomplete = 0;
1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1693
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = -1;
1697
1698 /* If the current node has constraints, duplicate all nodes
1699 since they must inherit the constraints. */
1700 if (dfa->nodes[node].constraint
1701 && dfa->edests[node].nelem
1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 {
1704 err = duplicate_node_closure (dfa, node, node, node,
1705 dfa->nodes[node].constraint);
1706 if (BE (err != REG_NOERROR, 0))
1707 return err;
1708 }
1709
1710 /* Expand each epsilon destination nodes. */
1711 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1712 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 {
1714 re_node_set eclosure_elem;
1715 int edest = dfa->edests[node].elems[i];
1716 /* If calculating the epsilon closure of `edest' is in progress,
1717 return intermediate result. */
1718 if (dfa->eclosures[edest].nelem == -1)
1719 {
1720 incomplete = 1;
1721 continue;
1722 }
1723 /* If we haven't calculated the epsilon closure of `edest' yet,
1724 calculate now. Otherwise use calculated epsilon closure. */
1725 if (dfa->eclosures[edest].nelem == 0)
1726 {
1727 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1728 if (BE (err != REG_NOERROR, 0))
1729 return err;
1730 }
1731 else
1732 eclosure_elem = dfa->eclosures[edest];
1733 /* Merge the epsilon closure of `edest'. */
1734 err = re_node_set_merge (&eclosure, &eclosure_elem);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
1737 /* If the epsilon closure of `edest' is incomplete,
1738 the epsilon closure of this node is also incomplete. */
1739 if (dfa->eclosures[edest].nelem == 0)
1740 {
1741 incomplete = 1;
1742 re_node_set_free (&eclosure_elem);
1743 }
1744 }
1745
1746 /* An epsilon closure includes itself. */
1747 ret = re_node_set_insert (&eclosure, node);
1748 if (BE (ret < 0, 0))
1749 return REG_ESPACE;
1750 if (incomplete && !root)
1751 dfa->eclosures[node].nelem = 0;
1752 else
1753 dfa->eclosures[node] = eclosure;
1754 *new_set = eclosure;
1755 return REG_NOERROR;
1756}
1757
1758/* Functions for token which are used in the parser. */
1759
1760/* Fetch a token from INPUT.
1761 We must not use this function inside bracket expressions. */
1762
1763static void
1764internal_function
1765fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766{
1767 re_string_skip_bytes (input, peek_token (result, input, syntax));
1768}
1769
1770/* Peek a token from INPUT, and return the length of the token.
1771 We must not use this function inside bracket expressions. */
1772
1773static int
1774internal_function
1775peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776{
1777 unsigned char c;
1778
1779 if (re_string_eoi (input))
1780 {
1781 token->type = END_OF_RE;
1782 return 0;
1783 }
1784
1785 c = re_string_peek_byte (input, 0);
1786 token->opr.c = c;
1787
1788 token->word_char = 0;
1789#ifdef RE_ENABLE_I18N
1790 token->mb_partial = 0;
1791 if (input->mb_cur_max > 1 &&
1792 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 {
1794 token->type = CHARACTER;
1795 token->mb_partial = 1;
1796 return 1;
1797 }
1798#endif
1799 if (c == '\\')
1800 {
1801 unsigned char c2;
1802 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 {
1804 token->type = BACK_SLASH;
1805 return 1;
1806 }
1807
1808 c2 = re_string_peek_byte_case (input, 1);
1809 token->opr.c = c2;
1810 token->type = CHARACTER;
1811#ifdef RE_ENABLE_I18N
1812 if (input->mb_cur_max > 1)
1813 {
1814 wint_t wc = re_string_wchar_at (input,
1815 re_string_cur_idx (input) + 1);
1816 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817 }
1818 else
1819#endif
1820 token->word_char = IS_WORD_CHAR (c2) != 0;
1821
1822 switch (c2)
1823 {
1824 case '|':
1825 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826 token->type = OP_ALT;
1827 break;
1828 case '1': case '2': case '3': case '4': case '5':
1829 case '6': case '7': case '8': case '9':
1830 if (!(syntax & RE_NO_BK_REFS))
1831 {
1832 token->type = OP_BACK_REF;
1833 token->opr.idx = c2 - '1';
1834 }
1835 break;
1836 case '<':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 {
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_FIRST;
1841 }
1842 break;
1843 case '>':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_LAST;
1848 }
1849 break;
1850 case 'b':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_DELIM;
1855 }
1856 break;
1857 case 'B':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = NOT_WORD_DELIM;
1862 }
1863 break;
1864 case 'w':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_WORD;
1867 break;
1868 case 'W':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_NOTWORD;
1871 break;
1872 case 's':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_SPACE;
1875 break;
1876 case 'S':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTSPACE;
1879 break;
1880 case '`':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 {
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = BUF_FIRST;
1885 }
1886 break;
1887 case '\'':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_LAST;
1892 }
1893 break;
1894 case '(':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_OPEN_SUBEXP;
1897 break;
1898 case ')':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_CLOSE_SUBEXP;
1901 break;
1902 case '+':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_PLUS;
1905 break;
1906 case '?':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_QUESTION;
1909 break;
1910 case '{':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_OPEN_DUP_NUM;
1913 break;
1914 case '}':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_CLOSE_DUP_NUM;
1917 break;
1918 default:
1919 break;
1920 }
1921 return 2;
1922 }
1923
1924 token->type = CHARACTER;
1925#ifdef RE_ENABLE_I18N
1926 if (input->mb_cur_max > 1)
1927 {
1928 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930 }
1931 else
1932#endif
1933 token->word_char = IS_WORD_CHAR (token->opr.c);
1934
1935 switch (c)
1936 {
1937 case '\n':
1938 if (syntax & RE_NEWLINE_ALT)
1939 token->type = OP_ALT;
1940 break;
1941 case '|':
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1943 token->type = OP_ALT;
1944 break;
1945 case '*':
1946 token->type = OP_DUP_ASTERISK;
1947 break;
1948 case '+':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_PLUS;
1951 break;
1952 case '?':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_QUESTION;
1955 break;
1956 case '{':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_OPEN_DUP_NUM;
1959 break;
1960 case '}':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_CLOSE_DUP_NUM;
1963 break;
1964 case '(':
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_OPEN_SUBEXP;
1967 break;
1968 case ')':
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_CLOSE_SUBEXP;
1971 break;
1972 case '[':
1973 token->type = OP_OPEN_BRACKET;
1974 break;
1975 case '.':
1976 token->type = OP_PERIOD;
1977 break;
1978 case '^':
1979 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1980 re_string_cur_idx (input) != 0)
1981 {
1982 char prev = re_string_peek_byte (input, -1);
1983 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1984 break;
1985 }
1986 token->type = ANCHOR;
1987 token->opr.ctx_type = LINE_FIRST;
1988 break;
1989 case '$':
1990 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1991 re_string_cur_idx (input) + 1 != re_string_length (input))
1992 {
1993 re_token_t next;
1994 re_string_skip_bytes (input, 1);
1995 peek_token (&next, input, syntax);
1996 re_string_skip_bytes (input, -1);
1997 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998 break;
1999 }
2000 token->type = ANCHOR;
2001 token->opr.ctx_type = LINE_LAST;
2002 break;
2003 default:
2004 break;
2005 }
2006 return 1;
2007}
2008
2009/* Peek a token from INPUT, and return the length of the token.
2010 We must not use this function out of bracket expressions. */
2011
2012static int
2013internal_function
2014peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2015{
2016 unsigned char c;
2017 if (re_string_eoi (input))
2018 {
2019 token->type = END_OF_RE;
2020 return 0;
2021 }
2022 c = re_string_peek_byte (input, 0);
2023 token->opr.c = c;
2024
2025#ifdef RE_ENABLE_I18N
2026 if (input->mb_cur_max > 1 &&
2027 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 {
2029 token->type = CHARACTER;
2030 return 1;
2031 }
2032#endif /* RE_ENABLE_I18N */
2033
2034 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 {
2037 /* In this case, '\' escape a character. */
2038 unsigned char c2;
2039 re_string_skip_bytes (input, 1);
2040 c2 = re_string_peek_byte (input, 0);
2041 token->opr.c = c2;
2042 token->type = CHARACTER;
2043 return 1;
2044 }
2045 if (c == '[') /* '[' is a special char in a bracket exps. */
2046 {
2047 unsigned char c2;
2048 int token_len;
2049 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050 c2 = re_string_peek_byte (input, 1);
2051 else
2052 c2 = 0;
2053 token->opr.c = c2;
2054 token_len = 2;
2055 switch (c2)
2056 {
2057 case '.':
2058 token->type = OP_OPEN_COLL_ELEM;
2059 break;
2060 case '=':
2061 token->type = OP_OPEN_EQUIV_CLASS;
2062 break;
2063 case ':':
2064 if (syntax & RE_CHAR_CLASSES)
2065 {
2066 token->type = OP_OPEN_CHAR_CLASS;
2067 break;
2068 }
2069 /* else fall through. */
2070 default:
2071 token->type = CHARACTER;
2072 token->opr.c = c;
2073 token_len = 1;
2074 break;
2075 }
2076 return token_len;
2077 }
2078 switch (c)
2079 {
2080 case '-':
2081 token->type = OP_CHARSET_RANGE;
2082 break;
2083 case ']':
2084 token->type = OP_CLOSE_BRACKET;
2085 break;
2086 case '^':
2087 token->type = OP_NON_MATCH_LIST;
2088 break;
2089 default:
2090 token->type = CHARACTER;
2091 }
2092 return 1;
2093}
2094
2095/* Functions for parser. */
2096
2097/* Entry point of the parser.
2098 Parse the regular expression REGEXP and return the structure tree.
2099 If an error has occurred, ERR is set by error code, and return NULL.
2100 This function build the following tree, from regular expression <reg_exp>:
2101 CAT
2102 / \
2103 / \
2104 <reg_exp> EOR
2105
2106 CAT means concatenation.
2107 EOR means end of regular expression. */
2108
2109static bin_tree_t *
2110parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2111 reg_errcode_t *err)
2112{
2113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2114 bin_tree_t *tree, *eor, *root;
2115 re_token_t current_token;
2116 dfa->syntax = syntax;
2117 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2119 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 return NULL;
2121 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 if (tree != NULL)
2123 root = create_tree (dfa, tree, eor, CONCAT);
2124 else
2125 root = eor;
2126 if (BE (eor == NULL || root == NULL, 0))
2127 {
2128 *err = REG_ESPACE;
2129 return NULL;
2130 }
2131 return root;
2132}
2133
2134/* This function build the following tree, from regular expression
2135 <branch1>|<branch2>:
2136 ALT
2137 / \
2138 / \
2139 <branch1> <branch2>
2140
2141 ALT means alternative, which represents the operator `|'. */
2142
2143static bin_tree_t *
2144parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2145 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2146{
2147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2148 bin_tree_t *tree, *branch = NULL;
2149 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2150 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2151 return NULL;
2152
2153 while (token->type == OP_ALT)
2154 {
2155 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2156 if (token->type != OP_ALT && token->type != END_OF_RE
2157 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 {
2159 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2160 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2161 return NULL;
2162 }
2163 else
2164 branch = NULL;
2165 tree = create_tree (dfa, tree, branch, OP_ALT);
2166 if (BE (tree == NULL, 0))
2167 {
2168 *err = REG_ESPACE;
2169 return NULL;
2170 }
2171 }
2172 return tree;
2173}
2174
2175/* This function build the following tree, from regular expression
2176 <exp1><exp2>:
2177 CAT
2178 / \
2179 / \
2180 <exp1> <exp2>
2181
2182 CAT means concatenation. */
2183
2184static bin_tree_t *
2185parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2187{
2188 bin_tree_t *tree, *exp;
2189 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2190 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2191 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192 return NULL;
2193
2194 while (token->type != OP_ALT && token->type != END_OF_RE
2195 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 {
2197 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2198 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2199 {
2200 return NULL;
2201 }
2202 if (tree != NULL && exp != NULL)
2203 {
2204 tree = create_tree (dfa, tree, exp, CONCAT);
2205 if (tree == NULL)
2206 {
2207 *err = REG_ESPACE;
2208 return NULL;
2209 }
2210 }
2211 else if (tree == NULL)
2212 tree = exp;
2213 /* Otherwise exp == NULL, we don't need to create new tree. */
2214 }
2215 return tree;
2216}
2217
2218/* This function build the following tree, from regular expression a*:
2219 *
2220 |
2221 a
2222*/
2223
2224static bin_tree_t *
2225parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2226 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2227{
2228 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2229 bin_tree_t *tree;
2230 switch (token->type)
2231 {
2232 case CHARACTER:
2233 tree = create_token_tree (dfa, NULL, NULL, token);
2234 if (BE (tree == NULL, 0))
2235 {
2236 *err = REG_ESPACE;
2237 return NULL;
2238 }
2239#ifdef RE_ENABLE_I18N
2240 if (dfa->mb_cur_max > 1)
2241 {
2242 while (!re_string_eoi (regexp)
2243 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 {
2245 bin_tree_t *mbc_remain;
2246 fetch_token (token, regexp, syntax);
2247 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2248 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2249 if (BE (mbc_remain == NULL || tree == NULL, 0))
2250 {
2251 *err = REG_ESPACE;
2252 return NULL;
2253 }
2254 }
2255 }
2256#endif
2257 break;
2258 case OP_OPEN_SUBEXP:
2259 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2260 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2261 return NULL;
2262 break;
2263 case OP_OPEN_BRACKET:
2264 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2265 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2266 return NULL;
2267 break;
2268 case OP_BACK_REF:
2269 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2270 {
2271 *err = REG_ESUBREG;
2272 return NULL;
2273 }
2274 dfa->used_bkref_map |= 1 << token->opr.idx;
2275 tree = create_token_tree (dfa, NULL, NULL, token);
2276 if (BE (tree == NULL, 0))
2277 {
2278 *err = REG_ESPACE;
2279 return NULL;
2280 }
2281 ++dfa->nbackref;
2282 dfa->has_mb_node = 1;
2283 break;
2284 case OP_OPEN_DUP_NUM:
2285 if (syntax & RE_CONTEXT_INVALID_DUP)
2286 {
2287 *err = REG_BADRPT;
2288 return NULL;
2289 }
2290 /* FALLTHROUGH */
2291 case OP_DUP_ASTERISK:
2292 case OP_DUP_PLUS:
2293 case OP_DUP_QUESTION:
2294 if (syntax & RE_CONTEXT_INVALID_OPS)
2295 {
2296 *err = REG_BADRPT;
2297 return NULL;
2298 }
2299 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 {
2301 fetch_token (token, regexp, syntax);
2302 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 }
2304 /* else fall through */
2305 case OP_CLOSE_SUBEXP:
2306 if ((token->type == OP_CLOSE_SUBEXP) &&
2307 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2308 {
2309 *err = REG_ERPAREN;
2310 return NULL;
2311 }
2312 /* else fall through */
2313 case OP_CLOSE_DUP_NUM:
2314 /* We treat it as a normal character. */
2315
2316 /* Then we can these characters as normal characters. */
2317 token->type = CHARACTER;
2318 /* mb_partial and word_char bits should be initialized already
2319 by peek_token. */
2320 tree = create_token_tree (dfa, NULL, NULL, token);
2321 if (BE (tree == NULL, 0))
2322 {
2323 *err = REG_ESPACE;
2324 return NULL;
2325 }
2326 break;
2327 case ANCHOR:
2328 if ((token->opr.ctx_type
2329 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2330 && dfa->word_ops_used == 0)
2331 init_word_char (dfa);
2332 if (token->opr.ctx_type == WORD_DELIM
2333 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 {
2335 bin_tree_t *tree_first, *tree_last;
2336 if (token->opr.ctx_type == WORD_DELIM)
2337 {
2338 token->opr.ctx_type = WORD_FIRST;
2339 tree_first = create_token_tree (dfa, NULL, NULL, token);
2340 token->opr.ctx_type = WORD_LAST;
2341 }
2342 else
2343 {
2344 token->opr.ctx_type = INSIDE_WORD;
2345 tree_first = create_token_tree (dfa, NULL, NULL, token);
2346 token->opr.ctx_type = INSIDE_NOTWORD;
2347 }
2348 tree_last = create_token_tree (dfa, NULL, NULL, token);
2349 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2350 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2351 {
2352 *err = REG_ESPACE;
2353 return NULL;
2354 }
2355 }
2356 else
2357 {
2358 tree = create_token_tree (dfa, NULL, NULL, token);
2359 if (BE (tree == NULL, 0))
2360 {
2361 *err = REG_ESPACE;
2362 return NULL;
2363 }
2364 }
2365 /* We must return here, since ANCHORs can't be followed
2366 by repetition operators.
2367 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2368 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2369 fetch_token (token, regexp, syntax);
2370 return tree;
2371 case OP_PERIOD:
2372 tree = create_token_tree (dfa, NULL, NULL, token);
2373 if (BE (tree == NULL, 0))
2374 {
2375 *err = REG_ESPACE;
2376 return NULL;
2377 }
2378 if (dfa->mb_cur_max > 1)
2379 dfa->has_mb_node = 1;
2380 break;
2381 case OP_WORD:
2382 case OP_NOTWORD:
2383 tree = build_charclass_op (dfa, regexp->trans,
2384 "alnum",
2385 "_",
2386 token->type == OP_NOTWORD, err);
2387 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2388 return NULL;
2389 break;
2390 case OP_SPACE:
2391 case OP_NOTSPACE:
2392 tree = build_charclass_op (dfa, regexp->trans,
2393 "space",
2394 "",
2395 token->type == OP_NOTSPACE, err);
2396 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2397 return NULL;
2398 break;
2399 case OP_ALT:
2400 case END_OF_RE:
2401 return NULL;
2402 case BACK_SLASH:
2403 *err = REG_EESCAPE;
2404 return NULL;
2405 default:
2406 /* Must not happen? */
2407#ifdef DEBUG
2408 assert (0);
2409#endif
2410 return NULL;
2411 }
2412 fetch_token (token, regexp, syntax);
2413
2414 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2415 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 {
2417 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2418 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419 return NULL;
2420 /* In BRE consecutive duplications are not allowed. */
2421 if ((syntax & RE_CONTEXT_INVALID_DUP)
2422 && (token->type == OP_DUP_ASTERISK
2423 || token->type == OP_OPEN_DUP_NUM))
2424 {
2425 *err = REG_BADRPT;
2426 return NULL;
2427 }
2428 }
2429
2430 return tree;
2431}
2432
2433/* This function build the following tree, from regular expression
2434 (<reg_exp>):
2435 SUBEXP
2436 |
2437 <reg_exp>
2438*/
2439
2440static bin_tree_t *
2441parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2443{
2444 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2445 bin_tree_t *tree;
2446 size_t cur_nsub;
2447 cur_nsub = preg->re_nsub++;
2448
2449 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450
2451 /* The subexpression may be a null string. */
2452 if (token->type == OP_CLOSE_SUBEXP)
2453 tree = NULL;
2454 else
2455 {
2456 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 *err = REG_EPAREN;
2459 if (BE (*err != REG_NOERROR, 0))
2460 return NULL;
2461 }
2462
2463 if (cur_nsub <= '9' - '1')
2464 dfa->completed_bkref_map |= 1 << cur_nsub;
2465
2466 tree = create_tree (dfa, tree, NULL, SUBEXP);
2467 if (BE (tree == NULL, 0))
2468 {
2469 *err = REG_ESPACE;
2470 return NULL;
2471 }
2472 tree->token.opr.idx = cur_nsub;
2473 return tree;
2474}
2475
2476/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2477
2478static bin_tree_t *
2479parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2480 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481{
2482 bin_tree_t *tree = NULL, *old_tree = NULL;
2483 int i, start, end, start_idx = re_string_cur_idx (regexp);
2484#ifndef RE_TOKEN_INIT_BUG
2485 re_token_t start_token = *token;
2486#else
2487 re_token_t start_token;
2488
2489 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2490#endif
2491
2492 if (token->type == OP_OPEN_DUP_NUM)
2493 {
2494 end = 0;
2495 start = fetch_number (regexp, token, syntax);
2496 if (start == -1)
2497 {
2498 if (token->type == CHARACTER && token->opr.c == ',')
2499 start = 0; /* We treat "{,m}" as "{0,m}". */
2500 else
2501 {
2502 *err = REG_BADBR; /* <re>{} is invalid. */
2503 return NULL;
2504 }
2505 }
2506 if (BE (start != -2, 1))
2507 {
2508 /* We treat "{n}" as "{n,n}". */
2509 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2510 : ((token->type == CHARACTER && token->opr.c == ',')
2511 ? fetch_number (regexp, token, syntax) : -2));
2512 }
2513 if (BE (start == -2 || end == -2, 0))
2514 {
2515 /* Invalid sequence. */
2516 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2517 {
2518 if (token->type == END_OF_RE)
2519 *err = REG_EBRACE;
2520 else
2521 *err = REG_BADBR;
2522
2523 return NULL;
2524 }
2525
2526 /* If the syntax bit is set, rollback. */
2527 re_string_set_index (regexp, start_idx);
2528 *token = start_token;
2529 token->type = CHARACTER;
2530 /* mb_partial and word_char bits should be already initialized by
2531 peek_token. */
2532 return elem;
2533 }
2534
2535 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2536 {
2537 /* First number greater than second. */
2538 *err = REG_BADBR;
2539 return NULL;
2540 }
2541 }
2542 else
2543 {
2544 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2545 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2546 }
2547
2548 fetch_token (token, regexp, syntax);
2549
2550 if (BE (elem == NULL, 0))
2551 return NULL;
2552 if (BE (start == 0 && end == 0, 0))
2553 {
2554 postorder (elem, free_tree, NULL);
2555 return NULL;
2556 }
2557
2558 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2559 if (BE (start > 0, 0))
2560 {
2561 tree = elem;
2562 for (i = 2; i <= start; ++i)
2563 {
2564 elem = duplicate_tree (elem, dfa);
2565 tree = create_tree (dfa, tree, elem, CONCAT);
2566 if (BE (elem == NULL || tree == NULL, 0))
2567 goto parse_dup_op_espace;
2568 }
2569
2570 if (start == end)
2571 return tree;
2572
2573 /* Duplicate ELEM before it is marked optional. */
2574 elem = duplicate_tree (elem, dfa);
2575 old_tree = tree;
2576 }
2577 else
2578 old_tree = NULL;
2579
2580 if (elem->token.type == SUBEXP)
2581 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2582
2583 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2584 if (BE (tree == NULL, 0))
2585 goto parse_dup_op_espace;
2586
2587 /* This loop is actually executed only when end != -1,
2588 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2589 already created the start+1-th copy. */
2590 for (i = start + 2; i <= end; ++i)
2591 {
2592 elem = duplicate_tree (elem, dfa);
2593 tree = create_tree (dfa, tree, elem, CONCAT);
2594 if (BE (elem == NULL || tree == NULL, 0))
2595 goto parse_dup_op_espace;
2596
2597 tree = create_tree (dfa, tree, NULL, OP_ALT);
2598 if (BE (tree == NULL, 0))
2599 goto parse_dup_op_espace;
2600 }
2601
2602 if (old_tree)
2603 tree = create_tree (dfa, old_tree, tree, CONCAT);
2604
2605 return tree;
2606
2607 parse_dup_op_espace:
2608 *err = REG_ESPACE;
2609 return NULL;
2610}
2611
2612/* Size of the names for collating symbol/equivalence_class/character_class.
2613 I'm not sure, but maybe enough. */
2614#define BRACKET_NAME_BUF_SIZE 32
2615
2616#ifndef _LIBC
2617 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618 Build the range expression which starts from START_ELEM, and ends
2619 at END_ELEM. The result are written to MBCSET and SBCSET.
2620 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621 mbcset->range_ends, is a pointer argument since we may
2622 update it. */
2623
2624static reg_errcode_t
2625internal_function
2626# ifdef RE_ENABLE_I18N
2627build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2628 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2629# else /* not RE_ENABLE_I18N */
2630build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2631 bracket_elem_t *end_elem)
2632# endif /* not RE_ENABLE_I18N */
2633{
2634 unsigned int start_ch, end_ch;
2635 /* Equivalence Classes and Character Classes can't be a range start/end. */
2636 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2637 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2638 0))
2639 return REG_ERANGE;
2640
2641 /* We can handle no multi character collating elements without libc
2642 support. */
2643 if (BE ((start_elem->type == COLL_SYM
2644 && strlen ((char *) start_elem->opr.name) > 1)
2645 || (end_elem->type == COLL_SYM
2646 && strlen ((char *) end_elem->opr.name) > 1), 0))
2647 return REG_ECOLLATE;
2648
2649# ifdef RE_ENABLE_I18N
2650 {
2651 wchar_t wc;
2652 wint_t start_wc;
2653 wint_t end_wc;
2654 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2655
2656 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2657 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 : 0));
2659 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2660 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661 : 0));
2662#ifdef GAWK
2663 /*
2664 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2665 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2666 * unsigned, so we don't have sign extension problems.
2667 */
2668 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 ? start_ch : start_elem->opr.wch);
2670 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 ? end_ch : end_elem->opr.wch);
2672#else
2673 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2674 ? __btowc (start_ch) : start_elem->opr.wch);
2675 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2676 ? __btowc (end_ch) : end_elem->opr.wch);
2677#endif
2678 if (start_wc == WEOF || end_wc == WEOF)
2679 return REG_ECOLLATE;
2680 cmp_buf[0] = start_wc;
2681 cmp_buf[4] = end_wc;
2682 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2683 return REG_ERANGE;
2684
2685 /* Got valid collation sequence values, add them as a new entry.
2686 However, for !_LIBC we have no collation elements: if the
2687 character set is single byte, the single byte character set
2688 that we build below suffices. parse_bracket_exp passes
2689 no MBCSET if dfa->mb_cur_max == 1. */
2690 if (mbcset)
2691 {
2692 /* Check the space of the arrays. */
2693 if (BE (*range_alloc == mbcset->nranges, 0))
2694 {
2695 /* There is not enough space, need realloc. */
2696 wchar_t *new_array_start, *new_array_end;
2697 int new_nranges;
2698
2699 /* +1 in case of mbcset->nranges is 0. */
2700 new_nranges = 2 * mbcset->nranges + 1;
2701 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2702 are NULL if *range_alloc == 0. */
2703 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2704 new_nranges);
2705 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2706 new_nranges);
2707
2708 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2709 return REG_ESPACE;
2710
2711 mbcset->range_starts = new_array_start;
2712 mbcset->range_ends = new_array_end;
2713 *range_alloc = new_nranges;
2714 }
2715
2716 mbcset->range_starts[mbcset->nranges] = start_wc;
2717 mbcset->range_ends[mbcset->nranges++] = end_wc;
2718 }
2719
2720 /* Build the table for single byte characters. */
2721 for (wc = 0; wc < SBC_MAX; ++wc)
2722 {
2723 cmp_buf[2] = wc;
2724 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2725 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2726 bitset_set (sbcset, wc);
2727 }
2728 }
2729# else /* not RE_ENABLE_I18N */
2730 {
2731 unsigned int ch;
2732 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2733 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734 : 0));
2735 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2736 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737 : 0));
2738 if (start_ch > end_ch)
2739 return REG_ERANGE;
2740 /* Build the table for single byte characters. */
2741 for (ch = 0; ch < SBC_MAX; ++ch)
2742 if (start_ch <= ch && ch <= end_ch)
2743 bitset_set (sbcset, ch);
2744 }
2745# endif /* not RE_ENABLE_I18N */
2746 return REG_NOERROR;
2747}
2748#endif /* not _LIBC */
2749
2750#ifndef _LIBC
2751/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2752 Build the collating element which is represented by NAME.
2753 The result are written to MBCSET and SBCSET.
2754 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2755 pointer argument since we may update it. */
2756
2757static reg_errcode_t
2758internal_function
2759# ifdef RE_ENABLE_I18N
2760build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2761 int *coll_sym_alloc, const unsigned char *name)
2762# else /* not RE_ENABLE_I18N */
2763build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2764# endif /* not RE_ENABLE_I18N */
2765{
2766 size_t name_len = strlen ((const char *) name);
2767 if (BE (name_len != 1, 0))
2768 return REG_ECOLLATE;
2769 else
2770 {
2771 bitset_set (sbcset, name[0]);
2772 return REG_NOERROR;
2773 }
2774}
2775#endif /* not _LIBC */
2776
2777/* This function parse bracket expression like "[abc]", "[a-c]",
2778 "[[.a-a.]]" etc. */
2779
2780static bin_tree_t *
2781parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2782 reg_syntax_t syntax, reg_errcode_t *err)
2783{
2784#ifdef _LIBC
2785 const unsigned char *collseqmb;
2786 const char *collseqwc;
2787 uint32_t nrules;
2788 int32_t table_size;
2789 const int32_t *symb_table;
2790 const unsigned char *extra;
2791
2792 /* Local function for parse_bracket_exp used in _LIBC environment.
2793 Seek the collating symbol entry correspondings to NAME.
2794 Return the index of the symbol in the SYMB_TABLE. */
2795
2796 auto inline int32_t
2797 __attribute ((always_inline))
2798 seek_collating_symbol_entry (name, name_len)
2799 const unsigned char *name;
2800 size_t name_len;
2801 {
2802 int32_t hash = elem_hash ((const char *) name, name_len);
2803 int32_t elem = hash % table_size;
2804 if (symb_table[2 * elem] != 0)
2805 {
2806 int32_t second = hash % (table_size - 2) + 1;
2807
2808 do
2809 {
2810 /* First compare the hashing value. */
2811 if (symb_table[2 * elem] == hash
2812 /* Compare the length of the name. */
2813 && name_len == extra[symb_table[2 * elem + 1]]
2814 /* Compare the name. */
2815 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2816 name_len) == 0)
2817 {
2818 /* Yep, this is the entry. */
2819 break;
2820 }
2821
2822 /* Next entry. */
2823 elem += second;
2824 }
2825 while (symb_table[2 * elem] != 0);
2826 }
2827 return elem;
2828 }
2829
2830 /* Local function for parse_bracket_exp used in _LIBC environment.
2831 Look up the collation sequence value of BR_ELEM.
2832 Return the value if succeeded, UINT_MAX otherwise. */
2833
2834 auto inline unsigned int
2835 __attribute ((always_inline))
2836 lookup_collation_sequence_value (br_elem)
2837 bracket_elem_t *br_elem;
2838 {
2839 if (br_elem->type == SB_CHAR)
2840 {
2841 /*
2842 if (MB_CUR_MAX == 1)
2843 */
2844 if (nrules == 0)
2845 return collseqmb[br_elem->opr.ch];
2846 else
2847 {
2848 wint_t wc = __btowc (br_elem->opr.ch);
2849 return __collseq_table_lookup (collseqwc, wc);
2850 }
2851 }
2852 else if (br_elem->type == MB_CHAR)
2853 {
2854 if (nrules != 0)
2855 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2856 }
2857 else if (br_elem->type == COLL_SYM)
2858 {
2859 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2860 if (nrules != 0)
2861 {
2862 int32_t elem, idx;
2863 elem = seek_collating_symbol_entry (br_elem->opr.name,
2864 sym_name_len);
2865 if (symb_table[2 * elem] != 0)
2866 {
2867 /* We found the entry. */
2868 idx = symb_table[2 * elem + 1];
2869 /* Skip the name of collating element name. */
2870 idx += 1 + extra[idx];
2871 /* Skip the byte sequence of the collating element. */
2872 idx += 1 + extra[idx];
2873 /* Adjust for the alignment. */
2874 idx = (idx + 3) & ~3;
2875 /* Skip the multibyte collation sequence value. */
2876 idx += sizeof (unsigned int);
2877 /* Skip the wide char sequence of the collating element. */
2878 idx += sizeof (unsigned int) *
2879 (1 + *(unsigned int *) (extra + idx));
2880 /* Return the collation sequence value. */
2881 return *(unsigned int *) (extra + idx);
2882 }
2883 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2884 {
2885 /* No valid character. Match it as a single byte
2886 character. */
2887 return collseqmb[br_elem->opr.name[0]];
2888 }
2889 }
2890 else if (sym_name_len == 1)
2891 return collseqmb[br_elem->opr.name[0]];
2892 }
2893 return UINT_MAX;
2894 }
2895
2896 /* Local function for parse_bracket_exp used in _LIBC environment.
2897 Build the range expression which starts from START_ELEM, and ends
2898 at END_ELEM. The result are written to MBCSET and SBCSET.
2899 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2900 mbcset->range_ends, is a pointer argument since we may
2901 update it. */
2902
2903 auto inline reg_errcode_t
2904 __attribute ((always_inline))
2905 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2906 re_charset_t *mbcset;
2907 int *range_alloc;
2908 bitset_t sbcset;
2909 bracket_elem_t *start_elem, *end_elem;
2910 {
2911 unsigned int ch;
2912 uint32_t start_collseq;
2913 uint32_t end_collseq;
2914
2915 /* Equivalence Classes and Character Classes can't be a range
2916 start/end. */
2917 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2918 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2919 0))
2920 return REG_ERANGE;
2921
2922 start_collseq = lookup_collation_sequence_value (start_elem);
2923 end_collseq = lookup_collation_sequence_value (end_elem);
2924 /* Check start/end collation sequence values. */
2925 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2926 return REG_ECOLLATE;
2927 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2928 return REG_ERANGE;
2929
2930 /* Got valid collation sequence values, add them as a new entry.
2931 However, if we have no collation elements, and the character set
2932 is single byte, the single byte character set that we
2933 build below suffices. */
2934 if (nrules > 0 || dfa->mb_cur_max > 1)
2935 {
2936 /* Check the space of the arrays. */
2937 if (BE (*range_alloc == mbcset->nranges, 0))
2938 {
2939 /* There is not enough space, need realloc. */
2940 uint32_t *new_array_start;
2941 uint32_t *new_array_end;
2942 int new_nranges;
2943
2944 /* +1 in case of mbcset->nranges is 0. */
2945 new_nranges = 2 * mbcset->nranges + 1;
2946 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2947 new_nranges);
2948 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2949 new_nranges);
2950
2951 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2952 return REG_ESPACE;
2953
2954 mbcset->range_starts = new_array_start;
2955 mbcset->range_ends = new_array_end;
2956 *range_alloc = new_nranges;
2957 }
2958
2959 mbcset->range_starts[mbcset->nranges] = start_collseq;
2960 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2961 }
2962
2963 /* Build the table for single byte characters. */
2964 for (ch = 0; ch < SBC_MAX; ch++)
2965 {
2966 uint32_t ch_collseq;
2967 /*
2968 if (MB_CUR_MAX == 1)
2969 */
2970 if (nrules == 0)
2971 ch_collseq = collseqmb[ch];
2972 else
2973 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2974 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2975 bitset_set (sbcset, ch);
2976 }
2977 return REG_NOERROR;
2978 }
2979
2980 /* Local function for parse_bracket_exp used in _LIBC environment.
2981 Build the collating element which is represented by NAME.
2982 The result are written to MBCSET and SBCSET.
2983 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2984 pointer argument since we may update it. */
2985
2986 auto inline reg_errcode_t
2987 __attribute ((always_inline))
2988 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2989 re_charset_t *mbcset;
2990 int *coll_sym_alloc;
2991 bitset_t sbcset;
2992 const unsigned char *name;
2993 {
2994 int32_t elem, idx;
2995 size_t name_len = strlen ((const char *) name);
2996 if (nrules != 0)
2997 {
2998 elem = seek_collating_symbol_entry (name, name_len);
2999 if (symb_table[2 * elem] != 0)
3000 {
3001 /* We found the entry. */
3002 idx = symb_table[2 * elem + 1];
3003 /* Skip the name of collating element name. */
3004 idx += 1 + extra[idx];
3005 }
3006 else if (symb_table[2 * elem] == 0 && name_len == 1)
3007 {
3008 /* No valid character, treat it as a normal
3009 character. */
3010 bitset_set (sbcset, name[0]);
3011 return REG_NOERROR;
3012 }
3013 else
3014 return REG_ECOLLATE;
3015
3016 /* Got valid collation sequence, add it as a new entry. */
3017 /* Check the space of the arrays. */
3018 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3019 {
3020 /* Not enough, realloc it. */
3021 /* +1 in case of mbcset->ncoll_syms is 0. */
3022 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3023 /* Use realloc since mbcset->coll_syms is NULL
3024 if *alloc == 0. */
3025 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3026 new_coll_sym_alloc);
3027 if (BE (new_coll_syms == NULL, 0))
3028 return REG_ESPACE;
3029 mbcset->coll_syms = new_coll_syms;
3030 *coll_sym_alloc = new_coll_sym_alloc;
3031 }
3032 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3033 return REG_NOERROR;
3034 }
3035 else
3036 {
3037 if (BE (name_len != 1, 0))
3038 return REG_ECOLLATE;
3039 else
3040 {
3041 bitset_set (sbcset, name[0]);
3042 return REG_NOERROR;
3043 }
3044 }
3045 }
3046#endif
3047
3048 re_token_t br_token;
3049 re_bitset_ptr_t sbcset;
3050#ifdef RE_ENABLE_I18N
3051 re_charset_t *mbcset;
3052 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3053 int equiv_class_alloc = 0, char_class_alloc = 0;
3054#endif /* not RE_ENABLE_I18N */
3055 int non_match = 0;
3056 bin_tree_t *work_tree;
3057 int token_len;
3058 int first_round = 1;
3059#ifdef _LIBC
3060 collseqmb = (const unsigned char *)
3061 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3062 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3063 if (nrules)
3064 {
3065 /*
3066 if (MB_CUR_MAX > 1)
3067 */
3068 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3069 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3070 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3071 _NL_COLLATE_SYMB_TABLEMB);
3072 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3073 _NL_COLLATE_SYMB_EXTRAMB);
3074 }
3075#endif
3076 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3077#ifdef RE_ENABLE_I18N
3078 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3079#endif /* RE_ENABLE_I18N */
3080#ifdef RE_ENABLE_I18N
3081 if (BE (sbcset == NULL || mbcset == NULL, 0))
3082#else
3083 if (BE (sbcset == NULL, 0))
3084#endif /* RE_ENABLE_I18N */
3085 {
3086 *err = REG_ESPACE;
3087 return NULL;
3088 }
3089
3090 token_len = peek_token_bracket (token, regexp, syntax);
3091 if (BE (token->type == END_OF_RE, 0))
3092 {
3093 *err = REG_BADPAT;
3094 goto parse_bracket_exp_free_return;
3095 }
3096 if (token->type == OP_NON_MATCH_LIST)
3097 {
3098#ifdef RE_ENABLE_I18N
3099 mbcset->non_match = 1;
3100#endif /* not RE_ENABLE_I18N */
3101 non_match = 1;
3102 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3103 bitset_set (sbcset, '\n');
3104 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3105 token_len = peek_token_bracket (token, regexp, syntax);
3106 if (BE (token->type == END_OF_RE, 0))
3107 {
3108 *err = REG_BADPAT;
3109 goto parse_bracket_exp_free_return;
3110 }
3111 }
3112
3113 /* We treat the first ']' as a normal character. */
3114 if (token->type == OP_CLOSE_BRACKET)
3115 token->type = CHARACTER;
3116
3117 while (1)
3118 {
3119 bracket_elem_t start_elem, end_elem;
3120 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3121 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3122 reg_errcode_t ret;
3123 int token_len2 = 0, is_range_exp = 0;
3124 re_token_t token2;
3125
3126 start_elem.opr.name = start_name_buf;
3127 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3128 syntax, first_round);
3129 if (BE (ret != REG_NOERROR, 0))
3130 {
3131 *err = ret;
3132 goto parse_bracket_exp_free_return;
3133 }
3134 first_round = 0;
3135
3136 /* Get information about the next token. We need it in any case. */
3137 token_len = peek_token_bracket (token, regexp, syntax);
3138
3139 /* Do not check for ranges if we know they are not allowed. */
3140 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3141 {
3142 if (BE (token->type == END_OF_RE, 0))
3143 {
3144 *err = REG_EBRACK;
3145 goto parse_bracket_exp_free_return;
3146 }
3147 if (token->type == OP_CHARSET_RANGE)
3148 {
3149 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3150 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3151 if (BE (token2.type == END_OF_RE, 0))
3152 {
3153 *err = REG_EBRACK;
3154 goto parse_bracket_exp_free_return;
3155 }
3156 if (token2.type == OP_CLOSE_BRACKET)
3157 {
3158 /* We treat the last '-' as a normal character. */
3159 re_string_skip_bytes (regexp, -token_len);
3160 token->type = CHARACTER;
3161 }
3162 else
3163 is_range_exp = 1;
3164 }
3165 }
3166
3167 if (is_range_exp == 1)
3168 {
3169 end_elem.opr.name = end_name_buf;
3170 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3171 dfa, syntax, 1);
3172 if (BE (ret != REG_NOERROR, 0))
3173 {
3174 *err = ret;
3175 goto parse_bracket_exp_free_return;
3176 }
3177
3178 token_len = peek_token_bracket (token, regexp, syntax);
3179
3180#ifdef _LIBC
3181 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3182 &start_elem, &end_elem);
3183#else
3184# ifdef RE_ENABLE_I18N
3185 *err = build_range_exp (sbcset,
3186 dfa->mb_cur_max > 1 ? mbcset : NULL,
3187 &range_alloc, &start_elem, &end_elem);
3188# else
3189 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3190# endif
3191#endif /* RE_ENABLE_I18N */
3192 if (BE (*err != REG_NOERROR, 0))
3193 goto parse_bracket_exp_free_return;
3194 }
3195 else
3196 {
3197 switch (start_elem.type)
3198 {
3199 case SB_CHAR:
3200 bitset_set (sbcset, start_elem.opr.ch);
3201 break;
3202#ifdef RE_ENABLE_I18N
3203 case MB_CHAR:
3204 /* Check whether the array has enough space. */
3205 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3206 {
3207 wchar_t *new_mbchars;
3208 /* Not enough, realloc it. */
3209 /* +1 in case of mbcset->nmbchars is 0. */
3210 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3211 /* Use realloc since array is NULL if *alloc == 0. */
3212 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3213 mbchar_alloc);
3214 if (BE (new_mbchars == NULL, 0))
3215 goto parse_bracket_exp_espace;
3216 mbcset->mbchars = new_mbchars;
3217 }
3218 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3219 break;
3220#endif /* RE_ENABLE_I18N */
3221 case EQUIV_CLASS:
3222 *err = build_equiv_class (sbcset,
3223#ifdef RE_ENABLE_I18N
3224 mbcset, &equiv_class_alloc,
3225#endif /* RE_ENABLE_I18N */
3226 start_elem.opr.name);
3227 if (BE (*err != REG_NOERROR, 0))
3228 goto parse_bracket_exp_free_return;
3229 break;
3230 case COLL_SYM:
3231 *err = build_collating_symbol (sbcset,
3232#ifdef RE_ENABLE_I18N
3233 mbcset, &coll_sym_alloc,
3234#endif /* RE_ENABLE_I18N */
3235 start_elem.opr.name);
3236 if (BE (*err != REG_NOERROR, 0))
3237 goto parse_bracket_exp_free_return;
3238 break;
3239 case CHAR_CLASS:
3240 *err = build_charclass (regexp->trans, sbcset,
3241#ifdef RE_ENABLE_I18N
3242 mbcset, &char_class_alloc,
3243#endif /* RE_ENABLE_I18N */
3244 (const char *) start_elem.opr.name, syntax);
3245 if (BE (*err != REG_NOERROR, 0))
3246 goto parse_bracket_exp_free_return;
3247 break;
3248 default:
3249 assert (0);
3250 break;
3251 }
3252 }
3253 if (BE (token->type == END_OF_RE, 0))
3254 {
3255 *err = REG_EBRACK;
3256 goto parse_bracket_exp_free_return;
3257 }
3258 if (token->type == OP_CLOSE_BRACKET)
3259 break;
3260 }
3261
3262 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3263
3264 /* If it is non-matching list. */
3265 if (non_match)
3266 bitset_not (sbcset);
3267
3268#ifdef RE_ENABLE_I18N
3269 /* Ensure only single byte characters are set. */
3270 if (dfa->mb_cur_max > 1)
3271 bitset_mask (sbcset, dfa->sb_char);
3272
3273 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3274 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3275 || mbcset->non_match)))
3276 {
3277 bin_tree_t *mbc_tree;
3278 int sbc_idx;
3279 /* Build a tree for complex bracket. */
3280 dfa->has_mb_node = 1;
3281 br_token.type = COMPLEX_BRACKET;
3282 br_token.opr.mbcset = mbcset;
3283 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3284 if (BE (mbc_tree == NULL, 0))
3285 goto parse_bracket_exp_espace;
3286 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3287 if (sbcset[sbc_idx])
3288 break;
3289 /* If there are no bits set in sbcset, there is no point
3290 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3291 if (sbc_idx < BITSET_WORDS)
3292 {
3293 /* Build a tree for simple bracket. */
3294 br_token.type = SIMPLE_BRACKET;
3295 br_token.opr.sbcset = sbcset;
3296 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3297 if (BE (work_tree == NULL, 0))
3298 goto parse_bracket_exp_espace;
3299
3300 /* Then join them by ALT node. */
3301 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3302 if (BE (work_tree == NULL, 0))
3303 goto parse_bracket_exp_espace;
3304 }
3305 else
3306 {
3307 re_free (sbcset);
3308 work_tree = mbc_tree;
3309 }
3310 }
3311 else
3312#endif /* not RE_ENABLE_I18N */
3313 {
3314#ifdef RE_ENABLE_I18N
3315 free_charset (mbcset);
3316#endif
3317 /* Build a tree for simple bracket. */
3318 br_token.type = SIMPLE_BRACKET;
3319 br_token.opr.sbcset = sbcset;
3320 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3321 if (BE (work_tree == NULL, 0))
3322 goto parse_bracket_exp_espace;
3323 }
3324 return work_tree;
3325
3326 parse_bracket_exp_espace:
3327 *err = REG_ESPACE;
3328 parse_bracket_exp_free_return:
3329 re_free (sbcset);
3330#ifdef RE_ENABLE_I18N
3331 free_charset (mbcset);
3332#endif /* RE_ENABLE_I18N */
3333 return NULL;
3334}
3335
3336/* Parse an element in the bracket expression. */
3337
3338static reg_errcode_t
3339parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3340 re_token_t *token, int token_len,
3341 UNUSED_PARAM re_dfa_t *dfa, reg_syntax_t syntax,
3342 int accept_hyphen)
3343{
3344#ifdef RE_ENABLE_I18N
3345 int cur_char_size;
3346 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3347 if (cur_char_size > 1)
3348 {
3349 elem->type = MB_CHAR;
3350 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3351 re_string_skip_bytes (regexp, cur_char_size);
3352 return REG_NOERROR;
3353 }
3354#endif /* RE_ENABLE_I18N */
3355 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3356 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3357 || token->type == OP_OPEN_EQUIV_CLASS)
3358 return parse_bracket_symbol (elem, regexp, token);
3359 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3360 {
3361 /* A '-' must only appear as anything but a range indicator before
3362 the closing bracket. Everything else is an error. */
3363 re_token_t token2;
3364 (void) peek_token_bracket (&token2, regexp, syntax);
3365 if (token2.type != OP_CLOSE_BRACKET)
3366 /* The actual error value is not standardized since this whole
3367 case is undefined. But ERANGE makes good sense. */
3368 return REG_ERANGE;
3369 }
3370 elem->type = SB_CHAR;
3371 elem->opr.ch = token->opr.c;
3372 return REG_NOERROR;
3373}
3374
3375/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3376 such as [:<character_class>:], [.<collating_element>.], and
3377 [=<equivalent_class>=]. */
3378
3379static reg_errcode_t
3380parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3381 re_token_t *token)
3382{
3383 unsigned char ch, delim = token->opr.c;
3384 int i = 0;
3385 if (re_string_eoi(regexp))
3386 return REG_EBRACK;
3387 for (;; ++i)
3388 {
3389 if (i >= BRACKET_NAME_BUF_SIZE)
3390 return REG_EBRACK;
3391 if (token->type == OP_OPEN_CHAR_CLASS)
3392 ch = re_string_fetch_byte_case (regexp);
3393 else
3394 ch = re_string_fetch_byte (regexp);
3395 if (re_string_eoi(regexp))
3396 return REG_EBRACK;
3397 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3398 break;
3399 elem->opr.name[i] = ch;
3400 }
3401 re_string_skip_bytes (regexp, 1);
3402 elem->opr.name[i] = '\0';
3403 switch (token->type)
3404 {
3405 case OP_OPEN_COLL_ELEM:
3406 elem->type = COLL_SYM;
3407 break;
3408 case OP_OPEN_EQUIV_CLASS:
3409 elem->type = EQUIV_CLASS;
3410 break;
3411 case OP_OPEN_CHAR_CLASS:
3412 elem->type = CHAR_CLASS;
3413 break;
3414 default:
3415 break;
3416 }
3417 return REG_NOERROR;
3418}
3419
3420 /* Helper function for parse_bracket_exp.
3421 Build the equivalence class which is represented by NAME.
3422 The result are written to MBCSET and SBCSET.
3423 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3424 is a pointer argument since we may update it. */
3425
3426static reg_errcode_t
3427#ifdef RE_ENABLE_I18N
3428build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3429 int *equiv_class_alloc, const unsigned char *name)
3430#else /* not RE_ENABLE_I18N */
3431build_equiv_class (bitset_t sbcset, const unsigned char *name)
3432#endif /* not RE_ENABLE_I18N */
3433{
3434#ifdef _LIBC
3435 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3436 if (nrules != 0)
3437 {
3438 const int32_t *table, *indirect;
3439 const unsigned char *weights, *extra, *cp;
3440 unsigned char char_buf[2];
3441 int32_t idx1, idx2;
3442 unsigned int ch;
3443 size_t len;
3444 /* This #include defines a local function! */
3445# include <locale/weight.h>
3446 /* Calculate the index for equivalence class. */
3447 cp = name;
3448 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3449 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 _NL_COLLATE_WEIGHTMB);
3451 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_EXTRAMB);
3453 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3454 _NL_COLLATE_INDIRECTMB);
3455 idx1 = findidx (&cp);
3456 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3457 /* This isn't a valid character. */
3458 return REG_ECOLLATE;
3459
3460 /* Build single byte matcing table for this equivalence class. */
3461 char_buf[1] = (unsigned char) '\0';
3462 len = weights[idx1 & 0xffffff];
3463 for (ch = 0; ch < SBC_MAX; ++ch)
3464 {
3465 char_buf[0] = ch;
3466 cp = char_buf;
3467 idx2 = findidx (&cp);
3468/*
3469 idx2 = table[ch];
3470*/
3471 if (idx2 == 0)
3472 /* This isn't a valid character. */
3473 continue;
3474 /* Compare only if the length matches and the collation rule
3475 index is the same. */
3476 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3477 {
3478 int cnt = 0;
3479
3480 while (cnt <= len &&
3481 weights[(idx1 & 0xffffff) + 1 + cnt]
3482 == weights[(idx2 & 0xffffff) + 1 + cnt])
3483 ++cnt;
3484
3485 if (cnt > len)
3486 bitset_set (sbcset, ch);
3487 }
3488 }
3489 /* Check whether the array has enough space. */
3490 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491 {
3492 /* Not enough, realloc it. */
3493 /* +1 in case of mbcset->nequiv_classes is 0. */
3494 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495 /* Use realloc since the array is NULL if *alloc == 0. */
3496 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497 int32_t,
3498 new_equiv_class_alloc);
3499 if (BE (new_equiv_classes == NULL, 0))
3500 return REG_ESPACE;
3501 mbcset->equiv_classes = new_equiv_classes;
3502 *equiv_class_alloc = new_equiv_class_alloc;
3503 }
3504 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505 }
3506 else
3507#endif /* _LIBC */
3508 {
3509 if (BE (strlen ((const char *) name) != 1, 0))
3510 return REG_ECOLLATE;
3511 bitset_set (sbcset, *name);
3512 }
3513 return REG_NOERROR;
3514}
3515
3516 /* Helper function for parse_bracket_exp.
3517 Build the character class which is represented by NAME.
3518 The result are written to MBCSET and SBCSET.
3519 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520 is a pointer argument since we may update it. */
3521
3522static reg_errcode_t
3523#ifdef RE_ENABLE_I18N
3524build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3525 re_charset_t *mbcset, int *char_class_alloc,
3526 const char *class_name, reg_syntax_t syntax)
3527#else /* not RE_ENABLE_I18N */
3528build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3529 const char *class_name, reg_syntax_t syntax)
3530#endif /* not RE_ENABLE_I18N */
3531{
3532 int i;
3533
3534 /* In case of REG_ICASE "upper" and "lower" match the both of
3535 upper and lower cases. */
3536 if ((syntax & RE_ICASE)
3537 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3538 class_name = "alpha";
3539
3540#ifdef RE_ENABLE_I18N
3541 /* Check the space of the arrays. */
3542 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3543 {
3544 /* Not enough, realloc it. */
3545 /* +1 in case of mbcset->nchar_classes is 0. */
3546 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3547 /* Use realloc since array is NULL if *alloc == 0. */
3548 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3549 new_char_class_alloc);
3550 if (BE (new_char_classes == NULL, 0))
3551 return REG_ESPACE;
3552 mbcset->char_classes = new_char_classes;
3553 *char_class_alloc = new_char_class_alloc;
3554 }
3555 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3556#endif /* RE_ENABLE_I18N */
3557
3558#define BUILD_CHARCLASS_LOOP(ctype_func) \
3559 do { \
3560 if (BE (trans != NULL, 0)) \
3561 { \
3562 for (i = 0; i < SBC_MAX; ++i) \
3563 if (ctype_func (i)) \
3564 bitset_set (sbcset, trans[i]); \
3565 } \
3566 else \
3567 { \
3568 for (i = 0; i < SBC_MAX; ++i) \
3569 if (ctype_func (i)) \
3570 bitset_set (sbcset, i); \
3571 } \
3572 } while (0)
3573
3574 if (strcmp (class_name, "alnum") == 0)
3575 BUILD_CHARCLASS_LOOP (isalnum);
3576 else if (strcmp (class_name, "cntrl") == 0)
3577 BUILD_CHARCLASS_LOOP (iscntrl);
3578 else if (strcmp (class_name, "lower") == 0)
3579 BUILD_CHARCLASS_LOOP (islower);
3580 else if (strcmp (class_name, "space") == 0)
3581 BUILD_CHARCLASS_LOOP (isspace);
3582 else if (strcmp (class_name, "alpha") == 0)
3583 BUILD_CHARCLASS_LOOP (isalpha);
3584 else if (strcmp (class_name, "digit") == 0)
3585 BUILD_CHARCLASS_LOOP (isdigit);
3586 else if (strcmp (class_name, "print") == 0)
3587 BUILD_CHARCLASS_LOOP (isprint);
3588 else if (strcmp (class_name, "upper") == 0)
3589 BUILD_CHARCLASS_LOOP (isupper);
3590 else if (strcmp (class_name, "blank") == 0)
3591#ifndef GAWK
3592 BUILD_CHARCLASS_LOOP (isblank);
3593#else
3594 /* see comments above */
3595 BUILD_CHARCLASS_LOOP (is_blank);
3596#endif
3597 else if (strcmp (class_name, "graph") == 0)
3598 BUILD_CHARCLASS_LOOP (isgraph);
3599 else if (strcmp (class_name, "punct") == 0)
3600 BUILD_CHARCLASS_LOOP (ispunct);
3601 else if (strcmp (class_name, "xdigit") == 0)
3602 BUILD_CHARCLASS_LOOP (isxdigit);
3603 else
3604 return REG_ECTYPE;
3605
3606 return REG_NOERROR;
3607}
3608
3609static bin_tree_t *
3610build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3611 const char *class_name,
3612 const char *extra, int non_match,
3613 reg_errcode_t *err)
3614{
3615 re_bitset_ptr_t sbcset;
3616#ifdef RE_ENABLE_I18N
3617 re_charset_t *mbcset;
3618 int alloc = 0;
3619#endif /* not RE_ENABLE_I18N */
3620 reg_errcode_t ret;
3621 re_token_t br_token;
3622 bin_tree_t *tree;
3623
3624 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3625#ifdef RE_ENABLE_I18N
3626 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3627#endif /* RE_ENABLE_I18N */
3628
3629#ifdef RE_ENABLE_I18N
3630 if (BE (sbcset == NULL || mbcset == NULL, 0))
3631#else /* not RE_ENABLE_I18N */
3632 if (BE (sbcset == NULL, 0))
3633#endif /* not RE_ENABLE_I18N */
3634 {
3635 *err = REG_ESPACE;
3636 return NULL;
3637 }
3638
3639 if (non_match)
3640 {
3641#ifdef RE_ENABLE_I18N
3642 mbcset->non_match = 1;
3643#endif /* not RE_ENABLE_I18N */
3644 }
3645
3646 /* We don't care the syntax in this case. */
3647 ret = build_charclass (trans, sbcset,
3648#ifdef RE_ENABLE_I18N
3649 mbcset, &alloc,
3650#endif /* RE_ENABLE_I18N */
3651 class_name, 0);
3652
3653 if (BE (ret != REG_NOERROR, 0))
3654 {
3655 re_free (sbcset);
3656#ifdef RE_ENABLE_I18N
3657 free_charset (mbcset);
3658#endif /* RE_ENABLE_I18N */
3659 *err = ret;
3660 return NULL;
3661 }
3662 /* \w match '_' also. */
3663 for (; *extra; extra++)
3664 bitset_set (sbcset, *extra);
3665
3666 /* If it is non-matching list. */
3667 if (non_match)
3668 bitset_not (sbcset);
3669
3670#ifdef RE_ENABLE_I18N
3671 /* Ensure only single byte characters are set. */
3672 if (dfa->mb_cur_max > 1)
3673 bitset_mask (sbcset, dfa->sb_char);
3674#endif
3675
3676 /* Build a tree for simple bracket. */
3677 br_token.type = SIMPLE_BRACKET;
3678 br_token.opr.sbcset = sbcset;
3679 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680 if (BE (tree == NULL, 0))
3681 goto build_word_op_espace;
3682
3683#ifdef RE_ENABLE_I18N
3684 if (dfa->mb_cur_max > 1)
3685 {
3686 bin_tree_t *mbc_tree;
3687 /* Build a tree for complex bracket. */
3688 br_token.type = COMPLEX_BRACKET;
3689 br_token.opr.mbcset = mbcset;
3690 dfa->has_mb_node = 1;
3691 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692 if (BE (mbc_tree == NULL, 0))
3693 goto build_word_op_espace;
3694 /* Then join them by ALT node. */
3695 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696 if (BE (mbc_tree != NULL, 1))
3697 return tree;
3698 }
3699 else
3700 {
3701 free_charset (mbcset);
3702 return tree;
3703 }
3704#else /* not RE_ENABLE_I18N */
3705 return tree;
3706#endif /* not RE_ENABLE_I18N */
3707
3708 build_word_op_espace:
3709 re_free (sbcset);
3710#ifdef RE_ENABLE_I18N
3711 free_charset (mbcset);
3712#endif /* RE_ENABLE_I18N */
3713 *err = REG_ESPACE;
3714 return NULL;
3715}
3716
3717/* This is intended for the expressions like "a{1,3}".
3718 Fetch a number from `input', and return the number.
3719 Return -1, if the number field is empty like "{,1}".
3720 Return -2, if an error has occurred. */
3721
3722static int
3723fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3724{
3725 int num = -1;
3726 unsigned char c;
3727 while (1)
3728 {
3729 fetch_token (token, input, syntax);
3730 c = token->opr.c;
3731 if (BE (token->type == END_OF_RE, 0))
3732 return -2;
3733 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3734 break;
3735 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3736 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3737 num = (num > RE_DUP_MAX) ? -2 : num;
3738 }
3739 return num;
3740}
3741
3742#ifdef RE_ENABLE_I18N
3743static void
3744free_charset (re_charset_t *cset)
3745{
3746 re_free (cset->mbchars);
3747# ifdef _LIBC
3748 re_free (cset->coll_syms);
3749 re_free (cset->equiv_classes);
3750 re_free (cset->range_starts);
3751 re_free (cset->range_ends);
3752# endif
3753 re_free (cset->char_classes);
3754 re_free (cset);
3755}
3756#endif /* RE_ENABLE_I18N */
3757
3758/* Functions for binary tree operation. */
3759
3760/* Create a tree node. */
3761
3762static bin_tree_t *
3763create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764 re_token_type_t type)
3765{
3766 re_token_t t;
3767 t.type = type;
3768 return create_token_tree (dfa, left, right, &t);
3769}
3770
3771static bin_tree_t *
3772create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3773 const re_token_t *token)
3774{
3775 bin_tree_t *tree;
3776 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3777 {
3778 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3779
3780 if (storage == NULL)
3781 return NULL;
3782 storage->next = dfa->str_tree_storage;
3783 dfa->str_tree_storage = storage;
3784 dfa->str_tree_storage_idx = 0;
3785 }
3786 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3787
3788 tree->parent = NULL;
3789 tree->left = left;
3790 tree->right = right;
3791 tree->token = *token;
3792 tree->token.duplicated = 0;
3793 tree->token.opt_subexp = 0;
3794 tree->first = NULL;
3795 tree->next = NULL;
3796 tree->node_idx = -1;
3797
3798 if (left != NULL)
3799 left->parent = tree;
3800 if (right != NULL)
3801 right->parent = tree;
3802 return tree;
3803}
3804
3805/* Mark the tree SRC as an optional subexpression.
3806 To be called from preorder or postorder. */
3807
3808static reg_errcode_t
3809mark_opt_subexp (void *extra, bin_tree_t *node)
3810{
3811 int idx = (int) (intptr_t) extra;
3812 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3813 node->token.opt_subexp = 1;
3814
3815 return REG_NOERROR;
3816}
3817
3818/* Free the allocated memory inside NODE. */
3819
3820static void
3821free_token (re_token_t *node)
3822{
3823#ifdef RE_ENABLE_I18N
3824 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3825 free_charset (node->opr.mbcset);
3826 else
3827#endif /* RE_ENABLE_I18N */
3828 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3829 re_free (node->opr.sbcset);
3830}
3831
3832/* Worker function for tree walking. Free the allocated memory inside NODE
3833 and its children. */
3834
3835static reg_errcode_t
3836free_tree (UNUSED_PARAM void *extra, bin_tree_t *node)
3837{
3838 free_token (&node->token);
3839 return REG_NOERROR;
3840}
3841
3842
3843/* Duplicate the node SRC, and return new node. This is a preorder
3844 visit similar to the one implemented by the generic visitor, but
3845 we need more infrastructure to maintain two parallel trees --- so,
3846 it's easier to duplicate. */
3847
3848static bin_tree_t *
3849duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3850{
3851 const bin_tree_t *node;
3852 bin_tree_t *dup_root;
3853 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3854
3855 for (node = root; ; )
3856 {
3857 /* Create a new tree and link it back to the current parent. */
3858 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3859 if (*p_new == NULL)
3860 return NULL;
3861 (*p_new)->parent = dup_node;
3862 (*p_new)->token.duplicated = 1;
3863 dup_node = *p_new;
3864
3865 /* Go to the left node, or up and to the right. */
3866 if (node->left)
3867 {
3868 node = node->left;
3869 p_new = &dup_node->left;
3870 }
3871 else
3872 {
3873 const bin_tree_t *prev = NULL;
3874 while (node->right == prev || node->right == NULL)
3875 {
3876 prev = node;
3877 node = node->parent;
3878 dup_node = dup_node->parent;
3879 if (!node)
3880 return dup_root;
3881 }
3882 node = node->right;
3883 p_new = &dup_node->right;
3884 }
3885 }
3886}