aboutsummaryrefslogtreecommitdiff
path: root/win32/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'win32/regcomp.c')
-rw-r--r--win32/regcomp.c3936
1 files changed, 3936 insertions, 0 deletions
diff --git a/win32/regcomp.c b/win32/regcomp.c
new file mode 100644
index 000000000..e1692d341
--- /dev/null
+++ b/win32/regcomp.c
@@ -0,0 +1,3936 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21#include "match_class.h"
22
23#define UNUSED_PARAM __attribute__ ((__unused__))
24
25static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
26 size_t length, reg_syntax_t syntax);
27static void re_compile_fastmap_iter (regex_t *bufp,
28 const re_dfastate_t *init_state,
29 char *fastmap);
30static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
31#ifdef RE_ENABLE_I18N
32static void free_charset (re_charset_t *cset);
33#endif /* RE_ENABLE_I18N */
34static void free_workarea_compile (regex_t *preg);
35static reg_errcode_t create_initial_state (re_dfa_t *dfa);
36#ifdef RE_ENABLE_I18N
37static void optimize_utf8 (re_dfa_t *dfa);
38#endif
39static reg_errcode_t analyze (regex_t *preg);
40static reg_errcode_t preorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
42 void *extra);
43static reg_errcode_t postorder (bin_tree_t *root,
44 reg_errcode_t (fn (void *, bin_tree_t *)),
45 void *extra);
46static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
47static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
48static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
49 bin_tree_t *node);
50static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
51static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
52static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
53static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
54static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 int node, int root);
59static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
60static int fetch_number (re_string_t *input, re_token_t *token,
61 reg_syntax_t syntax);
62static int peek_token (re_token_t *token, re_string_t *input,
63 reg_syntax_t syntax) internal_function;
64static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
65 reg_syntax_t syntax, reg_errcode_t *err);
66static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 int nest, reg_errcode_t *err);
69static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
79 re_dfa_t *dfa, re_token_t *token,
80 reg_syntax_t syntax, reg_errcode_t *err);
81static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
82 re_token_t *token, reg_syntax_t syntax,
83 reg_errcode_t *err);
84static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
85 re_string_t *regexp,
86 re_token_t *token, int token_len,
87 re_dfa_t *dfa,
88 reg_syntax_t syntax,
89 int accept_hyphen);
90static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token);
93#ifdef RE_ENABLE_I18N
94static reg_errcode_t build_equiv_class (bitset_t sbcset,
95 re_charset_t *mbcset,
96 int *equiv_class_alloc,
97 const unsigned char *name);
98static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
99 bitset_t sbcset,
100 re_charset_t *mbcset,
101 int *char_class_alloc,
102 const char *class_name,
103 reg_syntax_t syntax);
104#else /* not RE_ENABLE_I18N */
105static reg_errcode_t build_equiv_class (bitset_t sbcset,
106 const unsigned char *name);
107static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
108 bitset_t sbcset,
109 const char *class_name,
110 reg_syntax_t syntax);
111#endif /* not RE_ENABLE_I18N */
112static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
113 RE_TRANSLATE_TYPE trans,
114 const char *class_name,
115 const char *extra,
116 int non_match, reg_errcode_t *err);
117static bin_tree_t *create_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 re_token_type_t type);
120static bin_tree_t *create_token_tree (re_dfa_t *dfa,
121 bin_tree_t *left, bin_tree_t *right,
122 const re_token_t *token);
123static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
124static void free_token (re_token_t *node);
125static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
126static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127
128/* This table gives an error message for each of the error codes listed
129 in regex.h. Obviously the order here has to be same as there.
130 POSIX doesn't require that we do anything for REG_NOERROR,
131 but why not be nice? */
132
133const char __re_error_msgid[] attribute_hidden =
134 {
135#define REG_NOERROR_IDX 0
136 gettext_noop ("Success") /* REG_NOERROR */
137 "\0"
138#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139 gettext_noop ("No match") /* REG_NOMATCH */
140 "\0"
141#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 "\0"
144#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 "\0"
147#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 "\0"
150#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 "\0"
153#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 "\0"
156#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
158 "\0"
159#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 "\0"
162#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 "\0"
165#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 "\0"
168#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 "\0"
171#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 "\0"
174#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 "\0"
177#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 "\0"
180#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 "\0"
183#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 };
186
187const size_t __re_error_msgid_idx[] attribute_hidden =
188 {
189 REG_NOERROR_IDX,
190 REG_NOMATCH_IDX,
191 REG_BADPAT_IDX,
192 REG_ECOLLATE_IDX,
193 REG_ECTYPE_IDX,
194 REG_EESCAPE_IDX,
195 REG_ESUBREG_IDX,
196 REG_EBRACK_IDX,
197 REG_EPAREN_IDX,
198 REG_EBRACE_IDX,
199 REG_BADBR_IDX,
200 REG_ERANGE_IDX,
201 REG_ESPACE_IDX,
202 REG_BADRPT_IDX,
203 REG_EEND_IDX,
204 REG_ESIZE_IDX,
205 REG_ERPAREN_IDX
206 };
207
208/* Entry points for GNU code. */
209
210
211#ifdef ZOS_USS
212
213/* For ZOS USS we must define btowc */
214
215wchar_t
216btowc (int c)
217{
218 wchar_t wtmp[2];
219 char tmp[2];
220
221 tmp[0] = c;
222 tmp[1] = 0;
223
224 mbtowc (wtmp, tmp, 1);
225 return wtmp[0];
226}
227#endif
228
229/* re_compile_pattern is the GNU regular expression compiler: it
230 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
231 Returns 0 if the pattern was valid, otherwise an error string.
232
233 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
234 are set in BUFP on entry. */
235
236const char *
237re_compile_pattern (const char *pattern,
238 size_t length,
239 struct re_pattern_buffer *bufp)
240{
241 reg_errcode_t ret;
242
243 /* And GNU code determines whether or not to get register information
244 by passing null for the REGS argument to re_match, etc., not by
245 setting no_sub, unless RE_NO_SUB is set. */
246 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
247
248 /* Match anchors at newline. */
249 bufp->newline_anchor = 1;
250
251 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
252
253 if (!ret)
254 return NULL;
255 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
256}
257#ifdef _LIBC
258weak_alias (__re_compile_pattern, re_compile_pattern)
259#endif
260
261/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
262 also be assigned to arbitrarily: each pattern buffer stores its own
263 syntax, so it can be changed between regex compilations. */
264/* This has no initializer because initialized variables in Emacs
265 become read-only after dumping. */
266reg_syntax_t re_syntax_options;
267
268
269/* Specify the precise syntax of regexps for compilation. This provides
270 for compatibility for various utilities which historically have
271 different, incompatible syntaxes.
272
273 The argument SYNTAX is a bit mask comprised of the various bits
274 defined in regex.h. We return the old syntax. */
275
276reg_syntax_t
277re_set_syntax (reg_syntax_t syntax)
278{
279 reg_syntax_t ret = re_syntax_options;
280
281 re_syntax_options = syntax;
282 return ret;
283}
284#ifdef _LIBC
285weak_alias (__re_set_syntax, re_set_syntax)
286#endif
287
288int
289re_compile_fastmap (struct re_pattern_buffer *bufp)
290{
291 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
292 char *fastmap = bufp->fastmap;
293
294 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
295 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
296 if (dfa->init_state != dfa->init_state_word)
297 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
298 if (dfa->init_state != dfa->init_state_nl)
299 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
300 if (dfa->init_state != dfa->init_state_begbuf)
301 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
302 bufp->fastmap_accurate = 1;
303 return 0;
304}
305#ifdef _LIBC
306weak_alias (__re_compile_fastmap, re_compile_fastmap)
307#endif
308
309static inline void
310__attribute ((always_inline))
311re_set_fastmap (char *fastmap, int icase, int ch)
312{
313 fastmap[ch] = 1;
314 if (icase)
315 fastmap[tolower (ch)] = 1;
316}
317
318/* Helper function for re_compile_fastmap.
319 Compile fastmap for the initial_state INIT_STATE. */
320
321static void
322re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
323 char *fastmap)
324{
325 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
326 int node_cnt;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
329 {
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
332
333 if (type == CHARACTER)
334 {
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336#ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
338 {
339 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
340 wchar_t wc;
341 mbstate_t state;
342
343 p = buf;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, '\0', sizeof (state));
350 if (__mbrtowc (&wc, (const char *) buf, p - buf,
351 &state) == p - buf
352 && (__wcrtomb ((char *) buf, towlower (wc), &state)
353 != (size_t) -1))
354 re_set_fastmap (fastmap, 0, buf[0]);
355 re_free (buf);
356 }
357#endif
358 }
359 else if (type == SIMPLE_BRACKET)
360 {
361 int i, ch;
362 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
363 {
364 int j;
365 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
366 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
367 if (w & ((bitset_word_t) 1 << j))
368 re_set_fastmap (fastmap, icase, ch);
369 }
370 }
371#ifdef RE_ENABLE_I18N
372 else if (type == COMPLEX_BRACKET)
373 {
374 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
375 int i;
376
377# ifdef _LIBC
378 /* See if we have to try all bytes which start multiple collation
379 elements.
380 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
381 collation element, and don't catch 'b' since 'b' is
382 the only collation element which starts from 'b' (and
383 it is caught by SIMPLE_BRACKET). */
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
385 && (cset->ncoll_syms || cset->nranges))
386 {
387 const int32_t *table = (const int32_t *)
388 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
389 for (i = 0; i < SBC_MAX; ++i)
390 if (table[i] < 0)
391 re_set_fastmap (fastmap, icase, i);
392 }
393# endif /* _LIBC */
394
395 /* See if we have to start the match at all multibyte characters,
396 i.e. where we would not find an invalid sequence. This only
397 applies to multibyte character sets; for single byte character
398 sets, the SIMPLE_BRACKET again suffices. */
399 if (dfa->mb_cur_max > 1
400 && (cset->nchar_classes || cset->non_match || cset->nranges
401# ifdef _LIBC
402 || cset->nequiv_classes
403# endif /* _LIBC */
404 ))
405 {
406 unsigned char c = 0;
407 do
408 {
409 mbstate_t mbs;
410 memset (&mbs, 0, sizeof (mbs));
411 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
412 re_set_fastmap (fastmap, false, (int) c);
413 }
414 while (++c != 0);
415 }
416
417 else
418 {
419 /* ... Else catch all bytes which can start the mbchars. */
420 for (i = 0; i < cset->nmbchars; ++i)
421 {
422 char buf[256];
423 mbstate_t state;
424 memset (&state, '\0', sizeof (state));
425 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
426 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
427 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
428 {
429 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
430 != (size_t) -1)
431 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
432 }
433 }
434 }
435 }
436#endif /* RE_ENABLE_I18N */
437 else if (type == OP_PERIOD
438#ifdef RE_ENABLE_I18N
439 || type == OP_UTF8_PERIOD
440#endif /* RE_ENABLE_I18N */
441 || type == END_OF_RE)
442 {
443 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
444 if (type == END_OF_RE)
445 bufp->can_be_null = 1;
446 return;
447 }
448 }
449}
450
451/* Entry point for POSIX code. */
452/* regcomp takes a regular expression as a string and compiles it.
453
454 PREG is a regex_t *. We do not expect any fields to be initialized,
455 since POSIX says we shouldn't. Thus, we set
456
457 `buffer' to the compiled pattern;
458 `used' to the length of the compiled pattern;
459 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
460 REG_EXTENDED bit in CFLAGS is set; otherwise, to
461 RE_SYNTAX_POSIX_BASIC;
462 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
463 `fastmap' to an allocated space for the fastmap;
464 `fastmap_accurate' to zero;
465 `re_nsub' to the number of subexpressions in PATTERN.
466
467 PATTERN is the address of the pattern string.
468
469 CFLAGS is a series of bits which affect compilation.
470
471 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
472 use POSIX basic syntax.
473
474 If REG_NEWLINE is set, then . and [^...] don't match newline.
475 Also, regexec will try a match beginning after every newline.
476
477 If REG_ICASE is set, then we considers upper- and lowercase
478 versions of letters to be equivalent when matching.
479
480 If REG_NOSUB is set, then when PREG is passed to regexec, that
481 routine will report only success or failure, and nothing about the
482 registers.
483
484 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
485 the return codes and their meanings.) */
486
487int
488regcomp (regex_t *__restrict preg,
489 const char *__restrict pattern,
490 int cflags)
491{
492 reg_errcode_t ret;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
495
496 preg->buffer = NULL;
497 preg->allocated = 0;
498 preg->used = 0;
499
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
503 return REG_ESPACE;
504
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
506
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
514 }
515 else
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
519
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
521
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
525 ret = REG_EPAREN;
526
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
532 else
533 {
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
537 }
538
539 return (int) ret;
540}
541#ifdef _LIBC
542weak_alias (__regcomp, regcomp)
543#endif
544
545/* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
547
548size_t
549regerror(int errcode, UNUSED_PARAM const regex_t *__restrict preg,
550 char *__restrict errbuf, size_t errbuf_size)
551{
552 const char *msg;
553 size_t msg_size;
554
555 if (BE (errcode < 0
556 || errcode >= (int) (sizeof (__re_error_msgid_idx)
557 / sizeof (__re_error_msgid_idx[0])), 0))
558 /* Only error codes returned by the rest of the code should be passed
559 to this routine. If we are given anything else, or if other regex
560 code generates an invalid error code, then the program has a bug.
561 Dump core so we can fix it. */
562 abort ();
563
564 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
565
566 msg_size = strlen (msg) + 1; /* Includes the null. */
567
568 if (BE (errbuf_size != 0, 1))
569 {
570 if (BE (msg_size > errbuf_size, 0))
571 {
572 memcpy (errbuf, msg, errbuf_size - 1);
573 errbuf[errbuf_size - 1] = 0;
574 }
575 else
576 memcpy (errbuf, msg, msg_size);
577 }
578
579 return msg_size;
580}
581#ifdef _LIBC
582weak_alias (__regerror, regerror)
583#endif
584
585
586#ifdef RE_ENABLE_I18N
587/* This static array is used for the map to single-byte characters when
588 UTF-8 is used. Otherwise we would allocate memory just to initialize
589 it the same all the time. UTF-8 is the preferred encoding so this is
590 a worthwhile optimization. */
591#if __GNUC__ >= 3
592static const bitset_t utf8_sb_map = {
593 /* Set the first 128 bits. */
594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
595};
596#else /* ! (__GNUC__ >= 3) */
597static bitset_t utf8_sb_map;
598#endif /* __GNUC__ >= 3 */
599#endif /* RE_ENABLE_I18N */
600
601
602static void
603free_dfa_content (re_dfa_t *dfa)
604{
605 int i, j;
606
607 if (dfa->nodes)
608 for (i = 0; i < dfa->nodes_len; ++i)
609 free_token (dfa->nodes + i);
610 re_free (dfa->nexts);
611 for (i = 0; i < dfa->nodes_len; ++i)
612 {
613 if (dfa->eclosures != NULL)
614 re_node_set_free (dfa->eclosures + i);
615 if (dfa->inveclosures != NULL)
616 re_node_set_free (dfa->inveclosures + i);
617 if (dfa->edests != NULL)
618 re_node_set_free (dfa->edests + i);
619 }
620 re_free (dfa->edests);
621 re_free (dfa->eclosures);
622 re_free (dfa->inveclosures);
623 re_free (dfa->nodes);
624
625 if (dfa->state_table)
626 for (i = 0; i <= dfa->state_hash_mask; ++i)
627 {
628 struct re_state_table_entry *entry = dfa->state_table + i;
629 for (j = 0; j < entry->num; ++j)
630 {
631 re_dfastate_t *state = entry->array[j];
632 free_state (state);
633 }
634 re_free (entry->array);
635 }
636 re_free (dfa->state_table);
637#ifdef RE_ENABLE_I18N
638 if (dfa->sb_char != utf8_sb_map)
639 re_free (dfa->sb_char);
640#endif
641 re_free (dfa->subexp_map);
642#ifdef DEBUG
643 re_free (dfa->re_str);
644#endif
645
646 re_free (dfa);
647}
648
649
650/* Free dynamically allocated space used by PREG. */
651
652void
653regfree (regex_t *preg)
654{
655 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
656 if (BE (dfa != NULL, 1))
657 free_dfa_content (dfa);
658 preg->buffer = NULL;
659 preg->allocated = 0;
660
661 re_free (preg->fastmap);
662 preg->fastmap = NULL;
663
664 re_free (preg->translate);
665 preg->translate = NULL;
666}
667#ifdef _LIBC
668weak_alias (__regfree, regfree)
669#endif
670
671/* Entry points compatible with 4.2 BSD regex library. We don't define
672 them unless specifically requested. */
673
674#if defined _REGEX_RE_COMP || defined _LIBC
675
676/* BSD has one and only one pattern buffer. */
677static struct re_pattern_buffer re_comp_buf;
678
679char *
680# ifdef _LIBC
681/* Make these definitions weak in libc, so POSIX programs can redefine
682 these names if they don't use our functions, and still use
683 regcomp/regexec above without link errors. */
684weak_function
685# endif
686re_comp (s)
687 const char *s;
688{
689 reg_errcode_t ret;
690 char *fastmap;
691
692 if (!s)
693 {
694 if (!re_comp_buf.buffer)
695 return gettext ("No previous regular expression");
696 return 0;
697 }
698
699 if (re_comp_buf.buffer)
700 {
701 fastmap = re_comp_buf.fastmap;
702 re_comp_buf.fastmap = NULL;
703 __regfree (&re_comp_buf);
704 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
705 re_comp_buf.fastmap = fastmap;
706 }
707
708 if (re_comp_buf.fastmap == NULL)
709 {
710 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
711 if (re_comp_buf.fastmap == NULL)
712 return (char *) gettext (__re_error_msgid
713 + __re_error_msgid_idx[(int) REG_ESPACE]);
714 }
715
716 /* Since `re_exec' always passes NULL for the `regs' argument, we
717 don't need to initialize the pattern buffer fields which affect it. */
718
719 /* Match anchors at newlines. */
720 re_comp_buf.newline_anchor = 1;
721
722 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
723
724 if (!ret)
725 return NULL;
726
727 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
728 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
729}
730
731#ifdef _LIBC
732libc_freeres_fn (free_mem)
733{
734 __regfree (&re_comp_buf);
735}
736#endif
737
738#endif /* _REGEX_RE_COMP */
739
740/* Internal entry point.
741 Compile the regular expression PATTERN, whose length is LENGTH.
742 SYNTAX indicate regular expression's syntax. */
743
744static reg_errcode_t
745re_compile_internal (regex_t *preg, const char * pattern, size_t length,
746 reg_syntax_t syntax)
747{
748 reg_errcode_t err = REG_NOERROR;
749 re_dfa_t *dfa;
750 re_string_t regexp;
751
752 /* Initialize the pattern buffer. */
753 preg->fastmap_accurate = 0;
754 preg->syntax = syntax;
755 preg->not_bol = preg->not_eol = 0;
756 preg->used = 0;
757 preg->re_nsub = 0;
758 preg->can_be_null = 0;
759 preg->regs_allocated = REGS_UNALLOCATED;
760
761 /* Initialize the dfa. */
762 dfa = (re_dfa_t *) preg->buffer;
763 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764 {
765 /* If zero allocated, but buffer is non-null, try to realloc
766 enough space. This loses if buffer's address is bogus, but
767 that is the user's responsibility. If ->buffer is NULL this
768 is a simple allocation. */
769 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770 if (dfa == NULL)
771 return REG_ESPACE;
772 preg->allocated = sizeof (re_dfa_t);
773 preg->buffer = (unsigned char *) dfa;
774 }
775 preg->used = sizeof (re_dfa_t);
776
777 err = init_dfa (dfa, length);
778 if (BE (err != REG_NOERROR, 0))
779 {
780 free_dfa_content (dfa);
781 preg->buffer = NULL;
782 preg->allocated = 0;
783 return err;
784 }
785#ifdef DEBUG
786 /* Note: length+1 will not overflow since it is checked in init_dfa. */
787 dfa->re_str = re_malloc (char, length + 1);
788 strncpy (dfa->re_str, pattern, length + 1);
789#endif
790
791 __libc_lock_init (dfa->lock);
792
793 err = re_string_construct (&regexp, pattern, length, preg->translate,
794 syntax & RE_ICASE, dfa);
795 if (BE (err != REG_NOERROR, 0))
796 {
797 re_compile_internal_free_return:
798 free_workarea_compile (preg);
799 re_string_destruct (&regexp);
800 free_dfa_content (dfa);
801 preg->buffer = NULL;
802 preg->allocated = 0;
803 return err;
804 }
805
806 /* Parse the regular expression, and build a structure tree. */
807 preg->re_nsub = 0;
808 dfa->str_tree = parse (&regexp, preg, syntax, &err);
809 if (BE (dfa->str_tree == NULL, 0))
810 goto re_compile_internal_free_return;
811
812 /* Analyze the tree and create the nfa. */
813 err = analyze (preg);
814 if (BE (err != REG_NOERROR, 0))
815 goto re_compile_internal_free_return;
816
817#ifdef RE_ENABLE_I18N
818 /* If possible, do searching in single byte encoding to speed things up. */
819 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
820 optimize_utf8 (dfa);
821#endif
822
823 /* Then create the initial state of the dfa. */
824 err = create_initial_state (dfa);
825
826 /* Release work areas. */
827 free_workarea_compile (preg);
828 re_string_destruct (&regexp);
829
830 if (BE (err != REG_NOERROR, 0))
831 {
832 free_dfa_content (dfa);
833 preg->buffer = NULL;
834 preg->allocated = 0;
835 }
836
837 return err;
838}
839
840/* Initialize DFA. We use the length of the regular expression PAT_LEN
841 as the initial length of some arrays. */
842
843static reg_errcode_t
844init_dfa (re_dfa_t *dfa, size_t pat_len)
845{
846 unsigned int table_size;
847#ifndef _LIBC
848 const char *codeset_name;
849#endif
850
851 memset (dfa, '\0', sizeof (re_dfa_t));
852
853 /* Force allocation of str_tree_storage the first time. */
854 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
855
856 /* Avoid overflows. */
857 if (pat_len == SIZE_MAX)
858 return REG_ESPACE;
859
860 dfa->nodes_alloc = pat_len + 1;
861 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
862
863 /* table_size = 2 ^ ceil(log pat_len) */
864 for (table_size = 1; ; table_size <<= 1)
865 if (table_size > pat_len)
866 break;
867
868 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
869 dfa->state_hash_mask = table_size - 1;
870
871 dfa->mb_cur_max = MB_CUR_MAX;
872#ifdef _LIBC
873 if (dfa->mb_cur_max == 6
874 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
875 dfa->is_utf8 = 1;
876 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
877 != 0);
878#else
879# ifdef HAVE_LANGINFO_CODESET
880 codeset_name = nl_langinfo (CODESET);
881# else
882 codeset_name = getenv ("LC_ALL");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LC_CTYPE");
885 if (codeset_name == NULL || codeset_name[0] == '\0')
886 codeset_name = getenv ("LANG");
887 if (codeset_name == NULL)
888 codeset_name = "";
889 else if (strchr (codeset_name, '.') != NULL)
890 codeset_name = strchr (codeset_name, '.') + 1;
891# endif
892
893 /* strcasecmp isn't a standard interface. brute force check */
894#if 0
895 if (strcasecmp (codeset_name, "UTF-8") == 0
896 || strcasecmp (codeset_name, "UTF8") == 0)
897 dfa->is_utf8 = 1;
898#else
899 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
900 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
901 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
902 && (codeset_name[3] == '-'
903 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
904 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
905 dfa->is_utf8 = 1;
906#endif
907
908 /* We check exhaustively in the loop below if this charset is a
909 superset of ASCII. */
910 dfa->map_notascii = 0;
911#endif
912
913#ifdef RE_ENABLE_I18N
914 if (dfa->mb_cur_max > 1)
915 {
916 if (dfa->is_utf8)
917 {
918#if !defined(__GNUC__) || __GNUC__ < 3
919 static short utf8_sb_map_inited = 0;
920
921 if (! utf8_sb_map_inited)
922 {
923 int i;
924
925 utf8_sb_map_inited = 0;
926 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
927 utf8_sb_map[i] = BITSET_WORD_MAX;
928 }
929#endif
930 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
931 }
932 else
933 {
934 int i, j, ch;
935
936 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
937 if (BE (dfa->sb_char == NULL, 0))
938 return REG_ESPACE;
939
940 /* Set the bits corresponding to single byte chars. */
941 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
942 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
943 {
944 wint_t wch = __btowc (ch);
945 if (wch != WEOF)
946 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
947# ifndef _LIBC
948 if (isascii (ch) && wch != ch)
949 dfa->map_notascii = 1;
950# endif
951 }
952 }
953 }
954#endif
955
956 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
957 return REG_ESPACE;
958 return REG_NOERROR;
959}
960
961/* Initialize WORD_CHAR table, which indicate which character is
962 "word". In this case "word" means that it is the word construction
963 character used by some operators like "\<", "\>", etc. */
964
965static void
966internal_function
967init_word_char (re_dfa_t *dfa)
968{
969 int i, j, ch;
970 dfa->word_ops_used = 1;
971 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
972 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
973 if (isalnum (ch) || ch == '_')
974 dfa->word_char[i] |= (bitset_word_t) 1 << j;
975}
976
977/* Free the work area which are only used while compiling. */
978
979static void
980free_workarea_compile (regex_t *preg)
981{
982 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
983 bin_tree_storage_t *storage, *next;
984 for (storage = dfa->str_tree_storage; storage; storage = next)
985 {
986 next = storage->next;
987 re_free (storage);
988 }
989 dfa->str_tree_storage = NULL;
990 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
991 dfa->str_tree = NULL;
992 re_free (dfa->org_indices);
993 dfa->org_indices = NULL;
994}
995
996/* Create initial states for all contexts. */
997
998static reg_errcode_t
999create_initial_state (re_dfa_t *dfa)
1000{
1001 int first, i;
1002 reg_errcode_t err;
1003 re_node_set init_nodes;
1004
1005 /* Initial states have the epsilon closure of the node which is
1006 the first node of the regular expression. */
1007 first = dfa->str_tree->first->node_idx;
1008 dfa->init_node = first;
1009 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1010 if (BE (err != REG_NOERROR, 0))
1011 return err;
1012
1013 /* The back-references which are in initial states can epsilon transit,
1014 since in this case all of the subexpressions can be null.
1015 Then we add epsilon closures of the nodes which are the next nodes of
1016 the back-references. */
1017 if (dfa->nbackref > 0)
1018 for (i = 0; i < init_nodes.nelem; ++i)
1019 {
1020 int node_idx = init_nodes.elems[i];
1021 re_token_type_t type = dfa->nodes[node_idx].type;
1022
1023 int clexp_idx;
1024 if (type != OP_BACK_REF)
1025 continue;
1026 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1027 {
1028 re_token_t *clexp_node;
1029 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1030 if (clexp_node->type == OP_CLOSE_SUBEXP
1031 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1032 break;
1033 }
1034 if (clexp_idx == init_nodes.nelem)
1035 continue;
1036
1037 if (type == OP_BACK_REF)
1038 {
1039 int dest_idx = dfa->edests[node_idx].elems[0];
1040 if (!re_node_set_contains (&init_nodes, dest_idx))
1041 {
1042 err = re_node_set_merge (&init_nodes,
1043 dfa->eclosures + dest_idx);
1044 if (err != REG_NOERROR)
1045 return err;
1046 i = 0;
1047 }
1048 }
1049 }
1050
1051 /* It must be the first time to invoke acquire_state. */
1052 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1053 /* We don't check ERR here, since the initial state must not be NULL. */
1054 if (BE (dfa->init_state == NULL, 0))
1055 return err;
1056 if (dfa->init_state->has_constraint)
1057 {
1058 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_WORD);
1060 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1061 CONTEXT_NEWLINE);
1062 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1063 &init_nodes,
1064 CONTEXT_NEWLINE
1065 | CONTEXT_BEGBUF);
1066 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1067 || dfa->init_state_begbuf == NULL, 0))
1068 return err;
1069 }
1070 else
1071 dfa->init_state_word = dfa->init_state_nl
1072 = dfa->init_state_begbuf = dfa->init_state;
1073
1074 re_node_set_free (&init_nodes);
1075 return REG_NOERROR;
1076}
1077
1078#ifdef RE_ENABLE_I18N
1079/* If it is possible to do searching in single byte encoding instead of UTF-8
1080 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081 DFA nodes where needed. */
1082
1083static void
1084optimize_utf8 (re_dfa_t *dfa)
1085{
1086 int node, i, mb_chars = 0, has_period = 0;
1087
1088 for (node = 0; node < dfa->nodes_len; ++node)
1089 switch (dfa->nodes[node].type)
1090 {
1091 case CHARACTER:
1092 if (dfa->nodes[node].opr.c >= 0x80)
1093 mb_chars = 1;
1094 break;
1095 case ANCHOR:
1096 switch (dfa->nodes[node].opr.ctx_type)
1097 {
1098 case LINE_FIRST:
1099 case LINE_LAST:
1100 case BUF_FIRST:
1101 case BUF_LAST:
1102 break;
1103 default:
1104 /* Word anchors etc. cannot be handled. It's okay to test
1105 opr.ctx_type since constraints (for all DFA nodes) are
1106 created by ORing one or more opr.ctx_type values. */
1107 return;
1108 }
1109 break;
1110 case OP_PERIOD:
1111 has_period = 1;
1112 break;
1113 case OP_BACK_REF:
1114 case OP_ALT:
1115 case END_OF_RE:
1116 case OP_DUP_ASTERISK:
1117 case OP_OPEN_SUBEXP:
1118 case OP_CLOSE_SUBEXP:
1119 break;
1120 case COMPLEX_BRACKET:
1121 return;
1122 case SIMPLE_BRACKET:
1123 /* Just double check. The non-ASCII range starts at 0x80. */
1124 assert (0x80 % BITSET_WORD_BITS == 0);
1125 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1126 if (dfa->nodes[node].opr.sbcset[i])
1127 return;
1128 break;
1129 default:
1130 abort ();
1131 }
1132
1133 if (mb_chars || has_period)
1134 for (node = 0; node < dfa->nodes_len; ++node)
1135 {
1136 if (dfa->nodes[node].type == CHARACTER
1137 && dfa->nodes[node].opr.c >= 0x80)
1138 dfa->nodes[node].mb_partial = 0;
1139 else if (dfa->nodes[node].type == OP_PERIOD)
1140 dfa->nodes[node].type = OP_UTF8_PERIOD;
1141 }
1142
1143 /* The search can be in single byte locale. */
1144 dfa->mb_cur_max = 1;
1145 dfa->is_utf8 = 0;
1146 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1147}
1148#endif
1149
1150/* Analyze the structure tree, and calculate "first", "next", "edest",
1151 "eclosure", and "inveclosure". */
1152
1153static reg_errcode_t
1154analyze (regex_t *preg)
1155{
1156 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1157 reg_errcode_t ret;
1158
1159 /* Allocate arrays. */
1160 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1161 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1162 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1163 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1164 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1165 || dfa->eclosures == NULL, 0))
1166 return REG_ESPACE;
1167
1168 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1169 if (dfa->subexp_map != NULL)
1170 {
1171 int i;
1172 for (i = 0; i < preg->re_nsub; i++)
1173 dfa->subexp_map[i] = i;
1174 preorder (dfa->str_tree, optimize_subexps, dfa);
1175 for (i = 0; i < preg->re_nsub; i++)
1176 if (dfa->subexp_map[i] != i)
1177 break;
1178 if (i == preg->re_nsub)
1179 {
1180 free (dfa->subexp_map);
1181 dfa->subexp_map = NULL;
1182 }
1183 }
1184
1185 ret = postorder (dfa->str_tree, lower_subexps, preg);
1186 if (BE (ret != REG_NOERROR, 0))
1187 return ret;
1188 ret = postorder (dfa->str_tree, calc_first, dfa);
1189 if (BE (ret != REG_NOERROR, 0))
1190 return ret;
1191 preorder (dfa->str_tree, calc_next, dfa);
1192 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1194 return ret;
1195 ret = calc_eclosure (dfa);
1196 if (BE (ret != REG_NOERROR, 0))
1197 return ret;
1198
1199 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1200 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1201 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1202 || dfa->nbackref)
1203 {
1204 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1205 if (BE (dfa->inveclosures == NULL, 0))
1206 return REG_ESPACE;
1207 ret = calc_inveclosure (dfa);
1208 }
1209
1210 return ret;
1211}
1212
1213/* Our parse trees are very unbalanced, so we cannot use a stack to
1214 implement parse tree visits. Instead, we use parent pointers and
1215 some hairy code in these two functions. */
1216static reg_errcode_t
1217postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1218 void *extra)
1219{
1220 bin_tree_t *node, *prev;
1221
1222 for (node = root; ; )
1223 {
1224 /* Descend down the tree, preferably to the left (or to the right
1225 if that's the only child). */
1226 while (node->left || node->right)
1227 if (node->left)
1228 node = node->left;
1229 else
1230 node = node->right;
1231
1232 do
1233 {
1234 reg_errcode_t err = fn (extra, node);
1235 if (BE (err != REG_NOERROR, 0))
1236 return err;
1237 if (node->parent == NULL)
1238 return REG_NOERROR;
1239 prev = node;
1240 node = node->parent;
1241 }
1242 /* Go up while we have a node that is reached from the right. */
1243 while (node->right == prev || node->right == NULL);
1244 node = node->right;
1245 }
1246}
1247
1248static reg_errcode_t
1249preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1250 void *extra)
1251{
1252 bin_tree_t *node;
1253
1254 for (node = root; ; )
1255 {
1256 reg_errcode_t err = fn (extra, node);
1257 if (BE (err != REG_NOERROR, 0))
1258 return err;
1259
1260 /* Go to the left node, or up and to the right. */
1261 if (node->left)
1262 node = node->left;
1263 else
1264 {
1265 bin_tree_t *prev = NULL;
1266 while (node->right == prev || node->right == NULL)
1267 {
1268 prev = node;
1269 node = node->parent;
1270 if (!node)
1271 return REG_NOERROR;
1272 }
1273 node = node->right;
1274 }
1275 }
1276}
1277
1278/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1279 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1280 backreferences as well. Requires a preorder visit. */
1281static reg_errcode_t
1282optimize_subexps (void *extra, bin_tree_t *node)
1283{
1284 re_dfa_t *dfa = (re_dfa_t *) extra;
1285
1286 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1287 {
1288 int idx = node->token.opr.idx;
1289 node->token.opr.idx = dfa->subexp_map[idx];
1290 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1291 }
1292
1293 else if (node->token.type == SUBEXP
1294 && node->left && node->left->token.type == SUBEXP)
1295 {
1296 int other_idx = node->left->token.opr.idx;
1297
1298 node->left = node->left->left;
1299 if (node->left)
1300 node->left->parent = node;
1301
1302 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1303 if (other_idx < BITSET_WORD_BITS)
1304 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1305 }
1306
1307 return REG_NOERROR;
1308}
1309
1310/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1311 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1312static reg_errcode_t
1313lower_subexps (void *extra, bin_tree_t *node)
1314{
1315 regex_t *preg = (regex_t *) extra;
1316 reg_errcode_t err = REG_NOERROR;
1317
1318 if (node->left && node->left->token.type == SUBEXP)
1319 {
1320 node->left = lower_subexp (&err, preg, node->left);
1321 if (node->left)
1322 node->left->parent = node;
1323 }
1324 if (node->right && node->right->token.type == SUBEXP)
1325 {
1326 node->right = lower_subexp (&err, preg, node->right);
1327 if (node->right)
1328 node->right->parent = node;
1329 }
1330
1331 return err;
1332}
1333
1334static bin_tree_t *
1335lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1336{
1337 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1338 bin_tree_t *body = node->left;
1339 bin_tree_t *op, *cls, *tree1, *tree;
1340
1341 if (preg->no_sub
1342 /* We do not optimize empty subexpressions, because otherwise we may
1343 have bad CONCAT nodes with NULL children. This is obviously not
1344 very common, so we do not lose much. An example that triggers
1345 this case is the sed "script" /\(\)/x. */
1346 && node->left != NULL
1347 && (node->token.opr.idx >= BITSET_WORD_BITS
1348 || !(dfa->used_bkref_map
1349 & ((bitset_word_t) 1 << node->token.opr.idx))))
1350 return node->left;
1351
1352 /* Convert the SUBEXP node to the concatenation of an
1353 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1354 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1355 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1356 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1357 tree = create_tree (dfa, op, tree1, CONCAT);
1358 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1359 {
1360 *err = REG_ESPACE;
1361 return NULL;
1362 }
1363
1364 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1365 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1366 return tree;
1367}
1368
1369/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1370 nodes. Requires a postorder visit. */
1371static reg_errcode_t
1372calc_first (void *extra, bin_tree_t *node)
1373{
1374 re_dfa_t *dfa = (re_dfa_t *) extra;
1375 if (node->token.type == CONCAT)
1376 {
1377 node->first = node->left->first;
1378 node->node_idx = node->left->node_idx;
1379 }
1380 else
1381 {
1382 node->first = node;
1383 node->node_idx = re_dfa_add_node (dfa, node->token);
1384 if (BE (node->node_idx == -1, 0))
1385 return REG_ESPACE;
1386 if (node->token.type == ANCHOR)
1387 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1388 }
1389 return REG_NOERROR;
1390}
1391
1392/* Pass 2: compute NEXT on the tree. Preorder visit. */
1393static reg_errcode_t
1394calc_next (UNUSED_PARAM void *extra, bin_tree_t *node)
1395{
1396 switch (node->token.type)
1397 {
1398 case OP_DUP_ASTERISK:
1399 node->left->next = node;
1400 break;
1401 case CONCAT:
1402 node->left->next = node->right->first;
1403 node->right->next = node->next;
1404 break;
1405 default:
1406 if (node->left)
1407 node->left->next = node->next;
1408 if (node->right)
1409 node->right->next = node->next;
1410 break;
1411 }
1412 return REG_NOERROR;
1413}
1414
1415/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1416static reg_errcode_t
1417link_nfa_nodes (void *extra, bin_tree_t *node)
1418{
1419 re_dfa_t *dfa = (re_dfa_t *) extra;
1420 int idx = node->node_idx;
1421 reg_errcode_t err = REG_NOERROR;
1422
1423 switch (node->token.type)
1424 {
1425 case CONCAT:
1426 break;
1427
1428 case END_OF_RE:
1429 assert (node->next == NULL);
1430 break;
1431
1432 case OP_DUP_ASTERISK:
1433 case OP_ALT:
1434 {
1435 int left, right;
1436 dfa->has_plural_match = 1;
1437 if (node->left != NULL)
1438 left = node->left->first->node_idx;
1439 else
1440 left = node->next->node_idx;
1441 if (node->right != NULL)
1442 right = node->right->first->node_idx;
1443 else
1444 right = node->next->node_idx;
1445 assert (left > -1);
1446 assert (right > -1);
1447 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1448 }
1449 break;
1450
1451 case ANCHOR:
1452 case OP_OPEN_SUBEXP:
1453 case OP_CLOSE_SUBEXP:
1454 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1455 break;
1456
1457 case OP_BACK_REF:
1458 dfa->nexts[idx] = node->next->node_idx;
1459 if (node->token.type == OP_BACK_REF)
1460 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1461 break;
1462
1463 default:
1464 assert (!IS_EPSILON_NODE (node->token.type));
1465 dfa->nexts[idx] = node->next->node_idx;
1466 break;
1467 }
1468
1469 return err;
1470}
1471
1472/* Duplicate the epsilon closure of the node ROOT_NODE.
1473 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1474 to their own constraint. */
1475
1476static reg_errcode_t
1477internal_function
1478duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1479 int root_node, unsigned int init_constraint)
1480{
1481 int org_node, clone_node, ret;
1482 unsigned int constraint = init_constraint;
1483 for (org_node = top_org_node, clone_node = top_clone_node;;)
1484 {
1485 int org_dest, clone_dest;
1486 if (dfa->nodes[org_node].type == OP_BACK_REF)
1487 {
1488 /* If the back reference epsilon-transit, its destination must
1489 also have the constraint. Then duplicate the epsilon closure
1490 of the destination of the back reference, and store it in
1491 edests of the back reference. */
1492 org_dest = dfa->nexts[org_node];
1493 re_node_set_empty (dfa->edests + clone_node);
1494 clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 if (BE (clone_dest == -1, 0))
1496 return REG_ESPACE;
1497 dfa->nexts[clone_node] = dfa->nexts[org_node];
1498 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499 if (BE (ret < 0, 0))
1500 return REG_ESPACE;
1501 }
1502 else if (dfa->edests[org_node].nelem == 0)
1503 {
1504 /* In case of the node can't epsilon-transit, don't duplicate the
1505 destination and store the original destination as the
1506 destination of the node. */
1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 break;
1509 }
1510 else if (dfa->edests[org_node].nelem == 1)
1511 {
1512 /* In case of the node can epsilon-transit, and it has only one
1513 destination. */
1514 org_dest = dfa->edests[org_node].elems[0];
1515 re_node_set_empty (dfa->edests + clone_node);
1516 /* If the node is root_node itself, it means the epsilon clsoure
1517 has a loop. Then tie it to the destination of the root_node. */
1518 if (org_node == root_node && clone_node != org_node)
1519 {
1520 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1521 if (BE (ret < 0, 0))
1522 return REG_ESPACE;
1523 break;
1524 }
1525 /* In case of the node has another constraint, add it. */
1526 constraint |= dfa->nodes[org_node].constraint;
1527 clone_dest = duplicate_node (dfa, org_dest, constraint);
1528 if (BE (clone_dest == -1, 0))
1529 return REG_ESPACE;
1530 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 if (BE (ret < 0, 0))
1532 return REG_ESPACE;
1533 }
1534 else /* dfa->edests[org_node].nelem == 2 */
1535 {
1536 /* In case of the node can epsilon-transit, and it has two
1537 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1538 org_dest = dfa->edests[org_node].elems[0];
1539 re_node_set_empty (dfa->edests + clone_node);
1540 /* Search for a duplicated node which satisfies the constraint. */
1541 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1542 if (clone_dest == -1)
1543 {
1544 /* There is no such duplicated node, create a new one. */
1545 reg_errcode_t err;
1546 clone_dest = duplicate_node (dfa, org_dest, constraint);
1547 if (BE (clone_dest == -1, 0))
1548 return REG_ESPACE;
1549 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 if (BE (ret < 0, 0))
1551 return REG_ESPACE;
1552 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1553 root_node, constraint);
1554 if (BE (err != REG_NOERROR, 0))
1555 return err;
1556 }
1557 else
1558 {
1559 /* There is a duplicated node which satisfies the constraint,
1560 use it to avoid infinite loop. */
1561 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1562 if (BE (ret < 0, 0))
1563 return REG_ESPACE;
1564 }
1565
1566 org_dest = dfa->edests[org_node].elems[1];
1567 clone_dest = duplicate_node (dfa, org_dest, constraint);
1568 if (BE (clone_dest == -1, 0))
1569 return REG_ESPACE;
1570 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1571 if (BE (ret < 0, 0))
1572 return REG_ESPACE;
1573 }
1574 org_node = org_dest;
1575 clone_node = clone_dest;
1576 }
1577 return REG_NOERROR;
1578}
1579
1580/* Search for a node which is duplicated from the node ORG_NODE, and
1581 satisfies the constraint CONSTRAINT. */
1582
1583static int
1584search_duplicated_node (const re_dfa_t *dfa, int org_node,
1585 unsigned int constraint)
1586{
1587 int idx;
1588 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1589 {
1590 if (org_node == dfa->org_indices[idx]
1591 && constraint == dfa->nodes[idx].constraint)
1592 return idx; /* Found. */
1593 }
1594 return -1; /* Not found. */
1595}
1596
1597/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1598 Return the index of the new node, or -1 if insufficient storage is
1599 available. */
1600
1601static int
1602duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1603{
1604 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1605 if (BE (dup_idx != -1, 1))
1606 {
1607 dfa->nodes[dup_idx].constraint = constraint;
1608 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1609 dfa->nodes[dup_idx].duplicated = 1;
1610
1611 /* Store the index of the original node. */
1612 dfa->org_indices[dup_idx] = org_idx;
1613 }
1614 return dup_idx;
1615}
1616
1617static reg_errcode_t
1618calc_inveclosure (re_dfa_t *dfa)
1619{
1620 int src, idx, ret;
1621 for (idx = 0; idx < dfa->nodes_len; ++idx)
1622 re_node_set_init_empty (dfa->inveclosures + idx);
1623
1624 for (src = 0; src < dfa->nodes_len; ++src)
1625 {
1626 int *elems = dfa->eclosures[src].elems;
1627 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1628 {
1629 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1630 if (BE (ret == -1, 0))
1631 return REG_ESPACE;
1632 }
1633 }
1634
1635 return REG_NOERROR;
1636}
1637
1638/* Calculate "eclosure" for all the node in DFA. */
1639
1640static reg_errcode_t
1641calc_eclosure (re_dfa_t *dfa)
1642{
1643 int node_idx, incomplete;
1644#ifdef DEBUG
1645 assert (dfa->nodes_len > 0);
1646#endif
1647 incomplete = 0;
1648 /* For each nodes, calculate epsilon closure. */
1649 for (node_idx = 0; ; ++node_idx)
1650 {
1651 reg_errcode_t err;
1652 re_node_set eclosure_elem;
1653 if (node_idx == dfa->nodes_len)
1654 {
1655 if (!incomplete)
1656 break;
1657 incomplete = 0;
1658 node_idx = 0;
1659 }
1660
1661#ifdef DEBUG
1662 assert (dfa->eclosures[node_idx].nelem != -1);
1663#endif
1664
1665 /* If we have already calculated, skip it. */
1666 if (dfa->eclosures[node_idx].nelem != 0)
1667 continue;
1668 /* Calculate epsilon closure of `node_idx'. */
1669 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1670 if (BE (err != REG_NOERROR, 0))
1671 return err;
1672
1673 if (dfa->eclosures[node_idx].nelem == 0)
1674 {
1675 incomplete = 1;
1676 re_node_set_free (&eclosure_elem);
1677 }
1678 }
1679 return REG_NOERROR;
1680}
1681
1682/* Calculate epsilon closure of NODE. */
1683
1684static reg_errcode_t
1685calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1686{
1687 reg_errcode_t err;
1688 int i;
1689 re_node_set eclosure;
1690 int ret;
1691 int incomplete = 0;
1692 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1693 if (BE (err != REG_NOERROR, 0))
1694 return err;
1695
1696 /* This indicates that we are calculating this node now.
1697 We reference this value to avoid infinite loop. */
1698 dfa->eclosures[node].nelem = -1;
1699
1700 /* If the current node has constraints, duplicate all nodes
1701 since they must inherit the constraints. */
1702 if (dfa->nodes[node].constraint
1703 && dfa->edests[node].nelem
1704 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1705 {
1706 err = duplicate_node_closure (dfa, node, node, node,
1707 dfa->nodes[node].constraint);
1708 if (BE (err != REG_NOERROR, 0))
1709 return err;
1710 }
1711
1712 /* Expand each epsilon destination nodes. */
1713 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1714 for (i = 0; i < dfa->edests[node].nelem; ++i)
1715 {
1716 re_node_set eclosure_elem;
1717 int edest = dfa->edests[node].elems[i];
1718 /* If calculating the epsilon closure of `edest' is in progress,
1719 return intermediate result. */
1720 if (dfa->eclosures[edest].nelem == -1)
1721 {
1722 incomplete = 1;
1723 continue;
1724 }
1725 /* If we haven't calculated the epsilon closure of `edest' yet,
1726 calculate now. Otherwise use calculated epsilon closure. */
1727 if (dfa->eclosures[edest].nelem == 0)
1728 {
1729 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733 else
1734 eclosure_elem = dfa->eclosures[edest];
1735 /* Merge the epsilon closure of `edest'. */
1736 err = re_node_set_merge (&eclosure, &eclosure_elem);
1737 if (BE (err != REG_NOERROR, 0))
1738 return err;
1739 /* If the epsilon closure of `edest' is incomplete,
1740 the epsilon closure of this node is also incomplete. */
1741 if (dfa->eclosures[edest].nelem == 0)
1742 {
1743 incomplete = 1;
1744 re_node_set_free (&eclosure_elem);
1745 }
1746 }
1747
1748 /* An epsilon closure includes itself. */
1749 ret = re_node_set_insert (&eclosure, node);
1750 if (BE (ret < 0, 0))
1751 return REG_ESPACE;
1752 if (incomplete && !root)
1753 dfa->eclosures[node].nelem = 0;
1754 else
1755 dfa->eclosures[node] = eclosure;
1756 *new_set = eclosure;
1757 return REG_NOERROR;
1758}
1759
1760/* Functions for token which are used in the parser. */
1761
1762/* Fetch a token from INPUT.
1763 We must not use this function inside bracket expressions. */
1764
1765static void
1766internal_function
1767fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1768{
1769 re_string_skip_bytes (input, peek_token (result, input, syntax));
1770}
1771
1772/* Peek a token from INPUT, and return the length of the token.
1773 We must not use this function inside bracket expressions. */
1774
1775static int
1776internal_function
1777peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1778{
1779 unsigned char c;
1780
1781 if (re_string_eoi (input))
1782 {
1783 token->type = END_OF_RE;
1784 return 0;
1785 }
1786
1787 c = re_string_peek_byte (input, 0);
1788 token->opr.c = c;
1789
1790 token->word_char = 0;
1791#ifdef RE_ENABLE_I18N
1792 token->mb_partial = 0;
1793 if (input->mb_cur_max > 1 &&
1794 !re_string_first_byte (input, re_string_cur_idx (input)))
1795 {
1796 token->type = CHARACTER;
1797 token->mb_partial = 1;
1798 return 1;
1799 }
1800#endif
1801 if (c == '\\')
1802 {
1803 unsigned char c2;
1804 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1805 {
1806 token->type = BACK_SLASH;
1807 return 1;
1808 }
1809
1810 c2 = re_string_peek_byte_case (input, 1);
1811 token->opr.c = c2;
1812 token->type = CHARACTER;
1813#ifdef RE_ENABLE_I18N
1814 if (input->mb_cur_max > 1)
1815 {
1816 wint_t wc = re_string_wchar_at (input,
1817 re_string_cur_idx (input) + 1);
1818 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 }
1820 else
1821#endif
1822 token->word_char = IS_WORD_CHAR (c2) != 0;
1823
1824 switch (c2)
1825 {
1826 case '|':
1827 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1828 token->type = OP_ALT;
1829 break;
1830 case '1': case '2': case '3': case '4': case '5':
1831 case '6': case '7': case '8': case '9':
1832 if (!(syntax & RE_NO_BK_REFS))
1833 {
1834 token->type = OP_BACK_REF;
1835 token->opr.idx = c2 - '1';
1836 }
1837 break;
1838 case '<':
1839 if (!(syntax & RE_NO_GNU_OPS))
1840 {
1841 token->type = ANCHOR;
1842 token->opr.ctx_type = WORD_FIRST;
1843 }
1844 break;
1845 case '>':
1846 if (!(syntax & RE_NO_GNU_OPS))
1847 {
1848 token->type = ANCHOR;
1849 token->opr.ctx_type = WORD_LAST;
1850 }
1851 break;
1852 case 'b':
1853 if (!(syntax & RE_NO_GNU_OPS))
1854 {
1855 token->type = ANCHOR;
1856 token->opr.ctx_type = WORD_DELIM;
1857 }
1858 break;
1859 case 'B':
1860 if (!(syntax & RE_NO_GNU_OPS))
1861 {
1862 token->type = ANCHOR;
1863 token->opr.ctx_type = NOT_WORD_DELIM;
1864 }
1865 break;
1866 case 'w':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_WORD;
1869 break;
1870 case 'W':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_NOTWORD;
1873 break;
1874 case 's':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_SPACE;
1877 break;
1878 case 'S':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_NOTSPACE;
1881 break;
1882 case '`':
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 {
1885 token->type = ANCHOR;
1886 token->opr.ctx_type = BUF_FIRST;
1887 }
1888 break;
1889 case '\'':
1890 if (!(syntax & RE_NO_GNU_OPS))
1891 {
1892 token->type = ANCHOR;
1893 token->opr.ctx_type = BUF_LAST;
1894 }
1895 break;
1896 case '(':
1897 if (!(syntax & RE_NO_BK_PARENS))
1898 token->type = OP_OPEN_SUBEXP;
1899 break;
1900 case ')':
1901 if (!(syntax & RE_NO_BK_PARENS))
1902 token->type = OP_CLOSE_SUBEXP;
1903 break;
1904 case '+':
1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906 token->type = OP_DUP_PLUS;
1907 break;
1908 case '?':
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 token->type = OP_DUP_QUESTION;
1911 break;
1912 case '{':
1913 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914 token->type = OP_OPEN_DUP_NUM;
1915 break;
1916 case '}':
1917 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 token->type = OP_CLOSE_DUP_NUM;
1919 break;
1920 default:
1921 break;
1922 }
1923 return 2;
1924 }
1925
1926 token->type = CHARACTER;
1927#ifdef RE_ENABLE_I18N
1928 if (input->mb_cur_max > 1)
1929 {
1930 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1931 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 }
1933 else
1934#endif
1935 token->word_char = IS_WORD_CHAR (token->opr.c);
1936
1937 switch (c)
1938 {
1939 case '\n':
1940 if (syntax & RE_NEWLINE_ALT)
1941 token->type = OP_ALT;
1942 break;
1943 case '|':
1944 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1945 token->type = OP_ALT;
1946 break;
1947 case '*':
1948 token->type = OP_DUP_ASTERISK;
1949 break;
1950 case '+':
1951 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952 token->type = OP_DUP_PLUS;
1953 break;
1954 case '?':
1955 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 token->type = OP_DUP_QUESTION;
1957 break;
1958 case '{':
1959 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960 token->type = OP_OPEN_DUP_NUM;
1961 break;
1962 case '}':
1963 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 token->type = OP_CLOSE_DUP_NUM;
1965 break;
1966 case '(':
1967 if (syntax & RE_NO_BK_PARENS)
1968 token->type = OP_OPEN_SUBEXP;
1969 break;
1970 case ')':
1971 if (syntax & RE_NO_BK_PARENS)
1972 token->type = OP_CLOSE_SUBEXP;
1973 break;
1974 case '[':
1975 token->type = OP_OPEN_BRACKET;
1976 break;
1977 case '.':
1978 token->type = OP_PERIOD;
1979 break;
1980 case '^':
1981 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1982 re_string_cur_idx (input) != 0)
1983 {
1984 char prev = re_string_peek_byte (input, -1);
1985 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1986 break;
1987 }
1988 token->type = ANCHOR;
1989 token->opr.ctx_type = LINE_FIRST;
1990 break;
1991 case '$':
1992 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1993 re_string_cur_idx (input) + 1 != re_string_length (input))
1994 {
1995 re_token_t next;
1996 re_string_skip_bytes (input, 1);
1997 peek_token (&next, input, syntax);
1998 re_string_skip_bytes (input, -1);
1999 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2000 break;
2001 }
2002 token->type = ANCHOR;
2003 token->opr.ctx_type = LINE_LAST;
2004 break;
2005 default:
2006 break;
2007 }
2008 return 1;
2009}
2010
2011/* Peek a token from INPUT, and return the length of the token.
2012 We must not use this function out of bracket expressions. */
2013
2014static int
2015internal_function
2016peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2017{
2018 unsigned char c;
2019 if (re_string_eoi (input))
2020 {
2021 token->type = END_OF_RE;
2022 return 0;
2023 }
2024 c = re_string_peek_byte (input, 0);
2025 token->opr.c = c;
2026
2027#ifdef RE_ENABLE_I18N
2028 if (input->mb_cur_max > 1 &&
2029 !re_string_first_byte (input, re_string_cur_idx (input)))
2030 {
2031 token->type = CHARACTER;
2032 return 1;
2033 }
2034#endif /* RE_ENABLE_I18N */
2035
2036 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2037 && re_string_cur_idx (input) + 1 < re_string_length (input))
2038 {
2039 /* In this case, '\' escape a character. */
2040 unsigned char c2;
2041 re_string_skip_bytes (input, 1);
2042 c2 = re_string_peek_byte (input, 0);
2043 token->opr.c = c2;
2044 token->type = CHARACTER;
2045 return 1;
2046 }
2047 if (c == '[') /* '[' is a special char in a bracket exps. */
2048 {
2049 unsigned char c2;
2050 int token_len;
2051 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2052 c2 = re_string_peek_byte (input, 1);
2053 else
2054 c2 = 0;
2055 token->opr.c = c2;
2056 token_len = 2;
2057 switch (c2)
2058 {
2059 case '.':
2060 token->type = OP_OPEN_COLL_ELEM;
2061 break;
2062 case '=':
2063 token->type = OP_OPEN_EQUIV_CLASS;
2064 break;
2065 case ':':
2066 if (syntax & RE_CHAR_CLASSES)
2067 {
2068 token->type = OP_OPEN_CHAR_CLASS;
2069 break;
2070 }
2071 /* else fall through. */
2072 default:
2073 token->type = CHARACTER;
2074 token->opr.c = c;
2075 token_len = 1;
2076 break;
2077 }
2078 return token_len;
2079 }
2080 switch (c)
2081 {
2082 case '-':
2083 token->type = OP_CHARSET_RANGE;
2084 break;
2085 case ']':
2086 token->type = OP_CLOSE_BRACKET;
2087 break;
2088 case '^':
2089 token->type = OP_NON_MATCH_LIST;
2090 break;
2091 default:
2092 token->type = CHARACTER;
2093 }
2094 return 1;
2095}
2096
2097/* Functions for parser. */
2098
2099/* Entry point of the parser.
2100 Parse the regular expression REGEXP and return the structure tree.
2101 If an error has occurred, ERR is set by error code, and return NULL.
2102 This function build the following tree, from regular expression <reg_exp>:
2103 CAT
2104 / \
2105 / \
2106 <reg_exp> EOR
2107
2108 CAT means concatenation.
2109 EOR means end of regular expression. */
2110
2111static bin_tree_t *
2112parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2113 reg_errcode_t *err)
2114{
2115 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2116 bin_tree_t *tree, *eor, *root;
2117 re_token_t current_token;
2118 dfa->syntax = syntax;
2119 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2120 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2121 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2122 return NULL;
2123 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2124 if (tree != NULL)
2125 root = create_tree (dfa, tree, eor, CONCAT);
2126 else
2127 root = eor;
2128 if (BE (eor == NULL || root == NULL, 0))
2129 {
2130 *err = REG_ESPACE;
2131 return NULL;
2132 }
2133 return root;
2134}
2135
2136/* This function build the following tree, from regular expression
2137 <branch1>|<branch2>:
2138 ALT
2139 / \
2140 / \
2141 <branch1> <branch2>
2142
2143 ALT means alternative, which represents the operator `|'. */
2144
2145static bin_tree_t *
2146parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2147 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2148{
2149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2150 bin_tree_t *tree, *branch = NULL;
2151 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2152 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2153 return NULL;
2154
2155 while (token->type == OP_ALT)
2156 {
2157 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2158 if (token->type != OP_ALT && token->type != END_OF_RE
2159 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2160 {
2161 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2162 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2163 return NULL;
2164 }
2165 else
2166 branch = NULL;
2167 tree = create_tree (dfa, tree, branch, OP_ALT);
2168 if (BE (tree == NULL, 0))
2169 {
2170 *err = REG_ESPACE;
2171 return NULL;
2172 }
2173 }
2174 return tree;
2175}
2176
2177/* This function build the following tree, from regular expression
2178 <exp1><exp2>:
2179 CAT
2180 / \
2181 / \
2182 <exp1> <exp2>
2183
2184 CAT means concatenation. */
2185
2186static bin_tree_t *
2187parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2188 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2189{
2190 bin_tree_t *tree, *exp;
2191 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2192 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2193 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2194 return NULL;
2195
2196 while (token->type != OP_ALT && token->type != END_OF_RE
2197 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2198 {
2199 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2200 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2201 {
2202 return NULL;
2203 }
2204 if (tree != NULL && exp != NULL)
2205 {
2206 tree = create_tree (dfa, tree, exp, CONCAT);
2207 if (tree == NULL)
2208 {
2209 *err = REG_ESPACE;
2210 return NULL;
2211 }
2212 }
2213 else if (tree == NULL)
2214 tree = exp;
2215 /* Otherwise exp == NULL, we don't need to create new tree. */
2216 }
2217 return tree;
2218}
2219
2220/* This function build the following tree, from regular expression a*:
2221 *
2222 |
2223 a
2224*/
2225
2226static bin_tree_t *
2227parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2228 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2229{
2230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2231 bin_tree_t *tree;
2232 switch (token->type)
2233 {
2234 case CHARACTER:
2235 tree = create_token_tree (dfa, NULL, NULL, token);
2236 if (BE (tree == NULL, 0))
2237 {
2238 *err = REG_ESPACE;
2239 return NULL;
2240 }
2241#ifdef RE_ENABLE_I18N
2242 if (dfa->mb_cur_max > 1)
2243 {
2244 while (!re_string_eoi (regexp)
2245 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2246 {
2247 bin_tree_t *mbc_remain;
2248 fetch_token (token, regexp, syntax);
2249 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2250 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2251 if (BE (mbc_remain == NULL || tree == NULL, 0))
2252 {
2253 *err = REG_ESPACE;
2254 return NULL;
2255 }
2256 }
2257 }
2258#endif
2259 break;
2260 case OP_OPEN_SUBEXP:
2261 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2262 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2263 return NULL;
2264 break;
2265 case OP_OPEN_BRACKET:
2266 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2267 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 return NULL;
2269 break;
2270 case OP_BACK_REF:
2271 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2272 {
2273 *err = REG_ESUBREG;
2274 return NULL;
2275 }
2276 dfa->used_bkref_map |= 1 << token->opr.idx;
2277 tree = create_token_tree (dfa, NULL, NULL, token);
2278 if (BE (tree == NULL, 0))
2279 {
2280 *err = REG_ESPACE;
2281 return NULL;
2282 }
2283 ++dfa->nbackref;
2284 dfa->has_mb_node = 1;
2285 break;
2286 case OP_OPEN_DUP_NUM:
2287 if (syntax & RE_CONTEXT_INVALID_DUP)
2288 {
2289 *err = REG_BADRPT;
2290 return NULL;
2291 }
2292 /* FALLTHROUGH */
2293 case OP_DUP_ASTERISK:
2294 case OP_DUP_PLUS:
2295 case OP_DUP_QUESTION:
2296 if (syntax & RE_CONTEXT_INVALID_OPS)
2297 {
2298 *err = REG_BADRPT;
2299 return NULL;
2300 }
2301 else if (syntax & RE_CONTEXT_INDEP_OPS)
2302 {
2303 fetch_token (token, regexp, syntax);
2304 return parse_expression (regexp, preg, token, syntax, nest, err);
2305 }
2306 /* else fall through */
2307 case OP_CLOSE_SUBEXP:
2308 if ((token->type == OP_CLOSE_SUBEXP) &&
2309 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2310 {
2311 *err = REG_ERPAREN;
2312 return NULL;
2313 }
2314 /* else fall through */
2315 case OP_CLOSE_DUP_NUM:
2316 /* We treat it as a normal character. */
2317
2318 /* Then we can these characters as normal characters. */
2319 token->type = CHARACTER;
2320 /* mb_partial and word_char bits should be initialized already
2321 by peek_token. */
2322 tree = create_token_tree (dfa, NULL, NULL, token);
2323 if (BE (tree == NULL, 0))
2324 {
2325 *err = REG_ESPACE;
2326 return NULL;
2327 }
2328 break;
2329 case ANCHOR:
2330 if ((token->opr.ctx_type
2331 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2332 && dfa->word_ops_used == 0)
2333 init_word_char (dfa);
2334 if (token->opr.ctx_type == WORD_DELIM
2335 || token->opr.ctx_type == NOT_WORD_DELIM)
2336 {
2337 bin_tree_t *tree_first, *tree_last;
2338 if (token->opr.ctx_type == WORD_DELIM)
2339 {
2340 token->opr.ctx_type = WORD_FIRST;
2341 tree_first = create_token_tree (dfa, NULL, NULL, token);
2342 token->opr.ctx_type = WORD_LAST;
2343 }
2344 else
2345 {
2346 token->opr.ctx_type = INSIDE_WORD;
2347 tree_first = create_token_tree (dfa, NULL, NULL, token);
2348 token->opr.ctx_type = INSIDE_NOTWORD;
2349 }
2350 tree_last = create_token_tree (dfa, NULL, NULL, token);
2351 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2352 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2353 {
2354 *err = REG_ESPACE;
2355 return NULL;
2356 }
2357 }
2358 else
2359 {
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2362 {
2363 *err = REG_ESPACE;
2364 return NULL;
2365 }
2366 }
2367 /* We must return here, since ANCHORs can't be followed
2368 by repetition operators.
2369 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2370 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2371 fetch_token (token, regexp, syntax);
2372 return tree;
2373 case OP_PERIOD:
2374 tree = create_token_tree (dfa, NULL, NULL, token);
2375 if (BE (tree == NULL, 0))
2376 {
2377 *err = REG_ESPACE;
2378 return NULL;
2379 }
2380 if (dfa->mb_cur_max > 1)
2381 dfa->has_mb_node = 1;
2382 break;
2383 case OP_WORD:
2384 case OP_NOTWORD:
2385 tree = build_charclass_op (dfa, regexp->trans,
2386 "alnum",
2387 "_",
2388 token->type == OP_NOTWORD, err);
2389 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2390 return NULL;
2391 break;
2392 case OP_SPACE:
2393 case OP_NOTSPACE:
2394 tree = build_charclass_op (dfa, regexp->trans,
2395 "space",
2396 "",
2397 token->type == OP_NOTSPACE, err);
2398 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2399 return NULL;
2400 break;
2401 case OP_ALT:
2402 case END_OF_RE:
2403 return NULL;
2404 case BACK_SLASH:
2405 *err = REG_EESCAPE;
2406 return NULL;
2407 default:
2408 /* Must not happen? */
2409#ifdef DEBUG
2410 assert (0);
2411#endif
2412 return NULL;
2413 }
2414 fetch_token (token, regexp, syntax);
2415
2416 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2417 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2418 {
2419 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2420 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2421 return NULL;
2422 /* In BRE consecutive duplications are not allowed. */
2423 if ((syntax & RE_CONTEXT_INVALID_DUP)
2424 && (token->type == OP_DUP_ASTERISK
2425 || token->type == OP_OPEN_DUP_NUM))
2426 {
2427 *err = REG_BADRPT;
2428 return NULL;
2429 }
2430 }
2431
2432 return tree;
2433}
2434
2435/* This function build the following tree, from regular expression
2436 (<reg_exp>):
2437 SUBEXP
2438 |
2439 <reg_exp>
2440*/
2441
2442static bin_tree_t *
2443parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2444 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2445{
2446 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2447 bin_tree_t *tree;
2448 size_t cur_nsub;
2449 cur_nsub = preg->re_nsub++;
2450
2451 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2452
2453 /* The subexpression may be a null string. */
2454 if (token->type == OP_CLOSE_SUBEXP)
2455 tree = NULL;
2456 else
2457 {
2458 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2459 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2460 *err = REG_EPAREN;
2461 if (BE (*err != REG_NOERROR, 0))
2462 return NULL;
2463 }
2464
2465 if (cur_nsub <= '9' - '1')
2466 dfa->completed_bkref_map |= 1 << cur_nsub;
2467
2468 tree = create_tree (dfa, tree, NULL, SUBEXP);
2469 if (BE (tree == NULL, 0))
2470 {
2471 *err = REG_ESPACE;
2472 return NULL;
2473 }
2474 tree->token.opr.idx = cur_nsub;
2475 return tree;
2476}
2477
2478/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2479
2480static bin_tree_t *
2481parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2482 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2483{
2484 bin_tree_t *tree = NULL, *old_tree = NULL;
2485 int i, start, end, start_idx = re_string_cur_idx (regexp);
2486#ifndef RE_TOKEN_INIT_BUG
2487 re_token_t start_token = *token;
2488#else
2489 re_token_t start_token;
2490
2491 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2492#endif
2493
2494 if (token->type == OP_OPEN_DUP_NUM)
2495 {
2496 end = 0;
2497 start = fetch_number (regexp, token, syntax);
2498 if (start == -1)
2499 {
2500 if (token->type == CHARACTER && token->opr.c == ',')
2501 start = 0; /* We treat "{,m}" as "{0,m}". */
2502 else
2503 {
2504 *err = REG_BADBR; /* <re>{} is invalid. */
2505 return NULL;
2506 }
2507 }
2508 if (BE (start != -2, 1))
2509 {
2510 /* We treat "{n}" as "{n,n}". */
2511 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2512 : ((token->type == CHARACTER && token->opr.c == ',')
2513 ? fetch_number (regexp, token, syntax) : -2));
2514 }
2515 if (BE (start == -2 || end == -2, 0))
2516 {
2517 /* Invalid sequence. */
2518 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2519 {
2520 if (token->type == END_OF_RE)
2521 *err = REG_EBRACE;
2522 else
2523 *err = REG_BADBR;
2524
2525 return NULL;
2526 }
2527
2528 /* If the syntax bit is set, rollback. */
2529 re_string_set_index (regexp, start_idx);
2530 *token = start_token;
2531 token->type = CHARACTER;
2532 /* mb_partial and word_char bits should be already initialized by
2533 peek_token. */
2534 return elem;
2535 }
2536
2537 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2538 {
2539 /* First number greater than second. */
2540 *err = REG_BADBR;
2541 return NULL;
2542 }
2543 }
2544 else
2545 {
2546 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2547 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2548 }
2549
2550 fetch_token (token, regexp, syntax);
2551
2552 if (BE (elem == NULL, 0))
2553 return NULL;
2554 if (BE (start == 0 && end == 0, 0))
2555 {
2556 postorder (elem, free_tree, NULL);
2557 return NULL;
2558 }
2559
2560 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2561 if (BE (start > 0, 0))
2562 {
2563 tree = elem;
2564 for (i = 2; i <= start; ++i)
2565 {
2566 elem = duplicate_tree (elem, dfa);
2567 tree = create_tree (dfa, tree, elem, CONCAT);
2568 if (BE (elem == NULL || tree == NULL, 0))
2569 goto parse_dup_op_espace;
2570 }
2571
2572 if (start == end)
2573 return tree;
2574
2575 /* Duplicate ELEM before it is marked optional. */
2576 elem = duplicate_tree (elem, dfa);
2577 old_tree = tree;
2578 }
2579 else
2580 old_tree = NULL;
2581
2582 if (elem->token.type == SUBEXP)
2583 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2584
2585 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586 if (BE (tree == NULL, 0))
2587 goto parse_dup_op_espace;
2588
2589 /* This loop is actually executed only when end != -1,
2590 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2591 already created the start+1-th copy. */
2592 for (i = start + 2; i <= end; ++i)
2593 {
2594 elem = duplicate_tree (elem, dfa);
2595 tree = create_tree (dfa, tree, elem, CONCAT);
2596 if (BE (elem == NULL || tree == NULL, 0))
2597 goto parse_dup_op_espace;
2598
2599 tree = create_tree (dfa, tree, NULL, OP_ALT);
2600 if (BE (tree == NULL, 0))
2601 goto parse_dup_op_espace;
2602 }
2603
2604 if (old_tree)
2605 tree = create_tree (dfa, old_tree, tree, CONCAT);
2606
2607 return tree;
2608
2609 parse_dup_op_espace:
2610 *err = REG_ESPACE;
2611 return NULL;
2612}
2613
2614/* Size of the names for collating symbol/equivalence_class/character_class.
2615 I'm not sure, but maybe enough. */
2616#define BRACKET_NAME_BUF_SIZE 32
2617
2618#ifndef _LIBC
2619 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2620 Build the range expression which starts from START_ELEM, and ends
2621 at END_ELEM. The result are written to MBCSET and SBCSET.
2622 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2623 mbcset->range_ends, is a pointer argument since we may
2624 update it. */
2625
2626static reg_errcode_t
2627internal_function
2628# ifdef RE_ENABLE_I18N
2629build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2630 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2631# else /* not RE_ENABLE_I18N */
2632build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2633 bracket_elem_t *end_elem)
2634# endif /* not RE_ENABLE_I18N */
2635{
2636 unsigned int start_ch, end_ch;
2637 /* Equivalence Classes and Character Classes can't be a range start/end. */
2638 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2639 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2640 0))
2641 return REG_ERANGE;
2642
2643 /* We can handle no multi character collating elements without libc
2644 support. */
2645 if (BE ((start_elem->type == COLL_SYM
2646 && strlen ((char *) start_elem->opr.name) > 1)
2647 || (end_elem->type == COLL_SYM
2648 && strlen ((char *) end_elem->opr.name) > 1), 0))
2649 return REG_ECOLLATE;
2650
2651# ifdef RE_ENABLE_I18N
2652 {
2653 wchar_t wc;
2654 wint_t start_wc;
2655 wint_t end_wc;
2656 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2657
2658 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2659 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2660 : 0));
2661 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2662 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663 : 0));
2664#ifdef GAWK
2665 /*
2666 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2667 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2668 * unsigned, so we don't have sign extension problems.
2669 */
2670 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2671 ? start_ch : start_elem->opr.wch);
2672 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2673 ? end_ch : end_elem->opr.wch);
2674#else
2675 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2676 ? __btowc (start_ch) : start_elem->opr.wch);
2677 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2678 ? __btowc (end_ch) : end_elem->opr.wch);
2679#endif
2680 if (start_wc == WEOF || end_wc == WEOF)
2681 return REG_ECOLLATE;
2682 cmp_buf[0] = start_wc;
2683 cmp_buf[4] = end_wc;
2684 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2685 return REG_ERANGE;
2686
2687 /* Got valid collation sequence values, add them as a new entry.
2688 However, for !_LIBC we have no collation elements: if the
2689 character set is single byte, the single byte character set
2690 that we build below suffices. parse_bracket_exp passes
2691 no MBCSET if dfa->mb_cur_max == 1. */
2692 if (mbcset)
2693 {
2694 /* Check the space of the arrays. */
2695 if (BE (*range_alloc == mbcset->nranges, 0))
2696 {
2697 /* There is not enough space, need realloc. */
2698 wchar_t *new_array_start, *new_array_end;
2699 int new_nranges;
2700
2701 /* +1 in case of mbcset->nranges is 0. */
2702 new_nranges = 2 * mbcset->nranges + 1;
2703 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2704 are NULL if *range_alloc == 0. */
2705 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2706 new_nranges);
2707 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2708 new_nranges);
2709
2710 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2711 return REG_ESPACE;
2712
2713 mbcset->range_starts = new_array_start;
2714 mbcset->range_ends = new_array_end;
2715 *range_alloc = new_nranges;
2716 }
2717
2718 mbcset->range_starts[mbcset->nranges] = start_wc;
2719 mbcset->range_ends[mbcset->nranges++] = end_wc;
2720 }
2721
2722 /* Build the table for single byte characters. */
2723 for (wc = 0; wc < SBC_MAX; ++wc)
2724 {
2725 cmp_buf[2] = wc;
2726 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2727 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2728 bitset_set (sbcset, wc);
2729 }
2730 }
2731# else /* not RE_ENABLE_I18N */
2732 {
2733 unsigned int ch;
2734 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2735 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2736 : 0));
2737 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2738 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2739 : 0));
2740 if (start_ch > end_ch)
2741 return REG_ERANGE;
2742 /* Build the table for single byte characters. */
2743 for (ch = 0; ch < SBC_MAX; ++ch)
2744 if (start_ch <= ch && ch <= end_ch)
2745 bitset_set (sbcset, ch);
2746 }
2747# endif /* not RE_ENABLE_I18N */
2748 return REG_NOERROR;
2749}
2750#endif /* not _LIBC */
2751
2752#ifndef _LIBC
2753/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2754 Build the collating element which is represented by NAME.
2755 The result are written to MBCSET and SBCSET.
2756 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2757 pointer argument since we may update it. */
2758
2759static reg_errcode_t
2760internal_function
2761# ifdef RE_ENABLE_I18N
2762build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2763 int *coll_sym_alloc, const unsigned char *name)
2764# else /* not RE_ENABLE_I18N */
2765build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2766# endif /* not RE_ENABLE_I18N */
2767{
2768 size_t name_len = strlen ((const char *) name);
2769 if (BE (name_len != 1, 0))
2770 return REG_ECOLLATE;
2771 else
2772 {
2773 bitset_set (sbcset, name[0]);
2774 return REG_NOERROR;
2775 }
2776}
2777#endif /* not _LIBC */
2778
2779/* This function parse bracket expression like "[abc]", "[a-c]",
2780 "[[.a-a.]]" etc. */
2781
2782static bin_tree_t *
2783parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2784 reg_syntax_t syntax, reg_errcode_t *err)
2785{
2786#ifdef _LIBC
2787 const unsigned char *collseqmb;
2788 const char *collseqwc;
2789 uint32_t nrules;
2790 int32_t table_size;
2791 const int32_t *symb_table;
2792 const unsigned char *extra;
2793
2794 /* Local function for parse_bracket_exp used in _LIBC environment.
2795 Seek the collating symbol entry correspondings to NAME.
2796 Return the index of the symbol in the SYMB_TABLE. */
2797
2798 auto inline int32_t
2799 __attribute ((always_inline))
2800 seek_collating_symbol_entry (name, name_len)
2801 const unsigned char *name;
2802 size_t name_len;
2803 {
2804 int32_t hash = elem_hash ((const char *) name, name_len);
2805 int32_t elem = hash % table_size;
2806 if (symb_table[2 * elem] != 0)
2807 {
2808 int32_t second = hash % (table_size - 2) + 1;
2809
2810 do
2811 {
2812 /* First compare the hashing value. */
2813 if (symb_table[2 * elem] == hash
2814 /* Compare the length of the name. */
2815 && name_len == extra[symb_table[2 * elem + 1]]
2816 /* Compare the name. */
2817 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2818 name_len) == 0)
2819 {
2820 /* Yep, this is the entry. */
2821 break;
2822 }
2823
2824 /* Next entry. */
2825 elem += second;
2826 }
2827 while (symb_table[2 * elem] != 0);
2828 }
2829 return elem;
2830 }
2831
2832 /* Local function for parse_bracket_exp used in _LIBC environment.
2833 Look up the collation sequence value of BR_ELEM.
2834 Return the value if succeeded, UINT_MAX otherwise. */
2835
2836 auto inline unsigned int
2837 __attribute ((always_inline))
2838 lookup_collation_sequence_value (br_elem)
2839 bracket_elem_t *br_elem;
2840 {
2841 if (br_elem->type == SB_CHAR)
2842 {
2843 /*
2844 if (MB_CUR_MAX == 1)
2845 */
2846 if (nrules == 0)
2847 return collseqmb[br_elem->opr.ch];
2848 else
2849 {
2850 wint_t wc = __btowc (br_elem->opr.ch);
2851 return __collseq_table_lookup (collseqwc, wc);
2852 }
2853 }
2854 else if (br_elem->type == MB_CHAR)
2855 {
2856 if (nrules != 0)
2857 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2858 }
2859 else if (br_elem->type == COLL_SYM)
2860 {
2861 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2862 if (nrules != 0)
2863 {
2864 int32_t elem, idx;
2865 elem = seek_collating_symbol_entry (br_elem->opr.name,
2866 sym_name_len);
2867 if (symb_table[2 * elem] != 0)
2868 {
2869 /* We found the entry. */
2870 idx = symb_table[2 * elem + 1];
2871 /* Skip the name of collating element name. */
2872 idx += 1 + extra[idx];
2873 /* Skip the byte sequence of the collating element. */
2874 idx += 1 + extra[idx];
2875 /* Adjust for the alignment. */
2876 idx = (idx + 3) & ~3;
2877 /* Skip the multibyte collation sequence value. */
2878 idx += sizeof (unsigned int);
2879 /* Skip the wide char sequence of the collating element. */
2880 idx += sizeof (unsigned int) *
2881 (1 + *(unsigned int *) (extra + idx));
2882 /* Return the collation sequence value. */
2883 return *(unsigned int *) (extra + idx);
2884 }
2885 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2886 {
2887 /* No valid character. Match it as a single byte
2888 character. */
2889 return collseqmb[br_elem->opr.name[0]];
2890 }
2891 }
2892 else if (sym_name_len == 1)
2893 return collseqmb[br_elem->opr.name[0]];
2894 }
2895 return UINT_MAX;
2896 }
2897
2898 /* Local function for parse_bracket_exp used in _LIBC environment.
2899 Build the range expression which starts from START_ELEM, and ends
2900 at END_ELEM. The result are written to MBCSET and SBCSET.
2901 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2902 mbcset->range_ends, is a pointer argument since we may
2903 update it. */
2904
2905 auto inline reg_errcode_t
2906 __attribute ((always_inline))
2907 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2908 re_charset_t *mbcset;
2909 int *range_alloc;
2910 bitset_t sbcset;
2911 bracket_elem_t *start_elem, *end_elem;
2912 {
2913 unsigned int ch;
2914 uint32_t start_collseq;
2915 uint32_t end_collseq;
2916
2917 /* Equivalence Classes and Character Classes can't be a range
2918 start/end. */
2919 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2920 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2921 0))
2922 return REG_ERANGE;
2923
2924 start_collseq = lookup_collation_sequence_value (start_elem);
2925 end_collseq = lookup_collation_sequence_value (end_elem);
2926 /* Check start/end collation sequence values. */
2927 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2928 return REG_ECOLLATE;
2929 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2930 return REG_ERANGE;
2931
2932 /* Got valid collation sequence values, add them as a new entry.
2933 However, if we have no collation elements, and the character set
2934 is single byte, the single byte character set that we
2935 build below suffices. */
2936 if (nrules > 0 || dfa->mb_cur_max > 1)
2937 {
2938 /* Check the space of the arrays. */
2939 if (BE (*range_alloc == mbcset->nranges, 0))
2940 {
2941 /* There is not enough space, need realloc. */
2942 uint32_t *new_array_start;
2943 uint32_t *new_array_end;
2944 int new_nranges;
2945
2946 /* +1 in case of mbcset->nranges is 0. */
2947 new_nranges = 2 * mbcset->nranges + 1;
2948 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2949 new_nranges);
2950 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2951 new_nranges);
2952
2953 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2954 return REG_ESPACE;
2955
2956 mbcset->range_starts = new_array_start;
2957 mbcset->range_ends = new_array_end;
2958 *range_alloc = new_nranges;
2959 }
2960
2961 mbcset->range_starts[mbcset->nranges] = start_collseq;
2962 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2963 }
2964
2965 /* Build the table for single byte characters. */
2966 for (ch = 0; ch < SBC_MAX; ch++)
2967 {
2968 uint32_t ch_collseq;
2969 /*
2970 if (MB_CUR_MAX == 1)
2971 */
2972 if (nrules == 0)
2973 ch_collseq = collseqmb[ch];
2974 else
2975 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2976 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2977 bitset_set (sbcset, ch);
2978 }
2979 return REG_NOERROR;
2980 }
2981
2982 /* Local function for parse_bracket_exp used in _LIBC environment.
2983 Build the collating element which is represented by NAME.
2984 The result are written to MBCSET and SBCSET.
2985 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2986 pointer argument since we may update it. */
2987
2988 auto inline reg_errcode_t
2989 __attribute ((always_inline))
2990 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2991 re_charset_t *mbcset;
2992 int *coll_sym_alloc;
2993 bitset_t sbcset;
2994 const unsigned char *name;
2995 {
2996 int32_t elem, idx;
2997 size_t name_len = strlen ((const char *) name);
2998 if (nrules != 0)
2999 {
3000 elem = seek_collating_symbol_entry (name, name_len);
3001 if (symb_table[2 * elem] != 0)
3002 {
3003 /* We found the entry. */
3004 idx = symb_table[2 * elem + 1];
3005 /* Skip the name of collating element name. */
3006 idx += 1 + extra[idx];
3007 }
3008 else if (symb_table[2 * elem] == 0 && name_len == 1)
3009 {
3010 /* No valid character, treat it as a normal
3011 character. */
3012 bitset_set (sbcset, name[0]);
3013 return REG_NOERROR;
3014 }
3015 else
3016 return REG_ECOLLATE;
3017
3018 /* Got valid collation sequence, add it as a new entry. */
3019 /* Check the space of the arrays. */
3020 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3021 {
3022 /* Not enough, realloc it. */
3023 /* +1 in case of mbcset->ncoll_syms is 0. */
3024 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3025 /* Use realloc since mbcset->coll_syms is NULL
3026 if *alloc == 0. */
3027 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3028 new_coll_sym_alloc);
3029 if (BE (new_coll_syms == NULL, 0))
3030 return REG_ESPACE;
3031 mbcset->coll_syms = new_coll_syms;
3032 *coll_sym_alloc = new_coll_sym_alloc;
3033 }
3034 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3035 return REG_NOERROR;
3036 }
3037 else
3038 {
3039 if (BE (name_len != 1, 0))
3040 return REG_ECOLLATE;
3041 else
3042 {
3043 bitset_set (sbcset, name[0]);
3044 return REG_NOERROR;
3045 }
3046 }
3047 }
3048#endif
3049
3050 re_token_t br_token;
3051 re_bitset_ptr_t sbcset;
3052#ifdef RE_ENABLE_I18N
3053 re_charset_t *mbcset;
3054 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3055 int equiv_class_alloc = 0, char_class_alloc = 0;
3056#endif /* not RE_ENABLE_I18N */
3057 int non_match = 0;
3058 bin_tree_t *work_tree;
3059 int token_len;
3060 int first_round = 1;
3061#ifdef _LIBC
3062 collseqmb = (const unsigned char *)
3063 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3064 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3065 if (nrules)
3066 {
3067 /*
3068 if (MB_CUR_MAX > 1)
3069 */
3070 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3071 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3072 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3073 _NL_COLLATE_SYMB_TABLEMB);
3074 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3075 _NL_COLLATE_SYMB_EXTRAMB);
3076 }
3077#endif
3078 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3079#ifdef RE_ENABLE_I18N
3080 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3081#endif /* RE_ENABLE_I18N */
3082#ifdef RE_ENABLE_I18N
3083 if (BE (sbcset == NULL || mbcset == NULL, 0))
3084#else
3085 if (BE (sbcset == NULL, 0))
3086#endif /* RE_ENABLE_I18N */
3087 {
3088 *err = REG_ESPACE;
3089 return NULL;
3090 }
3091
3092 token_len = peek_token_bracket (token, regexp, syntax);
3093 if (BE (token->type == END_OF_RE, 0))
3094 {
3095 *err = REG_BADPAT;
3096 goto parse_bracket_exp_free_return;
3097 }
3098 if (token->type == OP_NON_MATCH_LIST)
3099 {
3100#ifdef RE_ENABLE_I18N
3101 mbcset->non_match = 1;
3102#endif /* not RE_ENABLE_I18N */
3103 non_match = 1;
3104 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3105 bitset_set (sbcset, '\n');
3106 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3107 token_len = peek_token_bracket (token, regexp, syntax);
3108 if (BE (token->type == END_OF_RE, 0))
3109 {
3110 *err = REG_BADPAT;
3111 goto parse_bracket_exp_free_return;
3112 }
3113 }
3114
3115 /* We treat the first ']' as a normal character. */
3116 if (token->type == OP_CLOSE_BRACKET)
3117 token->type = CHARACTER;
3118
3119 while (1)
3120 {
3121 bracket_elem_t start_elem, end_elem;
3122 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3123 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3124 reg_errcode_t ret;
3125 int token_len2 = 0, is_range_exp = 0;
3126 re_token_t token2;
3127
3128 start_elem.opr.name = start_name_buf;
3129 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3130 syntax, first_round);
3131 if (BE (ret != REG_NOERROR, 0))
3132 {
3133 *err = ret;
3134 goto parse_bracket_exp_free_return;
3135 }
3136 first_round = 0;
3137
3138 /* Get information about the next token. We need it in any case. */
3139 token_len = peek_token_bracket (token, regexp, syntax);
3140
3141 /* Do not check for ranges if we know they are not allowed. */
3142 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3143 {
3144 if (BE (token->type == END_OF_RE, 0))
3145 {
3146 *err = REG_EBRACK;
3147 goto parse_bracket_exp_free_return;
3148 }
3149 if (token->type == OP_CHARSET_RANGE)
3150 {
3151 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3152 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3153 if (BE (token2.type == END_OF_RE, 0))
3154 {
3155 *err = REG_EBRACK;
3156 goto parse_bracket_exp_free_return;
3157 }
3158 if (token2.type == OP_CLOSE_BRACKET)
3159 {
3160 /* We treat the last '-' as a normal character. */
3161 re_string_skip_bytes (regexp, -token_len);
3162 token->type = CHARACTER;
3163 }
3164 else
3165 is_range_exp = 1;
3166 }
3167 }
3168
3169 if (is_range_exp == 1)
3170 {
3171 end_elem.opr.name = end_name_buf;
3172 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3173 dfa, syntax, 1);
3174 if (BE (ret != REG_NOERROR, 0))
3175 {
3176 *err = ret;
3177 goto parse_bracket_exp_free_return;
3178 }
3179
3180 token_len = peek_token_bracket (token, regexp, syntax);
3181
3182#ifdef _LIBC
3183 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3184 &start_elem, &end_elem);
3185#else
3186# ifdef RE_ENABLE_I18N
3187 *err = build_range_exp (sbcset,
3188 dfa->mb_cur_max > 1 ? mbcset : NULL,
3189 &range_alloc, &start_elem, &end_elem);
3190# else
3191 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3192# endif
3193#endif /* RE_ENABLE_I18N */
3194 if (BE (*err != REG_NOERROR, 0))
3195 goto parse_bracket_exp_free_return;
3196 }
3197 else
3198 {
3199 switch (start_elem.type)
3200 {
3201 case SB_CHAR:
3202 bitset_set (sbcset, start_elem.opr.ch);
3203 break;
3204#ifdef RE_ENABLE_I18N
3205 case MB_CHAR:
3206 /* Check whether the array has enough space. */
3207 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3208 {
3209 wchar_t *new_mbchars;
3210 /* Not enough, realloc it. */
3211 /* +1 in case of mbcset->nmbchars is 0. */
3212 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3213 /* Use realloc since array is NULL if *alloc == 0. */
3214 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3215 mbchar_alloc);
3216 if (BE (new_mbchars == NULL, 0))
3217 goto parse_bracket_exp_espace;
3218 mbcset->mbchars = new_mbchars;
3219 }
3220 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3221 break;
3222#endif /* RE_ENABLE_I18N */
3223 case EQUIV_CLASS:
3224 *err = build_equiv_class (sbcset,
3225#ifdef RE_ENABLE_I18N
3226 mbcset, &equiv_class_alloc,
3227#endif /* RE_ENABLE_I18N */
3228 start_elem.opr.name);
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3231 break;
3232 case COLL_SYM:
3233 *err = build_collating_symbol (sbcset,
3234#ifdef RE_ENABLE_I18N
3235 mbcset, &coll_sym_alloc,
3236#endif /* RE_ENABLE_I18N */
3237 start_elem.opr.name);
3238 if (BE (*err != REG_NOERROR, 0))
3239 goto parse_bracket_exp_free_return;
3240 break;
3241 case CHAR_CLASS:
3242 *err = build_charclass (regexp->trans, sbcset,
3243#ifdef RE_ENABLE_I18N
3244 mbcset, &char_class_alloc,
3245#endif /* RE_ENABLE_I18N */
3246 (const char *) start_elem.opr.name, syntax);
3247 if (BE (*err != REG_NOERROR, 0))
3248 goto parse_bracket_exp_free_return;
3249 break;
3250 default:
3251 assert (0);
3252 break;
3253 }
3254 }
3255 if (BE (token->type == END_OF_RE, 0))
3256 {
3257 *err = REG_EBRACK;
3258 goto parse_bracket_exp_free_return;
3259 }
3260 if (token->type == OP_CLOSE_BRACKET)
3261 break;
3262 }
3263
3264 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3265
3266 /* If it is non-matching list. */
3267 if (non_match)
3268 bitset_not (sbcset);
3269
3270#ifdef RE_ENABLE_I18N
3271 /* Ensure only single byte characters are set. */
3272 if (dfa->mb_cur_max > 1)
3273 bitset_mask (sbcset, dfa->sb_char);
3274
3275 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3276 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3277 || mbcset->non_match)))
3278 {
3279 bin_tree_t *mbc_tree;
3280 int sbc_idx;
3281 /* Build a tree for complex bracket. */
3282 dfa->has_mb_node = 1;
3283 br_token.type = COMPLEX_BRACKET;
3284 br_token.opr.mbcset = mbcset;
3285 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3286 if (BE (mbc_tree == NULL, 0))
3287 goto parse_bracket_exp_espace;
3288 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3289 if (sbcset[sbc_idx])
3290 break;
3291 /* If there are no bits set in sbcset, there is no point
3292 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3293 if (sbc_idx < BITSET_WORDS)
3294 {
3295 /* Build a tree for simple bracket. */
3296 br_token.type = SIMPLE_BRACKET;
3297 br_token.opr.sbcset = sbcset;
3298 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3299 if (BE (work_tree == NULL, 0))
3300 goto parse_bracket_exp_espace;
3301
3302 /* Then join them by ALT node. */
3303 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3304 if (BE (work_tree == NULL, 0))
3305 goto parse_bracket_exp_espace;
3306 }
3307 else
3308 {
3309 re_free (sbcset);
3310 work_tree = mbc_tree;
3311 }
3312 }
3313 else
3314#endif /* not RE_ENABLE_I18N */
3315 {
3316#ifdef RE_ENABLE_I18N
3317 free_charset (mbcset);
3318#endif
3319 /* Build a tree for simple bracket. */
3320 br_token.type = SIMPLE_BRACKET;
3321 br_token.opr.sbcset = sbcset;
3322 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3323 if (BE (work_tree == NULL, 0))
3324 goto parse_bracket_exp_espace;
3325 }
3326 return work_tree;
3327
3328 parse_bracket_exp_espace:
3329 *err = REG_ESPACE;
3330 parse_bracket_exp_free_return:
3331 re_free (sbcset);
3332#ifdef RE_ENABLE_I18N
3333 free_charset (mbcset);
3334#endif /* RE_ENABLE_I18N */
3335 return NULL;
3336}
3337
3338/* Parse an element in the bracket expression. */
3339
3340static reg_errcode_t
3341parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3342 re_token_t *token, int token_len,
3343 UNUSED_PARAM re_dfa_t *dfa, reg_syntax_t syntax,
3344 int accept_hyphen)
3345{
3346#ifdef RE_ENABLE_I18N
3347 int cur_char_size;
3348 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3349 if (cur_char_size > 1)
3350 {
3351 elem->type = MB_CHAR;
3352 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3353 re_string_skip_bytes (regexp, cur_char_size);
3354 return REG_NOERROR;
3355 }
3356#endif /* RE_ENABLE_I18N */
3357 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3358 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3359 || token->type == OP_OPEN_EQUIV_CLASS)
3360 return parse_bracket_symbol (elem, regexp, token);
3361 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3362 {
3363 /* A '-' must only appear as anything but a range indicator before
3364 the closing bracket. Everything else is an error. */
3365 re_token_t token2;
3366 (void) peek_token_bracket (&token2, regexp, syntax);
3367 if (token2.type != OP_CLOSE_BRACKET)
3368 /* The actual error value is not standardized since this whole
3369 case is undefined. But ERANGE makes good sense. */
3370 return REG_ERANGE;
3371 }
3372 elem->type = SB_CHAR;
3373 elem->opr.ch = token->opr.c;
3374 return REG_NOERROR;
3375}
3376
3377/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3378 such as [:<character_class>:], [.<collating_element>.], and
3379 [=<equivalent_class>=]. */
3380
3381static reg_errcode_t
3382parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3383 re_token_t *token)
3384{
3385 unsigned char ch, delim = token->opr.c;
3386 int i = 0;
3387 if (re_string_eoi(regexp))
3388 return REG_EBRACK;
3389 for (;; ++i)
3390 {
3391 if (i >= BRACKET_NAME_BUF_SIZE)
3392 return REG_EBRACK;
3393 if (token->type == OP_OPEN_CHAR_CLASS)
3394 ch = re_string_fetch_byte_case (regexp);
3395 else
3396 ch = re_string_fetch_byte (regexp);
3397 if (re_string_eoi(regexp))
3398 return REG_EBRACK;
3399 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3400 break;
3401 elem->opr.name[i] = ch;
3402 }
3403 re_string_skip_bytes (regexp, 1);
3404 elem->opr.name[i] = '\0';
3405 switch (token->type)
3406 {
3407 case OP_OPEN_COLL_ELEM:
3408 elem->type = COLL_SYM;
3409 break;
3410 case OP_OPEN_EQUIV_CLASS:
3411 elem->type = EQUIV_CLASS;
3412 break;
3413 case OP_OPEN_CHAR_CLASS:
3414 elem->type = CHAR_CLASS;
3415 break;
3416 default:
3417 break;
3418 }
3419 return REG_NOERROR;
3420}
3421
3422 /* Helper function for parse_bracket_exp.
3423 Build the equivalence class which is represented by NAME.
3424 The result are written to MBCSET and SBCSET.
3425 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3426 is a pointer argument since we may update it. */
3427
3428static reg_errcode_t
3429#ifdef RE_ENABLE_I18N
3430build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3431 int *equiv_class_alloc, const unsigned char *name)
3432#else /* not RE_ENABLE_I18N */
3433build_equiv_class (bitset_t sbcset, const unsigned char *name)
3434#endif /* not RE_ENABLE_I18N */
3435{
3436#ifdef _LIBC
3437 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3438 if (nrules != 0)
3439 {
3440 const int32_t *table, *indirect;
3441 const unsigned char *weights, *extra, *cp;
3442 unsigned char char_buf[2];
3443 int32_t idx1, idx2;
3444 unsigned int ch;
3445 size_t len;
3446 /* This #include defines a local function! */
3447# include <locale/weight.h>
3448 /* Calculate the index for equivalence class. */
3449 cp = name;
3450 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3451 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_WEIGHTMB);
3453 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3454 _NL_COLLATE_EXTRAMB);
3455 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3456 _NL_COLLATE_INDIRECTMB);
3457 idx1 = findidx (&cp);
3458 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3459 /* This isn't a valid character. */
3460 return REG_ECOLLATE;
3461
3462 /* Build single byte matcing table for this equivalence class. */
3463 char_buf[1] = (unsigned char) '\0';
3464 len = weights[idx1 & 0xffffff];
3465 for (ch = 0; ch < SBC_MAX; ++ch)
3466 {
3467 char_buf[0] = ch;
3468 cp = char_buf;
3469 idx2 = findidx (&cp);
3470/*
3471 idx2 = table[ch];
3472*/
3473 if (idx2 == 0)
3474 /* This isn't a valid character. */
3475 continue;
3476 /* Compare only if the length matches and the collation rule
3477 index is the same. */
3478 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3479 {
3480 int cnt = 0;
3481
3482 while (cnt <= len &&
3483 weights[(idx1 & 0xffffff) + 1 + cnt]
3484 == weights[(idx2 & 0xffffff) + 1 + cnt])
3485 ++cnt;
3486
3487 if (cnt > len)
3488 bitset_set (sbcset, ch);
3489 }
3490 }
3491 /* Check whether the array has enough space. */
3492 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3493 {
3494 /* Not enough, realloc it. */
3495 /* +1 in case of mbcset->nequiv_classes is 0. */
3496 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3497 /* Use realloc since the array is NULL if *alloc == 0. */
3498 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3499 int32_t,
3500 new_equiv_class_alloc);
3501 if (BE (new_equiv_classes == NULL, 0))
3502 return REG_ESPACE;
3503 mbcset->equiv_classes = new_equiv_classes;
3504 *equiv_class_alloc = new_equiv_class_alloc;
3505 }
3506 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3507 }
3508 else
3509#endif /* _LIBC */
3510 {
3511 if (BE (strlen ((const char *) name) != 1, 0))
3512 return REG_ECOLLATE;
3513 bitset_set (sbcset, *name);
3514 }
3515 return REG_NOERROR;
3516}
3517
3518 /* Helper function for parse_bracket_exp.
3519 Build the character class which is represented by NAME.
3520 The result are written to MBCSET and SBCSET.
3521 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3522 is a pointer argument since we may update it. */
3523
3524static reg_errcode_t
3525#ifdef RE_ENABLE_I18N
3526build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527 re_charset_t *mbcset, int *char_class_alloc,
3528 const char *class_name, reg_syntax_t syntax)
3529#else /* not RE_ENABLE_I18N */
3530build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3531 const char *class_name, reg_syntax_t syntax)
3532#endif /* not RE_ENABLE_I18N */
3533{
3534 int i;
3535
3536 /* In case of REG_ICASE "upper" and "lower" match the both of
3537 upper and lower cases. */
3538 if ((syntax & RE_ICASE)
3539 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3540 class_name = "alpha";
3541
3542#ifdef RE_ENABLE_I18N
3543 /* Check the space of the arrays. */
3544 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3545 {
3546 /* Not enough, realloc it. */
3547 /* +1 in case of mbcset->nchar_classes is 0. */
3548 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3549 /* Use realloc since array is NULL if *alloc == 0. */
3550 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3551 new_char_class_alloc);
3552 if (BE (new_char_classes == NULL, 0))
3553 return REG_ESPACE;
3554 mbcset->char_classes = new_char_classes;
3555 *char_class_alloc = new_char_class_alloc;
3556 }
3557 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3558#endif /* RE_ENABLE_I18N */
3559
3560#define BUILD_CHARCLASS_LOOP(ctype_func) \
3561 do { \
3562 if (BE (trans != NULL, 0)) \
3563 { \
3564 for (i = 0; i < SBC_MAX; ++i) \
3565 if (ctype_func (i)) \
3566 bitset_set (sbcset, trans[i]); \
3567 } \
3568 else \
3569 { \
3570 for (i = 0; i < SBC_MAX; ++i) \
3571 if (ctype_func (i)) \
3572 bitset_set (sbcset, i); \
3573 } \
3574 } while (0)
3575
3576#if 0
3577 if (strcmp (class_name, "alnum") == 0)
3578 BUILD_CHARCLASS_LOOP (isalnum);
3579 else if (strcmp (class_name, "cntrl") == 0)
3580 BUILD_CHARCLASS_LOOP (iscntrl);
3581 else if (strcmp (class_name, "lower") == 0)
3582 BUILD_CHARCLASS_LOOP (islower);
3583 else if (strcmp (class_name, "space") == 0)
3584 BUILD_CHARCLASS_LOOP (isspace);
3585 else if (strcmp (class_name, "alpha") == 0)
3586 BUILD_CHARCLASS_LOOP (isalpha);
3587 else if (strcmp (class_name, "digit") == 0)
3588 BUILD_CHARCLASS_LOOP (isdigit);
3589 else if (strcmp (class_name, "print") == 0)
3590 BUILD_CHARCLASS_LOOP (isprint);
3591 else if (strcmp (class_name, "upper") == 0)
3592 BUILD_CHARCLASS_LOOP (isupper);
3593 else if (strcmp (class_name, "blank") == 0)
3594#ifndef GAWK
3595 BUILD_CHARCLASS_LOOP (isblank);
3596#else
3597 /* see comments above */
3598 BUILD_CHARCLASS_LOOP (is_blank);
3599#endif
3600 else if (strcmp (class_name, "graph") == 0)
3601 BUILD_CHARCLASS_LOOP (isgraph);
3602 else if (strcmp (class_name, "punct") == 0)
3603 BUILD_CHARCLASS_LOOP (ispunct);
3604 else if (strcmp (class_name, "xdigit") == 0)
3605 BUILD_CHARCLASS_LOOP (isxdigit);
3606 else
3607 return REG_ECTYPE;
3608#else
3609 switch (match_class(class_name)) {
3610 case CCLASS_ALNUM:
3611 BUILD_CHARCLASS_LOOP (isalnum);
3612 break;
3613 case CCLASS_CNTRL:
3614 BUILD_CHARCLASS_LOOP (iscntrl);
3615 break;
3616 case CCLASS_LOWER:
3617 BUILD_CHARCLASS_LOOP (islower);
3618 break;
3619 case CCLASS_SPACE:
3620 BUILD_CHARCLASS_LOOP (isspace);
3621 break;
3622 case CCLASS_ALPHA:
3623 BUILD_CHARCLASS_LOOP (isalpha);
3624 break;
3625 case CCLASS_DIGIT:
3626 BUILD_CHARCLASS_LOOP (isdigit);
3627 break;
3628 case CCLASS_PRINT:
3629 BUILD_CHARCLASS_LOOP (isprint);
3630 break;
3631 case CCLASS_UPPER:
3632 BUILD_CHARCLASS_LOOP (isupper);
3633 break;
3634 case CCLASS_BLANK:
3635#ifndef GAWK
3636 BUILD_CHARCLASS_LOOP (isblank);
3637#else
3638 /* see comments above */
3639 BUILD_CHARCLASS_LOOP (is_blank);
3640#endif
3641 break;
3642 case CCLASS_GRAPH:
3643 BUILD_CHARCLASS_LOOP (isgraph);
3644 break;
3645 case CCLASS_PUNCT:
3646 BUILD_CHARCLASS_LOOP (ispunct);
3647 break;
3648 case CCLASS_XDIGIT:
3649 BUILD_CHARCLASS_LOOP (isxdigit);
3650 break;
3651 default:
3652 return REG_ECTYPE;
3653 }
3654#endif
3655
3656 return REG_NOERROR;
3657}
3658
3659static bin_tree_t *
3660build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3661 const char *class_name,
3662 const char *extra, int non_match,
3663 reg_errcode_t *err)
3664{
3665 re_bitset_ptr_t sbcset;
3666#ifdef RE_ENABLE_I18N
3667 re_charset_t *mbcset;
3668 int alloc = 0;
3669#endif /* not RE_ENABLE_I18N */
3670 reg_errcode_t ret;
3671 re_token_t br_token;
3672 bin_tree_t *tree;
3673
3674 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3675#ifdef RE_ENABLE_I18N
3676 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3677#endif /* RE_ENABLE_I18N */
3678
3679#ifdef RE_ENABLE_I18N
3680 if (BE (sbcset == NULL || mbcset == NULL, 0))
3681#else /* not RE_ENABLE_I18N */
3682 if (BE (sbcset == NULL, 0))
3683#endif /* not RE_ENABLE_I18N */
3684 {
3685 *err = REG_ESPACE;
3686 return NULL;
3687 }
3688
3689 if (non_match)
3690 {
3691#ifdef RE_ENABLE_I18N
3692 mbcset->non_match = 1;
3693#endif /* not RE_ENABLE_I18N */
3694 }
3695
3696 /* We don't care the syntax in this case. */
3697 ret = build_charclass (trans, sbcset,
3698#ifdef RE_ENABLE_I18N
3699 mbcset, &alloc,
3700#endif /* RE_ENABLE_I18N */
3701 class_name, 0);
3702
3703 if (BE (ret != REG_NOERROR, 0))
3704 {
3705 re_free (sbcset);
3706#ifdef RE_ENABLE_I18N
3707 free_charset (mbcset);
3708#endif /* RE_ENABLE_I18N */
3709 *err = ret;
3710 return NULL;
3711 }
3712 /* \w match '_' also. */
3713 for (; *extra; extra++)
3714 bitset_set (sbcset, *extra);
3715
3716 /* If it is non-matching list. */
3717 if (non_match)
3718 bitset_not (sbcset);
3719
3720#ifdef RE_ENABLE_I18N
3721 /* Ensure only single byte characters are set. */
3722 if (dfa->mb_cur_max > 1)
3723 bitset_mask (sbcset, dfa->sb_char);
3724#endif
3725
3726 /* Build a tree for simple bracket. */
3727 br_token.type = SIMPLE_BRACKET;
3728 br_token.opr.sbcset = sbcset;
3729 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3730 if (BE (tree == NULL, 0))
3731 goto build_word_op_espace;
3732
3733#ifdef RE_ENABLE_I18N
3734 if (dfa->mb_cur_max > 1)
3735 {
3736 bin_tree_t *mbc_tree;
3737 /* Build a tree for complex bracket. */
3738 br_token.type = COMPLEX_BRACKET;
3739 br_token.opr.mbcset = mbcset;
3740 dfa->has_mb_node = 1;
3741 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3742 if (BE (mbc_tree == NULL, 0))
3743 goto build_word_op_espace;
3744 /* Then join them by ALT node. */
3745 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3746 if (BE (mbc_tree != NULL, 1))
3747 return tree;
3748 }
3749 else
3750 {
3751 free_charset (mbcset);
3752 return tree;
3753 }
3754#else /* not RE_ENABLE_I18N */
3755 return tree;
3756#endif /* not RE_ENABLE_I18N */
3757
3758 build_word_op_espace:
3759 re_free (sbcset);
3760#ifdef RE_ENABLE_I18N
3761 free_charset (mbcset);
3762#endif /* RE_ENABLE_I18N */
3763 *err = REG_ESPACE;
3764 return NULL;
3765}
3766
3767/* This is intended for the expressions like "a{1,3}".
3768 Fetch a number from `input', and return the number.
3769 Return -1, if the number field is empty like "{,1}".
3770 Return -2, if an error has occurred. */
3771
3772static int
3773fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3774{
3775 int num = -1;
3776 unsigned char c;
3777 while (1)
3778 {
3779 fetch_token (token, input, syntax);
3780 c = token->opr.c;
3781 if (BE (token->type == END_OF_RE, 0))
3782 return -2;
3783 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3784 break;
3785 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3786 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3787 num = (num > RE_DUP_MAX) ? -2 : num;
3788 }
3789 return num;
3790}
3791
3792#ifdef RE_ENABLE_I18N
3793static void
3794free_charset (re_charset_t *cset)
3795{
3796 re_free (cset->mbchars);
3797# ifdef _LIBC
3798 re_free (cset->coll_syms);
3799 re_free (cset->equiv_classes);
3800 re_free (cset->range_starts);
3801 re_free (cset->range_ends);
3802# endif
3803 re_free (cset->char_classes);
3804 re_free (cset);
3805}
3806#endif /* RE_ENABLE_I18N */
3807
3808/* Functions for binary tree operation. */
3809
3810/* Create a tree node. */
3811
3812static bin_tree_t *
3813create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3814 re_token_type_t type)
3815{
3816 re_token_t t;
3817 t.type = type;
3818 return create_token_tree (dfa, left, right, &t);
3819}
3820
3821static bin_tree_t *
3822create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 const re_token_t *token)
3824{
3825 bin_tree_t *tree;
3826 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827 {
3828 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830 if (storage == NULL)
3831 return NULL;
3832 storage->next = dfa->str_tree_storage;
3833 dfa->str_tree_storage = storage;
3834 dfa->str_tree_storage_idx = 0;
3835 }
3836 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3838 tree->parent = NULL;
3839 tree->left = left;
3840 tree->right = right;
3841 tree->token = *token;
3842 tree->token.duplicated = 0;
3843 tree->token.opt_subexp = 0;
3844 tree->first = NULL;
3845 tree->next = NULL;
3846 tree->node_idx = -1;
3847
3848 if (left != NULL)
3849 left->parent = tree;
3850 if (right != NULL)
3851 right->parent = tree;
3852 return tree;
3853}
3854
3855/* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3857
3858static reg_errcode_t
3859mark_opt_subexp (void *extra, bin_tree_t *node)
3860{
3861 int idx = (int) (intptr_t) extra;
3862 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863 node->token.opt_subexp = 1;
3864
3865 return REG_NOERROR;
3866}
3867
3868/* Free the allocated memory inside NODE. */
3869
3870static void
3871free_token (re_token_t *node)
3872{
3873#ifdef RE_ENABLE_I18N
3874 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875 free_charset (node->opr.mbcset);
3876 else
3877#endif /* RE_ENABLE_I18N */
3878 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879 re_free (node->opr.sbcset);
3880}
3881
3882/* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
3884
3885static reg_errcode_t
3886free_tree (UNUSED_PARAM void *extra, bin_tree_t *node)
3887{
3888 free_token (&node->token);
3889 return REG_NOERROR;
3890}
3891
3892
3893/* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3897
3898static bin_tree_t *
3899duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900{
3901 const bin_tree_t *node;
3902 bin_tree_t *dup_root;
3903 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904
3905 for (node = root; ; )
3906 {
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909 if (*p_new == NULL)
3910 return NULL;
3911 (*p_new)->parent = dup_node;
3912 (*p_new)->token.duplicated = 1;
3913 dup_node = *p_new;
3914
3915 /* Go to the left node, or up and to the right. */
3916 if (node->left)
3917 {
3918 node = node->left;
3919 p_new = &dup_node->left;
3920 }
3921 else
3922 {
3923 const bin_tree_t *prev = NULL;
3924 while (node->right == prev || node->right == NULL)
3925 {
3926 prev = node;
3927 node = node->parent;
3928 dup_node = dup_node->parent;
3929 if (!node)
3930 return dup_root;
3931 }
3932 node = node->right;
3933 p_new = &dup_node->right;
3934 }
3935 }
3936}