1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
89 int top_clone_node, int root_node,
90 unsigned int constraint);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101 reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105 reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116 re_token_t *token, reg_syntax_t syntax,
117 int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119 re_dfa_t *dfa, re_token_t *token,
120 reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122 re_token_t *token, reg_syntax_t syntax,
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
126 re_token_t *token, int token_len,
128 reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135 re_charset_t *mbcset, int *range_alloc,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 re_charset_t *mbcset,
141 const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144 bracket_elem_t *start_elem,
145 bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147 const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152 re_charset_t *mbcset,
153 int *equiv_class_alloc,
154 const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 re_charset_t *mbcset,
157 int *char_class_alloc,
158 const unsigned char *class_name,
159 reg_syntax_t syntax);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162 const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164 const unsigned char *class_name,
165 reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid[] attribute_hidden =
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx[] attribute_hidden =
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
263 re_compile_pattern (pattern, length, bufp)
266 struct re_pattern_buffer *bufp;
270 /* GNU code is written to assume at least RE_NREGS registers will be set
271 (and at least one extra will be -1). */
272 bufp->regs_allocated = REGS_UNALLOCATED;
274 /* And GNU code determines whether or not to get register information
275 by passing null for the REGS argument to re_match, etc., not by
279 /* Match anchors at newline. */
280 bufp->newline_anchor = 1;
282 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
286 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
289 weak_alias (__re_compile_pattern, re_compile_pattern)
292 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
293 also be assigned to arbitrarily: each pattern buffer stores its own
294 syntax, so it can be changed between regex compilations. */
295 /* This has no initializer because initialized variables in Emacs
296 become read-only after dumping. */
297 reg_syntax_t re_syntax_options;
300 /* Specify the precise syntax of regexps for compilation. This provides
301 for compatibility for various utilities which historically have
302 different, incompatible syntaxes.
304 The argument SYNTAX is a bit mask comprised of the various bits
305 defined in regex.h. We return the old syntax. */
308 re_set_syntax (syntax)
311 reg_syntax_t ret = re_syntax_options;
313 re_syntax_options = syntax;
317 weak_alias (__re_set_syntax, re_set_syntax)
321 re_compile_fastmap (bufp)
322 struct re_pattern_buffer *bufp;
324 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 char *fastmap = bufp->fastmap;
327 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
328 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
329 if (dfa->init_state != dfa->init_state_word)
330 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
331 if (dfa->init_state != dfa->init_state_nl)
332 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
333 if (dfa->init_state != dfa->init_state_begbuf)
334 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
335 bufp->fastmap_accurate = 1;
339 weak_alias (__re_compile_fastmap, re_compile_fastmap)
342 /* Helper function for re_compile_fastmap.
343 Compile fastmap for the initial_state INIT_STATE. */
346 re_compile_fastmap_iter (bufp, init_state, fastmap)
348 const re_dfastate_t *init_state;
351 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
353 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
355 int node = init_state->nodes.elems[node_cnt];
356 re_token_type_t type = dfa->nodes[node].type;
358 if (type == CHARACTER)
359 fastmap[dfa->nodes[node].opr.c] = 1;
360 else if (type == SIMPLE_BRACKET)
363 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
364 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
368 #ifdef RE_ENABLE_I18N
369 else if (type == COMPLEX_BRACKET)
372 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
373 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
374 || cset->nranges || cset->nchar_classes)
377 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
379 /* In this case we want to catch the bytes which are
380 the first byte of any collation elements.
381 e.g. In da_DK, we want to catch 'a' since "aa"
382 is a valid collation element, and don't catch
383 'b' since 'b' is the only collation element
384 which starts from 'b'. */
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
389 for (j = 0; j < UINT_BITS; ++j, ++ch)
395 for (i = 0; i < SBC_MAX; ++i)
396 if (__btowc (i) == WEOF)
398 # endif /* not _LIBC */
400 for (i = 0; i < cset->nmbchars; ++i)
403 wctomb (buf, cset->mbchars[i]);
404 fastmap[*(unsigned char *) buf] = 1;
407 #endif /* RE_ENABLE_I18N */
408 else if (type == END_OF_RE || type == OP_PERIOD
409 #ifdef RE_ENABLE_I18N
410 || type == COMPLEX_BRACKET
411 #endif /* RE_ENABLE_I18N */
414 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
415 if (type == END_OF_RE)
416 bufp->can_be_null = 1;
422 /* Entry point for POSIX code. */
423 /* regcomp takes a regular expression as a string and compiles it.
425 PREG is a regex_t *. We do not expect any fields to be initialized,
426 since POSIX says we shouldn't. Thus, we set
428 `buffer' to the compiled pattern;
429 `used' to the length of the compiled pattern;
430 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
431 REG_EXTENDED bit in CFLAGS is set; otherwise, to
432 RE_SYNTAX_POSIX_BASIC;
433 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
434 `fastmap' to an allocated space for the fastmap;
435 `fastmap_accurate' to zero;
436 `re_nsub' to the number of subexpressions in PATTERN.
438 PATTERN is the address of the pattern string.
440 CFLAGS is a series of bits which affect compilation.
442 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
443 use POSIX basic syntax.
445 If REG_NEWLINE is set, then . and [^...] don't match newline.
446 Also, regexec will try a match beginning after every newline.
448 If REG_ICASE is set, then we considers upper- and lowercase
449 versions of letters to be equivalent when matching.
451 If REG_NOSUB is set, then when PREG is passed to regexec, that
452 routine will report only success or failure, and nothing about the
455 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
456 the return codes and their meanings.) */
459 regcomp (preg, pattern, cflags)
460 regex_t *__restrict preg;
461 const char *__restrict pattern;
465 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
466 : RE_SYNTAX_POSIX_BASIC);
471 preg->can_be_null = 0;
473 /* Try to allocate space for the fastmap. */
474 preg->fastmap = re_malloc (char, SBC_MAX);
475 if (BE (preg->fastmap == NULL, 0))
478 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
480 /* If REG_NEWLINE is set, newlines are treated differently. */
481 if (cflags & REG_NEWLINE)
482 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
483 syntax &= ~RE_DOT_NEWLINE;
484 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
485 /* It also changes the matching behavior. */
486 preg->newline_anchor = 1;
489 preg->newline_anchor = 0;
490 preg->no_sub = !!(cflags & REG_NOSUB);
491 preg->translate = NULL;
493 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495 /* POSIX doesn't distinguish between an unmatched open-group and an
496 unmatched close-group: both are REG_EPAREN. */
497 if (ret == REG_ERPAREN)
500 /* We have already checked preg->fastmap != NULL. */
501 if (BE (ret == REG_NOERROR, 1))
502 /* Compute the fastmap now, since regexec cannot modify the pattern
503 buffer. This function nevers fails in this implementation. */
504 (void) __re_compile_fastmap (preg);
507 /* Some error occurred while compiling the expression. */
508 re_free (preg->fastmap);
509 preg->fastmap = NULL;
515 weak_alias (__regcomp, regcomp)
518 /* Returns a message corresponding to an error code, ERRCODE, returned
519 from either regcomp or regexec. We don't use PREG here. */
522 regerror (errcode, preg, errbuf, errbuf_size)
532 || errcode >= (int) (sizeof (__re_error_msgid_idx)
533 / sizeof (__re_error_msgid_idx[0])), 0))
534 /* Only error codes returned by the rest of the code should be passed
535 to this routine. If we are given anything else, or if other regex
536 code generates an invalid error code, then the program has a bug.
537 Dump core so we can fix it. */
540 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 msg_size = strlen (msg) + 1; /* Includes the null. */
544 if (BE (errbuf_size != 0, 1))
546 if (BE (msg_size > errbuf_size, 0))
548 #if defined HAVE_MEMPCPY || defined _LIBC
549 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
551 memcpy (errbuf, msg, errbuf_size - 1);
552 errbuf[errbuf_size - 1] = 0;
556 memcpy (errbuf, msg, msg_size);
562 weak_alias (__regerror, regerror)
567 free_dfa_content (re_dfa_t *dfa)
571 re_free (dfa->subexps);
573 for (i = 0; i < dfa->nodes_len; ++i)
575 re_token_t *node = dfa->nodes + i;
576 #ifdef RE_ENABLE_I18N
577 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
578 free_charset (node->opr.mbcset);
580 #endif /* RE_ENABLE_I18N */
581 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
582 re_free (node->opr.sbcset);
584 re_free (dfa->nexts);
585 for (i = 0; i < dfa->nodes_len; ++i)
587 if (dfa->eclosures != NULL)
588 re_node_set_free (dfa->eclosures + i);
589 if (dfa->inveclosures != NULL)
590 re_node_set_free (dfa->inveclosures + i);
591 if (dfa->edests != NULL)
592 re_node_set_free (dfa->edests + i);
594 re_free (dfa->edests);
595 re_free (dfa->eclosures);
596 re_free (dfa->inveclosures);
597 re_free (dfa->nodes);
599 for (i = 0; i <= dfa->state_hash_mask; ++i)
601 struct re_state_table_entry *entry = dfa->state_table + i;
602 for (j = 0; j < entry->num; ++j)
604 re_dfastate_t *state = entry->array[j];
607 re_free (entry->array);
609 re_free (dfa->state_table);
611 if (dfa->word_char != NULL)
612 re_free (dfa->word_char);
614 re_free (dfa->re_str);
621 /* Free dynamically allocated space used by PREG. */
627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
628 if (BE (dfa != NULL, 1))
629 free_dfa_content (dfa);
631 re_free (preg->fastmap);
634 weak_alias (__regfree, regfree)
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
640 #if defined _REGEX_RE_COMP || defined _LIBC
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf;
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
660 if (!re_comp_buf.buffer)
661 return gettext ("No previous regular expression");
665 if (re_comp_buf.buffer)
667 fastmap = re_comp_buf.fastmap;
668 re_comp_buf.fastmap = NULL;
669 __regfree (&re_comp_buf);
670 re_comp_buf.buffer = NULL;
671 re_comp_buf.allocated = 0;
672 re_comp_buf.fastmap = fastmap;
675 if (re_comp_buf.fastmap == NULL)
677 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
678 if (re_comp_buf.fastmap == NULL)
679 return (char *) gettext (__re_error_msgid
680 + __re_error_msgid_idx[(int) REG_ESPACE]);
683 /* Since `re_exec' always passes NULL for the `regs' argument, we
684 don't need to initialize the pattern buffer fields which affect it. */
686 /* Match anchors at newlines. */
687 re_comp_buf.newline_anchor = 1;
689 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
694 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
695 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
699 libc_freeres_fn (free_mem)
701 __regfree (&re_comp_buf);
705 #endif /* _REGEX_RE_COMP */
707 /* Internal entry point.
708 Compile the regular expression PATTERN, whose length is LENGTH.
709 SYNTAX indicate regular expression's syntax. */
712 re_compile_internal (preg, pattern, length, syntax)
714 const char * pattern;
718 reg_errcode_t err = REG_NOERROR;
722 /* Initialize the pattern buffer. */
723 preg->fastmap_accurate = 0;
724 preg->syntax = syntax;
725 preg->not_bol = preg->not_eol = 0;
729 /* Initialize the dfa. */
730 dfa = (re_dfa_t *) preg->buffer;
731 if (preg->allocated < sizeof (re_dfa_t))
733 /* If zero allocated, but buffer is non-null, try to realloc
734 enough space. This loses if buffer's address is bogus, but
735 that is the user's responsibility. If ->buffer is NULL this
736 is a simple allocation. */
737 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
740 preg->allocated = sizeof (re_dfa_t);
742 preg->buffer = (unsigned char *) dfa;
743 preg->used = sizeof (re_dfa_t);
745 err = init_dfa (dfa, length);
746 if (BE (err != REG_NOERROR, 0))
753 dfa->re_str = re_malloc (char, length + 1);
754 strncpy (dfa->re_str, pattern, length + 1);
757 err = re_string_construct (®exp, pattern, length, preg->translate,
759 if (BE (err != REG_NOERROR, 0))
766 /* Parse the regular expression, and build a structure tree. */
768 dfa->str_tree = parse (®exp, preg, syntax, &err);
769 if (BE (dfa->str_tree == NULL, 0))
770 goto re_compile_internal_free_return;
772 /* Analyze the tree and collect information which is necessary to
775 if (BE (err != REG_NOERROR, 0))
776 goto re_compile_internal_free_return;
778 /* Then create the initial state of the dfa. */
779 err = create_initial_state (dfa);
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (®exp);
785 if (BE (err != REG_NOERROR, 0))
787 re_compile_internal_free_return:
788 free_dfa_content (dfa);
795 /* Initialize DFA. We use the length of the regular expression PAT_LEN
796 as the initial length of some arrays. */
799 init_dfa (dfa, pat_len)
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 dfa->nodes_alloc = pat_len + 1;
808 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
810 dfa->states_alloc = pat_len + 1;
812 /* table_size = 2 ^ ceil(log pat_len) */
813 for (table_size = 1; table_size > 0; table_size <<= 1)
814 if (table_size > pat_len)
817 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
818 dfa->state_hash_mask = table_size - 1;
820 dfa->subexps_alloc = 1;
821 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
822 dfa->word_char = NULL;
824 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
825 || dfa->subexps == NULL, 0))
827 /* We don't bother to free anything which was allocated. Very
828 soon the process will go down anyway. */
830 dfa->state_table = NULL;
837 /* Initialize WORD_CHAR table, which indicate which character is
838 "word". In this case "word" means that it is the word construction
839 character used by some operators like "\<", "\>", etc. */
846 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
847 if (BE (dfa->word_char == NULL, 0))
849 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
850 for (j = 0; j < UINT_BITS; ++j, ++ch)
851 if (isalnum (ch) || ch == '_')
852 dfa->word_char[i] |= 1 << j;
856 /* Free the work area which are only used while compiling. */
859 free_workarea_compile (preg)
862 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
863 free_bin_tree (dfa->str_tree);
864 dfa->str_tree = NULL;
867 /* Create initial states for all contexts. */
870 create_initial_state (dfa)
875 re_node_set init_nodes;
877 /* Initial states have the epsilon closure of the node which is
878 the first node of the regular expression. */
879 first = dfa->str_tree->first;
880 dfa->init_node = first;
881 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
882 if (BE (err != REG_NOERROR, 0))
885 /* The back-references which are in initial states can epsilon transit,
886 since in this case all of the subexpressions can be null.
887 Then we add epsilon closures of the nodes which are the next nodes of
888 the back-references. */
889 if (dfa->nbackref > 0)
890 for (i = 0; i < init_nodes.nelem; ++i)
892 int node_idx = init_nodes.elems[i];
893 re_token_type_t type = dfa->nodes[node_idx].type;
896 if (type != OP_BACK_REF)
898 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
900 re_token_t *clexp_node;
901 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
902 if (clexp_node->type == OP_CLOSE_SUBEXP
903 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
906 if (clexp_idx == init_nodes.nelem)
909 if (type == OP_BACK_REF)
911 int dest_idx = dfa->edests[node_idx].elems[0];
912 if (!re_node_set_contains (&init_nodes, dest_idx))
914 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
920 /* It must be the first time to invoke acquire_state. */
921 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
922 /* We don't check ERR here, since the initial state must not be NULL. */
923 if (BE (dfa->init_state == NULL, 0))
925 if (dfa->init_state->has_constraint)
927 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
929 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
931 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
935 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
936 || dfa->init_state_begbuf == NULL, 0))
940 dfa->init_state_word = dfa->init_state_nl
941 = dfa->init_state_begbuf = dfa->init_state;
943 re_node_set_free (&init_nodes);
947 /* Analyze the structure tree, and calculate "first", "next", "edest",
948 "eclosure", and "inveclosure". */
957 /* Allocate arrays. */
958 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
959 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
960 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
961 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
962 if (BE (dfa->nexts == NULL || dfa->edests == NULL
963 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
965 /* Initialize them. */
966 for (i = 0; i < dfa->nodes_len; ++i)
969 re_node_set_init_empty (dfa->edests + i);
970 re_node_set_init_empty (dfa->eclosures + i);
971 re_node_set_init_empty (dfa->inveclosures + i);
974 ret = analyze_tree (dfa, dfa->str_tree);
975 if (BE (ret == REG_NOERROR, 1))
977 ret = calc_eclosure (dfa);
978 if (ret == REG_NOERROR)
979 calc_inveclosure (dfa);
984 /* Helper functions for analyze.
985 This function calculate "first", "next", and "edest" for the subtree
986 whose root is NODE. */
989 analyze_tree (dfa, node)
994 if (node->first == -1)
995 calc_first (dfa, node);
996 if (node->next == -1)
997 calc_next (dfa, node);
998 if (node->eclosure.nelem == 0)
999 calc_epsdest (dfa, node);
1000 /* Calculate "first" etc. for the left child. */
1001 if (node->left != NULL)
1003 ret = analyze_tree (dfa, node->left);
1004 if (BE (ret != REG_NOERROR, 0))
1007 /* Calculate "first" etc. for the right child. */
1008 if (node->right != NULL)
1010 ret = analyze_tree (dfa, node->right);
1011 if (BE (ret != REG_NOERROR, 0))
1017 /* Calculate "first" for the node NODE. */
1019 calc_first (dfa, node)
1024 idx = node->node_idx;
1025 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1030 case OP_OPEN_BRACKET:
1031 case OP_CLOSE_BRACKET:
1032 case OP_OPEN_DUP_NUM:
1033 case OP_CLOSE_DUP_NUM:
1034 case OP_NON_MATCH_LIST:
1035 case OP_OPEN_COLL_ELEM:
1036 case OP_CLOSE_COLL_ELEM:
1037 case OP_OPEN_EQUIV_CLASS:
1038 case OP_CLOSE_EQUIV_CLASS:
1039 case OP_OPEN_CHAR_CLASS:
1040 case OP_CLOSE_CHAR_CLASS:
1041 /* These must not be appeared here. */
1047 case OP_DUP_ASTERISK:
1048 case OP_DUP_QUESTION:
1049 #ifdef RE_ENABLE_I18N
1050 case COMPLEX_BRACKET:
1051 #endif /* RE_ENABLE_I18N */
1052 case SIMPLE_BRACKET:
1055 case OP_OPEN_SUBEXP:
1056 case OP_CLOSE_SUBEXP:
1061 assert (node->left != NULL);
1063 if (node->left->first == -1)
1064 calc_first (dfa, node->left);
1065 node->first = node->left->first;
1070 /* else fall through */
1073 assert (node->left != NULL);
1075 if (node->left->first == -1)
1076 calc_first (dfa, node->left);
1077 node->first = node->left->first;
1082 /* Calculate "next" for the node NODE. */
1085 calc_next (dfa, node)
1090 bin_tree_t *parent = node->parent;
1094 idx = node->node_idx;
1095 if (node->type == 0)
1096 dfa->nexts[idx] = node->next;
1100 idx = parent->node_idx;
1101 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1105 case OP_DUP_ASTERISK:
1110 if (parent->left == node)
1112 if (parent->right->first == -1)
1113 calc_first (dfa, parent->right);
1114 node->next = parent->right->first;
1117 /* else fall through */
1119 if (parent->next == -1)
1120 calc_next (dfa, parent);
1121 node->next = parent->next;
1124 idx = node->node_idx;
1125 if (node->type == 0)
1126 dfa->nexts[idx] = node->next;
1129 /* Calculate "edest" for the node NODE. */
1132 calc_epsdest (dfa, node)
1137 idx = node->node_idx;
1138 if (node->type == 0)
1140 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1141 || dfa->nodes[idx].type == OP_DUP_PLUS
1142 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1144 if (node->left->first == -1)
1145 calc_first (dfa, node->left);
1146 if (node->next == -1)
1147 calc_next (dfa, node);
1148 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1151 else if (dfa->nodes[idx].type == OP_ALT)
1154 if (node->left != NULL)
1156 if (node->left->first == -1)
1157 calc_first (dfa, node->left);
1158 left = node->left->first;
1162 if (node->next == -1)
1163 calc_next (dfa, node);
1166 if (node->right != NULL)
1168 if (node->right->first == -1)
1169 calc_first (dfa, node->right);
1170 right = node->right->first;
1174 if (node->next == -1)
1175 calc_next (dfa, node);
1178 re_node_set_init_2 (dfa->edests + idx, left, right);
1180 else if (dfa->nodes[idx].type == ANCHOR
1181 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1182 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1183 || dfa->nodes[idx].type == OP_BACK_REF)
1184 re_node_set_init_1 (dfa->edests + idx, node->next);
1188 /* Duplicate the epsilon closure of the node ROOT_NODE.
1189 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1190 to their own constraint. */
1192 static reg_errcode_t
1193 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1196 int top_org_node, top_clone_node, root_node;
1197 unsigned int init_constraint;
1200 int org_node, clone_node, ret;
1201 unsigned int constraint = init_constraint;
1202 for (org_node = top_org_node, clone_node = top_clone_node;;)
1204 int org_dest, clone_dest;
1205 if (dfa->nodes[org_node].type == OP_BACK_REF)
1207 /* If the back reference epsilon-transit, its destination must
1208 also have the constraint. Then duplicate the epsilon closure
1209 of the destination of the back reference, and store it in
1210 edests of the back reference. */
1211 org_dest = dfa->nexts[org_node];
1212 re_node_set_empty (dfa->edests + clone_node);
1213 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1214 if (BE (err != REG_NOERROR, 0))
1216 dfa->nexts[clone_node] = dfa->nexts[org_node];
1217 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1218 if (BE (ret < 0, 0))
1221 else if (dfa->edests[org_node].nelem == 0)
1223 /* In case of the node can't epsilon-transit, don't duplicate the
1224 destination and store the original destination as the
1225 destination of the node. */
1226 dfa->nexts[clone_node] = dfa->nexts[org_node];
1229 else if (dfa->edests[org_node].nelem == 1)
1231 /* In case of the node can epsilon-transit, and it has only one
1233 org_dest = dfa->edests[org_node].elems[0];
1234 re_node_set_empty (dfa->edests + clone_node);
1235 if (dfa->nodes[org_node].type == ANCHOR)
1237 /* In case of the node has another constraint, append it. */
1238 if (org_node == root_node && clone_node != org_node)
1240 /* ...but if the node is root_node itself, it means the
1241 epsilon closure have a loop, then tie it to the
1242 destination of the root_node. */
1243 ret = re_node_set_insert (dfa->edests + clone_node,
1245 if (BE (ret < 0, 0))
1249 constraint |= dfa->nodes[org_node].opr.ctx_type;
1251 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252 if (BE (err != REG_NOERROR, 0))
1254 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255 if (BE (ret < 0, 0))
1258 else /* dfa->edests[org_node].nelem == 2 */
1260 /* In case of the node can epsilon-transit, and it has two
1262 org_dest = dfa->edests[org_node].elems[0];
1263 re_node_set_empty (dfa->edests + clone_node);
1264 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1265 if (BE (err != REG_NOERROR, 0))
1267 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1268 if (BE (ret < 0, 0))
1271 err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1273 if (BE (err != REG_NOERROR, 0))
1276 org_dest = dfa->edests[org_node].elems[1];
1277 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1278 if (BE (err != REG_NOERROR, 0))
1280 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1281 if (BE (ret < 0, 0))
1284 org_node = org_dest;
1285 clone_node = clone_dest;
1290 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1291 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1292 otherwise return the error code. */
1294 static reg_errcode_t
1295 duplicate_node (new_idx, dfa, org_idx, constraint)
1297 int *new_idx, org_idx;
1298 unsigned int constraint;
1303 dup = dfa->nodes[org_idx];
1304 dup_idx = re_dfa_add_node (dfa, dup, 1);
1305 if (BE (dup_idx == -1, 0))
1307 dfa->nodes[dup_idx].constraint = constraint;
1308 if (dfa->nodes[org_idx].type == ANCHOR)
1309 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1310 dfa->nodes[dup_idx].duplicated = 1;
1311 re_node_set_init_empty (dfa->edests + dup_idx);
1312 re_node_set_init_empty (dfa->eclosures + dup_idx);
1313 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1320 calc_inveclosure (dfa)
1324 for (src = 0; src < dfa->nodes_len; ++src)
1326 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1328 dest = dfa->eclosures[src].elems[idx];
1329 re_node_set_insert (dfa->inveclosures + dest, src);
1334 /* Calculate "eclosure" for all the node in DFA. */
1336 static reg_errcode_t
1340 int node_idx, incomplete;
1342 assert (dfa->nodes_len > 0);
1345 /* For each nodes, calculate epsilon closure. */
1346 for (node_idx = 0; ; ++node_idx)
1349 re_node_set eclosure_elem;
1350 if (node_idx == dfa->nodes_len)
1359 assert (dfa->eclosures[node_idx].nelem != -1);
1361 /* If we have already calculated, skip it. */
1362 if (dfa->eclosures[node_idx].nelem != 0)
1364 /* Calculate epsilon closure of `node_idx'. */
1365 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1366 if (BE (err != REG_NOERROR, 0))
1369 if (dfa->eclosures[node_idx].nelem == 0)
1372 re_node_set_free (&eclosure_elem);
1378 /* Calculate epsilon closure of NODE. */
1380 static reg_errcode_t
1381 calc_eclosure_iter (new_set, dfa, node, root)
1382 re_node_set *new_set;
1387 unsigned int constraint;
1389 re_node_set eclosure;
1391 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1392 if (BE (err != REG_NOERROR, 0))
1395 /* This indicates that we are calculating this node now.
1396 We reference this value to avoid infinite loop. */
1397 dfa->eclosures[node].nelem = -1;
1399 constraint = ((dfa->nodes[node].type == ANCHOR)
1400 ? dfa->nodes[node].opr.ctx_type : 0);
1401 /* If the current node has constraints, duplicate all nodes.
1402 Since they must inherit the constraints. */
1403 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1405 int org_node, cur_node;
1406 org_node = cur_node = node;
1407 err = duplicate_node_closure (dfa, node, node, node, constraint);
1408 if (BE (err != REG_NOERROR, 0))
1412 /* Expand each epsilon destination nodes. */
1413 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1414 for (i = 0; i < dfa->edests[node].nelem; ++i)
1416 re_node_set eclosure_elem;
1417 int edest = dfa->edests[node].elems[i];
1418 /* If calculating the epsilon closure of `edest' is in progress,
1419 return intermediate result. */
1420 if (dfa->eclosures[edest].nelem == -1)
1425 /* If we haven't calculated the epsilon closure of `edest' yet,
1426 calculate now. Otherwise use calculated epsilon closure. */
1427 if (dfa->eclosures[edest].nelem == 0)
1429 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1430 if (BE (err != REG_NOERROR, 0))
1434 eclosure_elem = dfa->eclosures[edest];
1435 /* Merge the epsilon closure of `edest'. */
1436 re_node_set_merge (&eclosure, &eclosure_elem);
1437 /* If the epsilon closure of `edest' is incomplete,
1438 the epsilon closure of this node is also incomplete. */
1439 if (dfa->eclosures[edest].nelem == 0)
1442 re_node_set_free (&eclosure_elem);
1446 /* Epsilon closures include itself. */
1447 re_node_set_insert (&eclosure, node);
1448 if (incomplete && !root)
1449 dfa->eclosures[node].nelem = 0;
1451 dfa->eclosures[node] = eclosure;
1452 *new_set = eclosure;
1456 /* Functions for token which are used in the parser. */
1458 /* Fetch a token from INPUT.
1459 We must not use this function inside bracket expressions. */
1462 fetch_token (input, syntax)
1464 reg_syntax_t syntax;
1468 consumed_byte = peek_token (&token, input, syntax);
1469 re_string_skip_bytes (input, consumed_byte);
1473 /* Peek a token from INPUT, and return the length of the token.
1474 We must not use this function inside bracket expressions. */
1477 peek_token (token, input, syntax)
1480 reg_syntax_t syntax;
1484 if (re_string_eoi (input))
1486 token->type = END_OF_RE;
1490 c = re_string_peek_byte (input, 0);
1493 #ifdef RE_ENABLE_I18N
1494 token->mb_partial = 0;
1495 if (MB_CUR_MAX > 1 &&
1496 !re_string_first_byte (input, re_string_cur_idx (input)))
1498 token->type = CHARACTER;
1499 token->mb_partial = 1;
1506 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1508 token->type = BACK_SLASH;
1512 c2 = re_string_peek_byte_case (input, 1);
1514 token->type = CHARACTER;
1518 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1519 token->type = OP_ALT;
1521 case '1': case '2': case '3': case '4': case '5':
1522 case '6': case '7': case '8': case '9':
1523 if (!(syntax & RE_NO_BK_REFS))
1525 token->type = OP_BACK_REF;
1526 token->opr.idx = c2 - '0';
1530 if (!(syntax & RE_NO_GNU_OPS))
1532 token->type = ANCHOR;
1533 token->opr.idx = WORD_FIRST;
1537 if (!(syntax & RE_NO_GNU_OPS))
1539 token->type = ANCHOR;
1540 token->opr.idx = WORD_LAST;
1544 if (!(syntax & RE_NO_GNU_OPS))
1546 token->type = ANCHOR;
1547 token->opr.idx = WORD_DELIM;
1551 if (!(syntax & RE_NO_GNU_OPS))
1553 token->type = ANCHOR;
1554 token->opr.idx = INSIDE_WORD;
1558 if (!(syntax & RE_NO_GNU_OPS))
1559 token->type = OP_WORD;
1562 if (!(syntax & RE_NO_GNU_OPS))
1563 token->type = OP_NOTWORD;
1566 if (!(syntax & RE_NO_GNU_OPS))
1568 token->type = ANCHOR;
1569 token->opr.idx = BUF_FIRST;
1573 if (!(syntax & RE_NO_GNU_OPS))
1575 token->type = ANCHOR;
1576 token->opr.idx = BUF_LAST;
1580 if (!(syntax & RE_NO_BK_PARENS))
1581 token->type = OP_OPEN_SUBEXP;
1584 if (!(syntax & RE_NO_BK_PARENS))
1585 token->type = OP_CLOSE_SUBEXP;
1588 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1589 token->type = OP_DUP_PLUS;
1592 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1593 token->type = OP_DUP_QUESTION;
1596 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1597 token->type = OP_OPEN_DUP_NUM;
1600 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1601 token->type = OP_CLOSE_DUP_NUM;
1609 token->type = CHARACTER;
1613 if (syntax & RE_NEWLINE_ALT)
1614 token->type = OP_ALT;
1617 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1618 token->type = OP_ALT;
1621 token->type = OP_DUP_ASTERISK;
1624 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1625 token->type = OP_DUP_PLUS;
1628 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1629 token->type = OP_DUP_QUESTION;
1632 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1633 token->type = OP_OPEN_DUP_NUM;
1636 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1637 token->type = OP_CLOSE_DUP_NUM;
1640 if (syntax & RE_NO_BK_PARENS)
1641 token->type = OP_OPEN_SUBEXP;
1644 if (syntax & RE_NO_BK_PARENS)
1645 token->type = OP_CLOSE_SUBEXP;
1648 token->type = OP_OPEN_BRACKET;
1651 token->type = OP_PERIOD;
1654 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1655 re_string_cur_idx (input) != 0)
1657 char prev = re_string_peek_byte (input, -1);
1658 if (prev != '|' && prev != '(' &&
1659 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1662 token->type = ANCHOR;
1663 token->opr.idx = LINE_FIRST;
1666 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1667 re_string_cur_idx (input) + 1 != re_string_length (input))
1670 re_string_skip_bytes (input, 1);
1671 peek_token (&next, input, syntax);
1672 re_string_skip_bytes (input, -1);
1673 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1676 token->type = ANCHOR;
1677 token->opr.idx = LINE_LAST;
1685 /* Peek a token from INPUT, and return the length of the token.
1686 We must not use this function out of bracket expressions. */
1689 peek_token_bracket (token, input, syntax)
1692 reg_syntax_t syntax;
1695 if (re_string_eoi (input))
1697 token->type = END_OF_RE;
1700 c = re_string_peek_byte (input, 0);
1703 #ifdef RE_ENABLE_I18N
1704 if (MB_CUR_MAX > 1 &&
1705 !re_string_first_byte (input, re_string_cur_idx (input)))
1707 token->type = CHARACTER;
1710 #endif /* RE_ENABLE_I18N */
1712 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1714 /* In this case, '\' escape a character. */
1716 re_string_skip_bytes (input, 1);
1717 c2 = re_string_peek_byte (input, 0);
1719 token->type = CHARACTER;
1722 if (c == '[') /* '[' is a special char in a bracket exps. */
1726 c2 = re_string_peek_byte (input, 1);
1732 token->type = OP_OPEN_COLL_ELEM;
1735 token->type = OP_OPEN_EQUIV_CLASS;
1738 if (syntax & RE_CHAR_CLASSES)
1740 token->type = OP_OPEN_CHAR_CLASS;
1743 /* else fall through. */
1745 token->type = CHARACTER;
1755 token->type = OP_CHARSET_RANGE;
1758 token->type = OP_CLOSE_BRACKET;
1761 token->type = OP_NON_MATCH_LIST;
1764 token->type = CHARACTER;
1769 /* Functions for parser. */
1771 /* Entry point of the parser.
1772 Parse the regular expression REGEXP and return the structure tree.
1773 If an error is occured, ERR is set by error code, and return NULL.
1774 This function build the following tree, from regular expression <reg_exp>:
1780 CAT means concatenation.
1781 EOR means end of regular expression. */
1784 parse (regexp, preg, syntax, err)
1785 re_string_t *regexp;
1787 reg_syntax_t syntax;
1790 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1791 bin_tree_t *tree, *eor, *root;
1792 re_token_t current_token;
1794 current_token = fetch_token (regexp, syntax);
1795 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1796 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1798 new_idx = re_dfa_add_node (dfa, current_token, 0);
1799 eor = create_tree (NULL, NULL, 0, new_idx);
1801 root = create_tree (tree, eor, CONCAT, 0);
1804 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1812 /* This function build the following tree, from regular expression
1813 <branch1>|<branch2>:
1819 ALT means alternative, which represents the operator `|'. */
1822 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1823 re_string_t *regexp;
1826 reg_syntax_t syntax;
1830 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1831 bin_tree_t *tree, *branch = NULL;
1833 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1834 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1837 while (token->type == OP_ALT)
1839 re_token_t alt_token = *token;
1840 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1841 *token = fetch_token (regexp, syntax);
1842 if (token->type != OP_ALT && token->type != END_OF_RE
1843 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1845 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1846 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1848 free_bin_tree (tree);
1854 tree = create_tree (tree, branch, 0, new_idx);
1855 if (BE (new_idx == -1 || tree == NULL, 0))
1860 dfa->has_plural_match = 1;
1865 /* This function build the following tree, from regular expression
1872 CAT means concatenation. */
1875 parse_branch (regexp, preg, token, syntax, nest, err)
1876 re_string_t *regexp;
1879 reg_syntax_t syntax;
1883 bin_tree_t *tree, *exp;
1884 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1885 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1888 while (token->type != OP_ALT && token->type != END_OF_RE
1889 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1891 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1892 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1894 free_bin_tree (tree);
1897 if (tree != NULL && exp != NULL)
1899 tree = create_tree (tree, exp, CONCAT, 0);
1906 else if (tree == NULL)
1908 /* Otherwise exp == NULL, we don't need to create new tree. */
1913 /* This function build the following tree, from regular expression a*:
1920 parse_expression (regexp, preg, token, syntax, nest, err)
1921 re_string_t *regexp;
1924 reg_syntax_t syntax;
1928 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1931 switch (token->type)
1934 new_idx = re_dfa_add_node (dfa, *token, 0);
1935 tree = create_tree (NULL, NULL, 0, new_idx);
1936 if (BE (new_idx == -1 || tree == NULL, 0))
1941 #ifdef RE_ENABLE_I18N
1944 while (!re_string_eoi (regexp)
1945 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1947 bin_tree_t *mbc_remain;
1948 *token = fetch_token (regexp, syntax);
1949 new_idx = re_dfa_add_node (dfa, *token, 0);
1950 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1951 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1952 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1953 return *err = REG_ESPACE, NULL;
1958 case OP_OPEN_SUBEXP:
1959 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1960 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1963 case OP_OPEN_BRACKET:
1964 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1965 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1969 if (BE (preg->re_nsub < token->opr.idx
1970 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1975 new_idx = re_dfa_add_node (dfa, *token, 0);
1976 tree = create_tree (NULL, NULL, 0, new_idx);
1977 if (BE (new_idx == -1 || tree == NULL, 0))
1983 dfa->has_mb_node = 1;
1985 case OP_DUP_ASTERISK:
1987 case OP_DUP_QUESTION:
1988 case OP_OPEN_DUP_NUM:
1989 if (syntax & RE_CONTEXT_INVALID_OPS)
1994 else if (syntax & RE_CONTEXT_INDEP_OPS)
1996 *token = fetch_token (regexp, syntax);
1997 return parse_expression (regexp, preg, token, syntax, nest, err);
1999 /* else fall through */
2000 case OP_CLOSE_SUBEXP:
2001 if ((token->type == OP_CLOSE_SUBEXP) &&
2002 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2007 /* else fall through */
2008 case OP_CLOSE_DUP_NUM:
2009 /* We treat it as a normal character. */
2011 /* Then we can these characters as normal characters. */
2012 token->type = CHARACTER;
2013 new_idx = re_dfa_add_node (dfa, *token, 0);
2014 tree = create_tree (NULL, NULL, 0, new_idx);
2015 if (BE (new_idx == -1 || tree == NULL, 0))
2022 if (dfa->word_char == NULL)
2024 *err = init_word_char (dfa);
2025 if (BE (*err != REG_NOERROR, 0))
2028 if (token->opr.ctx_type == WORD_DELIM)
2030 bin_tree_t *tree_first, *tree_last;
2031 int idx_first, idx_last;
2032 token->opr.ctx_type = WORD_FIRST;
2033 idx_first = re_dfa_add_node (dfa, *token, 0);
2034 tree_first = create_tree (NULL, NULL, 0, idx_first);
2035 token->opr.ctx_type = WORD_LAST;
2036 idx_last = re_dfa_add_node (dfa, *token, 0);
2037 tree_last = create_tree (NULL, NULL, 0, idx_last);
2038 token->type = OP_ALT;
2039 new_idx = re_dfa_add_node (dfa, *token, 0);
2040 tree = create_tree (tree_first, tree_last, 0, new_idx);
2041 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2042 || tree_first == NULL || tree_last == NULL
2043 || tree == NULL, 0))
2051 new_idx = re_dfa_add_node (dfa, *token, 0);
2052 tree = create_tree (NULL, NULL, 0, new_idx);
2053 if (BE (new_idx == -1 || tree == NULL, 0))
2054 return *err = REG_ESPACE, NULL;
2056 /* We must return here, since ANCHORs can't be followed
2057 by repetition operators.
2058 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2059 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2060 *token = fetch_token (regexp, syntax);
2063 new_idx = re_dfa_add_node (dfa, *token, 0);
2064 tree = create_tree (NULL, NULL, 0, new_idx);
2065 if (BE (new_idx == -1 || tree == NULL, 0))
2071 dfa->has_mb_node = 1;
2074 tree = build_word_op (dfa, 0, err);
2075 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079 tree = build_word_op (dfa, 1, err);
2080 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090 /* Must not happen? */
2096 *token = fetch_token (regexp, syntax);
2098 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2099 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2101 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2102 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2104 dfa->has_plural_match = 1;
2110 /* This function build the following tree, from regular expression
2118 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2119 re_string_t *regexp;
2122 reg_syntax_t syntax;
2126 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2127 bin_tree_t *tree, *left_par, *right_par;
2130 cur_nsub = preg->re_nsub++;
2131 if (dfa->subexps_alloc < preg->re_nsub)
2133 re_subexp_t *new_array;
2134 dfa->subexps_alloc *= 2;
2135 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2136 if (BE (new_array == NULL, 0))
2138 dfa->subexps_alloc /= 2;
2142 dfa->subexps = new_array;
2144 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2145 dfa->subexps[cur_nsub].end = -1;
2147 new_idx = re_dfa_add_node (dfa, *token, 0);
2148 left_par = create_tree (NULL, NULL, 0, new_idx);
2149 if (BE (new_idx == -1 || left_par == NULL, 0))
2154 dfa->nodes[new_idx].opr.idx = cur_nsub;
2155 *token = fetch_token (regexp, syntax);
2157 /* The subexpression may be a null string. */
2158 if (token->type == OP_CLOSE_SUBEXP)
2162 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2163 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2166 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2168 free_bin_tree (tree);
2172 new_idx = re_dfa_add_node (dfa, *token, 0);
2173 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2174 right_par = create_tree (NULL, NULL, 0, new_idx);
2175 tree = ((tree == NULL) ? right_par
2176 : create_tree (tree, right_par, CONCAT, 0));
2177 tree = create_tree (left_par, tree, CONCAT, 0);
2178 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2183 dfa->nodes[new_idx].opr.idx = cur_nsub;
2188 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2191 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2192 bin_tree_t *dup_elem;
2193 re_string_t *regexp;
2196 reg_syntax_t syntax;
2199 re_token_t dup_token;
2200 bin_tree_t *tree = dup_elem, *work_tree;
2201 int new_idx, start_idx = re_string_cur_idx (regexp);
2202 re_token_t start_token = *token;
2203 if (token->type == OP_OPEN_DUP_NUM)
2207 int start = fetch_number (regexp, token, syntax);
2211 if (token->type == CHARACTER && token->opr.c == ',')
2212 start = 0; /* We treat "{,m}" as "{0,m}". */
2215 *err = REG_BADBR; /* <re>{} is invalid. */
2219 if (BE (start != -2, 1))
2221 /* We treat "{n}" as "{n,n}". */
2222 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2223 : ((token->type == CHARACTER && token->opr.c == ',')
2224 ? fetch_number (regexp, token, syntax) : -2));
2226 if (BE (start == -2 || end == -2, 0))
2228 /* Invalid sequence. */
2229 if (token->type == OP_CLOSE_DUP_NUM)
2230 goto parse_dup_op_invalid_interval;
2232 goto parse_dup_op_ebrace;
2234 if (BE (start == 0 && end == 0, 0))
2236 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2237 *token = fetch_token (regexp, syntax);
2238 free_bin_tree (dup_elem);
2242 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2244 for (i = 0; i < start; ++i)
2247 work_tree = duplicate_tree (elem, dfa);
2248 tree = create_tree (tree, work_tree, CONCAT, 0);
2249 if (BE (work_tree == NULL || tree == NULL, 0))
2250 goto parse_dup_op_espace;
2255 /* We treat "<re>{0,}" as "<re>*". */
2256 dup_token.type = OP_DUP_ASTERISK;
2259 elem = duplicate_tree (elem, dfa);
2260 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2261 work_tree = create_tree (elem, NULL, 0, new_idx);
2262 tree = create_tree (tree, work_tree, CONCAT, 0);
2263 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2264 || tree == NULL, 0))
2265 goto parse_dup_op_espace;
2269 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2270 tree = create_tree (elem, NULL, 0, new_idx);
2271 if (BE (new_idx == -1 || tree == NULL, 0))
2272 goto parse_dup_op_espace;
2275 else if (end - start > 0)
2277 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2278 dup_token.type = OP_DUP_QUESTION;
2281 elem = duplicate_tree (elem, dfa);
2282 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2283 elem = create_tree (elem, NULL, 0, new_idx);
2284 tree = create_tree (tree, elem, CONCAT, 0);
2285 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2286 goto parse_dup_op_espace;
2290 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2291 tree = elem = create_tree (elem, NULL, 0, new_idx);
2292 if (BE (new_idx == -1 || tree == NULL, 0))
2293 goto parse_dup_op_espace;
2295 for (i = 1; i < end - start; ++i)
2297 work_tree = duplicate_tree (elem, dfa);
2298 tree = create_tree (tree, work_tree, CONCAT, 0);
2299 if (BE (work_tree == NULL || tree == NULL, 0))
2309 new_idx = re_dfa_add_node (dfa, *token, 0);
2310 tree = create_tree (tree, NULL, 0, new_idx);
2311 if (BE (new_idx == -1 || tree == NULL, 0))
2317 *token = fetch_token (regexp, syntax);
2320 parse_dup_op_espace:
2321 free_bin_tree (tree);
2325 parse_dup_op_ebrace:
2326 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2331 goto parse_dup_op_rollback;
2332 parse_dup_op_invalid_interval:
2333 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2338 parse_dup_op_rollback:
2339 re_string_set_index (regexp, start_idx);
2340 *token = start_token;
2341 token->type = CHARACTER;
2345 /* Size of the names for collating symbol/equivalence_class/character_class.
2346 I'm not sure, but maybe enough. */
2347 #define BRACKET_NAME_BUF_SIZE 32
2350 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2351 Build the range expression which starts from START_ELEM, and ends
2352 at END_ELEM. The result are written to MBCSET and SBCSET.
2353 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2354 mbcset->range_ends, is a pointer argument sinse we may
2357 static reg_errcode_t
2358 # ifdef RE_ENABLE_I18N
2359 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2360 re_charset_t *mbcset;
2362 # else /* not RE_ENABLE_I18N */
2363 build_range_exp (sbcset, start_elem, end_elem)
2364 # endif /* not RE_ENABLE_I18N */
2365 re_bitset_ptr_t sbcset;
2366 bracket_elem_t *start_elem, *end_elem;
2368 unsigned int start_ch, end_ch;
2369 /* Equivalence Classes and Character Classes can't be a range start/end. */
2370 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2371 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2375 /* We can handle no multi character collating elements without libc
2377 if (BE ((start_elem->type == COLL_SYM
2378 && strlen ((char *) start_elem->opr.name) > 1)
2379 || (end_elem->type == COLL_SYM
2380 && strlen ((char *) end_elem->opr.name) > 1), 0))
2381 return REG_ECOLLATE;
2383 # ifdef RE_ENABLE_I18N
2385 wchar_t wc, start_wc, end_wc;
2386 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2388 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2389 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2391 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2392 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2394 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2395 ? __btowc (start_ch) : start_elem->opr.wch);
2396 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2397 ? __btowc (end_ch) : end_elem->opr.wch);
2398 cmp_buf[0] = start_wc;
2399 cmp_buf[4] = end_wc;
2400 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2403 /* Check the space of the arrays. */
2404 if (*range_alloc == mbcset->nranges)
2406 /* There are not enough space, need realloc. */
2407 wchar_t *new_array_start, *new_array_end;
2410 /* +1 in case of mbcset->nranges is 0. */
2411 new_nranges = 2 * mbcset->nranges + 1;
2412 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2413 are NULL if *range_alloc == 0. */
2414 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2416 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2419 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2422 mbcset->range_starts = new_array_start;
2423 mbcset->range_ends = new_array_end;
2424 *range_alloc = new_nranges;
2427 mbcset->range_starts[mbcset->nranges] = start_wc;
2428 mbcset->range_ends[mbcset->nranges++] = end_wc;
2430 /* Build the table for single byte characters. */
2431 for (wc = 0; wc <= SBC_MAX; ++wc)
2434 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2435 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2436 bitset_set (sbcset, wc);
2439 # else /* not RE_ENABLE_I18N */
2442 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2443 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2445 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2446 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2448 if (start_ch > end_ch)
2450 /* Build the table for single byte characters. */
2451 for (ch = 0; ch <= SBC_MAX; ++ch)
2452 if (start_ch <= ch && ch <= end_ch)
2453 bitset_set (sbcset, ch);
2455 # endif /* not RE_ENABLE_I18N */
2458 #endif /* not _LIBC */
2461 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2462 Build the collating element which is represented by NAME.
2463 The result are written to MBCSET and SBCSET.
2464 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2465 pointer argument since we may update it. */
2467 static reg_errcode_t
2468 # ifdef RE_ENABLE_I18N
2469 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2470 re_charset_t *mbcset;
2471 int *coll_sym_alloc;
2472 # else /* not RE_ENABLE_I18N */
2473 build_collating_symbol (sbcset, name)
2474 # endif /* not RE_ENABLE_I18N */
2475 re_bitset_ptr_t sbcset;
2476 const unsigned char *name;
2478 size_t name_len = strlen ((const char *) name);
2479 if (BE (name_len != 1, 0))
2480 return REG_ECOLLATE;
2483 bitset_set (sbcset, name[0]);
2487 #endif /* not _LIBC */
2489 /* This function parse bracket expression like "[abc]", "[a-c]",
2493 parse_bracket_exp (regexp, dfa, token, syntax, err)
2494 re_string_t *regexp;
2497 reg_syntax_t syntax;
2501 const unsigned char *collseqmb;
2502 const char *collseqwc;
2505 const int32_t *symb_table;
2506 const unsigned char *extra;
2508 /* Local function for parse_bracket_exp used in _LIBC environement.
2509 Seek the collating symbol entry correspondings to NAME.
2510 Return the index of the symbol in the SYMB_TABLE. */
2512 static inline int32_t
2513 seek_collating_symbol_entry (name, name_len)
2514 const unsigned char *name;
2517 int32_t hash = elem_hash ((const char *) name, name_len);
2518 int32_t elem = hash % table_size;
2519 int32_t second = hash % (table_size - 2);
2520 while (symb_table[2 * elem] != 0)
2522 /* First compare the hashing value. */
2523 if (symb_table[2 * elem] == hash
2524 /* Compare the length of the name. */
2525 && name_len == extra[symb_table[2 * elem + 1]]
2526 /* Compare the name. */
2527 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2530 /* Yep, this is the entry. */
2540 /* Local function for parse_bracket_exp used in _LIBC environement.
2541 Look up the collation sequence value of BR_ELEM.
2542 Return the value if succeeded, UINT_MAX otherwise. */
2544 static inline unsigned int
2545 lookup_collation_sequence_value (br_elem)
2546 bracket_elem_t *br_elem;
2548 if (br_elem->type == SB_CHAR)
2551 if (MB_CUR_MAX == 1)
2554 return collseqmb[br_elem->opr.ch];
2557 wint_t wc = __btowc (br_elem->opr.ch);
2558 return collseq_table_lookup (collseqwc, wc);
2561 else if (br_elem->type == MB_CHAR)
2563 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2565 else if (br_elem->type == COLL_SYM)
2567 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2571 elem = seek_collating_symbol_entry (br_elem->opr.name,
2573 if (symb_table[2 * elem] != 0)
2575 /* We found the entry. */
2576 idx = symb_table[2 * elem + 1];
2577 /* Skip the name of collating element name. */
2578 idx += 1 + extra[idx];
2579 /* Skip the byte sequence of the collating element. */
2580 idx += 1 + extra[idx];
2581 /* Adjust for the alignment. */
2582 idx = (idx + 3) & ~3;
2583 /* Skip the multibyte collation sequence value. */
2584 idx += sizeof (unsigned int);
2585 /* Skip the wide char sequence of the collating element. */
2586 idx += sizeof (unsigned int) *
2587 (1 + *(unsigned int *) (extra + idx));
2588 /* Return the collation sequence value. */
2589 return *(unsigned int *) (extra + idx);
2591 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2593 /* No valid character. Match it as a single byte
2595 return collseqmb[br_elem->opr.name[0]];
2598 else if (sym_name_len == 1)
2599 return collseqmb[br_elem->opr.name[0]];
2604 /* Local function for parse_bracket_exp used in _LIBC environement.
2605 Build the range expression which starts from START_ELEM, and ends
2606 at END_ELEM. The result are written to MBCSET and SBCSET.
2607 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2608 mbcset->range_ends, is a pointer argument sinse we may
2611 static inline reg_errcode_t
2612 # ifdef RE_ENABLE_I18N
2613 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2614 re_charset_t *mbcset;
2616 # else /* not RE_ENABLE_I18N */
2617 build_range_exp (sbcset, start_elem, end_elem)
2618 # endif /* not RE_ENABLE_I18N */
2619 re_bitset_ptr_t sbcset;
2620 bracket_elem_t *start_elem, *end_elem;
2623 uint32_t start_collseq;
2624 uint32_t end_collseq;
2626 # ifdef RE_ENABLE_I18N
2627 /* Check the space of the arrays. */
2628 if (*range_alloc == mbcset->nranges)
2630 /* There are not enough space, need realloc. */
2631 uint32_t *new_array_start;
2632 uint32_t *new_array_end;
2635 /* +1 in case of mbcset->nranges is 0. */
2636 new_nranges = 2 * mbcset->nranges + 1;
2637 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2638 are NULL if *range_alloc == 0. */
2639 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2641 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2644 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2647 mbcset->range_starts = new_array_start;
2648 mbcset->range_ends = new_array_end;
2649 *range_alloc = new_nranges;
2651 # endif /* RE_ENABLE_I18N */
2653 /* Equivalence Classes and Character Classes can't be a range
2655 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2656 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2660 start_collseq = lookup_collation_sequence_value (start_elem);
2661 end_collseq = lookup_collation_sequence_value (end_elem);
2662 /* Check start/end collation sequence values. */
2663 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2664 return REG_ECOLLATE;
2665 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2668 # ifdef RE_ENABLE_I18N
2669 /* Got valid collation sequence values, add them as a new entry. */
2670 mbcset->range_starts[mbcset->nranges] = start_collseq;
2671 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2672 # endif /* RE_ENABLE_I18N */
2674 /* Build the table for single byte characters. */
2675 for (ch = 0; ch <= SBC_MAX; ch++)
2677 uint32_t ch_collseq;
2679 if (MB_CUR_MAX == 1)
2682 ch_collseq = collseqmb[ch];
2684 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2685 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2686 bitset_set (sbcset, ch);
2691 /* Local function for parse_bracket_exp used in _LIBC environement.
2692 Build the collating element which is represented by NAME.
2693 The result are written to MBCSET and SBCSET.
2694 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2695 pointer argument sinse we may update it. */
2697 static inline reg_errcode_t
2698 # ifdef RE_ENABLE_I18N
2699 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2700 re_charset_t *mbcset;
2701 int *coll_sym_alloc;
2702 # else /* not RE_ENABLE_I18N */
2703 build_collating_symbol (sbcset, name)
2704 # endif /* not RE_ENABLE_I18N */
2705 re_bitset_ptr_t sbcset;
2706 const unsigned char *name;
2709 size_t name_len = strlen ((const char *) name);
2712 elem = seek_collating_symbol_entry (name, name_len);
2713 if (symb_table[2 * elem] != 0)
2715 /* We found the entry. */
2716 idx = symb_table[2 * elem + 1];
2717 /* Skip the name of collating element name. */
2718 idx += 1 + extra[idx];
2720 else if (symb_table[2 * elem] == 0 && name_len == 1)
2722 /* No valid character, treat it as a normal
2724 bitset_set (sbcset, name[0]);
2728 return REG_ECOLLATE;
2730 # ifdef RE_ENABLE_I18N
2731 /* Got valid collation sequence, add it as a new entry. */
2732 /* Check the space of the arrays. */
2733 if (*coll_sym_alloc == mbcset->ncoll_syms)
2735 /* Not enough, realloc it. */
2736 /* +1 in case of mbcset->ncoll_syms is 0. */
2737 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2738 /* Use realloc since mbcset->coll_syms is NULL
2740 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2742 if (BE (mbcset->coll_syms == NULL, 0))
2745 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2746 # endif /* RE_ENABLE_I18N */
2751 if (BE (name_len != 1, 0))
2752 return REG_ECOLLATE;
2755 bitset_set (sbcset, name[0]);
2762 re_token_t br_token;
2763 re_bitset_ptr_t sbcset;
2764 #ifdef RE_ENABLE_I18N
2765 re_charset_t *mbcset;
2766 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2767 int equiv_class_alloc = 0, char_class_alloc = 0;
2768 #else /* not RE_ENABLE_I18N */
2770 #endif /* not RE_ENABLE_I18N */
2771 bin_tree_t *work_tree;
2772 int token_len, new_idx;
2774 collseqmb = (const unsigned char *)
2775 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2776 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2782 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2783 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2784 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2785 _NL_COLLATE_SYMB_TABLEMB);
2786 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2787 _NL_COLLATE_SYMB_EXTRAMB);
2790 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2791 #ifdef RE_ENABLE_I18N
2792 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2793 #endif /* RE_ENABLE_I18N */
2794 #ifdef RE_ENABLE_I18N
2795 if (BE (sbcset == NULL || mbcset == NULL, 0))
2797 if (BE (sbcset == NULL, 0))
2798 #endif /* RE_ENABLE_I18N */
2804 token_len = peek_token_bracket (token, regexp, syntax);
2805 if (BE (token->type == END_OF_RE, 0))
2808 goto parse_bracket_exp_free_return;
2810 if (token->type == OP_NON_MATCH_LIST)
2812 #ifdef RE_ENABLE_I18N
2814 mbcset->non_match = 1;
2815 #else /* not RE_ENABLE_I18N */
2817 #endif /* not RE_ENABLE_I18N */
2818 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2819 bitset_set (sbcset, '\0');
2820 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2821 token_len = peek_token_bracket (token, regexp, syntax);
2822 if (BE (token->type == END_OF_RE, 0))
2825 goto parse_bracket_exp_free_return;
2827 #ifdef RE_ENABLE_I18N
2829 for (i = 0; i < SBC_MAX; ++i)
2830 if (__btowc (i) == WEOF)
2831 bitset_set (sbcset, i);
2832 #endif /* RE_ENABLE_I18N */
2835 /* We treat the first ']' as a normal character. */
2836 if (token->type == OP_CLOSE_BRACKET)
2837 token->type = CHARACTER;
2841 bracket_elem_t start_elem, end_elem;
2842 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2843 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2845 int token_len2 = 0, is_range_exp = 0;
2848 start_elem.opr.name = start_name_buf;
2849 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2851 if (BE (ret != REG_NOERROR, 0))
2854 goto parse_bracket_exp_free_return;
2857 token_len = peek_token_bracket (token, regexp, syntax);
2858 if (BE (token->type == END_OF_RE, 0))
2861 goto parse_bracket_exp_free_return;
2863 if (token->type == OP_CHARSET_RANGE)
2865 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2866 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2867 if (BE (token->type == END_OF_RE, 0))
2870 goto parse_bracket_exp_free_return;
2872 if (token2.type == OP_CLOSE_BRACKET)
2874 /* We treat the last '-' as a normal character. */
2875 re_string_skip_bytes (regexp, -token_len);
2876 token->type = CHARACTER;
2882 if (is_range_exp == 1)
2884 end_elem.opr.name = end_name_buf;
2885 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2887 if (BE (ret != REG_NOERROR, 0))
2890 goto parse_bracket_exp_free_return;
2893 token_len = peek_token_bracket (token, regexp, syntax);
2894 if (BE (token->type == END_OF_RE, 0))
2897 goto parse_bracket_exp_free_return;
2899 *err = build_range_exp (sbcset,
2900 #ifdef RE_ENABLE_I18N
2901 mbcset, &range_alloc,
2902 #endif /* RE_ENABLE_I18N */
2903 &start_elem, &end_elem);
2904 if (BE (*err != REG_NOERROR, 0))
2905 goto parse_bracket_exp_free_return;
2909 switch (start_elem.type)
2912 bitset_set (sbcset, start_elem.opr.ch);
2914 #ifdef RE_ENABLE_I18N
2916 /* Check whether the array has enough space. */
2917 if (mbchar_alloc == mbcset->nmbchars)
2919 /* Not enough, realloc it. */
2920 /* +1 in case of mbcset->nmbchars is 0. */
2921 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2922 /* Use realloc since array is NULL if *alloc == 0. */
2923 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2925 if (BE (mbcset->mbchars == NULL, 0))
2926 goto parse_bracket_exp_espace;
2928 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2930 #endif /* RE_ENABLE_I18N */
2932 *err = build_equiv_class (sbcset,
2933 #ifdef RE_ENABLE_I18N
2934 mbcset, &equiv_class_alloc,
2935 #endif /* RE_ENABLE_I18N */
2936 start_elem.opr.name);
2937 if (BE (*err != REG_NOERROR, 0))
2938 goto parse_bracket_exp_free_return;
2941 *err = build_collating_symbol (sbcset,
2942 #ifdef RE_ENABLE_I18N
2943 mbcset, &coll_sym_alloc,
2944 #endif /* RE_ENABLE_I18N */
2945 start_elem.opr.name);
2946 if (BE (*err != REG_NOERROR, 0))
2947 goto parse_bracket_exp_free_return;
2950 ret = build_charclass (sbcset,
2951 #ifdef RE_ENABLE_I18N
2952 mbcset, &char_class_alloc,
2953 #endif /* RE_ENABLE_I18N */
2954 start_elem.opr.name, syntax);
2955 if (BE (ret != REG_NOERROR, 0))
2956 goto parse_bracket_exp_espace;
2963 if (token->type == OP_CLOSE_BRACKET)
2967 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2969 /* If it is non-matching list. */
2970 #ifdef RE_ENABLE_I18N
2971 if (mbcset->non_match)
2972 #else /* not RE_ENABLE_I18N */
2974 #endif /* not RE_ENABLE_I18N */
2975 bitset_not (sbcset);
2977 /* Build a tree for simple bracket. */
2978 br_token.type = SIMPLE_BRACKET;
2979 br_token.opr.sbcset = sbcset;
2980 new_idx = re_dfa_add_node (dfa, br_token, 0);
2981 work_tree = create_tree (NULL, NULL, 0, new_idx);
2982 if (BE (new_idx == -1 || work_tree == NULL, 0))
2983 goto parse_bracket_exp_espace;
2985 #ifdef RE_ENABLE_I18N
2986 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2987 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2988 || mbcset->non_match)))
2990 re_token_t alt_token;
2991 bin_tree_t *mbc_tree;
2992 /* Build a tree for complex bracket. */
2993 br_token.type = COMPLEX_BRACKET;
2994 br_token.opr.mbcset = mbcset;
2995 dfa->has_mb_node = 1;
2996 new_idx = re_dfa_add_node (dfa, br_token, 0);
2997 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2998 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2999 goto parse_bracket_exp_espace;
3000 /* Then join them by ALT node. */
3001 dfa->has_plural_match = 1;
3002 alt_token.type = OP_ALT;
3003 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3004 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3005 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3010 free_charset (mbcset);
3013 #else /* not RE_ENABLE_I18N */
3015 #endif /* not RE_ENABLE_I18N */
3017 parse_bracket_exp_espace:
3019 parse_bracket_exp_free_return:
3021 #ifdef RE_ENABLE_I18N
3022 free_charset (mbcset);
3023 #endif /* RE_ENABLE_I18N */
3027 /* Parse an element in the bracket expression. */
3029 static reg_errcode_t
3030 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3031 bracket_elem_t *elem;
3032 re_string_t *regexp;
3036 reg_syntax_t syntax;
3038 #ifdef RE_ENABLE_I18N
3040 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3041 if (cur_char_size > 1)
3043 elem->type = MB_CHAR;
3044 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3045 re_string_skip_bytes (regexp, cur_char_size);
3048 #endif /* RE_ENABLE_I18N */
3049 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3050 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3051 || token->type == OP_OPEN_EQUIV_CLASS)
3052 return parse_bracket_symbol (elem, regexp, token);
3053 elem->type = SB_CHAR;
3054 elem->opr.ch = token->opr.c;
3058 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3059 such as [:<character_class>:], [.<collating_element>.], and
3060 [=<equivalent_class>=]. */
3062 static reg_errcode_t
3063 parse_bracket_symbol (elem, regexp, token)
3064 bracket_elem_t *elem;
3065 re_string_t *regexp;
3068 unsigned char ch, delim = token->opr.c;
3072 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3074 if (token->type == OP_OPEN_CHAR_CLASS)
3075 ch = re_string_fetch_byte_case (regexp);
3077 ch = re_string_fetch_byte (regexp);
3078 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3080 elem->opr.name[i] = ch;
3082 re_string_skip_bytes (regexp, 1);
3083 elem->opr.name[i] = '\0';
3084 switch (token->type)
3086 case OP_OPEN_COLL_ELEM:
3087 elem->type = COLL_SYM;
3089 case OP_OPEN_EQUIV_CLASS:
3090 elem->type = EQUIV_CLASS;
3092 case OP_OPEN_CHAR_CLASS:
3093 elem->type = CHAR_CLASS;
3101 /* Helper function for parse_bracket_exp.
3102 Build the equivalence class which is represented by NAME.
3103 The result are written to MBCSET and SBCSET.
3104 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3105 is a pointer argument sinse we may update it. */
3107 static reg_errcode_t
3108 #ifdef RE_ENABLE_I18N
3109 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3110 re_charset_t *mbcset;
3111 int *equiv_class_alloc;
3112 #else /* not RE_ENABLE_I18N */
3113 build_equiv_class (sbcset, name)
3114 #endif /* not RE_ENABLE_I18N */
3115 re_bitset_ptr_t sbcset;
3116 const unsigned char *name;
3118 #if defined _LIBC && defined RE_ENABLE_I18N
3119 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122 const int32_t *table, *indirect;
3123 const unsigned char *weights, *extra, *cp;
3124 unsigned char char_buf[2];
3128 /* This #include defines a local function! */
3129 # include <locale/weight.h>
3130 /* Calculate the index for equivalence class. */
3132 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3133 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3134 _NL_COLLATE_WEIGHTMB);
3135 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136 _NL_COLLATE_EXTRAMB);
3137 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3138 _NL_COLLATE_INDIRECTMB);
3139 idx1 = findidx (&cp);
3140 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3141 /* This isn't a valid character. */
3142 return REG_ECOLLATE;
3144 /* Build single byte matcing table for this equivalence class. */
3145 char_buf[1] = (unsigned char) '\0';
3146 len = weights[idx1];
3147 for (ch = 0; ch < SBC_MAX; ++ch)
3151 idx2 = findidx (&cp);
3156 /* This isn't a valid character. */
3158 if (len == weights[idx2])
3161 while (cnt <= len &&
3162 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3166 bitset_set (sbcset, ch);
3169 /* Check whether the array has enough space. */
3170 if (*equiv_class_alloc == mbcset->nequiv_classes)
3172 /* Not enough, realloc it. */
3173 /* +1 in case of mbcset->nequiv_classes is 0. */
3174 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3175 /* Use realloc since the array is NULL if *alloc == 0. */
3176 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3177 *equiv_class_alloc);
3178 if (BE (mbcset->equiv_classes == NULL, 0))
3181 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3184 #endif /* _LIBC && RE_ENABLE_I18N */
3186 if (BE (strlen ((const char *) name) != 1, 0))
3187 return REG_ECOLLATE;
3188 bitset_set (sbcset, *name);
3193 /* Helper function for parse_bracket_exp.
3194 Build the character class which is represented by NAME.
3195 The result are written to MBCSET and SBCSET.
3196 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3197 is a pointer argument sinse we may update it. */
3199 static reg_errcode_t
3200 #ifdef RE_ENABLE_I18N
3201 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3202 re_charset_t *mbcset;
3203 int *char_class_alloc;
3204 #else /* not RE_ENABLE_I18N */
3205 build_charclass (sbcset, class_name, syntax)
3206 #endif /* not RE_ENABLE_I18N */
3207 re_bitset_ptr_t sbcset;
3208 const unsigned char *class_name;
3209 reg_syntax_t syntax;
3212 const char *name = (const char *) class_name;
3214 /* In case of REG_ICASE "upper" and "lower" match the both of
3215 upper and lower cases. */
3216 if ((syntax & RE_ICASE)
3217 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3220 #ifdef RE_ENABLE_I18N
3221 /* Check the space of the arrays. */
3222 if (*char_class_alloc == mbcset->nchar_classes)
3224 /* Not enough, realloc it. */
3225 /* +1 in case of mbcset->nchar_classes is 0. */
3226 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3227 /* Use realloc since array is NULL if *alloc == 0. */
3228 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3230 if (BE (mbcset->char_classes == NULL, 0))
3233 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3234 #endif /* RE_ENABLE_I18N */
3236 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3237 for (i = 0; i < SBC_MAX; ++i) \
3239 if (ctype_func (i)) \
3240 bitset_set (sbcset, i); \
3243 if (strcmp (name, "alnum") == 0)
3244 BUILD_CHARCLASS_LOOP (isalnum)
3245 else if (strcmp (name, "cntrl") == 0)
3246 BUILD_CHARCLASS_LOOP (iscntrl)
3247 else if (strcmp (name, "lower") == 0)
3248 BUILD_CHARCLASS_LOOP (islower)
3249 else if (strcmp (name, "space") == 0)
3250 BUILD_CHARCLASS_LOOP (isspace)
3251 else if (strcmp (name, "alpha") == 0)
3252 BUILD_CHARCLASS_LOOP (isalpha)
3253 else if (strcmp (name, "digit") == 0)
3254 BUILD_CHARCLASS_LOOP (isdigit)
3255 else if (strcmp (name, "print") == 0)
3256 BUILD_CHARCLASS_LOOP (isprint)
3257 else if (strcmp (name, "upper") == 0)
3258 BUILD_CHARCLASS_LOOP (isupper)
3259 else if (strcmp (name, "blank") == 0)
3260 BUILD_CHARCLASS_LOOP (isblank)
3261 else if (strcmp (name, "graph") == 0)
3262 BUILD_CHARCLASS_LOOP (isgraph)
3263 else if (strcmp (name, "punct") == 0)
3264 BUILD_CHARCLASS_LOOP (ispunct)
3265 else if (strcmp (name, "xdigit") == 0)
3266 BUILD_CHARCLASS_LOOP (isxdigit)
3274 build_word_op (dfa, not, err)
3279 re_bitset_ptr_t sbcset;
3280 #ifdef RE_ENABLE_I18N
3281 re_charset_t *mbcset;
3283 #else /* not RE_ENABLE_I18N */
3285 #endif /* not RE_ENABLE_I18N */
3287 re_token_t br_token;
3291 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3292 #ifdef RE_ENABLE_I18N
3293 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3294 #endif /* RE_ENABLE_I18N */
3296 #ifdef RE_ENABLE_I18N
3297 if (BE (sbcset == NULL || mbcset == NULL, 0))
3298 #else /* not RE_ENABLE_I18N */
3299 if (BE (sbcset == NULL, 0))
3300 #endif /* not RE_ENABLE_I18N */
3308 #ifdef RE_ENABLE_I18N
3311 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3312 bitset_set(cset->sbcset, '\0');
3314 mbcset->non_match = 1;
3316 for (i = 0; i < SBC_MAX; ++i)
3317 if (__btowc (i) == WEOF)
3318 bitset_set (sbcset, i);
3319 #else /* not RE_ENABLE_I18N */
3321 #endif /* not RE_ENABLE_I18N */
3324 /* We don't care the syntax in this case. */
3325 ret = build_charclass (sbcset,
3326 #ifdef RE_ENABLE_I18N
3328 #endif /* RE_ENABLE_I18N */
3329 (const unsigned char *) "alpha", 0);
3331 if (BE (ret != REG_NOERROR, 0))
3334 #ifdef RE_ENABLE_I18N
3335 free_charset (mbcset);
3336 #endif /* RE_ENABLE_I18N */
3340 /* \w match '_' also. */
3341 bitset_set (sbcset, '_');
3343 /* If it is non-matching list. */
3344 #ifdef RE_ENABLE_I18N
3345 if (mbcset->non_match)
3346 #else /* not RE_ENABLE_I18N */
3348 #endif /* not RE_ENABLE_I18N */
3349 bitset_not (sbcset);
3351 /* Build a tree for simple bracket. */
3352 br_token.type = SIMPLE_BRACKET;
3353 br_token.opr.sbcset = sbcset;
3354 new_idx = re_dfa_add_node (dfa, br_token, 0);
3355 tree = create_tree (NULL, NULL, 0, new_idx);
3356 if (BE (new_idx == -1 || tree == NULL, 0))
3357 goto build_word_op_espace;
3359 #ifdef RE_ENABLE_I18N
3362 re_token_t alt_token;
3363 bin_tree_t *mbc_tree;
3364 /* Build a tree for complex bracket. */
3365 br_token.type = COMPLEX_BRACKET;
3366 br_token.opr.mbcset = mbcset;
3367 dfa->has_mb_node = 1;
3368 new_idx = re_dfa_add_node (dfa, br_token, 0);
3369 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3370 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3371 goto build_word_op_espace;
3372 /* Then join them by ALT node. */
3373 alt_token.type = OP_ALT;
3374 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3375 tree = create_tree (tree, mbc_tree, 0, new_idx);
3376 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3381 free_charset (mbcset);
3384 #else /* not RE_ENABLE_I18N */
3386 #endif /* not RE_ENABLE_I18N */
3388 build_word_op_espace:
3390 #ifdef RE_ENABLE_I18N
3391 free_charset (mbcset);
3392 #endif /* RE_ENABLE_I18N */
3397 /* This is intended for the expressions like "a{1,3}".
3398 Fetch a number from `input', and return the number.
3399 Return -1, if the number field is empty like "{,1}".
3400 Return -2, If an error is occured. */
3403 fetch_number (input, token, syntax)
3406 reg_syntax_t syntax;
3412 *token = fetch_token (input, syntax);
3414 if (BE (token->type == END_OF_RE, 0))
3416 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3418 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3419 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3420 num = (num > RE_DUP_MAX) ? -2 : num;
3425 #ifdef RE_ENABLE_I18N
3427 free_charset (re_charset_t *cset)
3429 re_free (cset->mbchars);
3431 re_free (cset->coll_syms);
3432 re_free (cset->equiv_classes);
3433 re_free (cset->range_starts);
3434 re_free (cset->range_ends);
3436 re_free (cset->char_classes);
3439 #endif /* RE_ENABLE_I18N */
3441 /* Functions for binary tree operation. */
3443 /* Create a node of tree.
3444 Note: This function automatically free left and right if malloc fails. */
3447 create_tree (left, right, type, index)
3450 re_token_type_t type;
3454 tree = re_malloc (bin_tree_t, 1);
3455 if (BE (tree == NULL, 0))
3457 free_bin_tree (left);
3458 free_bin_tree (right);
3461 tree->parent = NULL;
3463 tree->right = right;
3465 tree->node_idx = index;
3468 re_node_set_init_empty (&tree->eclosure);
3471 left->parent = tree;
3473 right->parent = tree;
3477 /* Free the sub tree pointed by TREE. */
3480 free_bin_tree (tree)
3485 /*re_node_set_free (&tree->eclosure);*/
3486 free_bin_tree (tree->left);
3487 free_bin_tree (tree->right);
3491 /* Duplicate the node SRC, and return new node. */
3494 duplicate_tree (src, dfa)
3495 const bin_tree_t *src;
3498 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3500 /* Since node indies must be according to Post-order of the tree,
3501 we must duplicate the left at first. */
3502 if (src->left != NULL)
3504 left = duplicate_tree (src->left, dfa);
3509 /* Secondaly, duplicate the right. */
3510 if (src->right != NULL)
3512 right = duplicate_tree (src->right, dfa);
3515 free_bin_tree (left);
3520 /* At last, duplicate itself. */
3521 if (src->type == NON_TYPE)
3523 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3524 dfa->nodes[new_node_idx].duplicated = 1;
3525 if (BE (new_node_idx == -1, 0))
3527 free_bin_tree (left);
3528 free_bin_tree (right);
3533 new_node_idx = src->type;
3535 new_tree = create_tree (left, right, src->type, new_node_idx);
3536 if (BE (new_tree == NULL, 0))
3538 free_bin_tree (left);
3539 free_bin_tree (right);