1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 #if defined HAVE_WCHAR_H || defined _LIBC
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
59 # define gettext(msgid) (msgid)
63 /* This define is so xgettext can find the internationalizable
65 # define gettext_noop(String) String
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
89 int top_clone_node, int root_node,
90 unsigned int constraint);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101 reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105 reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116 re_token_t *token, reg_syntax_t syntax,
117 int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119 re_dfa_t *dfa, re_token_t *token,
120 reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122 re_token_t *token, reg_syntax_t syntax,
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
126 re_token_t *token, int token_len,
128 reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135 re_charset_t *mbcset, int *range_alloc,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 re_charset_t *mbcset,
141 const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144 bracket_elem_t *start_elem,
145 bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147 const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152 re_charset_t *mbcset,
153 int *equiv_class_alloc,
154 const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 re_charset_t *mbcset,
157 int *char_class_alloc,
158 const unsigned char *class_name,
159 reg_syntax_t syntax);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162 const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164 const unsigned char *class_name,
165 reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid[] attribute_hidden =
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx[] attribute_hidden =
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
263 re_compile_pattern (pattern, length, bufp)
266 struct re_pattern_buffer *bufp;
270 /* And GNU code determines whether or not to get register information
271 by passing null for the REGS argument to re_match, etc., not by
275 /* Match anchors at newline. */
276 bufp->newline_anchor = 1;
278 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
282 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
285 weak_alias (__re_compile_pattern, re_compile_pattern)
288 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
289 also be assigned to arbitrarily: each pattern buffer stores its own
290 syntax, so it can be changed between regex compilations. */
291 /* This has no initializer because initialized variables in Emacs
292 become read-only after dumping. */
293 reg_syntax_t re_syntax_options;
296 /* Specify the precise syntax of regexps for compilation. This provides
297 for compatibility for various utilities which historically have
298 different, incompatible syntaxes.
300 The argument SYNTAX is a bit mask comprised of the various bits
301 defined in regex.h. We return the old syntax. */
304 re_set_syntax (syntax)
307 reg_syntax_t ret = re_syntax_options;
309 re_syntax_options = syntax;
313 weak_alias (__re_set_syntax, re_set_syntax)
317 re_compile_fastmap (bufp)
318 struct re_pattern_buffer *bufp;
320 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
321 char *fastmap = bufp->fastmap;
323 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
324 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
325 if (dfa->init_state != dfa->init_state_word)
326 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
327 if (dfa->init_state != dfa->init_state_nl)
328 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
329 if (dfa->init_state != dfa->init_state_begbuf)
330 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
331 bufp->fastmap_accurate = 1;
335 weak_alias (__re_compile_fastmap, re_compile_fastmap)
339 re_set_fastmap (char *fastmap, int icase, int ch)
343 fastmap[tolower (ch)] = 1;
346 /* Helper function for re_compile_fastmap.
347 Compile fastmap for the initial_state INIT_STATE. */
350 re_compile_fastmap_iter (bufp, init_state, fastmap)
352 const re_dfastate_t *init_state;
355 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
357 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
358 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
360 int node = init_state->nodes.elems[node_cnt];
361 re_token_type_t type = dfa->nodes[node].type;
363 if (type == CHARACTER)
364 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
365 else if (type == SIMPLE_BRACKET)
368 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369 for (j = 0; j < UINT_BITS; ++j, ++ch)
370 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371 re_set_fastmap (fastmap, icase, ch);
373 #ifdef RE_ENABLE_I18N
374 else if (type == COMPLEX_BRACKET)
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379 || cset->nranges || cset->nchar_classes)
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
391 const int32_t *table = (const int32_t *)
392 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394 for (j = 0; j < UINT_BITS; ++j, ++ch)
396 re_set_fastmap (fastmap, icase, ch);
400 for (i = 0; i < SBC_MAX; ++i)
401 if (__btowc (i) == WEOF)
402 re_set_fastmap (fastmap, icase, i);
403 # endif /* not _LIBC */
405 for (i = 0; i < cset->nmbchars; ++i)
408 wctomb (buf, cset->mbchars[i]);
409 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 #endif /* RE_ENABLE_I18N */
413 else if (type == END_OF_RE || type == OP_PERIOD)
415 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
416 if (type == END_OF_RE)
417 bufp->can_be_null = 1;
423 /* Entry point for POSIX code. */
424 /* regcomp takes a regular expression as a string and compiles it.
426 PREG is a regex_t *. We do not expect any fields to be initialized,
427 since POSIX says we shouldn't. Thus, we set
429 `buffer' to the compiled pattern;
430 `used' to the length of the compiled pattern;
431 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
432 REG_EXTENDED bit in CFLAGS is set; otherwise, to
433 RE_SYNTAX_POSIX_BASIC;
434 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
435 `fastmap' to an allocated space for the fastmap;
436 `fastmap_accurate' to zero;
437 `re_nsub' to the number of subexpressions in PATTERN.
439 PATTERN is the address of the pattern string.
441 CFLAGS is a series of bits which affect compilation.
443 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
444 use POSIX basic syntax.
446 If REG_NEWLINE is set, then . and [^...] don't match newline.
447 Also, regexec will try a match beginning after every newline.
449 If REG_ICASE is set, then we considers upper- and lowercase
450 versions of letters to be equivalent when matching.
452 If REG_NOSUB is set, then when PREG is passed to regexec, that
453 routine will report only success or failure, and nothing about the
456 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
457 the return codes and their meanings.) */
460 regcomp (preg, pattern, cflags)
461 regex_t *__restrict preg;
462 const char *__restrict pattern;
466 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
467 : RE_SYNTAX_POSIX_BASIC);
473 /* Try to allocate space for the fastmap. */
474 preg->fastmap = re_malloc (char, SBC_MAX);
475 if (BE (preg->fastmap == NULL, 0))
478 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
480 /* If REG_NEWLINE is set, newlines are treated differently. */
481 if (cflags & REG_NEWLINE)
482 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
483 syntax &= ~RE_DOT_NEWLINE;
484 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
485 /* It also changes the matching behavior. */
486 preg->newline_anchor = 1;
489 preg->newline_anchor = 0;
490 preg->no_sub = !!(cflags & REG_NOSUB);
491 preg->translate = NULL;
493 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495 /* POSIX doesn't distinguish between an unmatched open-group and an
496 unmatched close-group: both are REG_EPAREN. */
497 if (ret == REG_ERPAREN)
500 /* We have already checked preg->fastmap != NULL. */
501 if (BE (ret == REG_NOERROR, 1))
502 /* Compute the fastmap now, since regexec cannot modify the pattern
503 buffer. This function nevers fails in this implementation. */
504 (void) __re_compile_fastmap (preg);
507 /* Some error occurred while compiling the expression. */
508 re_free (preg->fastmap);
509 preg->fastmap = NULL;
515 weak_alias (__regcomp, regcomp)
518 /* Returns a message corresponding to an error code, ERRCODE, returned
519 from either regcomp or regexec. We don't use PREG here. */
522 regerror (errcode, preg, errbuf, errbuf_size)
532 || errcode >= (int) (sizeof (__re_error_msgid_idx)
533 / sizeof (__re_error_msgid_idx[0])), 0))
534 /* Only error codes returned by the rest of the code should be passed
535 to this routine. If we are given anything else, or if other regex
536 code generates an invalid error code, then the program has a bug.
537 Dump core so we can fix it. */
540 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 msg_size = strlen (msg) + 1; /* Includes the null. */
544 if (BE (errbuf_size != 0, 1))
546 if (BE (msg_size > errbuf_size, 0))
548 #if defined HAVE_MEMPCPY || defined _LIBC
549 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
551 memcpy (errbuf, msg, errbuf_size - 1);
552 errbuf[errbuf_size - 1] = 0;
556 memcpy (errbuf, msg, msg_size);
562 weak_alias (__regerror, regerror)
567 free_dfa_content (re_dfa_t *dfa)
571 re_free (dfa->subexps);
573 for (i = 0; i < dfa->nodes_len; ++i)
575 re_token_t *node = dfa->nodes + i;
576 #ifdef RE_ENABLE_I18N
577 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
578 free_charset (node->opr.mbcset);
580 #endif /* RE_ENABLE_I18N */
581 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
582 re_free (node->opr.sbcset);
584 re_free (dfa->nexts);
585 for (i = 0; i < dfa->nodes_len; ++i)
587 if (dfa->eclosures != NULL)
588 re_node_set_free (dfa->eclosures + i);
589 if (dfa->inveclosures != NULL)
590 re_node_set_free (dfa->inveclosures + i);
591 if (dfa->edests != NULL)
592 re_node_set_free (dfa->edests + i);
594 re_free (dfa->edests);
595 re_free (dfa->eclosures);
596 re_free (dfa->inveclosures);
597 re_free (dfa->nodes);
599 for (i = 0; i <= dfa->state_hash_mask; ++i)
601 struct re_state_table_entry *entry = dfa->state_table + i;
602 for (j = 0; j < entry->num; ++j)
604 re_dfastate_t *state = entry->array[j];
607 re_free (entry->array);
609 re_free (dfa->state_table);
611 if (dfa->word_char != NULL)
612 re_free (dfa->word_char);
614 re_free (dfa->re_str);
621 /* Free dynamically allocated space used by PREG. */
627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
628 if (BE (dfa != NULL, 1))
629 free_dfa_content (dfa);
631 re_free (preg->fastmap);
634 weak_alias (__regfree, regfree)
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
640 #if defined _REGEX_RE_COMP || defined _LIBC
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf;
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
660 if (!re_comp_buf.buffer)
661 return gettext ("No previous regular expression");
665 if (re_comp_buf.buffer)
667 fastmap = re_comp_buf.fastmap;
668 re_comp_buf.fastmap = NULL;
669 __regfree (&re_comp_buf);
670 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
671 re_comp_buf.fastmap = fastmap;
674 if (re_comp_buf.fastmap == NULL)
676 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
677 if (re_comp_buf.fastmap == NULL)
678 return (char *) gettext (__re_error_msgid
679 + __re_error_msgid_idx[(int) REG_ESPACE]);
682 /* Since `re_exec' always passes NULL for the `regs' argument, we
683 don't need to initialize the pattern buffer fields which affect it. */
685 /* Match anchors at newlines. */
686 re_comp_buf.newline_anchor = 1;
688 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
693 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
694 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
698 libc_freeres_fn (free_mem)
700 __regfree (&re_comp_buf);
704 #endif /* _REGEX_RE_COMP */
706 /* Internal entry point.
707 Compile the regular expression PATTERN, whose length is LENGTH.
708 SYNTAX indicate regular expression's syntax. */
711 re_compile_internal (preg, pattern, length, syntax)
713 const char * pattern;
717 reg_errcode_t err = REG_NOERROR;
721 /* Initialize the pattern buffer. */
722 preg->fastmap_accurate = 0;
723 preg->syntax = syntax;
724 preg->not_bol = preg->not_eol = 0;
727 preg->can_be_null = 0;
728 preg->regs_allocated = REGS_UNALLOCATED;
730 /* Initialize the dfa. */
731 dfa = (re_dfa_t *) preg->buffer;
732 if (preg->allocated < sizeof (re_dfa_t))
734 /* If zero allocated, but buffer is non-null, try to realloc
735 enough space. This loses if buffer's address is bogus, but
736 that is the user's responsibility. If ->buffer is NULL this
737 is a simple allocation. */
738 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
741 preg->allocated = sizeof (re_dfa_t);
743 preg->buffer = (unsigned char *) dfa;
744 preg->used = sizeof (re_dfa_t);
746 err = init_dfa (dfa, length);
747 if (BE (err != REG_NOERROR, 0))
754 dfa->re_str = re_malloc (char, length + 1);
755 strncpy (dfa->re_str, pattern, length + 1);
758 err = re_string_construct (®exp, pattern, length, preg->translate,
760 if (BE (err != REG_NOERROR, 0))
767 /* Parse the regular expression, and build a structure tree. */
769 dfa->str_tree = parse (®exp, preg, syntax, &err);
770 if (BE (dfa->str_tree == NULL, 0))
771 goto re_compile_internal_free_return;
773 /* Analyze the tree and collect information which is necessary to
776 if (BE (err != REG_NOERROR, 0))
777 goto re_compile_internal_free_return;
779 /* Then create the initial state of the dfa. */
780 err = create_initial_state (dfa);
782 /* Release work areas. */
783 free_workarea_compile (preg);
784 re_string_destruct (®exp);
786 if (BE (err != REG_NOERROR, 0))
788 re_compile_internal_free_return:
789 free_dfa_content (dfa);
796 /* Initialize DFA. We use the length of the regular expression PAT_LEN
797 as the initial length of some arrays. */
800 init_dfa (dfa, pat_len)
806 memset (dfa, '\0', sizeof (re_dfa_t));
808 dfa->nodes_alloc = pat_len + 1;
809 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
811 dfa->states_alloc = pat_len + 1;
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size > 0; table_size <<= 1)
815 if (table_size > pat_len)
818 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->subexps_alloc = 1;
822 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
823 dfa->word_char = NULL;
825 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
826 || dfa->subexps == NULL, 0))
828 /* We don't bother to free anything which was allocated. Very
829 soon the process will go down anyway. */
831 dfa->state_table = NULL;
838 /* Initialize WORD_CHAR table, which indicate which character is
839 "word". In this case "word" means that it is the word construction
840 character used by some operators like "\<", "\>", etc. */
847 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
848 if (BE (dfa->word_char == NULL, 0))
850 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
851 for (j = 0; j < UINT_BITS; ++j, ++ch)
852 if (isalnum (ch) || ch == '_')
853 dfa->word_char[i] |= 1 << j;
857 /* Free the work area which are only used while compiling. */
860 free_workarea_compile (preg)
863 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
864 free_bin_tree (dfa->str_tree);
865 dfa->str_tree = NULL;
868 /* Create initial states for all contexts. */
871 create_initial_state (dfa)
876 re_node_set init_nodes;
878 /* Initial states have the epsilon closure of the node which is
879 the first node of the regular expression. */
880 first = dfa->str_tree->first;
881 dfa->init_node = first;
882 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
883 if (BE (err != REG_NOERROR, 0))
886 /* The back-references which are in initial states can epsilon transit,
887 since in this case all of the subexpressions can be null.
888 Then we add epsilon closures of the nodes which are the next nodes of
889 the back-references. */
890 if (dfa->nbackref > 0)
891 for (i = 0; i < init_nodes.nelem; ++i)
893 int node_idx = init_nodes.elems[i];
894 re_token_type_t type = dfa->nodes[node_idx].type;
897 if (type != OP_BACK_REF)
899 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
901 re_token_t *clexp_node;
902 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
903 if (clexp_node->type == OP_CLOSE_SUBEXP
904 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
907 if (clexp_idx == init_nodes.nelem)
910 if (type == OP_BACK_REF)
912 int dest_idx = dfa->edests[node_idx].elems[0];
913 if (!re_node_set_contains (&init_nodes, dest_idx))
915 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
921 /* It must be the first time to invoke acquire_state. */
922 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
923 /* We don't check ERR here, since the initial state must not be NULL. */
924 if (BE (dfa->init_state == NULL, 0))
926 if (dfa->init_state->has_constraint)
928 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
930 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
932 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
936 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
937 || dfa->init_state_begbuf == NULL, 0))
941 dfa->init_state_word = dfa->init_state_nl
942 = dfa->init_state_begbuf = dfa->init_state;
944 re_node_set_free (&init_nodes);
948 /* Analyze the structure tree, and calculate "first", "next", "edest",
949 "eclosure", and "inveclosure". */
958 /* Allocate arrays. */
959 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
960 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
961 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
962 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
963 if (BE (dfa->nexts == NULL || dfa->edests == NULL
964 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
966 /* Initialize them. */
967 for (i = 0; i < dfa->nodes_len; ++i)
970 re_node_set_init_empty (dfa->edests + i);
971 re_node_set_init_empty (dfa->eclosures + i);
972 re_node_set_init_empty (dfa->inveclosures + i);
975 ret = analyze_tree (dfa, dfa->str_tree);
976 if (BE (ret == REG_NOERROR, 1))
978 ret = calc_eclosure (dfa);
979 if (ret == REG_NOERROR)
980 calc_inveclosure (dfa);
985 /* Helper functions for analyze.
986 This function calculate "first", "next", and "edest" for the subtree
987 whose root is NODE. */
990 analyze_tree (dfa, node)
995 if (node->first == -1)
996 calc_first (dfa, node);
997 if (node->next == -1)
998 calc_next (dfa, node);
999 if (node->eclosure.nelem == 0)
1000 calc_epsdest (dfa, node);
1001 /* Calculate "first" etc. for the left child. */
1002 if (node->left != NULL)
1004 ret = analyze_tree (dfa, node->left);
1005 if (BE (ret != REG_NOERROR, 0))
1008 /* Calculate "first" etc. for the right child. */
1009 if (node->right != NULL)
1011 ret = analyze_tree (dfa, node->right);
1012 if (BE (ret != REG_NOERROR, 0))
1018 /* Calculate "first" for the node NODE. */
1020 calc_first (dfa, node)
1025 idx = node->node_idx;
1026 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1031 case OP_OPEN_BRACKET:
1032 case OP_CLOSE_BRACKET:
1033 case OP_OPEN_DUP_NUM:
1034 case OP_CLOSE_DUP_NUM:
1035 case OP_NON_MATCH_LIST:
1036 case OP_OPEN_COLL_ELEM:
1037 case OP_CLOSE_COLL_ELEM:
1038 case OP_OPEN_EQUIV_CLASS:
1039 case OP_CLOSE_EQUIV_CLASS:
1040 case OP_OPEN_CHAR_CLASS:
1041 case OP_CLOSE_CHAR_CLASS:
1042 /* These must not be appeared here. */
1048 case OP_DUP_ASTERISK:
1049 case OP_DUP_QUESTION:
1050 #ifdef RE_ENABLE_I18N
1051 case COMPLEX_BRACKET:
1052 #endif /* RE_ENABLE_I18N */
1053 case SIMPLE_BRACKET:
1056 case OP_OPEN_SUBEXP:
1057 case OP_CLOSE_SUBEXP:
1062 assert (node->left != NULL);
1064 if (node->left->first == -1)
1065 calc_first (dfa, node->left);
1066 node->first = node->left->first;
1071 /* else fall through */
1074 assert (node->left != NULL);
1076 if (node->left->first == -1)
1077 calc_first (dfa, node->left);
1078 node->first = node->left->first;
1083 /* Calculate "next" for the node NODE. */
1086 calc_next (dfa, node)
1091 bin_tree_t *parent = node->parent;
1095 idx = node->node_idx;
1096 if (node->type == 0)
1097 dfa->nexts[idx] = node->next;
1101 idx = parent->node_idx;
1102 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1106 case OP_DUP_ASTERISK:
1111 if (parent->left == node)
1113 if (parent->right->first == -1)
1114 calc_first (dfa, parent->right);
1115 node->next = parent->right->first;
1118 /* else fall through */
1120 if (parent->next == -1)
1121 calc_next (dfa, parent);
1122 node->next = parent->next;
1125 idx = node->node_idx;
1126 if (node->type == 0)
1127 dfa->nexts[idx] = node->next;
1130 /* Calculate "edest" for the node NODE. */
1133 calc_epsdest (dfa, node)
1138 idx = node->node_idx;
1139 if (node->type == 0)
1141 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1142 || dfa->nodes[idx].type == OP_DUP_PLUS
1143 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1145 if (node->left->first == -1)
1146 calc_first (dfa, node->left);
1147 if (node->next == -1)
1148 calc_next (dfa, node);
1149 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1152 else if (dfa->nodes[idx].type == OP_ALT)
1155 if (node->left != NULL)
1157 if (node->left->first == -1)
1158 calc_first (dfa, node->left);
1159 left = node->left->first;
1163 if (node->next == -1)
1164 calc_next (dfa, node);
1167 if (node->right != NULL)
1169 if (node->right->first == -1)
1170 calc_first (dfa, node->right);
1171 right = node->right->first;
1175 if (node->next == -1)
1176 calc_next (dfa, node);
1179 re_node_set_init_2 (dfa->edests + idx, left, right);
1181 else if (dfa->nodes[idx].type == ANCHOR
1182 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1183 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1184 || dfa->nodes[idx].type == OP_BACK_REF)
1185 re_node_set_init_1 (dfa->edests + idx, node->next);
1189 /* Duplicate the epsilon closure of the node ROOT_NODE.
1190 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1191 to their own constraint. */
1193 static reg_errcode_t
1194 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1197 int top_org_node, top_clone_node, root_node;
1198 unsigned int init_constraint;
1201 int org_node, clone_node, ret;
1202 unsigned int constraint = init_constraint;
1203 for (org_node = top_org_node, clone_node = top_clone_node;;)
1205 int org_dest, clone_dest;
1206 if (dfa->nodes[org_node].type == OP_BACK_REF)
1208 /* If the back reference epsilon-transit, its destination must
1209 also have the constraint. Then duplicate the epsilon closure
1210 of the destination of the back reference, and store it in
1211 edests of the back reference. */
1212 org_dest = dfa->nexts[org_node];
1213 re_node_set_empty (dfa->edests + clone_node);
1214 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1215 if (BE (err != REG_NOERROR, 0))
1217 dfa->nexts[clone_node] = dfa->nexts[org_node];
1218 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1219 if (BE (ret < 0, 0))
1222 else if (dfa->edests[org_node].nelem == 0)
1224 /* In case of the node can't epsilon-transit, don't duplicate the
1225 destination and store the original destination as the
1226 destination of the node. */
1227 dfa->nexts[clone_node] = dfa->nexts[org_node];
1230 else if (dfa->edests[org_node].nelem == 1)
1232 /* In case of the node can epsilon-transit, and it has only one
1234 org_dest = dfa->edests[org_node].elems[0];
1235 re_node_set_empty (dfa->edests + clone_node);
1236 if (dfa->nodes[org_node].type == ANCHOR)
1238 /* In case of the node has another constraint, append it. */
1239 if (org_node == root_node && clone_node != org_node)
1241 /* ...but if the node is root_node itself, it means the
1242 epsilon closure have a loop, then tie it to the
1243 destination of the root_node. */
1244 ret = re_node_set_insert (dfa->edests + clone_node,
1246 if (BE (ret < 0, 0))
1250 constraint |= dfa->nodes[org_node].opr.ctx_type;
1252 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1253 if (BE (err != REG_NOERROR, 0))
1255 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1256 if (BE (ret < 0, 0))
1259 else /* dfa->edests[org_node].nelem == 2 */
1261 /* In case of the node can epsilon-transit, and it has two
1263 org_dest = dfa->edests[org_node].elems[0];
1264 re_node_set_empty (dfa->edests + clone_node);
1265 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1266 if (BE (err != REG_NOERROR, 0))
1268 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1269 if (BE (ret < 0, 0))
1272 err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1274 if (BE (err != REG_NOERROR, 0))
1277 org_dest = dfa->edests[org_node].elems[1];
1278 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1279 if (BE (err != REG_NOERROR, 0))
1281 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1282 if (BE (ret < 0, 0))
1285 org_node = org_dest;
1286 clone_node = clone_dest;
1291 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1292 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1293 otherwise return the error code. */
1295 static reg_errcode_t
1296 duplicate_node (new_idx, dfa, org_idx, constraint)
1298 int *new_idx, org_idx;
1299 unsigned int constraint;
1304 dup = dfa->nodes[org_idx];
1305 dup_idx = re_dfa_add_node (dfa, dup, 1);
1306 if (BE (dup_idx == -1, 0))
1308 dfa->nodes[dup_idx].constraint = constraint;
1309 if (dfa->nodes[org_idx].type == ANCHOR)
1310 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1311 dfa->nodes[dup_idx].duplicated = 1;
1312 re_node_set_init_empty (dfa->edests + dup_idx);
1313 re_node_set_init_empty (dfa->eclosures + dup_idx);
1314 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1321 calc_inveclosure (dfa)
1325 for (src = 0; src < dfa->nodes_len; ++src)
1327 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1329 dest = dfa->eclosures[src].elems[idx];
1330 re_node_set_insert (dfa->inveclosures + dest, src);
1335 /* Calculate "eclosure" for all the node in DFA. */
1337 static reg_errcode_t
1341 int node_idx, incomplete;
1343 assert (dfa->nodes_len > 0);
1346 /* For each nodes, calculate epsilon closure. */
1347 for (node_idx = 0; ; ++node_idx)
1350 re_node_set eclosure_elem;
1351 if (node_idx == dfa->nodes_len)
1360 assert (dfa->eclosures[node_idx].nelem != -1);
1362 /* If we have already calculated, skip it. */
1363 if (dfa->eclosures[node_idx].nelem != 0)
1365 /* Calculate epsilon closure of `node_idx'. */
1366 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1367 if (BE (err != REG_NOERROR, 0))
1370 if (dfa->eclosures[node_idx].nelem == 0)
1373 re_node_set_free (&eclosure_elem);
1379 /* Calculate epsilon closure of NODE. */
1381 static reg_errcode_t
1382 calc_eclosure_iter (new_set, dfa, node, root)
1383 re_node_set *new_set;
1388 unsigned int constraint;
1390 re_node_set eclosure;
1392 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1393 if (BE (err != REG_NOERROR, 0))
1396 /* This indicates that we are calculating this node now.
1397 We reference this value to avoid infinite loop. */
1398 dfa->eclosures[node].nelem = -1;
1400 constraint = ((dfa->nodes[node].type == ANCHOR)
1401 ? dfa->nodes[node].opr.ctx_type : 0);
1402 /* If the current node has constraints, duplicate all nodes.
1403 Since they must inherit the constraints. */
1404 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1406 int org_node, cur_node;
1407 org_node = cur_node = node;
1408 err = duplicate_node_closure (dfa, node, node, node, constraint);
1409 if (BE (err != REG_NOERROR, 0))
1413 /* Expand each epsilon destination nodes. */
1414 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1415 for (i = 0; i < dfa->edests[node].nelem; ++i)
1417 re_node_set eclosure_elem;
1418 int edest = dfa->edests[node].elems[i];
1419 /* If calculating the epsilon closure of `edest' is in progress,
1420 return intermediate result. */
1421 if (dfa->eclosures[edest].nelem == -1)
1426 /* If we haven't calculated the epsilon closure of `edest' yet,
1427 calculate now. Otherwise use calculated epsilon closure. */
1428 if (dfa->eclosures[edest].nelem == 0)
1430 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1431 if (BE (err != REG_NOERROR, 0))
1435 eclosure_elem = dfa->eclosures[edest];
1436 /* Merge the epsilon closure of `edest'. */
1437 re_node_set_merge (&eclosure, &eclosure_elem);
1438 /* If the epsilon closure of `edest' is incomplete,
1439 the epsilon closure of this node is also incomplete. */
1440 if (dfa->eclosures[edest].nelem == 0)
1443 re_node_set_free (&eclosure_elem);
1447 /* Epsilon closures include itself. */
1448 re_node_set_insert (&eclosure, node);
1449 if (incomplete && !root)
1450 dfa->eclosures[node].nelem = 0;
1452 dfa->eclosures[node] = eclosure;
1453 *new_set = eclosure;
1457 /* Functions for token which are used in the parser. */
1459 /* Fetch a token from INPUT.
1460 We must not use this function inside bracket expressions. */
1463 fetch_token (input, syntax)
1465 reg_syntax_t syntax;
1469 consumed_byte = peek_token (&token, input, syntax);
1470 re_string_skip_bytes (input, consumed_byte);
1474 /* Peek a token from INPUT, and return the length of the token.
1475 We must not use this function inside bracket expressions. */
1478 peek_token (token, input, syntax)
1481 reg_syntax_t syntax;
1485 if (re_string_eoi (input))
1487 token->type = END_OF_RE;
1491 c = re_string_peek_byte (input, 0);
1494 #ifdef RE_ENABLE_I18N
1495 token->mb_partial = 0;
1496 if (MB_CUR_MAX > 1 &&
1497 !re_string_first_byte (input, re_string_cur_idx (input)))
1499 token->type = CHARACTER;
1500 token->mb_partial = 1;
1507 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1509 token->type = BACK_SLASH;
1513 c2 = re_string_peek_byte_case (input, 1);
1515 token->type = CHARACTER;
1519 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1520 token->type = OP_ALT;
1522 case '1': case '2': case '3': case '4': case '5':
1523 case '6': case '7': case '8': case '9':
1524 if (!(syntax & RE_NO_BK_REFS))
1526 token->type = OP_BACK_REF;
1527 token->opr.idx = c2 - '0';
1531 if (!(syntax & RE_NO_GNU_OPS))
1533 token->type = ANCHOR;
1534 token->opr.idx = WORD_FIRST;
1538 if (!(syntax & RE_NO_GNU_OPS))
1540 token->type = ANCHOR;
1541 token->opr.idx = WORD_LAST;
1545 if (!(syntax & RE_NO_GNU_OPS))
1547 token->type = ANCHOR;
1548 token->opr.idx = WORD_DELIM;
1552 if (!(syntax & RE_NO_GNU_OPS))
1554 token->type = ANCHOR;
1555 token->opr.idx = INSIDE_WORD;
1559 if (!(syntax & RE_NO_GNU_OPS))
1560 token->type = OP_WORD;
1563 if (!(syntax & RE_NO_GNU_OPS))
1564 token->type = OP_NOTWORD;
1567 if (!(syntax & RE_NO_GNU_OPS))
1569 token->type = ANCHOR;
1570 token->opr.idx = BUF_FIRST;
1574 if (!(syntax & RE_NO_GNU_OPS))
1576 token->type = ANCHOR;
1577 token->opr.idx = BUF_LAST;
1581 if (!(syntax & RE_NO_BK_PARENS))
1582 token->type = OP_OPEN_SUBEXP;
1585 if (!(syntax & RE_NO_BK_PARENS))
1586 token->type = OP_CLOSE_SUBEXP;
1589 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1590 token->type = OP_DUP_PLUS;
1593 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1594 token->type = OP_DUP_QUESTION;
1597 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1598 token->type = OP_OPEN_DUP_NUM;
1601 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1602 token->type = OP_CLOSE_DUP_NUM;
1610 token->type = CHARACTER;
1614 if (syntax & RE_NEWLINE_ALT)
1615 token->type = OP_ALT;
1618 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1619 token->type = OP_ALT;
1622 token->type = OP_DUP_ASTERISK;
1625 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1626 token->type = OP_DUP_PLUS;
1629 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1630 token->type = OP_DUP_QUESTION;
1633 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1634 token->type = OP_OPEN_DUP_NUM;
1637 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1638 token->type = OP_CLOSE_DUP_NUM;
1641 if (syntax & RE_NO_BK_PARENS)
1642 token->type = OP_OPEN_SUBEXP;
1645 if (syntax & RE_NO_BK_PARENS)
1646 token->type = OP_CLOSE_SUBEXP;
1649 token->type = OP_OPEN_BRACKET;
1652 token->type = OP_PERIOD;
1655 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1656 re_string_cur_idx (input) != 0)
1658 char prev = re_string_peek_byte (input, -1);
1659 if (prev != '|' && prev != '(' &&
1660 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1663 token->type = ANCHOR;
1664 token->opr.idx = LINE_FIRST;
1667 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1668 re_string_cur_idx (input) + 1 != re_string_length (input))
1671 re_string_skip_bytes (input, 1);
1672 peek_token (&next, input, syntax);
1673 re_string_skip_bytes (input, -1);
1674 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1677 token->type = ANCHOR;
1678 token->opr.idx = LINE_LAST;
1686 /* Peek a token from INPUT, and return the length of the token.
1687 We must not use this function out of bracket expressions. */
1690 peek_token_bracket (token, input, syntax)
1693 reg_syntax_t syntax;
1696 if (re_string_eoi (input))
1698 token->type = END_OF_RE;
1701 c = re_string_peek_byte (input, 0);
1704 #ifdef RE_ENABLE_I18N
1705 if (MB_CUR_MAX > 1 &&
1706 !re_string_first_byte (input, re_string_cur_idx (input)))
1708 token->type = CHARACTER;
1711 #endif /* RE_ENABLE_I18N */
1713 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1715 /* In this case, '\' escape a character. */
1717 re_string_skip_bytes (input, 1);
1718 c2 = re_string_peek_byte (input, 0);
1720 token->type = CHARACTER;
1723 if (c == '[') /* '[' is a special char in a bracket exps. */
1727 c2 = re_string_peek_byte (input, 1);
1733 token->type = OP_OPEN_COLL_ELEM;
1736 token->type = OP_OPEN_EQUIV_CLASS;
1739 if (syntax & RE_CHAR_CLASSES)
1741 token->type = OP_OPEN_CHAR_CLASS;
1744 /* else fall through. */
1746 token->type = CHARACTER;
1756 token->type = OP_CHARSET_RANGE;
1759 token->type = OP_CLOSE_BRACKET;
1762 token->type = OP_NON_MATCH_LIST;
1765 token->type = CHARACTER;
1770 /* Functions for parser. */
1772 /* Entry point of the parser.
1773 Parse the regular expression REGEXP and return the structure tree.
1774 If an error is occured, ERR is set by error code, and return NULL.
1775 This function build the following tree, from regular expression <reg_exp>:
1781 CAT means concatenation.
1782 EOR means end of regular expression. */
1785 parse (regexp, preg, syntax, err)
1786 re_string_t *regexp;
1788 reg_syntax_t syntax;
1791 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1792 bin_tree_t *tree, *eor, *root;
1793 re_token_t current_token;
1795 current_token = fetch_token (regexp, syntax);
1796 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1797 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1799 new_idx = re_dfa_add_node (dfa, current_token, 0);
1800 eor = create_tree (NULL, NULL, 0, new_idx);
1802 root = create_tree (tree, eor, CONCAT, 0);
1805 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1813 /* This function build the following tree, from regular expression
1814 <branch1>|<branch2>:
1820 ALT means alternative, which represents the operator `|'. */
1823 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1824 re_string_t *regexp;
1827 reg_syntax_t syntax;
1831 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1832 bin_tree_t *tree, *branch = NULL;
1834 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1835 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1838 while (token->type == OP_ALT)
1840 re_token_t alt_token = *token;
1841 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1842 *token = fetch_token (regexp, syntax);
1843 if (token->type != OP_ALT && token->type != END_OF_RE
1844 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1846 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1847 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1849 free_bin_tree (tree);
1855 tree = create_tree (tree, branch, 0, new_idx);
1856 if (BE (new_idx == -1 || tree == NULL, 0))
1861 dfa->has_plural_match = 1;
1866 /* This function build the following tree, from regular expression
1873 CAT means concatenation. */
1876 parse_branch (regexp, preg, token, syntax, nest, err)
1877 re_string_t *regexp;
1880 reg_syntax_t syntax;
1884 bin_tree_t *tree, *exp;
1885 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1886 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1889 while (token->type != OP_ALT && token->type != END_OF_RE
1890 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1892 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1893 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1895 free_bin_tree (tree);
1898 if (tree != NULL && exp != NULL)
1900 tree = create_tree (tree, exp, CONCAT, 0);
1907 else if (tree == NULL)
1909 /* Otherwise exp == NULL, we don't need to create new tree. */
1914 /* This function build the following tree, from regular expression a*:
1921 parse_expression (regexp, preg, token, syntax, nest, err)
1922 re_string_t *regexp;
1925 reg_syntax_t syntax;
1929 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1932 switch (token->type)
1935 new_idx = re_dfa_add_node (dfa, *token, 0);
1936 tree = create_tree (NULL, NULL, 0, new_idx);
1937 if (BE (new_idx == -1 || tree == NULL, 0))
1942 #ifdef RE_ENABLE_I18N
1945 while (!re_string_eoi (regexp)
1946 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1948 bin_tree_t *mbc_remain;
1949 *token = fetch_token (regexp, syntax);
1950 new_idx = re_dfa_add_node (dfa, *token, 0);
1951 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1952 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1953 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1954 return *err = REG_ESPACE, NULL;
1959 case OP_OPEN_SUBEXP:
1960 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1961 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964 case OP_OPEN_BRACKET:
1965 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1966 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1970 if (BE (preg->re_nsub < token->opr.idx
1971 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1976 new_idx = re_dfa_add_node (dfa, *token, 0);
1977 tree = create_tree (NULL, NULL, 0, new_idx);
1978 if (BE (new_idx == -1 || tree == NULL, 0))
1984 dfa->has_mb_node = 1;
1986 case OP_DUP_ASTERISK:
1988 case OP_DUP_QUESTION:
1989 case OP_OPEN_DUP_NUM:
1990 if (syntax & RE_CONTEXT_INVALID_OPS)
1995 else if (syntax & RE_CONTEXT_INDEP_OPS)
1997 *token = fetch_token (regexp, syntax);
1998 return parse_expression (regexp, preg, token, syntax, nest, err);
2000 /* else fall through */
2001 case OP_CLOSE_SUBEXP:
2002 if ((token->type == OP_CLOSE_SUBEXP) &&
2003 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2008 /* else fall through */
2009 case OP_CLOSE_DUP_NUM:
2010 /* We treat it as a normal character. */
2012 /* Then we can these characters as normal characters. */
2013 token->type = CHARACTER;
2014 new_idx = re_dfa_add_node (dfa, *token, 0);
2015 tree = create_tree (NULL, NULL, 0, new_idx);
2016 if (BE (new_idx == -1 || tree == NULL, 0))
2023 if (dfa->word_char == NULL)
2025 *err = init_word_char (dfa);
2026 if (BE (*err != REG_NOERROR, 0))
2029 if (token->opr.ctx_type == WORD_DELIM)
2031 bin_tree_t *tree_first, *tree_last;
2032 int idx_first, idx_last;
2033 token->opr.ctx_type = WORD_FIRST;
2034 idx_first = re_dfa_add_node (dfa, *token, 0);
2035 tree_first = create_tree (NULL, NULL, 0, idx_first);
2036 token->opr.ctx_type = WORD_LAST;
2037 idx_last = re_dfa_add_node (dfa, *token, 0);
2038 tree_last = create_tree (NULL, NULL, 0, idx_last);
2039 token->type = OP_ALT;
2040 new_idx = re_dfa_add_node (dfa, *token, 0);
2041 tree = create_tree (tree_first, tree_last, 0, new_idx);
2042 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2043 || tree_first == NULL || tree_last == NULL
2044 || tree == NULL, 0))
2052 new_idx = re_dfa_add_node (dfa, *token, 0);
2053 tree = create_tree (NULL, NULL, 0, new_idx);
2054 if (BE (new_idx == -1 || tree == NULL, 0))
2055 return *err = REG_ESPACE, NULL;
2057 /* We must return here, since ANCHORs can't be followed
2058 by repetition operators.
2059 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2060 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2061 *token = fetch_token (regexp, syntax);
2064 new_idx = re_dfa_add_node (dfa, *token, 0);
2065 tree = create_tree (NULL, NULL, 0, new_idx);
2066 if (BE (new_idx == -1 || tree == NULL, 0))
2072 dfa->has_mb_node = 1;
2075 tree = build_word_op (dfa, 0, err);
2076 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2080 tree = build_word_op (dfa, 1, err);
2081 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2091 /* Must not happen? */
2097 *token = fetch_token (regexp, syntax);
2099 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2100 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2102 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2103 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105 dfa->has_plural_match = 1;
2111 /* This function build the following tree, from regular expression
2119 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2120 re_string_t *regexp;
2123 reg_syntax_t syntax;
2127 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2128 bin_tree_t *tree, *left_par, *right_par;
2131 cur_nsub = preg->re_nsub++;
2132 if (dfa->subexps_alloc < preg->re_nsub)
2134 re_subexp_t *new_array;
2135 dfa->subexps_alloc *= 2;
2136 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2137 if (BE (new_array == NULL, 0))
2139 dfa->subexps_alloc /= 2;
2143 dfa->subexps = new_array;
2145 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2146 dfa->subexps[cur_nsub].end = -1;
2148 new_idx = re_dfa_add_node (dfa, *token, 0);
2149 left_par = create_tree (NULL, NULL, 0, new_idx);
2150 if (BE (new_idx == -1 || left_par == NULL, 0))
2155 dfa->nodes[new_idx].opr.idx = cur_nsub;
2156 *token = fetch_token (regexp, syntax);
2158 /* The subexpression may be a null string. */
2159 if (token->type == OP_CLOSE_SUBEXP)
2163 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2164 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2167 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2169 free_bin_tree (tree);
2173 new_idx = re_dfa_add_node (dfa, *token, 0);
2174 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2175 right_par = create_tree (NULL, NULL, 0, new_idx);
2176 tree = ((tree == NULL) ? right_par
2177 : create_tree (tree, right_par, CONCAT, 0));
2178 tree = create_tree (left_par, tree, CONCAT, 0);
2179 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2184 dfa->nodes[new_idx].opr.idx = cur_nsub;
2189 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2192 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2193 bin_tree_t *dup_elem;
2194 re_string_t *regexp;
2197 reg_syntax_t syntax;
2200 re_token_t dup_token;
2201 bin_tree_t *tree = dup_elem, *work_tree;
2202 int new_idx, start_idx = re_string_cur_idx (regexp);
2203 re_token_t start_token = *token;
2204 if (token->type == OP_OPEN_DUP_NUM)
2208 int start = fetch_number (regexp, token, syntax);
2212 if (token->type == CHARACTER && token->opr.c == ',')
2213 start = 0; /* We treat "{,m}" as "{0,m}". */
2216 *err = REG_BADBR; /* <re>{} is invalid. */
2220 if (BE (start != -2, 1))
2222 /* We treat "{n}" as "{n,n}". */
2223 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2224 : ((token->type == CHARACTER && token->opr.c == ',')
2225 ? fetch_number (regexp, token, syntax) : -2));
2227 if (BE (start == -2 || end == -2, 0))
2229 /* Invalid sequence. */
2230 if (token->type == OP_CLOSE_DUP_NUM)
2231 goto parse_dup_op_invalid_interval;
2233 goto parse_dup_op_ebrace;
2235 if (BE (start == 0 && end == 0, 0))
2237 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2238 *token = fetch_token (regexp, syntax);
2239 free_bin_tree (dup_elem);
2243 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2245 for (i = 0; i < start; ++i)
2248 work_tree = duplicate_tree (elem, dfa);
2249 tree = create_tree (tree, work_tree, CONCAT, 0);
2250 if (BE (work_tree == NULL || tree == NULL, 0))
2251 goto parse_dup_op_espace;
2256 /* We treat "<re>{0,}" as "<re>*". */
2257 dup_token.type = OP_DUP_ASTERISK;
2260 elem = duplicate_tree (elem, dfa);
2261 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2262 work_tree = create_tree (elem, NULL, 0, new_idx);
2263 tree = create_tree (tree, work_tree, CONCAT, 0);
2264 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2265 || tree == NULL, 0))
2266 goto parse_dup_op_espace;
2270 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2271 tree = create_tree (elem, NULL, 0, new_idx);
2272 if (BE (new_idx == -1 || tree == NULL, 0))
2273 goto parse_dup_op_espace;
2276 else if (end - start > 0)
2278 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2279 dup_token.type = OP_DUP_QUESTION;
2282 elem = duplicate_tree (elem, dfa);
2283 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2284 elem = create_tree (elem, NULL, 0, new_idx);
2285 tree = create_tree (tree, elem, CONCAT, 0);
2286 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2287 goto parse_dup_op_espace;
2291 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2292 tree = elem = create_tree (elem, NULL, 0, new_idx);
2293 if (BE (new_idx == -1 || tree == NULL, 0))
2294 goto parse_dup_op_espace;
2296 for (i = 1; i < end - start; ++i)
2298 work_tree = duplicate_tree (elem, dfa);
2299 tree = create_tree (tree, work_tree, CONCAT, 0);
2300 if (BE (work_tree == NULL || tree == NULL, 0))
2310 new_idx = re_dfa_add_node (dfa, *token, 0);
2311 tree = create_tree (tree, NULL, 0, new_idx);
2312 if (BE (new_idx == -1 || tree == NULL, 0))
2318 *token = fetch_token (regexp, syntax);
2321 parse_dup_op_espace:
2322 free_bin_tree (tree);
2326 parse_dup_op_ebrace:
2327 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2332 goto parse_dup_op_rollback;
2333 parse_dup_op_invalid_interval:
2334 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2339 parse_dup_op_rollback:
2340 re_string_set_index (regexp, start_idx);
2341 *token = start_token;
2342 token->type = CHARACTER;
2346 /* Size of the names for collating symbol/equivalence_class/character_class.
2347 I'm not sure, but maybe enough. */
2348 #define BRACKET_NAME_BUF_SIZE 32
2351 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2352 Build the range expression which starts from START_ELEM, and ends
2353 at END_ELEM. The result are written to MBCSET and SBCSET.
2354 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2355 mbcset->range_ends, is a pointer argument sinse we may
2358 static reg_errcode_t
2359 # ifdef RE_ENABLE_I18N
2360 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2361 re_charset_t *mbcset;
2363 # else /* not RE_ENABLE_I18N */
2364 build_range_exp (sbcset, start_elem, end_elem)
2365 # endif /* not RE_ENABLE_I18N */
2366 re_bitset_ptr_t sbcset;
2367 bracket_elem_t *start_elem, *end_elem;
2369 unsigned int start_ch, end_ch;
2370 /* Equivalence Classes and Character Classes can't be a range start/end. */
2371 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2372 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2376 /* We can handle no multi character collating elements without libc
2378 if (BE ((start_elem->type == COLL_SYM
2379 && strlen ((char *) start_elem->opr.name) > 1)
2380 || (end_elem->type == COLL_SYM
2381 && strlen ((char *) end_elem->opr.name) > 1), 0))
2382 return REG_ECOLLATE;
2384 # ifdef RE_ENABLE_I18N
2386 wchar_t wc, start_wc, end_wc;
2387 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2389 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2390 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2392 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2393 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2395 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2396 ? __btowc (start_ch) : start_elem->opr.wch);
2397 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2398 ? __btowc (end_ch) : end_elem->opr.wch);
2399 cmp_buf[0] = start_wc;
2400 cmp_buf[4] = end_wc;
2401 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2404 /* Check the space of the arrays. */
2405 if (*range_alloc == mbcset->nranges)
2407 /* There are not enough space, need realloc. */
2408 wchar_t *new_array_start, *new_array_end;
2411 /* +1 in case of mbcset->nranges is 0. */
2412 new_nranges = 2 * mbcset->nranges + 1;
2413 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2414 are NULL if *range_alloc == 0. */
2415 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2417 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2420 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2423 mbcset->range_starts = new_array_start;
2424 mbcset->range_ends = new_array_end;
2425 *range_alloc = new_nranges;
2428 mbcset->range_starts[mbcset->nranges] = start_wc;
2429 mbcset->range_ends[mbcset->nranges++] = end_wc;
2431 /* Build the table for single byte characters. */
2432 for (wc = 0; wc <= SBC_MAX; ++wc)
2435 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2436 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2437 bitset_set (sbcset, wc);
2440 # else /* not RE_ENABLE_I18N */
2443 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2444 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2446 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2447 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2449 if (start_ch > end_ch)
2451 /* Build the table for single byte characters. */
2452 for (ch = 0; ch <= SBC_MAX; ++ch)
2453 if (start_ch <= ch && ch <= end_ch)
2454 bitset_set (sbcset, ch);
2456 # endif /* not RE_ENABLE_I18N */
2459 #endif /* not _LIBC */
2462 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2463 Build the collating element which is represented by NAME.
2464 The result are written to MBCSET and SBCSET.
2465 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2466 pointer argument since we may update it. */
2468 static reg_errcode_t
2469 # ifdef RE_ENABLE_I18N
2470 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2471 re_charset_t *mbcset;
2472 int *coll_sym_alloc;
2473 # else /* not RE_ENABLE_I18N */
2474 build_collating_symbol (sbcset, name)
2475 # endif /* not RE_ENABLE_I18N */
2476 re_bitset_ptr_t sbcset;
2477 const unsigned char *name;
2479 size_t name_len = strlen ((const char *) name);
2480 if (BE (name_len != 1, 0))
2481 return REG_ECOLLATE;
2484 bitset_set (sbcset, name[0]);
2488 #endif /* not _LIBC */
2490 /* This function parse bracket expression like "[abc]", "[a-c]",
2494 parse_bracket_exp (regexp, dfa, token, syntax, err)
2495 re_string_t *regexp;
2498 reg_syntax_t syntax;
2502 const unsigned char *collseqmb;
2503 const char *collseqwc;
2506 const int32_t *symb_table;
2507 const unsigned char *extra;
2509 /* Local function for parse_bracket_exp used in _LIBC environement.
2510 Seek the collating symbol entry correspondings to NAME.
2511 Return the index of the symbol in the SYMB_TABLE. */
2513 static inline int32_t
2514 seek_collating_symbol_entry (name, name_len)
2515 const unsigned char *name;
2518 int32_t hash = elem_hash ((const char *) name, name_len);
2519 int32_t elem = hash % table_size;
2520 int32_t second = hash % (table_size - 2);
2521 while (symb_table[2 * elem] != 0)
2523 /* First compare the hashing value. */
2524 if (symb_table[2 * elem] == hash
2525 /* Compare the length of the name. */
2526 && name_len == extra[symb_table[2 * elem + 1]]
2527 /* Compare the name. */
2528 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2531 /* Yep, this is the entry. */
2541 /* Local function for parse_bracket_exp used in _LIBC environement.
2542 Look up the collation sequence value of BR_ELEM.
2543 Return the value if succeeded, UINT_MAX otherwise. */
2545 static inline unsigned int
2546 lookup_collation_sequence_value (br_elem)
2547 bracket_elem_t *br_elem;
2549 if (br_elem->type == SB_CHAR)
2552 if (MB_CUR_MAX == 1)
2555 return collseqmb[br_elem->opr.ch];
2558 wint_t wc = __btowc (br_elem->opr.ch);
2559 return collseq_table_lookup (collseqwc, wc);
2562 else if (br_elem->type == MB_CHAR)
2564 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2566 else if (br_elem->type == COLL_SYM)
2568 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2572 elem = seek_collating_symbol_entry (br_elem->opr.name,
2574 if (symb_table[2 * elem] != 0)
2576 /* We found the entry. */
2577 idx = symb_table[2 * elem + 1];
2578 /* Skip the name of collating element name. */
2579 idx += 1 + extra[idx];
2580 /* Skip the byte sequence of the collating element. */
2581 idx += 1 + extra[idx];
2582 /* Adjust for the alignment. */
2583 idx = (idx + 3) & ~3;
2584 /* Skip the multibyte collation sequence value. */
2585 idx += sizeof (unsigned int);
2586 /* Skip the wide char sequence of the collating element. */
2587 idx += sizeof (unsigned int) *
2588 (1 + *(unsigned int *) (extra + idx));
2589 /* Return the collation sequence value. */
2590 return *(unsigned int *) (extra + idx);
2592 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2594 /* No valid character. Match it as a single byte
2596 return collseqmb[br_elem->opr.name[0]];
2599 else if (sym_name_len == 1)
2600 return collseqmb[br_elem->opr.name[0]];
2605 /* Local function for parse_bracket_exp used in _LIBC environement.
2606 Build the range expression which starts from START_ELEM, and ends
2607 at END_ELEM. The result are written to MBCSET and SBCSET.
2608 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2609 mbcset->range_ends, is a pointer argument sinse we may
2612 static inline reg_errcode_t
2613 # ifdef RE_ENABLE_I18N
2614 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2615 re_charset_t *mbcset;
2617 # else /* not RE_ENABLE_I18N */
2618 build_range_exp (sbcset, start_elem, end_elem)
2619 # endif /* not RE_ENABLE_I18N */
2620 re_bitset_ptr_t sbcset;
2621 bracket_elem_t *start_elem, *end_elem;
2624 uint32_t start_collseq;
2625 uint32_t end_collseq;
2627 # ifdef RE_ENABLE_I18N
2628 /* Check the space of the arrays. */
2629 if (*range_alloc == mbcset->nranges)
2631 /* There are not enough space, need realloc. */
2632 uint32_t *new_array_start;
2633 uint32_t *new_array_end;
2636 /* +1 in case of mbcset->nranges is 0. */
2637 new_nranges = 2 * mbcset->nranges + 1;
2638 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639 are NULL if *range_alloc == 0. */
2640 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2642 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2645 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2648 mbcset->range_starts = new_array_start;
2649 mbcset->range_ends = new_array_end;
2650 *range_alloc = new_nranges;
2652 # endif /* RE_ENABLE_I18N */
2654 /* Equivalence Classes and Character Classes can't be a range
2656 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2657 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2661 start_collseq = lookup_collation_sequence_value (start_elem);
2662 end_collseq = lookup_collation_sequence_value (end_elem);
2663 /* Check start/end collation sequence values. */
2664 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2665 return REG_ECOLLATE;
2666 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2669 # ifdef RE_ENABLE_I18N
2670 /* Got valid collation sequence values, add them as a new entry. */
2671 mbcset->range_starts[mbcset->nranges] = start_collseq;
2672 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2673 # endif /* RE_ENABLE_I18N */
2675 /* Build the table for single byte characters. */
2676 for (ch = 0; ch <= SBC_MAX; ch++)
2678 uint32_t ch_collseq;
2680 if (MB_CUR_MAX == 1)
2683 ch_collseq = collseqmb[ch];
2685 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2686 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2687 bitset_set (sbcset, ch);
2692 /* Local function for parse_bracket_exp used in _LIBC environement.
2693 Build the collating element which is represented by NAME.
2694 The result are written to MBCSET and SBCSET.
2695 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2696 pointer argument sinse we may update it. */
2698 static inline reg_errcode_t
2699 # ifdef RE_ENABLE_I18N
2700 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2701 re_charset_t *mbcset;
2702 int *coll_sym_alloc;
2703 # else /* not RE_ENABLE_I18N */
2704 build_collating_symbol (sbcset, name)
2705 # endif /* not RE_ENABLE_I18N */
2706 re_bitset_ptr_t sbcset;
2707 const unsigned char *name;
2710 size_t name_len = strlen ((const char *) name);
2713 elem = seek_collating_symbol_entry (name, name_len);
2714 if (symb_table[2 * elem] != 0)
2716 /* We found the entry. */
2717 idx = symb_table[2 * elem + 1];
2718 /* Skip the name of collating element name. */
2719 idx += 1 + extra[idx];
2721 else if (symb_table[2 * elem] == 0 && name_len == 1)
2723 /* No valid character, treat it as a normal
2725 bitset_set (sbcset, name[0]);
2729 return REG_ECOLLATE;
2731 # ifdef RE_ENABLE_I18N
2732 /* Got valid collation sequence, add it as a new entry. */
2733 /* Check the space of the arrays. */
2734 if (*coll_sym_alloc == mbcset->ncoll_syms)
2736 /* Not enough, realloc it. */
2737 /* +1 in case of mbcset->ncoll_syms is 0. */
2738 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2739 /* Use realloc since mbcset->coll_syms is NULL
2741 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2743 if (BE (mbcset->coll_syms == NULL, 0))
2746 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2747 # endif /* RE_ENABLE_I18N */
2752 if (BE (name_len != 1, 0))
2753 return REG_ECOLLATE;
2756 bitset_set (sbcset, name[0]);
2763 re_token_t br_token;
2764 re_bitset_ptr_t sbcset;
2765 #ifdef RE_ENABLE_I18N
2766 re_charset_t *mbcset;
2767 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2768 int equiv_class_alloc = 0, char_class_alloc = 0;
2769 #else /* not RE_ENABLE_I18N */
2771 #endif /* not RE_ENABLE_I18N */
2772 bin_tree_t *work_tree;
2773 int token_len, new_idx;
2775 collseqmb = (const unsigned char *)
2776 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2777 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2783 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2784 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2785 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2786 _NL_COLLATE_SYMB_TABLEMB);
2787 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2788 _NL_COLLATE_SYMB_EXTRAMB);
2791 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2792 #ifdef RE_ENABLE_I18N
2793 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2794 #endif /* RE_ENABLE_I18N */
2795 #ifdef RE_ENABLE_I18N
2796 if (BE (sbcset == NULL || mbcset == NULL, 0))
2798 if (BE (sbcset == NULL, 0))
2799 #endif /* RE_ENABLE_I18N */
2805 token_len = peek_token_bracket (token, regexp, syntax);
2806 if (BE (token->type == END_OF_RE, 0))
2809 goto parse_bracket_exp_free_return;
2811 if (token->type == OP_NON_MATCH_LIST)
2813 #ifdef RE_ENABLE_I18N
2815 mbcset->non_match = 1;
2816 #else /* not RE_ENABLE_I18N */
2818 #endif /* not RE_ENABLE_I18N */
2819 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2820 bitset_set (sbcset, '\0');
2821 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2822 token_len = peek_token_bracket (token, regexp, syntax);
2823 if (BE (token->type == END_OF_RE, 0))
2826 goto parse_bracket_exp_free_return;
2828 #ifdef RE_ENABLE_I18N
2830 for (i = 0; i < SBC_MAX; ++i)
2831 if (__btowc (i) == WEOF)
2832 bitset_set (sbcset, i);
2833 #endif /* RE_ENABLE_I18N */
2836 /* We treat the first ']' as a normal character. */
2837 if (token->type == OP_CLOSE_BRACKET)
2838 token->type = CHARACTER;
2842 bracket_elem_t start_elem, end_elem;
2843 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2844 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2846 int token_len2 = 0, is_range_exp = 0;
2849 start_elem.opr.name = start_name_buf;
2850 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2852 if (BE (ret != REG_NOERROR, 0))
2855 goto parse_bracket_exp_free_return;
2858 token_len = peek_token_bracket (token, regexp, syntax);
2859 if (BE (token->type == END_OF_RE, 0))
2862 goto parse_bracket_exp_free_return;
2864 if (token->type == OP_CHARSET_RANGE)
2866 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2867 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2868 if (BE (token->type == END_OF_RE, 0))
2871 goto parse_bracket_exp_free_return;
2873 if (token2.type == OP_CLOSE_BRACKET)
2875 /* We treat the last '-' as a normal character. */
2876 re_string_skip_bytes (regexp, -token_len);
2877 token->type = CHARACTER;
2883 if (is_range_exp == 1)
2885 end_elem.opr.name = end_name_buf;
2886 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2888 if (BE (ret != REG_NOERROR, 0))
2891 goto parse_bracket_exp_free_return;
2894 token_len = peek_token_bracket (token, regexp, syntax);
2895 if (BE (token->type == END_OF_RE, 0))
2898 goto parse_bracket_exp_free_return;
2900 *err = build_range_exp (sbcset,
2901 #ifdef RE_ENABLE_I18N
2902 mbcset, &range_alloc,
2903 #endif /* RE_ENABLE_I18N */
2904 &start_elem, &end_elem);
2905 if (BE (*err != REG_NOERROR, 0))
2906 goto parse_bracket_exp_free_return;
2910 switch (start_elem.type)
2913 bitset_set (sbcset, start_elem.opr.ch);
2915 #ifdef RE_ENABLE_I18N
2917 /* Check whether the array has enough space. */
2918 if (mbchar_alloc == mbcset->nmbchars)
2920 /* Not enough, realloc it. */
2921 /* +1 in case of mbcset->nmbchars is 0. */
2922 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2923 /* Use realloc since array is NULL if *alloc == 0. */
2924 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2926 if (BE (mbcset->mbchars == NULL, 0))
2927 goto parse_bracket_exp_espace;
2929 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2931 #endif /* RE_ENABLE_I18N */
2933 *err = build_equiv_class (sbcset,
2934 #ifdef RE_ENABLE_I18N
2935 mbcset, &equiv_class_alloc,
2936 #endif /* RE_ENABLE_I18N */
2937 start_elem.opr.name);
2938 if (BE (*err != REG_NOERROR, 0))
2939 goto parse_bracket_exp_free_return;
2942 *err = build_collating_symbol (sbcset,
2943 #ifdef RE_ENABLE_I18N
2944 mbcset, &coll_sym_alloc,
2945 #endif /* RE_ENABLE_I18N */
2946 start_elem.opr.name);
2947 if (BE (*err != REG_NOERROR, 0))
2948 goto parse_bracket_exp_free_return;
2951 ret = build_charclass (sbcset,
2952 #ifdef RE_ENABLE_I18N
2953 mbcset, &char_class_alloc,
2954 #endif /* RE_ENABLE_I18N */
2955 start_elem.opr.name, syntax);
2956 if (BE (ret != REG_NOERROR, 0))
2957 goto parse_bracket_exp_espace;
2964 if (token->type == OP_CLOSE_BRACKET)
2968 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2970 /* If it is non-matching list. */
2971 #ifdef RE_ENABLE_I18N
2972 if (mbcset->non_match)
2973 #else /* not RE_ENABLE_I18N */
2975 #endif /* not RE_ENABLE_I18N */
2976 bitset_not (sbcset);
2978 /* Build a tree for simple bracket. */
2979 br_token.type = SIMPLE_BRACKET;
2980 br_token.opr.sbcset = sbcset;
2981 new_idx = re_dfa_add_node (dfa, br_token, 0);
2982 work_tree = create_tree (NULL, NULL, 0, new_idx);
2983 if (BE (new_idx == -1 || work_tree == NULL, 0))
2984 goto parse_bracket_exp_espace;
2986 #ifdef RE_ENABLE_I18N
2987 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2988 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2989 || mbcset->non_match)))
2991 re_token_t alt_token;
2992 bin_tree_t *mbc_tree;
2993 /* Build a tree for complex bracket. */
2994 br_token.type = COMPLEX_BRACKET;
2995 br_token.opr.mbcset = mbcset;
2996 dfa->has_mb_node = 1;
2997 new_idx = re_dfa_add_node (dfa, br_token, 0);
2998 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2999 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3000 goto parse_bracket_exp_espace;
3001 /* Then join them by ALT node. */
3002 dfa->has_plural_match = 1;
3003 alt_token.type = OP_ALT;
3004 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3005 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3006 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3011 free_charset (mbcset);
3014 #else /* not RE_ENABLE_I18N */
3016 #endif /* not RE_ENABLE_I18N */
3018 parse_bracket_exp_espace:
3020 parse_bracket_exp_free_return:
3022 #ifdef RE_ENABLE_I18N
3023 free_charset (mbcset);
3024 #endif /* RE_ENABLE_I18N */
3028 /* Parse an element in the bracket expression. */
3030 static reg_errcode_t
3031 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3032 bracket_elem_t *elem;
3033 re_string_t *regexp;
3037 reg_syntax_t syntax;
3039 #ifdef RE_ENABLE_I18N
3041 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3042 if (cur_char_size > 1)
3044 elem->type = MB_CHAR;
3045 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3046 re_string_skip_bytes (regexp, cur_char_size);
3049 #endif /* RE_ENABLE_I18N */
3050 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3051 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3052 || token->type == OP_OPEN_EQUIV_CLASS)
3053 return parse_bracket_symbol (elem, regexp, token);
3054 elem->type = SB_CHAR;
3055 elem->opr.ch = token->opr.c;
3059 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3060 such as [:<character_class>:], [.<collating_element>.], and
3061 [=<equivalent_class>=]. */
3063 static reg_errcode_t
3064 parse_bracket_symbol (elem, regexp, token)
3065 bracket_elem_t *elem;
3066 re_string_t *regexp;
3069 unsigned char ch, delim = token->opr.c;
3073 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3075 if (token->type == OP_OPEN_CHAR_CLASS)
3076 ch = re_string_fetch_byte_case (regexp);
3078 ch = re_string_fetch_byte (regexp);
3079 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3081 elem->opr.name[i] = ch;
3083 re_string_skip_bytes (regexp, 1);
3084 elem->opr.name[i] = '\0';
3085 switch (token->type)
3087 case OP_OPEN_COLL_ELEM:
3088 elem->type = COLL_SYM;
3090 case OP_OPEN_EQUIV_CLASS:
3091 elem->type = EQUIV_CLASS;
3093 case OP_OPEN_CHAR_CLASS:
3094 elem->type = CHAR_CLASS;
3102 /* Helper function for parse_bracket_exp.
3103 Build the equivalence class which is represented by NAME.
3104 The result are written to MBCSET and SBCSET.
3105 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3106 is a pointer argument sinse we may update it. */
3108 static reg_errcode_t
3109 #ifdef RE_ENABLE_I18N
3110 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3111 re_charset_t *mbcset;
3112 int *equiv_class_alloc;
3113 #else /* not RE_ENABLE_I18N */
3114 build_equiv_class (sbcset, name)
3115 #endif /* not RE_ENABLE_I18N */
3116 re_bitset_ptr_t sbcset;
3117 const unsigned char *name;
3119 #if defined _LIBC && defined RE_ENABLE_I18N
3120 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123 const int32_t *table, *indirect;
3124 const unsigned char *weights, *extra, *cp;
3125 unsigned char char_buf[2];
3129 /* This #include defines a local function! */
3130 # include <locale/weight.h>
3131 /* Calculate the index for equivalence class. */
3133 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3134 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3135 _NL_COLLATE_WEIGHTMB);
3136 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3137 _NL_COLLATE_EXTRAMB);
3138 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3139 _NL_COLLATE_INDIRECTMB);
3140 idx1 = findidx (&cp);
3141 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3142 /* This isn't a valid character. */
3143 return REG_ECOLLATE;
3145 /* Build single byte matcing table for this equivalence class. */
3146 char_buf[1] = (unsigned char) '\0';
3147 len = weights[idx1];
3148 for (ch = 0; ch < SBC_MAX; ++ch)
3152 idx2 = findidx (&cp);
3157 /* This isn't a valid character. */
3159 if (len == weights[idx2])
3162 while (cnt <= len &&
3163 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3167 bitset_set (sbcset, ch);
3170 /* Check whether the array has enough space. */
3171 if (*equiv_class_alloc == mbcset->nequiv_classes)
3173 /* Not enough, realloc it. */
3174 /* +1 in case of mbcset->nequiv_classes is 0. */
3175 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3176 /* Use realloc since the array is NULL if *alloc == 0. */
3177 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3178 *equiv_class_alloc);
3179 if (BE (mbcset->equiv_classes == NULL, 0))
3182 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3185 #endif /* _LIBC && RE_ENABLE_I18N */
3187 if (BE (strlen ((const char *) name) != 1, 0))
3188 return REG_ECOLLATE;
3189 bitset_set (sbcset, *name);
3194 /* Helper function for parse_bracket_exp.
3195 Build the character class which is represented by NAME.
3196 The result are written to MBCSET and SBCSET.
3197 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3198 is a pointer argument sinse we may update it. */
3200 static reg_errcode_t
3201 #ifdef RE_ENABLE_I18N
3202 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3203 re_charset_t *mbcset;
3204 int *char_class_alloc;
3205 #else /* not RE_ENABLE_I18N */
3206 build_charclass (sbcset, class_name, syntax)
3207 #endif /* not RE_ENABLE_I18N */
3208 re_bitset_ptr_t sbcset;
3209 const unsigned char *class_name;
3210 reg_syntax_t syntax;
3213 const char *name = (const char *) class_name;
3215 /* In case of REG_ICASE "upper" and "lower" match the both of
3216 upper and lower cases. */
3217 if ((syntax & RE_ICASE)
3218 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3221 #ifdef RE_ENABLE_I18N
3222 /* Check the space of the arrays. */
3223 if (*char_class_alloc == mbcset->nchar_classes)
3225 /* Not enough, realloc it. */
3226 /* +1 in case of mbcset->nchar_classes is 0. */
3227 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3228 /* Use realloc since array is NULL if *alloc == 0. */
3229 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3231 if (BE (mbcset->char_classes == NULL, 0))
3234 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3235 #endif /* RE_ENABLE_I18N */
3237 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3238 for (i = 0; i < SBC_MAX; ++i) \
3240 if (ctype_func (i)) \
3241 bitset_set (sbcset, i); \
3244 if (strcmp (name, "alnum") == 0)
3245 BUILD_CHARCLASS_LOOP (isalnum)
3246 else if (strcmp (name, "cntrl") == 0)
3247 BUILD_CHARCLASS_LOOP (iscntrl)
3248 else if (strcmp (name, "lower") == 0)
3249 BUILD_CHARCLASS_LOOP (islower)
3250 else if (strcmp (name, "space") == 0)
3251 BUILD_CHARCLASS_LOOP (isspace)
3252 else if (strcmp (name, "alpha") == 0)
3253 BUILD_CHARCLASS_LOOP (isalpha)
3254 else if (strcmp (name, "digit") == 0)
3255 BUILD_CHARCLASS_LOOP (isdigit)
3256 else if (strcmp (name, "print") == 0)
3257 BUILD_CHARCLASS_LOOP (isprint)
3258 else if (strcmp (name, "upper") == 0)
3259 BUILD_CHARCLASS_LOOP (isupper)
3260 else if (strcmp (name, "blank") == 0)
3261 BUILD_CHARCLASS_LOOP (isblank)
3262 else if (strcmp (name, "graph") == 0)
3263 BUILD_CHARCLASS_LOOP (isgraph)
3264 else if (strcmp (name, "punct") == 0)
3265 BUILD_CHARCLASS_LOOP (ispunct)
3266 else if (strcmp (name, "xdigit") == 0)
3267 BUILD_CHARCLASS_LOOP (isxdigit)
3275 build_word_op (dfa, not, err)
3280 re_bitset_ptr_t sbcset;
3281 #ifdef RE_ENABLE_I18N
3282 re_charset_t *mbcset;
3284 #else /* not RE_ENABLE_I18N */
3286 #endif /* not RE_ENABLE_I18N */
3288 re_token_t br_token;
3292 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3293 #ifdef RE_ENABLE_I18N
3294 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3295 #endif /* RE_ENABLE_I18N */
3297 #ifdef RE_ENABLE_I18N
3298 if (BE (sbcset == NULL || mbcset == NULL, 0))
3299 #else /* not RE_ENABLE_I18N */
3300 if (BE (sbcset == NULL, 0))
3301 #endif /* not RE_ENABLE_I18N */
3309 #ifdef RE_ENABLE_I18N
3312 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3313 bitset_set(cset->sbcset, '\0');
3315 mbcset->non_match = 1;
3317 for (i = 0; i < SBC_MAX; ++i)
3318 if (__btowc (i) == WEOF)
3319 bitset_set (sbcset, i);
3320 #else /* not RE_ENABLE_I18N */
3322 #endif /* not RE_ENABLE_I18N */
3325 /* We don't care the syntax in this case. */
3326 ret = build_charclass (sbcset,
3327 #ifdef RE_ENABLE_I18N
3329 #endif /* RE_ENABLE_I18N */
3330 (const unsigned char *) "alpha", 0);
3332 if (BE (ret != REG_NOERROR, 0))
3335 #ifdef RE_ENABLE_I18N
3336 free_charset (mbcset);
3337 #endif /* RE_ENABLE_I18N */
3341 /* \w match '_' also. */
3342 bitset_set (sbcset, '_');
3344 /* If it is non-matching list. */
3345 #ifdef RE_ENABLE_I18N
3346 if (mbcset->non_match)
3347 #else /* not RE_ENABLE_I18N */
3349 #endif /* not RE_ENABLE_I18N */
3350 bitset_not (sbcset);
3352 /* Build a tree for simple bracket. */
3353 br_token.type = SIMPLE_BRACKET;
3354 br_token.opr.sbcset = sbcset;
3355 new_idx = re_dfa_add_node (dfa, br_token, 0);
3356 tree = create_tree (NULL, NULL, 0, new_idx);
3357 if (BE (new_idx == -1 || tree == NULL, 0))
3358 goto build_word_op_espace;
3360 #ifdef RE_ENABLE_I18N
3363 re_token_t alt_token;
3364 bin_tree_t *mbc_tree;
3365 /* Build a tree for complex bracket. */
3366 br_token.type = COMPLEX_BRACKET;
3367 br_token.opr.mbcset = mbcset;
3368 dfa->has_mb_node = 1;
3369 new_idx = re_dfa_add_node (dfa, br_token, 0);
3370 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3371 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3372 goto build_word_op_espace;
3373 /* Then join them by ALT node. */
3374 alt_token.type = OP_ALT;
3375 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3376 tree = create_tree (tree, mbc_tree, 0, new_idx);
3377 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3382 free_charset (mbcset);
3385 #else /* not RE_ENABLE_I18N */
3387 #endif /* not RE_ENABLE_I18N */
3389 build_word_op_espace:
3391 #ifdef RE_ENABLE_I18N
3392 free_charset (mbcset);
3393 #endif /* RE_ENABLE_I18N */
3398 /* This is intended for the expressions like "a{1,3}".
3399 Fetch a number from `input', and return the number.
3400 Return -1, if the number field is empty like "{,1}".
3401 Return -2, If an error is occured. */
3404 fetch_number (input, token, syntax)
3407 reg_syntax_t syntax;
3413 *token = fetch_token (input, syntax);
3415 if (BE (token->type == END_OF_RE, 0))
3417 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3419 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3420 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3421 num = (num > RE_DUP_MAX) ? -2 : num;
3426 #ifdef RE_ENABLE_I18N
3428 free_charset (re_charset_t *cset)
3430 re_free (cset->mbchars);
3432 re_free (cset->coll_syms);
3433 re_free (cset->equiv_classes);
3434 re_free (cset->range_starts);
3435 re_free (cset->range_ends);
3437 re_free (cset->char_classes);
3440 #endif /* RE_ENABLE_I18N */
3442 /* Functions for binary tree operation. */
3444 /* Create a node of tree.
3445 Note: This function automatically free left and right if malloc fails. */
3448 create_tree (left, right, type, index)
3451 re_token_type_t type;
3455 tree = re_malloc (bin_tree_t, 1);
3456 if (BE (tree == NULL, 0))
3458 free_bin_tree (left);
3459 free_bin_tree (right);
3462 tree->parent = NULL;
3464 tree->right = right;
3466 tree->node_idx = index;
3469 re_node_set_init_empty (&tree->eclosure);
3472 left->parent = tree;
3474 right->parent = tree;
3478 /* Free the sub tree pointed by TREE. */
3481 free_bin_tree (tree)
3486 /*re_node_set_free (&tree->eclosure);*/
3487 free_bin_tree (tree->left);
3488 free_bin_tree (tree->right);
3492 /* Duplicate the node SRC, and return new node. */
3495 duplicate_tree (src, dfa)
3496 const bin_tree_t *src;
3499 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3501 /* Since node indies must be according to Post-order of the tree,
3502 we must duplicate the left at first. */
3503 if (src->left != NULL)
3505 left = duplicate_tree (src->left, dfa);
3510 /* Secondaly, duplicate the right. */
3511 if (src->right != NULL)
3513 right = duplicate_tree (src->right, dfa);
3516 free_bin_tree (left);
3521 /* At last, duplicate itself. */
3522 if (src->type == NON_TYPE)
3524 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3525 dfa->nodes[new_node_idx].duplicated = 1;
3526 if (BE (new_node_idx == -1, 0))
3528 free_bin_tree (left);
3529 free_bin_tree (right);
3534 new_node_idx = src->type;
3536 new_tree = create_tree (left, right, src->type, new_node_idx);
3537 if (BE (new_tree == NULL, 0))
3539 free_bin_tree (left);
3540 free_bin_tree (right);