summaryrefslogtreecommitdiff
path: root/src/regex/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex/regcomp.c')
-rw-r--r--src/regex/regcomp.c1715
1 files changed, 634 insertions, 1081 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c
index d9076275..4cdaa1ea 100644
--- a/src/regex/regcomp.c
+++ b/src/regex/regcomp.c
@@ -34,6 +34,7 @@
#include <regex.h>
#include <limits.h>
#include <stdint.h>
+#include <ctype.h>
#include "tre.h"
@@ -135,108 +136,88 @@ typedef struct {
tre_ast_node_t *right;
} tre_union_t;
-static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
-
-static tre_ast_node_t *
-tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
-
-static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
- int minimal);
-
-static tre_ast_node_t *
-tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
-
-static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
- tre_ast_node_t *right);
-
static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
+tre_ast_new_node(tre_mem_t mem, int type, void *obj)
{
- tre_ast_node_t *node;
-
- node = tre_mem_calloc(mem, sizeof(*node));
- if (!node)
- return NULL;
- node->obj = tre_mem_calloc(mem, size);
- if (!node->obj)
- return NULL;
- node->type = type;
- node->nullable = -1;
- node->submatch_id = -1;
-
- return node;
+ tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
+ if (!node || !obj)
+ return 0;
+ node->obj = obj;
+ node->type = type;
+ node->nullable = -1;
+ node->submatch_id = -1;
+ return node;
}
static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
{
- tre_ast_node_t *node;
- tre_literal_t *lit;
-
- node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
- if (!node)
- return NULL;
- lit = node->obj;
- lit->code_min = code_min;
- lit->code_max = code_max;
- lit->position = position;
-
- return node;
+ tre_ast_node_t *node;
+ tre_literal_t *lit;
+
+ lit = tre_mem_calloc(mem, sizeof *lit);
+ node = tre_ast_new_node(mem, LITERAL, lit);
+ if (!node)
+ return 0;
+ lit->code_min = code_min;
+ lit->code_max = code_max;
+ lit->position = position;
+ return node;
}
static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
- int minimal)
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
{
- tre_ast_node_t *node;
- tre_iteration_t *iter;
-
- node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
- if (!node)
- return NULL;
- iter = node->obj;
- iter->arg = arg;
- iter->min = min;
- iter->max = max;
- iter->minimal = minimal;
- node->num_submatches = arg->num_submatches;
-
- return node;
+ tre_ast_node_t *node;
+ tre_iteration_t *iter;
+
+ iter = tre_mem_calloc(mem, sizeof *iter);
+ node = tre_ast_new_node(mem, ITERATION, iter);
+ if (!node)
+ return 0;
+ iter->arg = arg;
+ iter->min = min;
+ iter->max = max;
+ iter->minimal = minimal;
+ node->num_submatches = arg->num_submatches;
+ return node;
}
static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
{
- tre_ast_node_t *node;
-
- node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
- if (node == NULL)
- return NULL;
- ((tre_union_t *)node->obj)->left = left;
- ((tre_union_t *)node->obj)->right = right;
- node->num_submatches = left->num_submatches + right->num_submatches;
-
- return node;
+ tre_ast_node_t *node;
+ tre_union_t *un;
+
+ if (!left)
+ return right;
+ un = tre_mem_calloc(mem, sizeof *un);
+ node = tre_ast_new_node(mem, UNION, un);
+ if (!node || !right)
+ return 0;
+ un->left = left;
+ un->right = right;
+ node->num_submatches = left->num_submatches + right->num_submatches;
+ return node;
}
static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
- tre_ast_node_t *right)
+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
{
- tre_ast_node_t *node;
-
- node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
- if (node == NULL)
- return NULL;
- ((tre_catenation_t *)node->obj)->left = left;
- ((tre_catenation_t *)node->obj)->right = right;
- node->num_submatches = left->num_submatches + right->num_submatches;
-
- return node;
+ tre_ast_node_t *node;
+ tre_catenation_t *cat;
+
+ if (!left)
+ return right;
+ cat = tre_mem_calloc(mem, sizeof *cat);
+ node = tre_ast_new_node(mem, CATENATION, cat);
+ if (!node)
+ return 0;
+ cat->left = left;
+ cat->right = right;
+ node->num_submatches = left->num_submatches + right->num_submatches;
+ return node;
}
@@ -412,1077 +393,654 @@ define_popf(voidptr, void *)
/* Parse context. */
typedef struct {
- /* Memory allocator. The AST is allocated using this. */
- tre_mem_t mem;
- /* Stack used for keeping track of regexp syntax. */
- tre_stack_t *stack;
- /* The parse result. */
- tre_ast_node_t *result;
- /* The regexp to parse and its length. */
- const char *re;
- /* The first character of the entire regexp. */
- const char *re_start;
- /* Current submatch ID. */
- int submatch_id;
- /* Current position (number of literal). */
- int position;
- /* The highest back reference or -1 if none seen so far. */
- int max_backref;
- /* This flag is set if the regexp uses approximate matching. */
- int have_approx;
- /* Compilation flags. */
- int cflags;
- /* If this flag is set the top-level submatch is not captured. */
- int nofirstsub;
+ /* Memory allocator. The AST is allocated using this. */
+ tre_mem_t mem;
+ /* Stack used for keeping track of regexp syntax. */
+ tre_stack_t *stack;
+ /* The parsed node after a parse function returns. */
+ tre_ast_node_t *n;
+ /* Position in the regexp pattern after a parse function returns. */
+ const char *s;
+ /* The first character of the regexp. */
+ const char *re;
+ /* Current submatch ID. */
+ int submatch_id;
+ /* Current position (number of literal). */
+ int position;
+ /* The highest back reference or -1 if none seen so far. */
+ int max_backref;
+ /* Compilation flags. */
+ int cflags;
} tre_parse_ctx_t;
-/* Parses a wide character regexp pattern into a syntax tree. This parser
- handles both syntaxes (BRE and ERE), including the TRE extensions. */
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx);
-
-
-/*
- This parser is just a simple recursive descent parser for POSIX.2
- regexps. The parser supports both the obsolete default syntax and
- the "extended" syntax, and some nonstandard extensions.
-*/
-
-/* Characters with special meanings in regexp syntax. */
-#define CHAR_PIPE '|'
-#define CHAR_LPAREN '('
-#define CHAR_RPAREN ')'
-#define CHAR_LBRACE '{'
-#define CHAR_RBRACE '}'
-#define CHAR_LBRACKET '['
-#define CHAR_RBRACKET ']'
-#define CHAR_MINUS '-'
-#define CHAR_STAR '*'
-#define CHAR_QUESTIONMARK '?'
-#define CHAR_PLUS '+'
-#define CHAR_PERIOD '.'
-#define CHAR_COLON ':'
-#define CHAR_EQUAL '='
-#define CHAR_COMMA ','
-#define CHAR_CARET '^'
-#define CHAR_DOLLAR '$'
-#define CHAR_BACKSLASH '\\'
-#define CHAR_HASH '#'
-#define CHAR_TILDE '~'
-
-
/* Some macros for expanding \w, \s, etc. */
-static const struct tre_macro_struct {
- const char c;
- const char *expansion;
-} tre_macros[] =
- { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
- {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
- {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
- {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
- { 0, NULL }
- };
-
+static const struct {
+ char c;
+ const char *expansion;
+} tre_macros[] = {
+ {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
+ {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
+ {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
+ {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
+ { 0, 0 }
+};
/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
must have at least `len' items. Sets buf[0] to zero if the there
is no match in `tre_macros'. */
-static const char *
-tre_expand_macro(const char *regex)
+static const char *tre_expand_macro(const char *s)
{
- int i;
-
- if (!*regex)
- return 0;
-
- for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
- return tre_macros[i].expansion;
+ int i;
+ for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
+ return tre_macros[i].expansion;
}
-static reg_errcode_t
-tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
- tre_ast_node_t ***items)
+static int
+tre_compare_lit(const void *a, const void *b)
{
- reg_errcode_t status;
- tre_ast_node_t **array = *items;
- /* Allocate more space if necessary. */
- if (*i >= *max_i)
- {
- tre_ast_node_t **new_items;
- /* If the array is already 1024 items large, give up -- there's
- probably an error in the regexp (e.g. not a '\0' terminated
- string and missing ']') */
- if (*max_i > 1024)
- return REG_ESPACE;
- *max_i *= 2;
- new_items = xrealloc(array, sizeof(*array) * *max_i);
- if (new_items == NULL)
- return REG_ESPACE;
- *items = array = new_items;
- }
- array[*i] = tre_ast_new_literal(mem, min, max, -1);
- status = array[*i] == NULL ? REG_ESPACE : REG_OK;
- (*i)++;
- return status;
+ const tre_literal_t *const *la = a;
+ const tre_literal_t *const *lb = b;
+ /* assumes the range of valid code_min is < INT_MAX */
+ return la[0]->code_min - lb[0]->code_min;
}
+struct literals {
+ tre_mem_t mem;
+ tre_literal_t **a;
+ int len;
+ int cap;
+};
-static int
-tre_compare_items(const void *a, const void *b)
+static tre_literal_t *tre_new_lit(struct literals *p)
{
- const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
- const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
- tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
- int a_min = l_a->code_min, b_min = l_b->code_min;
-
- if (a_min < b_min)
- return -1;
- else if (a_min > b_min)
- return 1;
- else
- return 0;
+ tre_literal_t **a;
+ if (p->len >= p->cap) {
+ if (p->cap >= 1<<15)
+ return 0;
+ p->cap *= 2;
+ a = xrealloc(p->a, p->cap * sizeof *p->a);
+ if (!a)
+ return 0;
+ p->a = a;
+ }
+ a = p->a + p->len++;
+ *a = tre_mem_calloc(p->mem, sizeof **a);
+ return *a;
}
-/* Maximum number of character classes that can occur in a negated bracket
- expression. */
-#define MAX_NEG_CLASSES 64
-
-/* Maximum length of character class names. */
-#define MAX_CLASS_NAME
-
-static reg_errcode_t
-tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
- tre_ctype_t neg_classes[], int *num_neg_classes,
- tre_ast_node_t ***items, int *num_items,
- int *items_size)
+static int add_icase_literals(struct literals *ls, int min, int max)
{
- const char *re = ctx->re;
- reg_errcode_t status = REG_OK;
- tre_ctype_t class = (tre_ctype_t)0;
- int i = *num_items;
- int max_i = *items_size;
- int skip;
-
- /* Build an array of the items in the bracket expression. */
- while (status == REG_OK)
- {
- skip = 0;
- if (!*re)
- {
- status = REG_EBRACK;
- }
- else if (*re == CHAR_RBRACKET && re > ctx->re)
- {
- re++;
- break;
+ tre_literal_t *lit;
+ int b, e, c;
+ for (c=min; c<=max; ) {
+ /* assumes islower(c) and isupper(c) are exclusive
+ and toupper(c)!=c if islower(c).
+ multiple opposite case characters are not supported */
+ if (tre_islower(c)) {
+ b = e = tre_toupper(c);
+ for (c++, e++; c<=max; c++, e++)
+ if (tre_toupper(c) != e) break;
+ } else if (tre_isupper(c)) {
+ b = e = tre_tolower(c);
+ for (c++, e++; c<=max; c++, e++)
+ if (tre_tolower(c) != e) break;
+ } else {
+ c++;
+ continue;
+ }
+ lit = tre_new_lit(ls);
+ if (!lit)
+ return -1;
+ lit->code_min = b;
+ lit->code_max = e-1;
+ lit->position = -1;
}
- else
- {
- tre_cint_t min = 0, max = 0;
- wchar_t wc;
- int clen = mbtowc(&wc, re, -1);
+ return 0;
+}
- if (clen<0) clen=1, wc=WEOF;
- class = (tre_ctype_t)0;
- if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
- {
- min = wc;
- re += clen+1;
- clen = mbtowc(&wc, re, -1);
- if (clen<0) clen=1, wc=WEOF;
- max = wc;
- re += clen;
- /* XXX - Should use collation order instead of encoding values
- in character ranges. */
- if (min > max)
- status = REG_ERANGE;
- }
- else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
- status = REG_ECOLLATE;
- else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
- status = REG_ECOLLATE;
- else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
- {
- char tmp_str[64];
- const char *endptr = re + 2;
- int len;
- while (*endptr && *endptr != CHAR_COLON)
- endptr++;
- if (*endptr)
- {
- len = MIN(endptr - re - 2, 63);
- strncpy(tmp_str, re + 2, len);
- tmp_str[len] = '\0';
- class = tre_ctype(tmp_str);
- if (!class)
- status = REG_ECTYPE;
- re = endptr + 2;
- }
- else
- status = REG_ECTYPE;
- min = 0;
- max = TRE_CHAR_MAX;
- }
- else
- {
- if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
- && ctx->re != re)
- /* Two ranges are not allowed to share and endpoint. */
- status = REG_ERANGE;
- min = max = wc;
- re += clen;
- }
+/* Maximum number of character classes in a negated bracket expression. */
+#define MAX_NEG_CLASSES 64
- if (status != REG_OK)
- break;
+struct neg {
+ int negate;
+ int len;
+ tre_ctype_t a[MAX_NEG_CLASSES];
+};
- if (class && negate)
- if (*num_neg_classes >= MAX_NEG_CLASSES)
- status = REG_ESPACE;
- else
- neg_classes[(*num_neg_classes)++] = class;
- else if (!skip)
- {
- status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
- if (status != REG_OK)
- break;
- ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
- }
+// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
- /* Add opposite-case counterpoints if REG_ICASE is present.
- This is broken if there are more than two "same" characters. */
- if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
- {
- tre_cint_t cmin, ccurr;
+/*
+bracket grammar:
+Bracket = '[' List ']' | '[^' List ']'
+List = Term | List Term
+Term = Char | Range | Chclass | Eqclass
+Range = Char '-' Char | Char '-' '-'
+Char = Coll | coll_single
+Meta = ']' | '-'
+Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]'
+Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'
+Chclass = '[:' class ':]'
+
+coll_single is a single char collating element but it can be
+ '-' only at the beginning or end of a List and
+ ']' only at the beginning of a List and
+ '^' anywhere except after the openning '['
+*/
- while (min <= max)
- {
- if (tre_islower(min))
- {
- cmin = ccurr = tre_toupper(min++);
- while (tre_islower(min) && tre_toupper(min) == ccurr + 1
- && min <= max)
- ccurr = tre_toupper(min++);
- status = tre_new_item(ctx->mem, cmin, ccurr,
- &i, &max_i, items);
- }
- else if (tre_isupper(min))
- {
- cmin = ccurr = tre_tolower(min++);
- while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
- && min <= max)
- ccurr = tre_tolower(min++);
- status = tre_new_item(ctx->mem, cmin, ccurr,
- &i, &max_i, items);
- }
- else min++;
- if (status != REG_OK)
- break;
+static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
+{
+ const char *start = s;
+ tre_ctype_t class;
+ int min, max;
+ wchar_t wc;
+ int len;
+
+ for (;;) {
+ class = 0;
+ len = mbtowc(&wc, s, -1);
+ if (len <= 0)
+ return *s ? REG_BADPAT : REG_EBRACK;
+ if (*s == ']' && s != start) {
+ ctx->s = s+1;
+ return REG_OK;
+ }
+ if (*s == '-' && s != start && s[1] != ']' &&
+ /* extension: [a-z--@] is accepted as [a-z]|[--@] */
+ (s[1] != '-' || s[2] == ']'))
+ return REG_ERANGE;
+ if (*s == '[' && (s[1] == '.' || s[1] == '='))
+ /* collating symbols and equivalence classes are not supported */
+ return REG_ECOLLATE;
+ if (*s == '[' && s[1] == ':') {
+ char tmp[CHARCLASS_NAME_MAX+1];
+ s += 2;
+ for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
+ if (s[len] == ':') {
+ memcpy(tmp, s, len);
+ tmp[len] = 0;
+ class = tre_ctype(tmp);
+ break;
+ }
+ }
+ if (!class || s[len+1] != ']')
+ return REG_ECTYPE;
+ min = 0;
+ max = TRE_CHAR_MAX;
+ s += len+2;
+ } else {
+ min = max = wc;
+ s += len;
+ if (*s == '-' && s[1] != ']') {
+ s++;
+ len = mbtowc(&wc, s, -1);
+ max = wc;
+ /* XXX - Should use collation order instead of
+ encoding values in character ranges. */
+ if (len <= 0 || min > max)
+ return REG_ERANGE;
+ s += len;
+ }
+ }
+
+ if (class && neg->negate) {
+ if (neg->len >= MAX_NEG_CLASSES)
+ return REG_ESPACE;
+ neg->a[neg->len++] = class;
+ } else {
+ tre_literal_t *lit = tre_new_lit(ls);
+ if (!lit)
+ return REG_ESPACE;
+ lit->code_min = min;
+ lit->code_max = max;
+ lit->class = class;
+ lit->position = -1;
+
+ /* Add opposite-case codepoints if REG_ICASE is present.
+ It seems that POSIX requires that bracket negation
+ should happen before case-folding, but most practical
+ implementations do it the other way around. Changing
+ the order would need efficient representation of
+ case-fold ranges and bracket range sets even with
+ simple patterns so this is ok for now. */
+ if (ctx->cflags & REG_ICASE && !class)
+ if (add_icase_literals(ls, min, max))
+ return REG_ESPACE;
}
- if (status != REG_OK)
- break;
- }
}
- }
- *num_items = i;
- *items_size = max_i;
- ctx->re = re;
- return status;
}
-static reg_errcode_t
-tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
{
- tre_ast_node_t *node = NULL;
- int negate = 0;
- reg_errcode_t status = REG_OK;
- tre_ast_node_t **items, *u, *n;
- int i = 0, j, max_i = 32, curr_max, curr_min;
- tre_ctype_t neg_classes[MAX_NEG_CLASSES];
- int num_neg_classes = 0;
-
- /* Start off with an array of `max_i' elements. */
- items = xmalloc(sizeof(*items) * max_i);
- if (items == NULL)
- return REG_ESPACE;
-
- if (*ctx->re == CHAR_CARET)
- {
- negate = 1;
- ctx->re++;
- }
-
- status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
- &items, &i, &max_i);
-
- if (status != REG_OK)
- goto parse_bracket_done;
-
- /* Sort the array if we need to negate it. */
- if (negate)
- qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
-
- curr_max = curr_min = 0;
- /* Build a union of the items in the array, negated if necessary. */
- for (j = 0; j < i && status == REG_OK; j++)
- {
- int min, max;
- tre_literal_t *l = items[j]->obj;
- min = l->code_min;
- max = l->code_max;
-
- if (negate)
- {
- if (min < curr_max)
- {
- /* Overlap. */
- curr_max = MAX(max + 1, curr_max);
- l = NULL;
- }
- else
- {
- /* No overlap. */
- curr_max = min - 1;
- if (curr_max >= curr_min)
- {
- l->code_min = curr_min;
- l->code_max = curr_max;
+ int i, max, min, negmax, negmin;
+ tre_ast_node_t *node = 0, *n;
+ tre_ctype_t *nc = 0;
+ tre_literal_t *lit;
+ struct literals ls;
+ struct neg neg;
+ reg_errcode_t err;
+
+ ls.mem = ctx->mem;
+ ls.len = 0;
+ ls.cap = 32;
+ ls.a = xmalloc(ls.cap * sizeof *ls.a);
+ if (!ls.a)
+ return REG_ESPACE;
+ neg.len = 0;
+ neg.negate = *s == '^';
+ if (neg.negate)
+ s++;
+
+ err = parse_bracket_terms(ctx, s, &ls, &neg);
+ if (err != REG_OK)
+ goto parse_bracket_done;
+
+ if (neg.negate) {
+ /* Sort the array if we need to negate it. */
+ qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
+ /* extra lit for the last negated range */
+ lit = tre_new_lit(&ls);
+ if (!lit) {
+ err = REG_ESPACE;
+ goto parse_bracket_done;
}
- else
- {
- l = NULL;
+ lit->code_min = TRE_CHAR_MAX+1;
+ lit->code_max = TRE_CHAR_MAX+1;
+ lit->position = -1;
+ /* negated classes */
+ if (neg.len) {
+ nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
+ if (!nc) {
+ err = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ memcpy(nc, neg.a, neg.len*sizeof *neg.a);
+ nc[neg.len] = 0;
}
- curr_min = curr_max = max + 1;
- }
}
- if (l != NULL)
- {
- int k;
- l->position = ctx->position;
- if (num_neg_classes > 0)
- {
- l->neg_classes = tre_mem_alloc(ctx->mem,
- (sizeof(*l->neg_classes)
- * (num_neg_classes + 1)));
- if (l->neg_classes == NULL)
- {
- status = REG_ESPACE;
- break;
+ /* Build a union of the items in the array, negated if necessary. */
+ negmax = negmin = 0;
+ for (i = 0; i < ls.len; i++) {
+ lit = ls.a[i];
+ min = lit->code_min;
+ max = lit->code_max;
+ if (neg.negate) {
+ if (min <= negmin) {
+ /* Overlap. */
+ negmin = MAX(max + 1, negmin);
+ continue;
+ }
+ negmax = min - 1;
+ lit->code_min = negmin;
+ lit->code_max = negmax;
+ negmin = max + 1;
}
- for (k = 0; k < num_neg_classes; k++)
- l->neg_classes[k] = neg_classes[k];
- l->neg_classes[k] = (tre_ctype_t)0;
- }
- else
- l->neg_classes = NULL;
- if (node == NULL)
- node = items[j];
- else
- {
- u = tre_ast_new_union(ctx->mem, node, items[j]);
- if (u == NULL)
- status = REG_ESPACE;
- node = u;
- }
- }
- }
-
- if (status != REG_OK)
- goto parse_bracket_done;
-
- if (negate)
- {
- int k;
- n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
- if (n == NULL)
- status = REG_ESPACE;
- else
- {
- tre_literal_t *l = n->obj;
- if (num_neg_classes > 0)
- {
- l->neg_classes = tre_mem_alloc(ctx->mem,
- (sizeof(*l->neg_classes)
- * (num_neg_classes + 1)));
- if (l->neg_classes == NULL)
- {
- status = REG_ESPACE;
- goto parse_bracket_done;
+ lit->position = ctx->position;
+ lit->neg_classes = nc;
+ n = tre_ast_new_node(ctx->mem, LITERAL, lit);
+ node = tre_ast_new_union(ctx->mem, node, n);
+ if (!node) {
+ err = REG_ESPACE;
+ break;
}
- for (k = 0; k < num_neg_classes; k++)
- l->neg_classes[k] = neg_classes[k];
- l->neg_classes[k] = (tre_ctype_t)0;
- }
- else
- l->neg_classes = NULL;
- if (node == NULL)
- node = n;
- else
- {
- u = tre_ast_new_union(ctx->mem, node, n);
- if (u == NULL)
- status = REG_ESPACE;
- node = u;
- }
}
- }
-
- if (status != REG_OK)
- goto parse_bracket_done;
-#ifdef TRE_DEBUG
- tre_ast_print(node);
-#endif /* TRE_DEBUG */
-
- parse_bracket_done:
- xfree(items);
- ctx->position++;
- *result = node;
- return status;
+parse_bracket_done:
+ xfree(ls.a);
+ ctx->position++;
+ ctx->n = node;
+ return err;
}
-
-/* Parses a positive decimal integer. Returns -1 if the string does not
- contain a valid number. */
-static int
-tre_parse_int(const char **regex)
+static const char *parse_dup_count(const char *s, int *n)
{
- int num = -1;
- const char *r = *regex;
- while (*r-'0'<10U)
- {
- if (num < 0)
- num = 0;
- num = num * 10 + *r - '0';
- r++;
- }
- *regex = r;
- return num;
+ *n = -1;
+ if (!isdigit(*s))
+ return s;
+ *n = 0;
+ for (;;) {
+ *n = 10 * *n + (*s - '0');
+ s++;
+ if (!isdigit(*s) || *n > RE_DUP_MAX)
+ break;
+ }
+ return s;
}
-
-static reg_errcode_t
-tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
{
- int min, max;
- const char *r = ctx->re;
- int minimal = 0;
-
- /* Parse number (minimum repetition count). */
- min = -1;
- if (*r >= '0' && *r <= '9') {
- min = tre_parse_int(&r);
- }
-
- /* Parse comma and second number (maximum repetition count). */
- max = min;
- if (*r == CHAR_COMMA)
- {
- r++;
- max = tre_parse_int(&r);
- }
-
- /* Check that the repeat counts are sane. */
- if ((max >= 0 && min > max) || max > RE_DUP_MAX)
- return REG_BADBR;
-
- /* Missing }. */
- if (!*r)
- return REG_EBRACE;
-
- /* Empty contents of {}. */
- if (r == ctx->re)
- return REG_BADBR;
-
- /* Parse the ending '}' or '\}'.*/
- if (ctx->cflags & REG_EXTENDED)
- {
- if (*r != CHAR_RBRACE)
- return REG_BADBR;
- r++;
- }
- else
- {
- if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
- return REG_BADBR;
- r += 2;
- }
-
- /* Create the AST node(s). */
- if (min == 0 && max == 0)
- {
- *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- if (*result == NULL)
- return REG_ESPACE;
- }
- else
- {
- if (min < 0 && max < 0)
- /* Only approximate parameters set, no repetitions. */
- min = max = 1;
-
- *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
- if (!*result)
- return REG_ESPACE;
- }
-
- ctx->re = r;
- return REG_OK;
+ int min, max;
+
+ s = parse_dup_count(s, &min);
+ if (*s == ',')
+ s = parse_dup_count(s+1, &max);
+ else
+ max = min;
+
+ if (
+ (max < min && max >= 0) ||
+ max > RE_DUP_MAX ||
+ min > RE_DUP_MAX ||
+ min < 0 ||
+ (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
+ *s++ != '}'
+ )
+ return REG_BADBR;
+
+ if (min == 0 && max == 0)
+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ else
+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+ if (!ctx->n)
+ return REG_ESPACE;
+ ctx->s = s;
+ return REG_OK;
}
-typedef enum {
- PARSE_RE = 0,
- PARSE_ATOM,
- PARSE_MARK_FOR_SUBMATCH,
- PARSE_BRANCH,
- PARSE_PIECE,
- PARSE_CATENATION,
- PARSE_POST_CATENATION,
- PARSE_UNION,
- PARSE_POST_UNION,
- PARSE_POSTFIX,
- PARSE_RESTORE_CFLAGS
-} tre_parse_re_stack_symbol_t;
-
-
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx)
+static int hexval(unsigned c)
{
- tre_ast_node_t *result = NULL;
- tre_parse_re_stack_symbol_t symbol;
- reg_errcode_t status = REG_OK;
- tre_stack_t *stack = ctx->stack;
- int bottom = tre_stack_num_objects(stack);
- int depth = 0;
- wchar_t wc;
- int clen;
-
- if (!ctx->nofirstsub)
- {
- STACK_PUSH(stack, int, ctx->submatch_id);
- STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
- ctx->submatch_id++;
- }
- STACK_PUSH(stack, int, PARSE_RE);
- ctx->re_start = ctx->re;
-
-
- /* The following is basically just a recursive descent parser. I use
- an explicit stack instead of recursive functions mostly because of
- two reasons: compatibility with systems which have an overflowable
- call stack, and efficiency (both in lines of code and speed). */
- while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
- {
- if (status != REG_OK)
- break;
- symbol = tre_stack_pop_int(stack);
- switch (symbol)
- {
- case PARSE_RE:
- /* Parse a full regexp. A regexp is one or more branches,
- separated by the union operator `|'. */
- if (ctx->cflags & REG_EXTENDED)
- STACK_PUSHX(stack, int, PARSE_UNION);
- STACK_PUSHX(stack, int, PARSE_BRANCH);
- break;
-
- case PARSE_BRANCH:
- /* Parse a branch. A branch is one or more pieces, concatenated.
- A piece is an atom possibly followed by a postfix operator. */
- STACK_PUSHX(stack, int, PARSE_CATENATION);
- STACK_PUSHX(stack, int, PARSE_PIECE);
- break;
-
- case PARSE_PIECE:
- /* Parse a piece. A piece is an atom possibly followed by one
- or more postfix operators. */
- STACK_PUSHX(stack, int, PARSE_POSTFIX);
- STACK_PUSHX(stack, int, PARSE_ATOM);
- break;
-
- case PARSE_CATENATION:
- /* If the expression has not ended, parse another piece. */
- {
- tre_char_t c;
- if (!*ctx->re)
- break;
- c = *ctx->re;
- if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
- break;
- if ((ctx->cflags & REG_EXTENDED
- && c == CHAR_RPAREN && depth > 0)
- || (!(ctx->cflags & REG_EXTENDED)
- && (c == CHAR_BACKSLASH
- && *(ctx->re + 1) == CHAR_RPAREN)))
- {
- if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
- status = REG_EPAREN;
- depth--;
- if (!(ctx->cflags & REG_EXTENDED))
- ctx->re += 2;
- break;
- }
-
- {
- /* Default case, left associative concatenation. */
- STACK_PUSHX(stack, int, PARSE_CATENATION);
- STACK_PUSHX(stack, voidptr, result);
- STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
- STACK_PUSHX(stack, int, PARSE_PIECE);
- }
- break;
- }
-
- case PARSE_POST_CATENATION:
- {
- tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
- tre_ast_node_t *tmp_node;
- tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
- if (!tmp_node)
- return REG_ESPACE;
- result = tmp_node;
- break;
- }
-
- case PARSE_UNION:
- switch (*ctx->re)
- {
- case CHAR_PIPE:
- STACK_PUSHX(stack, int, PARSE_UNION);
- STACK_PUSHX(stack, voidptr, result);
- STACK_PUSHX(stack, int, PARSE_POST_UNION);
- STACK_PUSHX(stack, int, PARSE_BRANCH);
- ctx->re++;
- break;
-
- case CHAR_RPAREN:
- ctx->re++;
- break;
-
- default:
- break;
- }
- break;
-
- case PARSE_POST_UNION:
- {
- tre_ast_node_t *tmp_node;
- tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
- tmp_node = tre_ast_new_union(ctx->mem, tree, result);
- if (!tmp_node)
- return REG_ESPACE;
- result = tmp_node;
- break;
- }
-
- case PARSE_POSTFIX:
- /* Parse postfix operators. */
- switch (*ctx->re)
- {
- case CHAR_PLUS:
- case CHAR_QUESTIONMARK:
- if (!(ctx->cflags & REG_EXTENDED))
- break;
- /*FALLTHROUGH*/
- case CHAR_STAR:
- {
- tre_ast_node_t *tmp_node;
- int minimal = 0;
- int rep_min = 0;
- int rep_max = -1;
-
- if (*ctx->re == CHAR_PLUS)
- rep_min = 1;
- if (*ctx->re == CHAR_QUESTIONMARK)
- rep_max = 1;
-
- ctx->re++;
- tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
- minimal);
- if (tmp_node == NULL)
- return REG_ESPACE;
- result = tmp_node;
- STACK_PUSHX(stack, int, PARSE_POSTFIX);
- }
- break;
-
- case CHAR_BACKSLASH:
- /* "\{" is special without REG_EXTENDED */
- if (!(ctx->cflags & REG_EXTENDED)
- && *(ctx->re + 1) == CHAR_LBRACE)
- {
- ctx->re++;
- goto parse_brace;
- }
- else
- break;
-
- case CHAR_LBRACE:
- /* "{" is literal without REG_EXTENDED */
- if (!(ctx->cflags & REG_EXTENDED))
- break;
-
- parse_brace:
- ctx->re++;
+ if (c-'0'<10) return c-'0';
+ c |= 32;
+ if (c-'a'<6) return c-'a'+10;
+ return -1;
+}
- status = tre_parse_bound(ctx, &result);
- if (status != REG_OK)
- return status;
- STACK_PUSHX(stack, int, PARSE_POSTFIX);
- break;
- }
- break;
+static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
+{
+ if (node->submatch_id >= 0) {
+ tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (!n)
+ return REG_ESPACE;
+ n = tre_ast_new_catenation(ctx->mem, n, node);
+ if (!n)
+ return REG_ESPACE;
+ n->num_submatches = node->num_submatches;
+ node = n;
+ }
+ node->submatch_id = subid;
+ node->num_submatches++;
+ ctx->n = node;
+ return REG_OK;
+}
- case PARSE_ATOM:
- /* Parse an atom. An atom is a regular expression enclosed in `()',
- an empty set of `()', a bracket expression, `.', `^', `$',
- a `\' followed by a character, or a single character. */
+/*
+BRE grammar:
+Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$'
+Branch = Atom | Branch Atom
+Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref
+Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}'
- switch (*ctx->re)
- {
- case CHAR_LPAREN: /* parenthesized subexpression */
+(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
- if (ctx->cflags & REG_EXTENDED)
- {
- lparen:
- depth++;
- {
- ctx->re++;
- /* First parse a whole RE, then mark the resulting tree
- for submatching. */
- STACK_PUSHX(stack, int, ctx->submatch_id);
- STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
- STACK_PUSHX(stack, int, PARSE_RE);
- ctx->submatch_id++;
- }
- }
- else
- goto parse_literal;
- break;
+ERE grammar:
+Regex = Branch | Regex '|' Branch
+Branch = Atom | Branch Atom
+Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$'
+Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}'
- case CHAR_LBRACKET: /* bracket expression */
- ctx->re++;
- status = tre_parse_bracket(ctx, &result);
- if (status != REG_OK)
- return status;
- break;
+(a*+?, ^*, $+, \X, {, (|a) are unspecified)
+*/
- case CHAR_BACKSLASH:
- /* If this is "\(" or "\)" chew off the backslash and
- try again. */
- if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
- {
- ctx->re++;
- goto lparen;
- }
- if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
- {
- goto empty_atom;
+static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
+{
+ int len, ere = ctx->cflags & REG_EXTENDED;
+ const char *p;
+ tre_ast_node_t *node;
+ wchar_t wc;
+ switch (*s) {
+ case '[':
+ return parse_bracket(ctx, s+1);
+ case '\\':
+ p = tre_expand_macro(s+1);
+ if (p) {
+ /* assume \X expansion is a single atom */
+ reg_errcode_t err = parse_atom(ctx, p);
+ ctx->s = s+2;
+ return err;
}
-
- /* If a macro is used, parse the expanded macro recursively. */
- {
- const char *buf = tre_expand_macro(ctx->re + 1);
- if (buf)
- {
- tre_parse_ctx_t subctx;
- memcpy(&subctx, ctx, sizeof(subctx));
- subctx.re = buf;
- subctx.nofirstsub = 1;
- status = tre_parse(&subctx);
- if (status != REG_OK)
- return status;
- ctx->re += 2;
- ctx->position = subctx.position;
- result = subctx.result;
- break;
- }
- }
-
- if (!ctx->re[1])
- /* Trailing backslash. */
- return REG_EESCAPE;
-
- ctx->re++;
- switch (*ctx->re)
- {
+ /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
+ switch (*++s) {
+ case 0:
+ return REG_EESCAPE;
case 'b':
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_WB, -1);
- ctx->re++;
- break;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
+ break;
case 'B':
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_WB_NEG, -1);
- ctx->re++;
- break;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
+ break;
case '<':
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_BOW, -1);
- ctx->re++;
- break;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
+ break;
case '>':
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_EOW, -1);
- ctx->re++;
- break;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
+ break;
case 'x':
- ctx->re++;
- if (ctx->re[0] != CHAR_LBRACE)
- {
- /* 8 bit hex char. */
- char tmp[3] = {0, 0, 0};
- long val;
-
- if (tre_isxdigit(ctx->re[0]))
- {
- tmp[0] = (char)ctx->re[0];
- ctx->re++;
+ s++;
+ int i, v = 0, c;
+ len = 2;
+ if (*s == '{') {
+ len = 8;
+ s++;
}
- if (tre_isxdigit(ctx->re[0]))
- {
- tmp[1] = (char)ctx->re[0];
- ctx->re++;
+ for (i=0; i<len && v<0x110000; i++) {
+ c = hexval(s[i]);
+ if (c < 0) break;
+ v = 16*v + c;
}
- val = strtol(tmp, NULL, 16);
- result = tre_ast_new_literal(ctx->mem, (int)val,
- (int)val, ctx->position);
- ctx->position++;
- break;
- }
- else if (*ctx->re)
- {
- /* Wide char. */
- char tmp[32];
- long val;
- int i = 0;
- ctx->re++;
- while (*ctx->re && i < sizeof tmp)
- {
- if (ctx->re[0] == CHAR_RBRACE)
- break;
- if (tre_isxdigit(ctx->re[0]))
- {
- tmp[i] = (char)ctx->re[0];
- i++;
- ctx->re++;
- continue;
- }
- return REG_EBRACE;
+ s += i;
+ if (len == 8) {
+ if (*s != '}')
+ return REG_EBRACE;
+ s++;
}
- ctx->re++;
- tmp[i] = 0;
- val = strtol(tmp, NULL, 16);
- result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
- ctx->position);
- ctx->position++;
- break;
- }
- /*FALLTHROUGH*/
-
+ node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
+ ctx->position++;
+ s--;
+ break;
default:
- if (tre_isdigit(*ctx->re))
- {
- /* Back reference. */
- int val = *ctx->re - '0';
- result = tre_ast_new_literal(ctx->mem, BACKREF, val,
- ctx->position);
- if (result == NULL)
- return REG_ESPACE;
- ctx->position++;
- ctx->max_backref = MAX(val, ctx->max_backref);
- ctx->re++;
- }
- else
- {
- /* Escaped character. */
- result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
- ctx->position);
- ctx->position++;
- ctx->re++;
- }
- break;
- }
- if (result == NULL)
- return REG_ESPACE;
- break;
-
- case CHAR_PERIOD: /* the any-symbol */
- if (ctx->cflags & REG_NEWLINE)
- {
- tre_ast_node_t *tmp1;
- tre_ast_node_t *tmp2;
- tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
- ctx->position);
- if (!tmp1)
- return REG_ESPACE;
- tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
- ctx->position + 1);
- if (!tmp2)
- return REG_ESPACE;
- result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
- if (!result)
- return REG_ESPACE;
- ctx->position += 2;
- }
- else
- {
- result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
- ctx->position);
- if (!result)
- return REG_ESPACE;
- ctx->position++;
+ if (isdigit(*s)) {
+ /* back reference */
+ int val = *s - '0';
+ node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
+ ctx->max_backref = MAX(val, ctx->max_backref);
+ } else {
+ /* extension: accept unknown escaped char
+ as a literal */
+ node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
+ }
+ ctx->position++;
}
- ctx->re++;
- break;
-
- case CHAR_CARET: /* beginning of line assertion */
- /* '^' has a special meaning everywhere in EREs, and at
- beginning of BRE. */
- if (ctx->cflags & REG_EXTENDED
- || ctx->re == ctx->re_start)
- {
- if (!(ctx->cflags & REG_EXTENDED))
- STACK_PUSHX(stack, int, PARSE_CATENATION);
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_BOL, -1);
- if (result == NULL)
- return REG_ESPACE;
- ctx->re++;
+ s++;
+ break;
+ case '.':
+ if (ctx->cflags & REG_NEWLINE) {
+ tre_ast_node_t *tmp1, *tmp2;
+ tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
+ tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
+ if (tmp1 && tmp2)
+ node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ else
+ node = 0;
+ } else {
+ node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
}
- else
- goto parse_literal;
- break;
-
- case CHAR_DOLLAR: /* end of line assertion. */
- /* '$' is special everywhere in EREs, and in the end of the
- string in BREs. */
- if (ctx->cflags & REG_EXTENDED
- || !*(ctx->re + 1))
- {
- result = tre_ast_new_literal(ctx->mem, ASSERTION,
- ASSERT_AT_EOL, -1);
- if (result == NULL)
- return REG_ESPACE;
- ctx->re++;
+ s++;
+ break;
+ case '^':
+ /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
+ if (!ere && s != ctx->re)
+ goto parse_literal;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
+ s++;
+ break;
+ case '$':
+ /* '$' is special everywhere in EREs, and in the end of the string in BREs. */
+ if (!ere && s[1])
+ goto parse_literal;
+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
+ s++;
+ break;
+ case '*':
+ case '|':
+ case '{':
+ case '+':
+ case '?':
+ if (!ere)
+ goto parse_literal;
+ case 0:
+ node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ break;
+ default:
+parse_literal:
+ len = mbtowc(&wc, s, -1);
+ if (len < 0)
+ return REG_BADPAT;
+ if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
+ tre_ast_node_t *tmp1, *tmp2;
+ /* multiple opposite case characters are not supported */
+ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
+ tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
+ if (tmp1 && tmp2)
+ node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ else
+ node = 0;
+ } else {
+ node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
}
- else
- goto parse_literal;
- break;
-
- case CHAR_RPAREN:
- if (!depth)
- goto parse_literal;
- case CHAR_STAR:
- case CHAR_PIPE:
- case CHAR_LBRACE:
- case CHAR_PLUS:
- case CHAR_QUESTIONMARK:
- if (!(ctx->cflags & REG_EXTENDED))
- goto parse_literal;
-
- case 0:
- empty_atom:
- result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- if (!result)
+ ctx->position++;
+ s += len;
+ break;
+ }
+ if (!node)
return REG_ESPACE;
- break;
+ ctx->n = node;
+ ctx->s = s;
+ return REG_OK;
+}
- default:
- parse_literal:
+#define PUSHPTR(err, s, v) do { \
+ if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
+ return err; \
+} while(0)
- clen = mbtowc(&wc, ctx->re, -1);
- if (clen<0) clen=1, wc=WEOF;
+#define PUSHINT(err, s, v) do { \
+ if ((err = tre_stack_push_int(s, v)) != REG_OK) \
+ return err; \
+} while(0)
- /* Note that we can't use an tre_isalpha() test here, since there
- may be characters which are alphabetic but neither upper or
- lower case. */
- if (ctx->cflags & REG_ICASE
- && (tre_isupper(wc) || tre_islower(wc)))
- {
- tre_ast_node_t *tmp1;
- tre_ast_node_t *tmp2;
-
- /* XXX - Can there be more than one opposite-case
- counterpoints for some character in some locale? Or
- more than two characters which all should be regarded
- the same character if case is ignored? If yes, there
- does not seem to be a portable way to detect it. I guess
- that at least for multi-character collating elements there
- could be several opposite-case counterpoints, but they
- cannot be supported portably anyway. */
- tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
- tre_toupper(wc),
- ctx->position);
- if (!tmp1)
- return REG_ESPACE;
- tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
- tre_tolower(wc),
- ctx->position);
- if (!tmp2)
- return REG_ESPACE;
- result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
- if (!result)
- return REG_ESPACE;
+static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
+{
+ tre_ast_node_t *nbranch=0, *nunion=0;
+ int ere = ctx->cflags & REG_EXTENDED;
+ const char *s = ctx->re;
+ int subid = 0;
+ int depth = 0;
+ reg_errcode_t err;
+ tre_stack_t *stack = ctx->stack;
+
+ PUSHINT(err, stack, subid++);
+ for (;;) {
+ if ((!ere && *s == '\\' && s[1] == '(') ||
+ (ere && *s == '(')) {
+ PUSHPTR(err, stack, nunion);
+ PUSHPTR(err, stack, nbranch);
+ PUSHINT(err, stack, subid++);
+ s++;
+ if (!ere)
+ s++;
+ depth++;
+ nbranch = nunion = 0;
+ continue;
}
- else
- {
- result = tre_ast_new_literal(ctx->mem, wc, wc,
- ctx->position);
- if (!result)
- return REG_ESPACE;
+ if ((!ere && *s == '\\' && s[1] == ')') ||
+ (ere && *s == ')' && depth)) {
+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (!ctx->n)
+ return REG_ESPACE;
+ } else {
+ err = parse_atom(ctx, s);
+ if (err != REG_OK)
+ return err;
+ s = ctx->s;
}
- ctx->position++;
- ctx->re += clen;
- break;
- }
- break;
-
- case PARSE_MARK_FOR_SUBMATCH:
- {
- int submatch_id = tre_stack_pop_int(stack);
- if (result->submatch_id >= 0)
- {
- tre_ast_node_t *n, *tmp_node;
- n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- if (n == NULL)
- return REG_ESPACE;
- tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
- if (tmp_node == NULL)
- return REG_ESPACE;
- tmp_node->num_submatches = result->num_submatches;
- result = tmp_node;
- }
- result->submatch_id = submatch_id;
- result->num_submatches++;
- break;
- }
-
- case PARSE_RESTORE_CFLAGS:
- ctx->cflags = tre_stack_pop_int(stack);
- break;
+ parse_iter:
+ /* extension: repetitions are accepted after an empty node
+ eg. (+), ^*, a$?, a|{2} */
+ switch (*s) {
+ case '+':
+ case '?':
+ if (!ere)
+ break;
+ /* fallthrough */
+ case '*':;
+ int min=0, max=-1;
+ if (*s == '+')
+ min = 1;
+ if (*s == '?')
+ max = 1;
+ s++;
+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+ if (!ctx->n)
+ return REG_ESPACE;
+ /* extension: multiple consecutive *+?{,} is unspecified,
+ but (a+)+ has to be supported so accepting a++ makes
+ sense, note however that the RE_DUP_MAX limit can be
+ circumvented: (a{255}){255} uses a lot of memory.. */
+ goto parse_iter;
+ case '\\':
+ if (ere || s[1] != '{')
+ break;
+ s++;
+ goto parse_brace;
+ case '{':
+ if (!ere)
+ break;
+ parse_brace:
+ err = parse_dup(ctx, s+1);
+ if (err != REG_OK)
+ return err;
+ s = ctx->s;
+ goto parse_iter;
+ }
- default:
- assert(0);
- break;
+ nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
+ if ((ere && *s == '|') ||
+ (ere && *s == ')' && depth) ||
+ (!ere && *s == '\\' && s[1] == ')') ||
+ !*s) {
+ /* extension: empty branch is unspecified (), (|a), (a|)
+ here they are not rejected but match on empty string */
+ int c = *s;
+ nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
+ nbranch = 0;
+ if (c != '|') {
+ if (c == '\\') {
+ if (!depth) return REG_EPAREN;
+ s+=2;
+ } else if (c == ')')
+ s++;
+ depth--;
+ err = marksub(ctx, nunion, tre_stack_pop_int(stack));
+ if (err != REG_OK)
+ return err;
+ if (!c && depth<0) {
+ ctx->submatch_id = subid;
+ return REG_OK;
+ }
+ if (!c || depth<0)
+ return REG_EPAREN;
+ nbranch = tre_stack_pop_voidptr(stack);
+ nunion = tre_stack_pop_voidptr(stack);
+ goto parse_iter;
+ }
+ s++;
+ }
}
- }
-
- /* Check for missing closing parentheses. */
- if (depth > 0)
- return REG_EPAREN;
-
- if (status == REG_OK)
- ctx->result = result;
-
- return status;
}
-
/***********************************************************************
from tre-compile.c
***********************************************************************/
@@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
if (errcode != REG_OK)
ERROR_EXIT(errcode);
preg->re_nsub = parse_ctx.submatch_id - 1;
- tree = parse_ctx.result;
-
- /* Back references and approximate matching cannot currently be used
- in the same regexp. */
- if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
- ERROR_EXIT(REG_BADPAT);
+ tree = parse_ctx.n;
#ifdef TRE_DEBUG
tre_ast_print(tree);
@@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
if (tnfa == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->have_backrefs = parse_ctx.max_backref >= 0;
- tnfa->have_approx = parse_ctx.have_approx;
+ tnfa->have_approx = 0;
tnfa->num_submatches = parse_ctx.submatch_id;
/* Set up tags for submatch addressing. If REG_NOSUB is set and the