summaryrefslogtreecommitdiff
path: root/src/regex/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex/regcomp.c')
-rw-r--r--src/regex/regcomp.c3362
1 files changed, 3362 insertions, 0 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c
new file mode 100644
index 00000000..3307942e
--- /dev/null
+++ b/src/regex/regcomp.c
@@ -0,0 +1,3362 @@
+/*
+ regcomp.c - TRE POSIX compatible regex compilation functions.
+
+ Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <regex.h>
+#include <limits.h>
+#include <stdint.h>
+
+#include "tre.h"
+
+#include <assert.h>
+
+/***********************************************************************
+ from tre-ast.c and tre-ast.h
+***********************************************************************/
+
+/* The different AST node types. */
+typedef enum {
+ LITERAL,
+ CATENATION,
+ ITERATION,
+ UNION
+} tre_ast_type_t;
+
+/* Special subtypes of TRE_LITERAL. */
+#define EMPTY -1 /* Empty leaf (denotes empty string). */
+#define ASSERTION -2 /* Assertion leaf. */
+#define TAG -3 /* Tag leaf. */
+#define BACKREF -4 /* Back reference leaf. */
+
+#define IS_SPECIAL(x) ((x)->code_min < 0)
+#define IS_EMPTY(x) ((x)->code_min == EMPTY)
+#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
+#define IS_TAG(x) ((x)->code_min == TAG)
+#define IS_BACKREF(x) ((x)->code_min == BACKREF)
+
+/* Taken from tre-compile.h */
+typedef struct {
+ int position;
+ int code_min;
+ int code_max;
+ int *tags;
+ int assertions;
+ tre_ctype_t class;
+ tre_ctype_t *neg_classes;
+ int backref;
+} tre_pos_and_tags_t;
+
+/* A generic AST node. All AST nodes consist of this node on the top
+ level with `obj' pointing to the actual content. */
+typedef struct {
+ tre_ast_type_t type; /* Type of the node. */
+ void *obj; /* Pointer to actual node. */
+ int nullable;
+ int submatch_id;
+ int num_submatches;
+ int num_tags;
+ tre_pos_and_tags_t *firstpos;
+ tre_pos_and_tags_t *lastpos;
+} tre_ast_node_t;
+
+
+/* A "literal" node. These are created for assertions, back references,
+ tags, matching parameter settings, and all expressions that match one
+ character. */
+typedef struct {
+ long code_min;
+ long code_max;
+ int position;
+ tre_ctype_t class;
+ tre_ctype_t *neg_classes;
+} tre_literal_t;
+
+/* A "catenation" node. These are created when two regexps are concatenated.
+ If there are more than one subexpressions in sequence, the `left' part
+ holds all but the last, and `right' part holds the last subexpression
+ (catenation is left associative). */
+typedef struct {
+ tre_ast_node_t *left;
+ tre_ast_node_t *right;
+} tre_catenation_t;
+
+/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
+ operators. */
+typedef struct {
+ /* Subexpression to match. */
+ tre_ast_node_t *arg;
+ /* Minimum number of consecutive matches. */
+ int min;
+ /* Maximum number of consecutive matches. */
+ int max;
+} tre_iteration_t;
+
+/* An "union" node. These are created for the "|" operator. */
+typedef struct {
+ tre_ast_node_t *left;
+ tre_ast_node_t *right;
+} tre_union_t;
+
+static tre_ast_node_t *
+tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
+{
+ tre_ast_node_t *node;
+
+ node = tre_mem_calloc(mem, sizeof(*node));
+ if (!node)
+ return NULL;
+ node->obj = tre_mem_calloc(mem, size);
+ if (!node->obj)
+ return NULL;
+ node->type = type;
+ node->nullable = -1;
+ node->submatch_id = -1;
+
+ return node;
+}
+
+static tre_ast_node_t *
+tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
+{
+ tre_ast_node_t *node;
+ tre_literal_t *lit;
+
+ node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
+ if (!node)
+ return NULL;
+ lit = node->obj;
+ lit->code_min = code_min;
+ lit->code_max = code_max;
+ lit->position = position;
+
+ return node;
+}
+
+static tre_ast_node_t *
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max)
+{
+ tre_ast_node_t *node;
+ tre_iteration_t *iter;
+
+ node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
+ if (!node)
+ return NULL;
+ iter = node->obj;
+ iter->arg = arg;
+ iter->min = min;
+ iter->max = max;
+ node->num_submatches = arg->num_submatches;
+
+ return node;
+}
+
+static tre_ast_node_t *
+tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
+{
+ tre_ast_node_t *node;
+
+ node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
+ if (node == NULL)
+ return NULL;
+ ((tre_union_t *)node->obj)->left = left;
+ ((tre_union_t *)node->obj)->right = right;
+ node->num_submatches = left->num_submatches + right->num_submatches;
+
+ return node;
+}
+
+static tre_ast_node_t *
+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
+ tre_ast_node_t *right)
+{
+ tre_ast_node_t *node;
+
+ node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
+ if (node == NULL)
+ return NULL;
+ ((tre_catenation_t *)node->obj)->left = left;
+ ((tre_catenation_t *)node->obj)->right = right;
+ node->num_submatches = left->num_submatches + right->num_submatches;
+
+ return node;
+}
+
+/***********************************************************************
+ from tre-stack.c and tre-stack.h
+***********************************************************************/
+
+/* Just to save some typing. */
+#define STACK_PUSH(s, value) \
+ do \
+ { \
+ status = tre_stack_push(s, (void *)(value)); \
+ } \
+ while (0)
+
+#define STACK_PUSHX(s, value) \
+ { \
+ status = tre_stack_push(s, (void *)(value)); \
+ if (status != REG_OK) \
+ break; \
+ }
+
+#define STACK_PUSHR(s, value) \
+ { \
+ reg_errcode_t status; \
+ status = tre_stack_push(s, (void *)(value)); \
+ if (status != REG_OK) \
+ return status; \
+ }
+
+typedef struct tre_stack_rec {
+ int size;
+ int max_size;
+ int increment;
+ int ptr;
+ void **stack;
+} tre_stack_t;
+
+
+static tre_stack_t *
+tre_stack_new(int size, int max_size, int increment)
+{
+ tre_stack_t *s;
+
+ s = xmalloc(sizeof(*s));
+ if (s != NULL)
+ {
+ s->stack = xmalloc(sizeof(*s->stack) * size);
+ if (s->stack == NULL)
+ {
+ xfree(s);
+ return NULL;
+ }
+ s->size = size;
+ s->max_size = max_size;
+ s->increment = increment;
+ s->ptr = 0;
+ }
+ return s;
+}
+
+static void
+tre_stack_destroy(tre_stack_t *s)
+{
+ xfree(s->stack);
+ xfree(s);
+}
+
+static int
+tre_stack_num_objects(tre_stack_t *s)
+{
+ return s->ptr;
+}
+
+static reg_errcode_t
+tre_stack_push(tre_stack_t *s, void *value)
+{
+ if (s->ptr < s->size)
+ {
+ s->stack[s->ptr] = value;
+ s->ptr++;
+ }
+ else
+ {
+ if (s->size >= s->max_size)
+ {
+ DPRINT(("tre_stack_push: stack full\n"));
+ return REG_ESPACE;
+ }
+ else
+ {
+ void **new_buffer;
+ int new_size;
+ DPRINT(("tre_stack_push: trying to realloc more space\n"));
+ new_size = s->size + s->increment;
+ if (new_size > s->max_size)
+ new_size = s->max_size;
+ new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
+ if (new_buffer == NULL)
+ {
+ DPRINT(("tre_stack_push: realloc failed.\n"));
+ return REG_ESPACE;
+ }
+ DPRINT(("tre_stack_push: realloc succeeded.\n"));
+ assert(new_size > s->size);
+ s->size = new_size;
+ s->stack = new_buffer;
+ tre_stack_push(s, value);
+ }
+ }
+ return REG_OK;
+}
+
+static void *
+tre_stack_pop(tre_stack_t *s)
+{
+ return s->stack[--s->ptr];
+}
+
+
+/***********************************************************************
+ from tre-parse.c and tre-parse.h
+***********************************************************************/
+
+/* Parse context. */
+typedef struct {
+ /* Memory allocator. The AST is allocated using this. */
+ tre_mem_t mem;
+ /* Stack used for keeping track of regexp syntax. */
+ tre_stack_t *stack;
+ /* The parse result. */
+ tre_ast_node_t *result;
+ /* The regexp to parse and its length. */
+ const tre_char_t *re;
+ /* The first character of the entire regexp. */
+ const tre_char_t *re_start;
+ /* The first character after the end of the regexp. */
+ const tre_char_t *re_end;
+ int len;
+ /* Current submatch ID. */
+ int submatch_id;
+ /* Current position (number of literal). */
+ int position;
+ /* The highest back reference or -1 if none seen so far. */
+ int max_backref;
+ /* Compilation flags. */
+ int cflags;
+ /* If this flag is set the top-level submatch is not captured. */
+ int nofirstsub;
+} tre_parse_ctx_t;
+
+static reg_errcode_t
+tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
+ tre_ast_node_t ***items)
+{
+ reg_errcode_t status;
+ tre_ast_node_t **array = *items;
+ /* Allocate more space if necessary. */
+ if (*i >= *max_i)
+ {
+ tre_ast_node_t **new_items;
+ DPRINT(("out of array space, i = %d\n", *i));
+ /* If the array is already 1024 items large, give up -- there's
+ probably an error in the regexp (e.g. not a '\0' terminated
+ string and missing ']') */
+ if (*max_i > 1024)
+ return REG_ESPACE;
+ *max_i *= 2;
+ new_items = xrealloc(array, sizeof(*items) * *max_i);
+ if (new_items == NULL)
+ return REG_ESPACE;
+ *items = array = new_items;
+ }
+ array[*i] = tre_ast_new_literal(mem, min, max, -1);
+ status = array[*i] == NULL ? REG_ESPACE : REG_OK;
+ (*i)++;
+ return status;
+}
+
+
+/* Expands a character class to character ranges. */
+static reg_errcode_t
+tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
+ int *i, int *max_i, int cflags)
+{
+ reg_errcode_t status = REG_OK;
+ tre_cint_t c;
+ int j, min = -1, max = 0;
+ assert(TRE_MB_CUR_MAX == 1);
+
+ DPRINT((" expanding class to character ranges\n"));
+ for (j = 0; (j < 256) && (status == REG_OK); j++)
+ {
+ c = j;
+ if (tre_isctype(c, class)
+ || ((cflags & REG_ICASE)
+ && (tre_isctype(tre_tolower(c), class)
+ || tre_isctype(tre_toupper(c), class))))
+{
+ if (min < 0)
+ min = c;
+ max = c;
+ }
+ else if (min >= 0)
+ {
+ DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
+ status = tre_new_item(mem, min, max, i, max_i, items);
+ min = -1;
+ }
+ }
+ if (min >= 0 && status == REG_OK)
+ status = tre_new_item(mem, min, max, i, max_i, items);
+ return status;
+}
+
+
+static int
+tre_compare_items(const void *a, const void *b)
+{
+ tre_ast_node_t *node_a = *(tre_ast_node_t **)a;
+ tre_ast_node_t *node_b = *(tre_ast_node_t **)b;
+ tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
+ int a_min = l_a->code_min, b_min = l_b->code_min;
+
+ if (a_min < b_min)
+ return -1;
+ else if (a_min > b_min)
+ return 1;
+ else
+ return 0;
+}
+
+/* Maximum number of character classes that can occur in a negated bracket
+ expression. */
+#define MAX_NEG_CLASSES 64
+
+/* Maximum length of character class names. */
+#define MAX_CLASS_NAME
+
+static reg_errcode_t
+tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
+ tre_ctype_t neg_classes[], int *num_neg_classes,
+ tre_ast_node_t ***items, int *num_items,
+ int *items_size)
+{
+ const tre_char_t *re = ctx->re;
+ reg_errcode_t status = REG_OK;
+ tre_ctype_t class = (tre_ctype_t)0;
+ int i = *num_items;
+ int max_i = *items_size;
+ int skip;
+
+ /* Build an array of the items in the bracket expression. */
+ while (status == REG_OK)
+ {
+ skip = 0;
+ if (re == ctx->re_end)
+ {
+ status = REG_EBRACK;
+ }
+ else if (*re == ']' && re > ctx->re)
+ {
+ DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
+ ctx->re_end - re, re));
+ re++;
+ break;
+ }
+ else
+ {
+ tre_cint_t min = 0, max = 0;
+
+ class = (tre_ctype_t)0;
+ if (re + 2 < ctx->re_end
+ && *(re + 1) == '-' && *(re + 2) != ']')
+ {
+ DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n",
+ ctx->re_end - re, re));
+ min = *re;
+ max = *(re + 2);
+ re += 3;
+ /* XXX - Should use collation order instead of encoding values
+ in character ranges. */
+ if (min > max)
+ status = REG_ERANGE;
+ }
+ else if (re + 1 < ctx->re_end
+ && *re == '[' && *(re + 1) == '.')
+ status = REG_ECOLLATE;
+ else if (re + 1 < ctx->re_end
+ && *re == '[' && *(re + 1) == '=')
+ status = REG_ECOLLATE;
+ else if (re + 1 < ctx->re_end
+ && *re == '[' && *(re + 1) == ':')
+ {
+ char tmp_str[64];
+ const tre_char_t *endptr = re + 2;
+ int len;
+ DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n",
+ ctx->re_end - re, re));
+ while (endptr < ctx->re_end && *endptr != ':')
+ endptr++;
+ if (endptr != ctx->re_end)
+ {
+ len = MIN(endptr - re - 2, 63);
+#ifdef TRE_WCHAR
+ {
+ tre_char_t tmp_wcs[64];
+ wcsncpy(tmp_wcs, re + 2, len);
+ tmp_wcs[len] = '\0';
+#if defined HAVE_WCSRTOMBS
+ {
+ mbstate_t state;
+ const tre_char_t *src = tmp_wcs;
+ memset(&state, '\0', sizeof(state));
+ len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
+ }
+#elif defined HAVE_WCSTOMBS
+ len = wcstombs(tmp_str, tmp_wcs, 63);
+#endif /* defined HAVE_WCSTOMBS */
+ }
+#else /* !TRE_WCHAR */
+ strncpy(tmp_str, re + 2, len);
+#endif /* !TRE_WCHAR */
+ tmp_str[len] = '\0';
+ DPRINT((" class name: %s\n", tmp_str));
+ class = tre_ctype(tmp_str);
+ if (!class)
+ status = REG_ECTYPE;
+ /* Optimize character classes for 8 bit character sets. */
+ if (status == REG_OK && TRE_MB_CUR_MAX == 1)
+ {
+ status = tre_expand_ctype(ctx->mem, class, items,
+ &i, &max_i, ctx->cflags);
+ class = (tre_ctype_t)0;
+ skip = 1;
+ }
+ re = endptr + 2;
+ }
+ else
+ status = REG_ECTYPE;
+ min = 0;
+ max = TRE_CHAR_MAX;
+ }
+ else
+ {
+ DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n",
+ ctx->re_end - re, re));
+ if (*re == '-' && *(re + 1) != ']'
+ && ctx->re != re)
+ /* Two ranges are not allowed to share and endpoint. */
+ status = REG_ERANGE;
+ min = max = *re++;
+ }
+
+ if (status != REG_OK)
+ break;
+
+ if (class && negate)
+ if (*num_neg_classes >= MAX_NEG_CLASSES)
+ status = REG_ESPACE;
+ else
+ neg_classes[(*num_neg_classes)++] = class;
+ else if (!skip)
+ {
+ status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
+ if (status != REG_OK)
+ break;
+ ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
+ }
+
+ /* Add opposite-case counterpoints if REG_ICASE is present.
+ This is broken if there are more than two "same" characters. */
+ if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
+ {
+ int cmin, ccurr;
+
+ DPRINT(("adding opposite-case counterpoints\n"));
+ while (min <= max)
+ {
+ if (tre_islower(min))
+ {
+ cmin = ccurr = tre_toupper(min++);
+ while (tre_islower(min) && tre_toupper(min) == ccurr + 1
+ && min <= max)
+ ccurr = tre_toupper(min++);
+ status = tre_new_item(ctx->mem, cmin, ccurr,
+ &i, &max_i, items);
+ }
+ else if (tre_isupper(min))
+ {
+ cmin = ccurr = tre_tolower(min++);
+ while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
+ && min <= max)
+ ccurr = tre_tolower(min++);
+ status = tre_new_item(ctx->mem, cmin, ccurr,
+ &i, &max_i, items);
+ }
+ else min++;
+ if (status != REG_OK)
+ break;
+ }
+ if (status != REG_OK)
+ break;
+ }
+ }
+ }
+ *num_items = i;
+ *items_size = max_i;
+ ctx->re = re;
+ return status;
+}
+
+static reg_errcode_t
+tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+{
+ tre_ast_node_t *node = NULL;
+ int negate = 0;
+ reg_errcode_t status = REG_OK;
+ tre_ast_node_t **items, *u, *n;
+ int i = 0, j, max_i = 32, curr_max, curr_min;
+ tre_ctype_t neg_classes[MAX_NEG_CLASSES];
+ int num_neg_classes = 0;
+
+ /* Start off with an array of `max_i' elements. */
+ items = xmalloc(sizeof(*items) * max_i);
+ if (items == NULL)
+ return REG_ESPACE;
+
+ if (*ctx->re == '^')
+ {
+ DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ negate = 1;
+ ctx->re++;
+ }
+
+ status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
+ &items, &i, &max_i);
+
+ if (status != REG_OK)
+ goto parse_bracket_done;
+
+ /* Sort the array if we need to negate it. */
+ if (negate)
+ qsort(items, i, sizeof(*items), tre_compare_items);
+
+ curr_max = curr_min = 0;
+ /* Build a union of the items in the array, negated if necessary. */
+ for (j = 0; j < i && status == REG_OK; j++)
+ {
+ int min, max;
+ tre_literal_t *l = items[j]->obj;
+ min = l->code_min;
+ max = l->code_max;
+
+ DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
+ (int)l->code_min, (int)l->code_max, (long)l->class, curr_max));
+
+ if (negate)
+ {
+ if (min < curr_max)
+ {
+ /* Overlap. */
+ curr_max = MAX(max + 1, curr_max);
+ DPRINT(("overlap, curr_max = %d\n", curr_max));
+ l = NULL;
+ }
+ else
+ {
+ /* No overlap. */
+ curr_max = min - 1;
+ if (curr_max >= curr_min)
+ {
+ DPRINT(("no overlap\n"));
+ l->code_min = curr_min;
+ l->code_max = curr_max;
+ }
+ else
+ {
+ DPRINT(("no overlap, zero room\n"));
+ l = NULL;
+ }
+ curr_min = curr_max = max + 1;
+ }
+ }
+
+ if (l != NULL)
+ {
+ int k;
+ DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
+ l->position = ctx->position;
+ if (num_neg_classes > 0)
+ {
+ l->neg_classes = tre_mem_alloc(ctx->mem,
+ (sizeof(l->neg_classes)
+ * (num_neg_classes + 1)));
+ if (l->neg_classes == NULL)
+ {
+ status = REG_ESPACE;
+ break;
+ }
+ for (k = 0; k < num_neg_classes; k++)
+ l->neg_classes[k] = neg_classes[k];
+ l->neg_classes[k] = (tre_ctype_t)0;
+ }
+ else
+ l->neg_classes = NULL;
+ if (node == NULL)
+ node = items[j];
+ else
+ {
+ u = tre_ast_new_union(ctx->mem, node, items[j]);
+ if (u == NULL)
+ status = REG_ESPACE;
+ node = u;
+ }
+ }
+ }
+
+ if (status != REG_OK)
+ goto parse_bracket_done;
+
+ if (negate)
+ {
+ int k;
+ DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
+ n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
+ if (n == NULL)
+ status = REG_ESPACE;
+ else
+ {
+ tre_literal_t *l = n->obj;
+ if (num_neg_classes > 0)
+ {
+ l->neg_classes = tre_mem_alloc(ctx->mem,
+ (sizeof(l->neg_classes)
+ * (num_neg_classes + 1)));
+ if (l->neg_classes == NULL)
+ {
+ status = REG_ESPACE;
+ goto parse_bracket_done;
+ }
+ for (k = 0; k < num_neg_classes; k++)
+ l->neg_classes[k] = neg_classes[k];
+ l->neg_classes[k] = (tre_ctype_t)0;
+ }
+ else
+ l->neg_classes = NULL;
+ if (node == NULL)
+ node = n;
+ else
+ {
+ u = tre_ast_new_union(ctx->mem, node, n);
+ if (u == NULL)
+ status = REG_ESPACE;
+ node = u;
+ }
+ }
+ }
+
+ if (status != REG_OK)
+ goto parse_bracket_done;
+
+#ifdef TRE_DEBUG
+ tre_ast_print(node);
+#endif /* TRE_DEBUG */
+
+ parse_bracket_done:
+ xfree(items);
+ ctx->position++;
+ *result = node;
+ return status;
+}
+
+
+/* Parses a positive decimal integer. Returns -1 if the string does not
+ contain a valid number. */
+static int
+tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
+{
+ int num = -1;
+ const tre_char_t *r = *regex;
+ while (r < regex_end && *r >= '0' && *r <= '9')
+ {
+ if (num < 0)
+ num = 0;
+ num = num * 10 + *r - '0';
+ r++;
+ }
+ *regex = r;
+ return num;
+}
+
+
+static reg_errcode_t
+tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+{
+ int min, max;
+ const tre_char_t *r = ctx->re;
+ const tre_char_t *start;
+ int counts_set = 0;
+
+ /* Parse number (minimum repetition count). */
+ min = -1;
+ if (r < ctx->re_end && *r >= '0' && *r <= '9') {
+ DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r));
+ min = tre_parse_int(&r, ctx->re_end);
+ }
+
+ /* Parse comma and second number (maximum repetition count). */
+ max = min;
+ if (r < ctx->re_end && *r == ',')
+ {
+ r++;
+ DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r));
+ max = tre_parse_int(&r, ctx->re_end);
+ }
+
+ /* Check that the repeat counts are sane. */
+ if ((max >= 0 && min > max) || max > RE_DUP_MAX)
+ return REG_BADBR;
+
+
+ /*
+ '{'
+ optionally followed immediately by a number == minimum repcount
+ optionally followed by , then a number == maximum repcount
+ */
+
+
+ do {
+ int done;
+ start = r;
+
+ /* Parse count limit settings */
+ done = 0;
+ if (!counts_set)
+ while (r + 1 < ctx->re_end && !done)
+ {
+ switch (*r)
+ {
+ case ',':
+ r++;
+ break;
+ case ' ':
+ r++;
+ break;
+ case '}':
+ done = 1;
+ break;
+ default:
+ done = 1;
+ break;
+ }
+ }
+ } while (start != r);
+
+ /* Missing }. */
+ if (r >= ctx->re_end)
+ return REG_EBRACE;
+
+ /* Empty contents of {}. */
+ if (r == ctx->re)
+ return REG_BADBR;
+
+ /* Parse the ending '}' or '\}'.*/
+ if (ctx->cflags & REG_EXTENDED)
+ {
+ if (r >= ctx->re_end || *r != '}')
+ return REG_BADBR;
+ r++;
+ }
+ else
+ {
+ if (r + 1 >= ctx->re_end
+ || *r != '\\'
+ || *(r + 1) != '}')
+ return REG_BADBR;
+ r += 2;
+ }
+
+
+ /* Create the AST node(s). */
+ if (min == 0 && max == 0)
+ {
+ *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (*result == NULL)
+ return REG_ESPACE;
+ }
+ else
+ {
+ if (min < 0 && max < 0)
+ /* Only approximate parameters set, no repetitions. */
+ min = max = 1;
+
+ *result = tre_ast_new_iter(ctx->mem, *result, min, max);
+ if (!*result)
+ return REG_ESPACE;
+ }
+
+ ctx->re = r;
+ return REG_OK;
+}
+
+typedef enum {
+ PARSE_RE = 0,
+ PARSE_ATOM,
+ PARSE_MARK_FOR_SUBMATCH,
+ PARSE_BRANCH,
+ PARSE_PIECE,
+ PARSE_CATENATION,
+ PARSE_POST_CATENATION,
+ PARSE_UNION,
+ PARSE_POST_UNION,
+ PARSE_POSTFIX,
+ PARSE_RESTORE_CFLAGS
+} tre_parse_re_stack_symbol_t;
+
+
+static reg_errcode_t
+tre_parse(tre_parse_ctx_t *ctx)
+{
+ tre_ast_node_t *result = NULL;
+ tre_parse_re_stack_symbol_t symbol;
+ reg_errcode_t status = REG_OK;
+ tre_stack_t *stack = ctx->stack;
+ int bottom = tre_stack_num_objects(stack);
+ int depth = 0;
+
+ DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
+ ctx->len, ctx->re, ctx->len));
+
+ if (!ctx->nofirstsub)
+ {
+ STACK_PUSH(stack, ctx->re);
+ STACK_PUSH(stack, ctx->submatch_id);
+ STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH);
+ ctx->submatch_id++;
+ }
+ STACK_PUSH(stack, PARSE_RE);
+ ctx->re_start = ctx->re;
+ ctx->re_end = ctx->re + ctx->len;
+
+
+ /* The following is basically just a recursive descent parser. I use
+ an explicit stack instead of recursive functions mostly because of
+ two reasons: compatibility with systems which have an overflowable
+ call stack, and efficiency (both in lines of code and speed). */
+ while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
+ {
+ if (status != REG_OK)
+ break;
+ symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack);
+ switch (symbol)
+ {
+ case PARSE_RE:
+ /* Parse a full regexp. A regexp is one or more branches,
+ separated by the union operator `|'. */
+ if (ctx->cflags & REG_EXTENDED)
+ STACK_PUSHX(stack, PARSE_UNION);
+ STACK_PUSHX(stack, PARSE_BRANCH);
+ break;
+
+ case PARSE_BRANCH:
+ /* Parse a branch. A branch is one or more pieces, concatenated.
+ A piece is an atom possibly followed by a postfix operator. */
+ STACK_PUSHX(stack, PARSE_CATENATION);
+ STACK_PUSHX(stack, PARSE_PIECE);
+ break;
+
+ case PARSE_PIECE:
+ /* Parse a piece. A piece is an atom possibly followed by one
+ or more postfix operators. */
+ STACK_PUSHX(stack, PARSE_POSTFIX);
+ STACK_PUSHX(stack, PARSE_ATOM);
+ break;
+
+ case PARSE_CATENATION:
+ /* If the expression has not ended, parse another piece. */
+ {
+ tre_char_t c;
+ if (ctx->re >= ctx->re_end)
+ break;
+ c = *ctx->re;
+ if (ctx->cflags & REG_EXTENDED && c == '|')
+ break;
+ if ((ctx->cflags & REG_EXTENDED
+ && c == ')' && depth > 0)
+ || (!(ctx->cflags & REG_EXTENDED)
+ && (c == '\\'
+ && *(ctx->re + 1) == ')')))
+ {
+ if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
+ status = REG_EPAREN;
+ DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ depth--;
+ if (!(ctx->cflags & REG_EXTENDED))
+ ctx->re += 2;
+ break;
+ }
+
+ /* Left associative concatenation. */
+ STACK_PUSHX(stack, PARSE_CATENATION);
+ STACK_PUSHX(stack, result);
+ STACK_PUSHX(stack, PARSE_POST_CATENATION);
+ STACK_PUSHX(stack, PARSE_PIECE);
+ break;
+ }
+
+ case PARSE_POST_CATENATION:
+ {
+ tre_ast_node_t *tree = tre_stack_pop(stack);
+ tre_ast_node_t *tmp_node;
+ tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
+ if (!tmp_node)
+ return REG_ESPACE;
+ result = tmp_node;
+ break;
+ }
+
+ case PARSE_UNION:
+ if (ctx->re >= ctx->re_end)
+ break;
+ switch (*ctx->re)
+ {
+ case '|':
+ DPRINT(("tre_parse: union: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ STACK_PUSHX(stack, PARSE_UNION);
+ STACK_PUSHX(stack, result);
+ STACK_PUSHX(stack, PARSE_POST_UNION);
+ STACK_PUSHX(stack, PARSE_BRANCH);
+ ctx->re++;
+ break;
+
+ case ')':
+ ctx->re++;
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case PARSE_POST_UNION:
+ {
+ tre_ast_node_t *tmp_node;
+ tre_ast_node_t *tree = tre_stack_pop(stack);
+ tmp_node = tre_ast_new_union(ctx->mem, tree, result);
+ if (!tmp_node)
+ return REG_ESPACE;
+ result = tmp_node;
+ break;
+ }
+
+ case PARSE_POSTFIX:
+ /* Parse postfix operators. */
+ if (ctx->re >= ctx->re_end)
+ break;
+ switch (*ctx->re)
+ {
+ case '+':
+ case '?':
+ if (!(ctx->cflags & REG_EXTENDED))
+ break;
+ case '*':
+ {
+ tre_ast_node_t *tmp_node;
+ int rep_min = 0;
+ int rep_max = -1;
+ if (*ctx->re == '+')
+ rep_min = 1;
+ if (*ctx->re == '?')
+ rep_max = 1;
+
+ ctx->re++;
+ tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max);
+ if (tmp_node == NULL)
+ return REG_ESPACE;
+ result = tmp_node;
+ STACK_PUSHX(stack, PARSE_POSTFIX);
+ break;
+ }
+
+ case '\\':
+ /* "\{" is special without REG_EXTENDED */
+ if (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end
+ && *(ctx->re + 1) == '{')
+ {
+ ctx->re++;
+ goto parse_brace;
+ }
+ else
+ break;
+
+ case '{':
+ /* "{" is literal without REG_EXTENDED */
+ if (!(ctx->cflags & REG_EXTENDED))
+ break;
+
+ parse_brace:
+ DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ ctx->re++;
+
+ status = tre_parse_bound(ctx, &result);
+ if (status != REG_OK)
+ return status;
+ STACK_PUSHX(stack, PARSE_POSTFIX);
+ break;
+ }
+ break;
+
+ case PARSE_ATOM:
+ /* Parse an atom. An atom is a regular expression enclosed in `()',
+ an empty set of `()', a bracket expression, `.', `^', `$',
+ a `\' followed by a character, or a single character. */
+
+ /* End of regexp? (empty string). */
+ if (ctx->re >= ctx->re_end)
+ goto parse_literal;
+
+ switch (*ctx->re)
+ {
+ case '(': /* parenthesized subexpression */
+
+ if (ctx->cflags & REG_EXTENDED
+ || (ctx->re > ctx->re_start
+ && *(ctx->re - 1) == '\\'))
+ {
+ depth++;
+ {
+ DPRINT(("tre_parse: group begin: '%.*" STRF
+ "', submatch %d\n",
+ ctx->re_end - ctx->re, ctx->re,
+ ctx->submatch_id));
+ ctx->re++;
+ /* First parse a whole RE, then mark the resulting tree
+ for submatching. */
+ STACK_PUSHX(stack, ctx->submatch_id);
+ STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH);
+ STACK_PUSHX(stack, PARSE_RE);
+ ctx->submatch_id++;
+ }
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case ')': /* end of current subexpression */
+ if ((ctx->cflags & REG_EXTENDED && depth > 0)
+ || (ctx->re > ctx->re_start
+ && *(ctx->re - 1) == '\\'))
+ {
+ DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ /* We were expecting an atom, but instead the current
+ subexpression was closed. POSIX leaves the meaning of
+ this to be implementation-defined. We interpret this as
+ an empty expression (which matches an empty string). */
+ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ if (!(ctx->cflags & REG_EXTENDED))
+ ctx->re--;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case '[': /* bracket expression */
+ DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ ctx->re++;
+ status = tre_parse_bracket(ctx, &result);
+ if (status != REG_OK)
+ return status;
+ break;
+
+ case '\\':
+ /* If this is "\(" or "\)" chew off the backslash and
+ try again. */
+ if (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end
+ && (*(ctx->re + 1) == '('
+ || *(ctx->re + 1) == ')'))
+ {
+ ctx->re++;
+ STACK_PUSHX(stack, PARSE_ATOM);
+ break;
+ }
+
+ if (ctx->re + 1 >= ctx->re_end)
+ /* Trailing backslash. */
+ return REG_EESCAPE;
+
+ DPRINT(("tre_parse: bleep: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ ctx->re++;
+ switch (*ctx->re)
+ {
+ default:
+ if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re))
+ {
+ /* Back reference. */
+ int val = *ctx->re - '0';
+ DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re + 1, ctx->re - 1));
+ result = tre_ast_new_literal(ctx->mem, BACKREF, val,
+ ctx->position);
+ if (result == NULL)
+ return REG_ESPACE;
+ ctx->position++;
+ ctx->max_backref = MAX(val, ctx->max_backref);
+ ctx->re++;
+ }
+ else
+ {
+ /* Escaped character. */
+ DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re + 1, ctx->re - 1));
+ result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
+ ctx->position);
+ ctx->position++;
+ ctx->re++;
+ }
+ break;
+ }
+ if (result == NULL)
+ return REG_ESPACE;
+ break;
+
+ case '.': /* the any-symbol */
+ DPRINT(("tre_parse: any: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ if (ctx->cflags & REG_NEWLINE)
+ {
+ tre_ast_node_t *tmp1;
+ tre_ast_node_t *tmp2;
+ tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
+ ctx->position);
+ if (!tmp1)
+ return REG_ESPACE;
+ tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
+ ctx->position + 1);
+ if (!tmp2)
+ return REG_ESPACE;
+ result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ if (!result)
+ return REG_ESPACE;
+ ctx->position += 2;
+ }
+ else
+ {
+ result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
+ ctx->position);
+ if (!result)
+ return REG_ESPACE;
+ ctx->position++;
+ }
+ ctx->re++;
+ break;
+
+ case '^': /* beginning of line assertion */
+ /* '^' has a special meaning everywhere in EREs, and in the
+ beginning of the RE and after \( is BREs. */
+ if (ctx->cflags & REG_EXTENDED
+ || (ctx->re - 2 >= ctx->re_start
+ && *(ctx->re - 2) == '\\'
+ && *(ctx->re - 1) == '(')
+ || ctx->re == ctx->re_start)
+ {
+ DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_BOL, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ ctx->re++;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ case '$': /* end of line assertion. */
+ /* '$' is special everywhere in EREs, and in the end of the
+ string and before \) is BREs. */
+ if (ctx->cflags & REG_EXTENDED
+ || (ctx->re + 2 < ctx->re_end
+ && *(ctx->re + 1) == '\\'
+ && *(ctx->re + 2) == ')')
+ || ctx->re + 1 == ctx->re_end)
+ {
+ DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ result = tre_ast_new_literal(ctx->mem, ASSERTION,
+ ASSERT_AT_EOL, -1);
+ if (result == NULL)
+ return REG_ESPACE;
+ ctx->re++;
+ }
+ else
+ goto parse_literal;
+ break;
+
+ default:
+ parse_literal:
+
+ /* We are expecting an atom. If the subexpression (or the whole
+ regexp ends here, we interpret it as an empty expression
+ (which matches an empty string). */
+ if (
+ (ctx->re >= ctx->re_end
+ || *ctx->re == '*'
+ || (ctx->cflags & REG_EXTENDED
+ && (*ctx->re == '|'
+ || *ctx->re == '{'
+ || *ctx->re == '+'
+ || *ctx->re == '?'))
+ /* Test for "\)" in BRE mode. */
+ || (!(ctx->cflags & REG_EXTENDED)
+ && ctx->re + 1 < ctx->re_end
+ && *ctx->re == '\\'
+ && *(ctx->re + 1) == '{')))
+ {
+ DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (!result)
+ return REG_ESPACE;
+ break;
+ }
+
+ DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
+ ctx->re_end - ctx->re, ctx->re));
+ /* Note that we can't use an tre_isalpha() test here, since there
+ may be characters which are alphabetic but neither upper or
+ lower case. */
+ if (ctx->cflags & REG_ICASE
+ && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
+ {
+ tre_ast_node_t *tmp1;
+ tre_ast_node_t *tmp2;
+
+ /* XXX - Can there be more than one opposite-case
+ counterpoints for some character in some locale? Or
+ more than two characters which all should be regarded
+ the same character if case is ignored? If yes, there
+ does not seem to be a portable way to detect it. I guess
+ that at least for multi-character collating elements there
+ could be several opposite-case counterpoints, but they
+ cannot be supported portably anyway. */
+ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
+ tre_toupper(*ctx->re),
+ ctx->position);
+ if (!tmp1)
+ return REG_ESPACE;
+ tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
+ tre_tolower(*ctx->re),
+ ctx->position);
+ if (!tmp2)
+ return REG_ESPACE;
+ result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+ if (!result)
+ return REG_ESPACE;
+ }
+ else
+ {
+ result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
+ ctx->position);
+ if (!result)
+ return REG_ESPACE;
+ }
+ ctx->position++;
+ ctx->re++;
+ break;
+ }
+ break;
+
+ case PARSE_MARK_FOR_SUBMATCH:
+ {
+ int submatch_id = (int)tre_stack_pop(stack);
+
+ if (result->submatch_id >= 0)
+ {
+ tre_ast_node_t *n, *tmp_node;
+ n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (n == NULL)
+ return REG_ESPACE;
+ tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
+ if (tmp_node == NULL)
+ return REG_ESPACE;
+ tmp_node->num_submatches = result->num_submatches;
+ result = tmp_node;
+ }
+ result->submatch_id = submatch_id;
+ result->num_submatches++;
+ break;
+ }
+
+ case PARSE_RESTORE_CFLAGS:
+ ctx->cflags = (int)tre_stack_pop(stack);
+ break;
+ }
+ }
+
+ /* Check for missing closing parentheses. */
+ if (depth > 0)
+ return REG_EPAREN;
+
+ if (status == REG_OK)
+ ctx->result = result;
+
+ return status;
+}
+
+
+/***********************************************************************
+ from tre-compile.c
+***********************************************************************/
+
+/*
+ Algorithms to setup tags so that submatch addressing can be done.
+*/
+
+
+/* Inserts a catenation node to the root of the tree given in `node'.
+ As the left child a new tag with number `tag_id' to `node' is added,
+ and the right child is the old root. */
+/* OR */
+/* Inserts a catenation node to the root of the tree given in `node'.
+ As the right child a new tag with number `tag_id' to `node' is added,
+ and the left child is the old root. */
+static reg_errcode_t
+tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right)
+{
+ tre_catenation_t *c;
+ tre_ast_node_t *child_tag, *child_old;
+
+ DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id));
+
+ c = tre_mem_alloc(mem, sizeof(*c));
+ if (c == NULL)
+ return REG_ESPACE;
+ child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1);
+ if (child_tag == NULL)
+ return REG_ESPACE;
+ child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
+ if (child_old == NULL)
+ return REG_ESPACE;
+
+ child_old->obj = node->obj;
+ child_old->type = node->type;
+ child_old->nullable = -1;
+ child_old->submatch_id = -1;
+ child_old->firstpos = NULL;
+ child_old->lastpos = NULL;
+ child_old->num_tags = 0;
+ node->obj = c;
+ node->type = CATENATION;
+
+ c->right = c->left = child_old;
+ if (right) c->right = child_tag;
+ else c->left = child_tag;
+
+ return REG_OK;
+}
+
+typedef enum {
+ ADDTAGS_RECURSE,
+ ADDTAGS_AFTER_ITERATION,
+ ADDTAGS_AFTER_UNION_LEFT,
+ ADDTAGS_AFTER_UNION_RIGHT,
+ ADDTAGS_AFTER_CAT_LEFT,
+ ADDTAGS_AFTER_CAT_RIGHT,
+ ADDTAGS_SET_SUBMATCH_END
+} tre_addtags_symbol_t;
+
+
+typedef struct {
+ int tag;
+ int next_tag;
+} tre_tag_states_t;
+
+/* Adds tags to appropriate locations in the parse tree in `tree', so that
+ subexpressions marked for submatch addressing can be traced. */
+static reg_errcode_t
+tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
+ tre_tnfa_t *tnfa)
+{
+ reg_errcode_t status = REG_OK;
+ tre_addtags_symbol_t symbol;
+ tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
+ int bottom = tre_stack_num_objects(stack);
+ /* True for first pass (counting number of needed tags) */
+ int first_pass = (mem == NULL || tnfa == NULL);
+ int *regset, *orig_regset;
+ int num_tags = 0; /* Total number of tags. */
+ int tag = 0; /* The tag that is to be added next. */
+ int next_tag = 1; /* Next tag to use after this one. */
+ int *parents; /* Stack of submatches the current submatch is
+ contained in. */
+ tre_tag_states_t *saved_states;
+
+ tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
+ if (!first_pass)
+ tnfa->end_tag = 0;
+
+ regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
+ if (regset == NULL)
+ return REG_ESPACE;
+ regset[0] = -1;
+ orig_regset = regset;
+
+ parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
+ if (parents == NULL)
+ {
+ xfree(regset);
+ return REG_ESPACE;
+ }
+ parents[0] = -1;
+
+ saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
+ if (saved_states == NULL)
+ {
+ xfree(regset);
+ xfree(parents);
+ return REG_ESPACE;
+ }
+ else
+ {
+ unsigned int i;
+ for (i = 0; i <= tnfa->num_submatches; i++)
+ saved_states[i].tag = -1;
+ }
+
+ STACK_PUSH(stack, node);
+ STACK_PUSH(stack, ADDTAGS_RECURSE);
+
+ while (tre_stack_num_objects(stack) > bottom)
+ {
+ if (status != REG_OK)
+ break;
+
+ symbol = (tre_addtags_symbol_t)tre_stack_pop(stack);
+ switch (symbol)
+ {
+
+ case ADDTAGS_SET_SUBMATCH_END:
+ {
+ int id = (int)tre_stack_pop(stack);
+ int i;
+
+ /* Add end of this submatch to regset. */
+ for (i = 0; regset[i] >= 0; i++);
+ regset[i] = id * 2 + 1;
+ regset[i + 1] = -1;
+
+ /* Pop this submatch from the parents stack. */
+ for (i = 0; parents[i] >= 0; i++);
+ parents[i - 1] = -1;
+ break;
+ }
+
+ case ADDTAGS_RECURSE:
+ node = tre_stack_pop(stack);
+
+ if (node->submatch_id >= 0)
+ {
+ int id = node->submatch_id;
+ int i;
+
+
+ /* Add start of this submatch to regset. */
+ for (i = 0; regset[i] >= 0; i++);
+ regset[i] = id * 2;
+ regset[i + 1] = -1;
+
+ if (!first_pass)
+ {
+ for (i = 0; parents[i] >= 0; i++);
+ tnfa->submatch_data[id].parents = NULL;
+ if (i > 0)
+ {
+ int *p = xmalloc(sizeof(*p) * (i + 1));
+ if (p == NULL)
+ {
+ status = REG_ESPACE;
+ break;
+ }
+ assert(tnfa->submatch_data[id].parents == NULL);
+ tnfa->submatch_data[id].parents = p;
+ for (i = 0; parents[i] >= 0; i++)
+ p[i] = parents[i];
+ p[i] = -1;
+ }
+ }
+
+ /* Add end of this submatch to regset after processing this
+ node. */
+ STACK_PUSHX(stack, node->submatch_id);
+ STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END);
+ }
+
+ switch (node->type)
+ {
+ case LITERAL:
+ {
+ tre_literal_t *lit = node->obj;
+
+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
+ {
+ int i;
+ DPRINT(("Literal %d-%d\n",
+ (int)lit->code_min, (int)lit->code_max));
+ if (regset[0] >= 0)
+ {
+ /* Regset is not empty, so add a tag before the
+ literal or backref. */
+ if (!first_pass)
+ {
+ status = tre_add_tag(mem, node, tag, 0 /*left*/);
+ tnfa->tag_directions[tag] = direction;
+ /* Go through the regset and set submatch data for
+ submatches that are using this tag. */
+ for (i = 0; regset[i] >= 0; i++)
+ {
+ int id = regset[i] >> 1;
+ int start = !(regset[i] & 1);
+ DPRINT((" Using tag %d for %s offset of "
+ "submatch %d\n", tag,
+ start ? "start" : "end", id));
+ if (start)
+ tnfa->submatch_data[id].so_tag = tag;
+ else
+ tnfa->submatch_data[id].eo_tag = tag;
+ }
+ }
+ else
+ {
+ DPRINT((" num_tags = 1\n"));
+ node->num_tags = 1;
+ }
+
+ DPRINT((" num_tags++\n"));
+ regset[0] = -1;
+ tag = next_tag;
+ num_tags++;
+ next_tag++;
+ }
+ }
+ else
+ {
+ assert(!IS_TAG(lit));
+ }
+ break;
+ }
+ case CATENATION:
+ {
+ tre_catenation_t *cat = node->obj;
+ tre_ast_node_t *left = cat->left;
+ tre_ast_node_t *right = cat->right;
+ int reserved_tag = -1;
+ DPRINT(("Catenation, next_tag = %d\n", next_tag));
+
+
+ /* After processing right child. */
+ STACK_PUSHX(stack, node);
+ STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT);
+
+ /* Process right child. */
+ STACK_PUSHX(stack, right);
+ STACK_PUSHX(stack, ADDTAGS_RECURSE);
+
+ /* After processing left child. */
+ STACK_PUSHX(stack, next_tag + left->num_tags);
+ DPRINT((" Pushing %d for after left\n",
+ next_tag + left->num_tags));
+ if (left->num_tags > 0 && right->num_tags > 0)
+ {
+ /* Reserve the next tag to the right child. */
+ DPRINT((" Reserving next_tag %d to right child\n",
+ next_tag));
+ reserved_tag = next_tag;
+ next_tag++;
+ }
+ STACK_PUSHX(stack, reserved_tag);
+ STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT);
+
+ /* Process left child. */
+ STACK_PUSHX(stack, left);
+ STACK_PUSHX(stack, ADDTAGS_RECURSE);
+
+ }
+ break;
+ case ITERATION:
+ {
+ tre_iteration_t *iter = node->obj;
+ DPRINT(("Iteration\n"));
+
+ if (first_pass)
+ {
+ STACK_PUSHX(stack, regset[0] >= 0);
+ }
+ else
+ {
+ STACK_PUSHX(stack, tag);
+ }
+ STACK_PUSHX(stack, node);
+ STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION);
+
+ STACK_PUSHX(stack, iter->arg);
+ STACK_PUSHX(stack, ADDTAGS_RECURSE);
+
+ /* Regset is not empty, so add a tag here. */
+ if (regset[0] >= 0)
+ {
+ if (!first_pass)
+ {
+ int i;
+ status = tre_add_tag(mem, node, tag, 0 /*left*/);
+ tnfa->tag_directions[tag] = direction;
+ /* Go through the regset and set submatch data for
+ submatches that are using this tag. */
+ for (i = 0; regset[i] >= 0; i++)
+ {
+ int id = regset[i] >> 1;
+ int start = !(regset[i] & 1);
+ DPRINT((" Using tag %d for %s offset of "
+ "submatch %d\n", tag,
+ start ? "start" : "end", id));
+ if (start)
+ tnfa->submatch_data[id].so_tag = tag;
+ else
+ tnfa->submatch_data[id].eo_tag = tag;
+ }
+ }
+
+ DPRINT((" num_tags++\n"));
+ regset[0] = -1;
+ tag = next_tag;
+ num_tags++;
+ next_tag++;
+ }
+ direction = TRE_TAG_MINIMIZE;
+ }
+ break;
+ case UNION:
+ {
+ tre_union_t *uni = node->obj;
+ tre_ast_node_t *left = uni->left;
+ tre_ast_node_t *right = uni->right;
+ int left_tag;
+ int right_tag;
+
+ if (regset[0] >= 0)
+ {
+ left_tag = next_tag;
+ right_tag = next_tag + 1;
+ }
+ else
+ {
+ left_tag = tag;
+ right_tag = next_tag;
+ }
+
+ DPRINT(("Union\n"));
+
+ /* After processing right child. */
+ STACK_PUSHX(stack, right_tag);
+ STACK_PUSHX(stack, left_tag);
+ STACK_PUSHX(stack, regset);
+ STACK_PUSHX(stack, regset[0] >= 0);
+ STACK_PUSHX(stack, node);
+ STACK_PUSHX(stack, right);
+ STACK_PUSHX(stack, left);
+ STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT);
+
+ /* Process right child. */
+ STACK_PUSHX(stack, right);
+ STACK_PUSHX(stack, ADDTAGS_RECURSE);
+
+ /* After processing left child. */
+ STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT);
+
+ /* Process left child. */
+ STACK_PUSHX(stack, left);
+ STACK_PUSHX(stack, ADDTAGS_RECURSE);
+
+ /* Regset is not empty, so add a tag here. */
+ if (regset[0] >= 0)
+ {
+ if (!first_pass)
+ {
+ int i;
+ status = tre_add_tag(mem, node, tag, 0 /*left*/);
+ tnfa->tag_directions[tag] = direction;
+ /* Go through the regset and set submatch data for
+ submatches that are using this tag. */
+ for (i = 0; regset[i] >= 0; i++)
+ {
+ int id = regset[i] >> 1;
+ int start = !(regset[i] & 1);
+ DPRINT((" Using tag %d for %s offset of "
+ "submatch %d\n", tag,
+ start ? "start" : "end", id));
+ if (start)
+ tnfa->submatch_data[id].so_tag = tag;
+ else
+ tnfa->submatch_data[id].eo_tag = tag;
+ }
+ }
+
+ DPRINT((" num_tags++\n"));
+ regset[0] = -1;
+ tag = next_tag;
+ num_tags++;
+ next_tag++;
+ }
+
+ if (node->num_submatches > 0)
+ {
+ /* The next two tags are reserved for markers. */
+ next_tag++;
+ tag = next_tag;
+ next_tag++;
+ }
+
+ break;
+ }
+ }
+
+ if (node->submatch_id >= 0)
+ {
+ int i;
+ /* Push this submatch on the parents stack. */
+ for (i = 0; parents[i] >= 0; i++);
+ parents[i] = node->submatch_id;
+ parents[i + 1] = -1;
+ }
+
+ break; /* end case: ADDTAGS_RECURSE */
+
+ case ADDTAGS_AFTER_ITERATION:
+ {
+ int enter_tag;
+ node = tre_stack_pop(stack);
+ if (first_pass)
+ node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
+ + (int)tre_stack_pop(stack);
+ else
+ enter_tag = (int)tre_stack_pop(stack);
+
+ DPRINT(("After iteration\n"));
+ direction = TRE_TAG_MAXIMIZE;
+ break;
+ }
+
+ case ADDTAGS_AFTER_CAT_LEFT:
+ {
+ int new_tag = (int)tre_stack_pop(stack);
+ next_tag = (int)tre_stack_pop(stack);
+ DPRINT(("After cat left, tag = %d, next_tag = %d\n",
+ tag, next_tag));
+ if (new_tag >= 0)
+ {
+ DPRINT((" Setting tag to %d\n", new_tag));
+ tag = new_tag;
+ }
+ break;
+ }
+
+ case ADDTAGS_AFTER_CAT_RIGHT:
+ DPRINT(("After cat right\n"));
+ node = tre_stack_pop(stack);
+ if (first_pass)
+ node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
+ + ((tre_catenation_t *)node->obj)->right->num_tags;
+ break;
+
+ case ADDTAGS_AFTER_UNION_LEFT:
+ DPRINT(("After union left\n"));
+ /* Lift the bottom of the `regset' array so that when processing
+ the right operand the items currently in the array are
+ invisible. The original bottom was saved at ADDTAGS_UNION and
+ will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
+ while (*regset >= 0)
+ regset++;
+ break;
+
+ case ADDTAGS_AFTER_UNION_RIGHT:
+ {
+ int added_tags, tag_left, tag_right;
+ tre_ast_node_t *left = tre_stack_pop(stack);
+ tre_ast_node_t *right = tre_stack_pop(stack);
+ DPRINT(("After union right\n"));
+ node = tre_stack_pop(stack);
+ added_tags = (int)tre_stack_pop(stack);
+ if (first_pass)
+ {
+ node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
+ + ((tre_union_t *)node->obj)->right->num_tags + added_tags
+ + ((node->num_submatches > 0) ? 2 : 0);
+ }
+ regset = tre_stack_pop(stack);
+ tag_left = (int)tre_stack_pop(stack);
+ tag_right = (int)tre_stack_pop(stack);
+
+ /* Add tags after both children, the left child gets a smaller
+ tag than the right child. This guarantees that we prefer
+ the left child over the right child. */
+ /* XXX - This is not always necessary (if the children have
+ tags which must be seen for every match of that child). */
+ /* XXX - Check if this is the only place where tre_add_tag_right
+ is used. If so, use tre_add_tag_left (putting the tag before
+ the child as opposed after the child) and throw away
+ tre_add_tag_right. */
+ if (node->num_submatches > 0)
+ {
+ if (!first_pass)
+ {
+ status = tre_add_tag(mem, left, tag_left, 1 /*right*/);
+ tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+ status = tre_add_tag(mem, right, tag_right, 1 /*right*/);
+ tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+ }
+ DPRINT((" num_tags += 2\n"));
+ num_tags += 2;
+ }
+ direction = TRE_TAG_MAXIMIZE;
+ break;
+ }
+
+ default:
+ assert(0);
+ break;
+
+ } /* end switch(symbol) */
+ } /* end while(tre_stack_num_objects(stack) > bottom) */
+
+ if (!first_pass)
+ {
+ int i;
+ /* Go through the regset and set submatch data for
+ submatches that are using this tag. */
+ for (i = 0; regset[i] >= 0; i++)
+ {
+ int id = regset[i] >> 1;
+ int start = !(regset[i] & 1);
+ DPRINT((" Using tag %d for %s offset of "
+ "submatch %d\n", num_tags,
+ start ? "start" : "end", id));
+ if (start)
+ tnfa->submatch_data[id].so_tag = num_tags;
+ else
+ tnfa->submatch_data[id].eo_tag = num_tags;
+ }
+ }
+
+ DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
+ first_pass? "First pass" : "Second pass", num_tags));
+
+ assert(tree->num_tags == num_tags);
+ tnfa->end_tag = num_tags;
+ tnfa->num_tags = num_tags;
+ xfree(orig_regset);
+ xfree(parents);
+ xfree(saved_states);
+ return status;
+}
+
+
+
+/*
+ AST to TNFA compilation routines.
+*/
+
+typedef enum {
+ COPY_RECURSE,
+ COPY_SET_RESULT_PTR
+} tre_copyast_symbol_t;
+
+/* Flags for tre_copy_ast(). */
+#define COPY_REMOVE_TAGS 1
+#define COPY_MAXIMIZE_FIRST_TAG 2
+
+static reg_errcode_t
+tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
+ int flags, int *pos_add, tre_tag_direction_t *tag_directions,
+ tre_ast_node_t **copy, int *max_pos)
+{
+ reg_errcode_t status = REG_OK;
+ int bottom = tre_stack_num_objects(stack);
+ int num_copied = 0;
+ int first_tag = 1;
+ tre_ast_node_t **result = copy;
+ tre_copyast_symbol_t symbol;
+
+ STACK_PUSH(stack, ast);
+ STACK_PUSH(stack, COPY_RECURSE);
+
+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
+ {
+ tre_ast_node_t *node;
+ if (status != REG_OK)
+ break;
+
+ symbol = (tre_copyast_symbol_t)tre_stack_pop(stack);
+ switch (symbol)
+ {
+ case COPY_SET_RESULT_PTR:
+ result = tre_stack_pop(stack);
+ break;
+ case COPY_RECURSE:
+ node = tre_stack_pop(stack);
+ switch (node->type)
+ {
+ case LITERAL:
+ {
+ tre_literal_t *lit = node->obj;
+ int pos = lit->position;
+ int min = lit->code_min;
+ int max = lit->code_max;
+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
+ {
+ /* XXX - e.g. [ab] has only one position but two
+ nodes, so we are creating holes in the state space
+ here. Not fatal, just wastes memory. */
+ pos += *pos_add;
+ num_copied++;
+ }
+ else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
+ {
+ /* Change this tag to empty. */
+ min = EMPTY;
+ max = pos = -1;
+ }
+ else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
+ && first_tag)
+ {
+ /* Maximize the first tag. */
+ tag_directions[max] = TRE_TAG_MAXIMIZE;
+ first_tag = 0;
+ }
+ *result = tre_ast_new_literal(mem, min, max, pos);
+ if (*result == NULL)
+ status = REG_ESPACE;
+
+ if (pos > *max_pos)
+ *max_pos = pos;
+ break;
+ }
+ case UNION:
+ {
+ tre_union_t *uni = node->obj;
+ tre_union_t *copy;
+ *result = tre_ast_new_union(mem, uni->left, uni->right);
+ if (*result == NULL)
+ {
+ status = REG_ESPACE;
+ break;
+ }
+ copy = (*result)->obj;
+ result = &copy->left;
+ STACK_PUSHX(stack, uni->right);
+ STACK_PUSHX(stack, COPY_RECURSE);
+ STACK_PUSHX(stack, &copy->right);
+ STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
+ STACK_PUSHX(stack, uni->left);
+ STACK_PUSHX(stack, COPY_RECURSE);
+ break;
+ }
+ case CATENATION:
+ {
+ tre_catenation_t *cat = node->obj;
+ tre_catenation_t *copy;
+ *result = tre_ast_new_catenation(mem, cat->left, cat->right);
+ if (*result == NULL)
+ {
+ status = REG_ESPACE;
+ break;
+ }
+ copy = (*result)->obj;
+ copy->left = NULL;
+ copy->right = NULL;
+ result = &copy->left;
+
+ STACK_PUSHX(stack, cat->right);
+ STACK_PUSHX(stack, COPY_RECURSE);
+ STACK_PUSHX(stack, &copy->right);
+ STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
+ STACK_PUSHX(stack, cat->left);
+ STACK_PUSHX(stack, COPY_RECURSE);
+ break;
+ }
+ case ITERATION:
+ {
+ tre_iteration_t *iter = node->obj;
+ STACK_PUSHX(stack, iter->arg);
+ STACK_PUSHX(stack, COPY_RECURSE);
+ *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max);
+ if (*result == NULL)
+ {
+ status = REG_ESPACE;
+ break;
+ }
+ iter = (*result)->obj;
+ result = &iter->arg;
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ }
+ }
+ *pos_add += num_copied;
+ return status;
+}
+
+typedef enum {
+ EXPAND_RECURSE,
+ EXPAND_AFTER_ITER
+} tre_expand_ast_symbol_t;
+
+/* Expands each iteration node that has a finite nonzero minimum or maximum
+ iteration count to a catenated sequence of copies of the node. */
+static reg_errcode_t
+tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
+ int *position, tre_tag_direction_t *tag_directions,
+ int *max_depth)
+{
+ reg_errcode_t status = REG_OK;
+ int bottom = tre_stack_num_objects(stack);
+ int pos_add = 0;
+ int pos_add_total = 0;
+ int max_pos = 0;
+ /* Approximate parameter nesting level. */
+ int iter_depth = 0;
+
+ STACK_PUSHR(stack, ast);
+ STACK_PUSHR(stack, EXPAND_RECURSE);
+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
+ {
+ tre_ast_node_t *node;
+ tre_expand_ast_symbol_t symbol;
+
+ if (status != REG_OK)
+ break;
+
+ DPRINT(("pos_add %d\n", pos_add));
+
+ symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack);
+ node = tre_stack_pop(stack);
+ switch (symbol)
+ {
+ case EXPAND_RECURSE:
+ switch (node->type)
+ {
+ case LITERAL:
+ {
+ tre_literal_t *lit= node->obj;
+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
+ {
+ lit->position += pos_add;
+ if (lit->position > max_pos)
+ max_pos = lit->position;
+ }
+ break;
+ }
+ case UNION:
+ {
+ tre_union_t *uni = node->obj;
+ STACK_PUSHX(stack, uni->right);
+ STACK_PUSHX(stack, EXPAND_RECURSE);
+ STACK_PUSHX(stack, uni->left);
+ STACK_PUSHX(stack, EXPAND_RECURSE);
+ break;
+ }
+ case CATENATION:
+ {
+ tre_catenation_t *cat = node->obj;
+ STACK_PUSHX(stack, cat->right);
+ STACK_PUSHX(stack, EXPAND_RECURSE);
+ STACK_PUSHX(stack, cat->left);
+ STACK_PUSHX(stack, EXPAND_RECURSE);
+ break;
+ }
+ case ITERATION:
+ {
+ tre_iteration_t *iter = node->obj;
+ STACK_PUSHX(stack, pos_add);
+ STACK_PUSHX(stack, node);
+ STACK_PUSHX(stack, EXPAND_AFTER_ITER);
+ STACK_PUSHX(stack, iter->arg);
+ STACK_PUSHX(stack, EXPAND_RECURSE);
+ /* If we are going to expand this node at EXPAND_AFTER_ITER
+ then don't increase the `pos' fields of the nodes now, it
+ will get done when expanding. */
+ if (iter->min > 1 || iter->max > 1)
+ pos_add = 0;
+ iter_depth++;
+ DPRINT(("iter\n"));
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case EXPAND_AFTER_ITER:
+ {
+ tre_iteration_t *iter = node->obj;
+ int pos_add_last;
+ pos_add = (int)tre_stack_pop(stack);
+ pos_add_last = pos_add;
+ if (iter->min > 1 || iter->max > 1)
+ {
+ tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
+ int i;
+ int pos_add_save = pos_add;
+
+ /* Create a catenated sequence of copies of the node. */
+ for (i = 0; i < iter->min; i++)
+ {
+ tre_ast_node_t *copy;
+ /* Remove tags from all but the last copy. */
+ int flags = ((i + 1 < iter->min)
+ ? COPY_REMOVE_TAGS
+ : COPY_MAXIMIZE_FIRST_TAG);
+ DPRINT((" pos_add %d\n", pos_add));
+ pos_add_save = pos_add;
+ status = tre_copy_ast(mem, stack, iter->arg, flags,
+ &pos_add, tag_directions, &copy,
+ &max_pos);
+ if (status != REG_OK)
+ return status;
+ if (seq1 != NULL)
+ seq1 = tre_ast_new_catenation(mem, seq1, copy);
+ else
+ seq1 = copy;
+ if (seq1 == NULL)
+ return REG_ESPACE;
+ }
+
+ if (iter->max == -1)
+ {
+ /* No upper limit. */
+ pos_add_save = pos_add;
+ status = tre_copy_ast(mem, stack, iter->arg, 0,
+ &pos_add, NULL, &seq2, &max_pos);
+ if (status != REG_OK)
+ return status;
+ seq2 = tre_ast_new_iter(mem, seq2, 0, -1);
+ if (seq2 == NULL)
+ return REG_ESPACE;
+ }
+ else
+ {
+ for (i = iter->min; i < iter->max; i++)
+ {
+ tre_ast_node_t *tmp, *copy;
+ pos_add_save = pos_add;
+ status = tre_copy_ast(mem, stack, iter->arg, 0,
+ &pos_add, NULL, &copy, &max_pos);
+ if (status != REG_OK)
+ return status;
+ if (seq2 != NULL)
+ seq2 = tre_ast_new_catenation(mem, copy, seq2);
+ else
+ seq2 = copy;
+ if (seq2 == NULL)
+ return REG_ESPACE;
+ tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
+ if (tmp == NULL)
+ return REG_ESPACE;
+ seq2 = tre_ast_new_union(mem, tmp, seq2);
+ if (seq2 == NULL)
+ return REG_ESPACE;
+ }
+ }
+
+ pos_add = pos_add_save;
+ if (seq1 == NULL)
+ seq1 = seq2;
+ else if (seq2 != NULL)
+ seq1 = tre_ast_new_catenation(mem, seq1, seq2);
+ if (seq1 == NULL)
+ return REG_ESPACE;
+ node->obj = seq1->obj;
+ node->type = seq1->type;
+ }
+
+ iter_depth--;
+ pos_add_total += pos_add - pos_add_last;
+ if (iter_depth == 0)
+ pos_add = pos_add_total;
+
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+ }
+
+ *position += pos_add_total;
+
+ /* `max_pos' should never be larger than `*position' if the above
+ code works, but just an extra safeguard let's make sure
+ `*position' is set large enough so enough memory will be
+ allocated for the transition table. */
+ if (max_pos > *position)
+ *position = max_pos;
+
+#ifdef TRE_DEBUG
+ DPRINT(("Expanded AST:\n"));
+ tre_ast_print(ast);
+ DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
+#endif
+
+ return status;
+}
+
+static tre_pos_and_tags_t *
+tre_set_empty(tre_mem_t mem)
+{
+ tre_pos_and_tags_t *new_set;
+
+ new_set = tre_mem_calloc(mem, sizeof(*new_set));
+ if (new_set == NULL)
+ return NULL;
+
+ new_set[0].position = -1;
+ new_set[0].code_min = -1;
+ new_set[0].code_max = -1;
+
+ return new_set;
+}
+
+static tre_pos_and_tags_t *
+tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
+ tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
+{
+ tre_pos_and_tags_t *new_set;
+
+ new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
+ if (new_set == NULL)
+ return NULL;
+
+ new_set[0].position = position;
+ new_set[0].code_min = code_min;
+ new_set[0].code_max = code_max;
+ new_set[0].class = class;
+ new_set[0].neg_classes = neg_classes;
+ new_set[0].backref = backref;
+ new_set[1].position = -1;
+ new_set[1].code_min = -1;
+ new_set[1].code_max = -1;
+
+ return new_set;
+}
+
+static tre_pos_and_tags_t *
+tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
+ int *tags, int assertions)
+{
+ int s1, s2, i, j;
+ tre_pos_and_tags_t *new_set;
+ int *new_tags;
+ int num_tags;
+
+ for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
+ for (s1 = 0; set1[s1].position >= 0; s1++);
+ for (s2 = 0; set2[s2].position >= 0; s2++);
+ new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
+ if (!new_set )
+ return NULL;
+
+ for (s1 = 0; set1[s1].position >= 0; s1++)
+ {
+ new_set[s1].position = set1[s1].position;
+ new_set[s1].code_min = set1[s1].code_min;
+ new_set[s1].code_max = set1[s1].code_max;
+ new_set[s1].assertions = set1[s1].assertions | assertions;
+ new_set[s1].class = set1[s1].class;
+ new_set[s1].neg_classes = set1[s1].neg_classes;
+ new_set[s1].backref = set1[s1].backref;
+ if (set1[s1].tags == NULL && tags == NULL)
+ new_set[s1].tags = NULL;
+ else
+ {
+ for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
+ new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
+ * (i + num_tags + 1)));
+ if (new_tags == NULL)
+ return NULL;
+ for (j = 0; j < i; j++)
+ new_tags[j] = set1[s1].tags[j];
+ for (i = 0; i < num_tags; i++)
+ new_tags[j + i] = tags[i];
+ new_tags[j + i] = -1;
+ new_set[s1].tags = new_tags;
+ }
+ }
+
+ for (s2 = 0; set2[s2].position >= 0; s2++)
+ {
+ new_set[s1 + s2].position = set2[s2].position;
+ new_set[s1 + s2].code_min = set2[s2].code_min;
+ new_set[s1 + s2].code_max = set2[s2].code_max;
+ /* XXX - why not | assertions here as well? */
+ new_set[s1 + s2].assertions = set2[s2].assertions;
+ new_set[s1 + s2].class = set2[s2].class;
+ new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
+ new_set[s1 + s2].backref = set2[s2].backref;
+ if (set2[s2].tags == NULL)
+ new_set[s1 + s2].tags = NULL;
+ else
+ {
+ for (i = 0; set2[s2].tags[i] >= 0; i++);
+ new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
+ if (new_tags == NULL)
+ return NULL;
+ for (j = 0; j < i; j++)
+ new_tags[j] = set2[s2].tags[j];
+ new_tags[j] = -1;
+ new_set[s1 + s2].tags = new_tags;
+ }
+ }
+ new_set[s1 + s2].position = -1;
+ return new_set;
+}
+
+/* Finds the empty path through `node' which is the one that should be
+ taken according to POSIX.2 rules, and adds the tags on that path to
+ `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
+ set to the number of tags seen on the path. */
+static reg_errcode_t
+tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
+ int *assertions, int *num_tags_seen)
+{
+ tre_literal_t *lit;
+ tre_union_t *uni;
+ tre_catenation_t *cat;
+ tre_iteration_t *iter;
+ int i;
+ int bottom = tre_stack_num_objects(stack);
+ reg_errcode_t status = REG_OK;
+ if (num_tags_seen)
+ *num_tags_seen = 0;
+
+ status = tre_stack_push(stack, node);
+
+ /* Walk through the tree recursively. */
+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
+ {
+ node = tre_stack_pop(stack);
+
+ switch (node->type)
+ {
+ case LITERAL:
+ lit = (tre_literal_t *)node->obj;
+ switch (lit->code_min)
+ {
+ case TAG:
+ if (lit->code_max >= 0)
+ {
+ if (tags != NULL)
+ {
+ /* Add the tag to `tags'. */
+ for (i = 0; tags[i] >= 0; i++)
+ if (tags[i] == lit->code_max)
+ break;
+ if (tags[i] < 0)
+ {
+ tags[i] = lit->code_max;
+ tags[i + 1] = -1;
+ }
+ }
+ if (num_tags_seen)
+ (*num_tags_seen)++;
+ }
+ break;
+ case ASSERTION:
+ assert(lit->code_max >= 1
+ || lit->code_max <= ASSERT_LAST);
+ if (assertions != NULL)
+ *assertions |= lit->code_max;
+ break;
+ case EMPTY:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+
+ case UNION:
+ /* Subexpressions starting earlier take priority over ones
+ starting later, so we prefer the left subexpression over the
+ right subexpression. */
+ uni = (tre_union_t *)node->obj;
+ if (uni->left->nullable)
+ STACK_PUSHX(stack, uni->left)
+ else if (uni->right->nullable)
+ STACK_PUSHX(stack, uni->right)
+ else
+ assert(0);
+ break;
+
+ case CATENATION:
+ /* The path must go through both children. */
+ cat = (tre_catenation_t *)node->obj;
+ assert(cat->left->nullable);
+ assert(cat->right->nullable);
+ STACK_PUSHX(stack, cat->left);
+ STACK_PUSHX(stack, cat->right);
+ break;
+
+ case ITERATION:
+ /* A match with an empty string is preferred over no match at
+ all, so we go through the argument if possible. */
+ iter = (tre_iteration_t *)node->obj;
+ if (iter->arg->nullable)
+ STACK_PUSHX(stack, iter->arg);
+ break;
+
+ default:
+ assert(0);
+ break;
+ }
+ }
+
+ return status;
+}
+
+
+typedef enum {
+ NFL_RECURSE,
+ NFL_POST_UNION,
+ NFL_POST_CATENATION,
+ NFL_POST_ITERATION
+} tre_nfl_stack_symbol_t;
+
+
+/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
+ the nodes of the AST `tree'. */
+static reg_errcode_t
+tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
+{
+ int bottom = tre_stack_num_objects(stack);
+
+ STACK_PUSHR(stack, tree);
+ STACK_PUSHR(stack, NFL_RECURSE);
+
+ while (tre_stack_num_objects(stack) > bottom)
+ {
+ tre_nfl_stack_symbol_t symbol;
+ tre_ast_node_t *node;
+
+ symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack);
+ node = tre_stack_pop(stack);
+ switch (symbol)
+ {
+ case NFL_RECURSE:
+ switch (node->type)
+ {
+ case LITERAL:
+ {
+ tre_literal_t *lit = (tre_literal_t *)node->obj;
+ if (IS_BACKREF(lit))
+ {
+ /* Back references: nullable = false, firstpos = {i},
+ lastpos = {i}. */
+ node->nullable = 0;
+ node->firstpos = tre_set_one(mem, lit->position, 0,
+ TRE_CHAR_MAX, 0, NULL, -1);
+ if (!node->firstpos)
+ return REG_ESPACE;
+ node->lastpos = tre_set_one(mem, lit->position, 0,
+ TRE_CHAR_MAX, 0, NULL,
+ lit->code_max);
+ if (!node->lastpos)
+ return REG_ESPACE;
+ }
+ else if (lit->code_min < 0)
+ {
+ /* Tags, empty strings and zero width assertions:
+ nullable = true, firstpos = {}, and lastpos = {}. */
+ node->nullable = 1;
+ node->firstpos = tre_set_empty(mem);
+ if (!node->firstpos)
+ return REG_ESPACE;
+ node->lastpos = tre_set_empty(mem);
+ if (!node->lastpos)
+ return REG_ESPACE;
+ }
+ else
+ {
+ /* Literal at position i: nullable = false, firstpos = {i},
+ lastpos = {i}. */
+ node->nullable = 0;
+ node->firstpos =
+ tre_set_one(mem, lit->position, lit->code_min,
+ lit->code_max, 0, NULL, -1);
+ if (!node->firstpos)
+ return REG_ESPACE;
+ node->lastpos = tre_set_one(mem, lit->position,
+ lit->code_min, lit->code_max,
+ lit->class, lit->neg_classes,
+ -1);
+ if (!node->lastpos)
+ return REG_ESPACE;
+ }
+ break;
+ }
+
+ case UNION:
+ /* Compute the attributes for the two subtrees, and after that
+ for this node. */
+ STACK_PUSHR(stack, node);
+ STACK_PUSHR(stack, NFL_POST_UNION);
+ STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right);
+ STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left);
+ STACK_PUSHR(stack, NFL_RECURSE);
+ break;
+
+ case CATENATION:
+ /* Compute the attributes for the two subtrees, and after that
+ for this node. */
+ STACK_PUSHR(stack, node);
+ STACK_PUSHR(stack, NFL_POST_CATENATION);
+ STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right);
+ STACK_PUSHR(stack, NFL_RECURSE);
+ STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left);
+ STACK_PUSHR(stack, NFL_RECURSE);
+ break;
+
+ case ITERATION:
+ /* Compute the attributes for the subtree, and after that for
+ this node. */
+ STACK_PUSHR(stack, node);
+ STACK_PUSHR(stack, NFL_POST_ITERATION);
+ STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg);
+ STACK_PUSHR(stack, NFL_RECURSE);
+ break;
+ }
+ break; /* end case: NFL_RECURSE */
+
+ case NFL_POST_UNION:
+ {
+ tre_union_t *uni = (tre_union_t *)node->obj;
+ node->nullable = uni->left->nullable || uni->right->nullable;
+ node->firstpos = tre_set_union(mem, uni->left->firstpos,
+ uni->right->firstpos, NULL, 0);
+ if (!node->firstpos)
+ return REG_ESPACE;
+ node->lastpos = tre_set_union(mem, uni->left->lastpos,
+ uni->right->lastpos, NULL, 0);
+ if (!node->lastpos)
+ return REG_ESPACE;
+ break;
+ }
+
+ case NFL_POST_ITERATION:
+ {
+ tre_iteration_t *iter = (tre_iteration_t *)node->obj;
+
+ if (iter->min == 0 || iter->arg->nullable)
+ node->nullable = 1;
+ else
+ node->nullable = 0;
+ node->firstpos = iter->arg->firstpos;
+ node->lastpos = iter->arg->lastpos;
+ break;
+ }
+
+ case NFL_POST_CATENATION:
+ {
+ int num_tags, *tags, assertions;
+ reg_errcode_t status;
+ tre_catenation_t *cat = node->obj;
+ node->nullable = cat->left->nullable && cat->right->nullable;
+
+ /* Compute firstpos. */
+ if (cat->left->nullable)
+ {
+ /* The left side matches the empty string. Make a first pass
+ with tre_match_empty() to get the number of tags. */
+ status = tre_match_empty(stack, cat->left,
+ NULL, NULL, &num_tags);
+ if (status != REG_OK)
+ return status;
+ /* Allocate arrays for the tags and parameters. */
+ tags = xmalloc(sizeof(*tags) * (num_tags + 1));
+ if (!tags)
+ return REG_ESPACE;
+ tags[0] = -1;
+ assertions = 0;
+ /* Second pass with tre_mach_empty() to get the list of
+ tags. */
+ status = tre_match_empty(stack, cat->left, tags,
+ &assertions, NULL);
+ if (status != REG_OK)
+ {
+ xfree(tags);
+ return status;
+ }
+ node->firstpos =
+ tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
+ tags, assertions);
+ xfree(tags);
+ if (!node->firstpos)
+ return REG_ESPACE;
+ }
+ else
+ {
+ node->firstpos = cat->left->firstpos;
+ }
+
+ /* Compute lastpos. */
+ if (cat->right->nullable)
+ {
+ /* The right side matches the empty string. Make a first pass
+ with tre_match_empty() to get the number of tags. */
+ status = tre_match_empty(stack, cat->right,
+ NULL, NULL, &num_tags);
+ if (status != REG_OK)
+ return status;
+ /* Allocate arrays for the tags and parameters. */
+ tags = xmalloc(sizeof(int) * (num_tags + 1));
+ if (!tags)
+ return REG_ESPACE;
+ tags[0] = -1;
+ assertions = 0;
+ /* Second pass with tre_mach_empty() to get the list of
+ tags. */
+ status = tre_match_empty(stack, cat->right, tags,
+ &assertions, NULL);
+ if (status != REG_OK)
+ {
+ xfree(tags);
+ return status;
+ }
+ node->lastpos =
+ tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
+ tags, assertions);
+ xfree(tags);
+ if (!node->lastpos)
+ return REG_ESPACE;
+ }
+ else
+ {
+ node->lastpos = cat->right->lastpos;
+ }
+ break;
+ }
+
+ default:
+ assert(0);
+ break;
+ }
+ }
+
+ return REG_OK;
+}
+
+
+/* Adds a transition from each position in `p1' to each position in `p2'. */
+static reg_errcode_t
+tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
+ tre_tnfa_transition_t *transitions,
+ int *counts, int *offs)
+{
+ tre_pos_and_tags_t *orig_p2 = p2;
+ tre_tnfa_transition_t *trans;
+ int i, j, k, l, dup, prev_p2_pos;
+
+ if (transitions != NULL)
+ while (p1->position >= 0)
+ {
+ p2 = orig_p2;
+ prev_p2_pos = -1;
+ while (p2->position >= 0)
+ {
+ /* Optimization: if this position was already handled, skip it. */
+ if (p2->position == prev_p2_pos)
+ {
+ p2++;
+ continue;
+ }
+ prev_p2_pos = p2->position;
+ /* Set `trans' to point to the next unused transition from
+ position `p1->position'. */
+ trans = transitions + offs[p1->position];
+ while (trans->state != NULL)
+ {
+#if 0
+ /* If we find a previous transition from `p1->position' to
+ `p2->position', it is overwritten. This can happen only
+ if there are nested loops in the regexp, like in "((a)*)*".
+ In POSIX.2 repetition using the outer loop is always
+ preferred over using the inner loop. Therefore the
+ transition for the inner loop is useless and can be thrown
+ away. */
+ /* XXX - The same position is used for all nodes in a bracket
+ expression, so this optimization cannot be used (it will
+ break bracket expressions) unless I figure out a way to
+ detect it here. */
+ if (trans->state_id == p2->position)
+ {
+ DPRINT(("*"));
+ break;
+ }
+#endif
+ trans++;
+ }
+
+ if (trans->state == NULL)
+ (trans + 1)->state = NULL;
+ /* Use the character ranges, assertions, etc. from `p1' for
+ the transition from `p1' to `p2'. */
+ trans->code_min = p1->code_min;
+ trans->code_max = p1->code_max;
+ trans->state = transitions + offs[p2->position];
+ trans->state_id = p2->position;
+ trans->assertions = p1->assertions | p2->assertions
+ | (p1->class ? ASSERT_CHAR_CLASS : 0)
+ | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
+ if (p1->backref >= 0)
+ {
+ assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
+ assert(p2->backref < 0);
+ trans->u.backref = p1->backref;
+ trans->assertions |= ASSERT_BACKREF;
+ }
+ else
+ trans->u.class = p1->class;
+ if (p1->neg_classes != NULL)
+ {
+ for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
+ trans->neg_classes =
+ xmalloc(sizeof(*trans->neg_classes) * (i + 1));
+ if (trans->neg_classes == NULL)
+ return REG_ESPACE;
+ for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
+ trans->neg_classes[i] = p1->neg_classes[i];
+ trans->neg_classes[i] = (tre_ctype_t)0;
+ }
+ else
+ trans->neg_classes = NULL;
+
+ /* Find out how many tags this transition has. */
+ i = 0;
+ if (p1->tags != NULL)
+ while(p1->tags[i] >= 0)
+ i++;
+ j = 0;
+ if (p2->tags != NULL)
+ while(p2->tags[j] >= 0)
+ j++;
+
+ /* If we are overwriting a transition, free the old tag array. */
+ if (trans->tags != NULL)
+ xfree(trans->tags);
+ trans->tags = NULL;
+
+ /* If there were any tags, allocate an array and fill it. */
+ if (i + j > 0)
+ {
+ trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
+ if (!trans->tags)
+ return REG_ESPACE;
+ i = 0;
+ if (p1->tags != NULL)
+ while(p1->tags[i] >= 0)
+ {
+ trans->tags[i] = p1->tags[i];
+ i++;
+ }
+ l = i;
+ j = 0;
+ if (p2->tags != NULL)
+ while (p2->tags[j] >= 0)
+ {
+ /* Don't add duplicates. */
+ dup = 0;
+ for (k = 0; k < i; k++)
+ if (trans->tags[k] == p2->tags[j])
+ {
+ dup = 1;
+ break;
+ }
+ if (!dup)
+ trans->tags[l++] = p2->tags[j];
+ j++;
+ }
+ trans->tags[l] = -1;
+ }
+
+
+#ifdef TRE_DEBUG
+ {
+ int *tags;
+
+ DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
+ p1->code_min));
+ if (p1->code_max != p1->code_min)
+ DPRINT(("-%3d", p1->code_max));
+ tags = trans->tags;
+ if (tags)
+ {
+ DPRINT((", tags ["));
+ while (*tags >= 0)
+ {
+ DPRINT(("%d", *tags));
+ tags++;
+ if (*tags >= 0)
+ DPRINT((","));
+ }
+ DPRINT(("]"));
+ }
+ if (trans->assertions)
+ DPRINT((", assert %d", trans->assertions));
+ if (trans->assertions & ASSERT_BACKREF)
+ DPRINT((", backref %d", trans->u.backref));
+ else if (trans->class)
+ DPRINT((", class %ld", (long)trans->class));
+ if (trans->neg_classes)
+ DPRINT((", neg_classes %p", trans->neg_classes));
+ DPRINT(("\n"));
+ }
+#endif /* TRE_DEBUG */
+ p2++;
+ }
+ p1++;
+ }
+ else
+ /* Compute a maximum limit for the number of transitions leaving
+ from each state. */
+ while (p1->position >= 0)
+ {
+ p2 = orig_p2;
+ while (p2->position >= 0)
+ {
+ counts[p1->position]++;
+ p2++;
+ }
+ p1++;
+ }
+ return REG_OK;
+}
+
+/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
+ labelled with one character range (there are no transitions on empty
+ strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
+ the regexp. */
+static reg_errcode_t
+tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
+ int *counts, int *offs)
+{
+ tre_union_t *uni;
+ tre_catenation_t *cat;
+ tre_iteration_t *iter;
+ reg_errcode_t errcode = REG_OK;
+
+ /* XXX - recurse using a stack!. */
+ switch (node->type)
+ {
+ case LITERAL:
+ break;
+ case UNION:
+ uni = (tre_union_t *)node->obj;
+ errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
+ if (errcode != REG_OK)
+ return errcode;
+ errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
+ break;
+
+ case CATENATION:
+ cat = (tre_catenation_t *)node->obj;
+ /* Add a transition from each position in cat->left->lastpos
+ to each position in cat->right->firstpos. */
+ errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
+ transitions, counts, offs);
+ if (errcode != REG_OK)
+ return errcode;
+ errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
+ if (errcode != REG_OK)
+ return errcode;
+ errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
+ break;
+
+ case ITERATION:
+ iter = (tre_iteration_t *)node->obj;
+ assert(iter->max == -1 || iter->max == 1);
+
+ if (iter->max == -1)
+ {
+ assert(iter->min == 0 || iter->min == 1);
+ /* Add a transition from each last position in the iterated
+ expression to each first position. */
+ errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
+ transitions, counts, offs);
+ if (errcode != REG_OK)
+ return errcode;
+ }
+ errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
+ break;
+ }
+ return errcode;
+}
+
+
+static void
+tre_free(regex_t *preg)
+{
+ tre_tnfa_t *tnfa;
+ unsigned int i;
+ tre_tnfa_transition_t *trans;
+
+ tnfa = (void *)preg->TRE_REGEX_T_FIELD;
+ if (!tnfa)
+ return;
+
+ for (i = 0; i < tnfa->num_transitions; i++)
+ if (tnfa->transitions[i].state)
+ {
+ if (tnfa->transitions[i].tags)
+ xfree(tnfa->transitions[i].tags);
+ if (tnfa->transitions[i].neg_classes)
+ xfree(tnfa->transitions[i].neg_classes);
+ }
+ if (tnfa->transitions)
+ xfree(tnfa->transitions);
+
+ if (tnfa->initial)
+ {
+ for (trans = tnfa->initial; trans->state; trans++)
+ {
+ if (trans->tags)
+ xfree(trans->tags);
+ }
+ xfree(tnfa->initial);
+ }
+
+ if (tnfa->submatch_data)
+ {
+ for (i = 0; i < tnfa->num_submatches; i++)
+ if (tnfa->submatch_data[i].parents)
+ xfree(tnfa->submatch_data[i].parents);
+ xfree(tnfa->submatch_data);
+ }
+
+ if (tnfa->tag_directions)
+ xfree(tnfa->tag_directions);
+ xfree(tnfa);
+}
+
+
+#define ERROR_EXIT(err) \
+ do \
+ { \
+ errcode = err; \
+ if (1) goto error_exit; \
+ } \
+ while (0)
+
+
+static int
+tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
+{
+ tre_stack_t *stack;
+ tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
+ tre_pos_and_tags_t *p;
+ int *counts = NULL, *offs = NULL;
+ int i, add = 0;
+ tre_tnfa_transition_t *transitions, *initial;
+ tre_tnfa_t *tnfa = NULL;
+ tre_submatch_data_t *submatch_data;
+ tre_tag_direction_t *tag_directions = NULL;
+ reg_errcode_t errcode;
+ tre_mem_t mem;
+
+ /* Parse context. */
+ tre_parse_ctx_t parse_ctx;
+
+ /* Allocate a stack used throughout the compilation process for various
+ purposes. */
+ stack = tre_stack_new(512, 10240, 128);
+ if (!stack)
+ return REG_ESPACE;
+ /* Allocate a fast memory allocator. */
+ mem = tre_mem_new();
+ if (!mem)
+ {
+ tre_stack_destroy(stack);
+ return REG_ESPACE;
+ }
+
+ /* Parse the regexp. */
+ memset(&parse_ctx, 0, sizeof(parse_ctx));
+ parse_ctx.mem = mem;
+ parse_ctx.stack = stack;
+ parse_ctx.re = regex;
+ parse_ctx.len = n;
+ parse_ctx.cflags = cflags;
+ parse_ctx.max_backref = -1;
+ DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex));
+ errcode = tre_parse(&parse_ctx);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+ preg->re_nsub = parse_ctx.submatch_id - 1;
+ tree = parse_ctx.result;
+
+#ifdef TRE_DEBUG
+ tre_ast_print(tree);
+#endif /* TRE_DEBUG */
+
+ /* Referring to nonexistent subexpressions is illegal. */
+ if (parse_ctx.max_backref > (int)preg->re_nsub)
+ ERROR_EXIT(REG_ESUBREG);
+
+ /* Allocate the TNFA struct. */
+ tnfa = xcalloc(1, sizeof(tre_tnfa_t));
+ if (tnfa == NULL)
+ ERROR_EXIT(REG_ESPACE);
+ tnfa->have_backrefs = parse_ctx.max_backref >= 0;
+ tnfa->num_submatches = parse_ctx.submatch_id;
+
+ /* Set up tags for submatch addressing. If REG_NOSUB is set and the
+ regexp does not have back references, this can be skipped. */
+ if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
+ {
+ DPRINT(("tre_compile: setting up tags\n"));
+
+ /* Figure out how many tags we will need. */
+ errcode = tre_add_tags(NULL, stack, tree, tnfa);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+#ifdef TRE_DEBUG
+ tre_ast_print(tree);
+#endif /* TRE_DEBUG */
+
+ if (tnfa->num_tags > 0)
+ {
+ tag_directions = xmalloc(sizeof(*tag_directions)
+ * (tnfa->num_tags + 1));
+ if (tag_directions == NULL)
+ ERROR_EXIT(REG_ESPACE);
+ tnfa->tag_directions = tag_directions;
+ memset(tag_directions, -1,
+ sizeof(*tag_directions) * (tnfa->num_tags + 1));
+ }
+
+ submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
+ if (submatch_data == NULL)
+ ERROR_EXIT(REG_ESPACE);
+ tnfa->submatch_data = submatch_data;
+
+ errcode = tre_add_tags(mem, stack, tree, tnfa);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+
+#ifdef TRE_DEBUG
+ for (i = 0; i < parse_ctx.submatch_id; i++)
+ DPRINT(("pmatch[%d] = {t%d, t%d}\n",
+ i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
+ for (i = 0; i < tnfa->num_tags; i++)
+ DPRINT(("t%d is %s\n", i,
+ tag_directions[i] == TRE_TAG_MINIMIZE ?
+ "minimized" : "maximized"));
+#endif /* TRE_DEBUG */
+ }
+
+ /* Expand iteration nodes. */
+ errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
+ tag_directions, NULL);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+
+ /* Add a dummy node for the final state.
+ XXX - For certain patterns this dummy node can be optimized away,
+ for example "a*" or "ab*". Figure out a simple way to detect
+ this possibility. */
+ tmp_ast_l = tree;
+ tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
+ if (tmp_ast_r == NULL)
+ ERROR_EXIT(REG_ESPACE);
+
+ tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
+ if (tree == NULL)
+ ERROR_EXIT(REG_ESPACE);
+
+#ifdef TRE_DEBUG
+ tre_ast_print(tree);
+ DPRINT(("Number of states: %d\n", parse_ctx.position));
+#endif /* TRE_DEBUG */
+
+ errcode = tre_compute_nfl(mem, stack, tree);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+
+ counts = xmalloc(sizeof(int) * parse_ctx.position);
+ if (counts == NULL)
+ ERROR_EXIT(REG_ESPACE);
+
+ offs = xmalloc(sizeof(int) * parse_ctx.position);
+ if (offs == NULL)
+ ERROR_EXIT(REG_ESPACE);
+
+ for (i = 0; i < parse_ctx.position; i++)
+ counts[i] = 0;
+ tre_ast_to_tnfa(tree, NULL, counts, NULL);
+
+ add = 0;
+ for (i = 0; i < parse_ctx.position; i++)
+ {
+ offs[i] = add;
+ add += counts[i] + 1;
+ counts[i] = 0;
+ }
+ transitions = xcalloc(add + 1, sizeof(*transitions));
+ if (transitions == NULL)
+ ERROR_EXIT(REG_ESPACE);
+ tnfa->transitions = transitions;
+ tnfa->num_transitions = add;
+
+ DPRINT(("Converting to TNFA:\n"));
+ errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
+ if (errcode != REG_OK)
+ ERROR_EXIT(errcode);
+
+ p = tree->firstpos;
+ i = 0;
+ while (p->position >= 0)
+ {
+ i++;
+
+#ifdef TRE_DEBUG
+ {
+ int *tags;
+ DPRINT(("initial: %d", p->position));
+ tags = p->tags;
+ if (tags != NULL)
+ {
+ if (*tags >= 0)
+ DPRINT(("/"));
+ while (*tags >= 0)
+ {
+ DPRINT(("%d", *tags));
+ tags++;
+ if (*tags >= 0)
+ DPRINT((","));
+ }
+ }
+ DPRINT((", assert %d", p->assertions));
+ DPRINT(("\n"));
+ }
+#endif /* TRE_DEBUG */
+
+ p++;
+ }
+
+ initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
+ if (initial == NULL)
+ ERROR_EXIT(REG_ESPACE);
+ tnfa->initial = initial;
+
+ i = 0;
+ for (p = tree->firstpos; p->position >= 0; p++)
+ {
+ initial[i].state = transitions + offs[p->position];
+ initial[i].state_id = p->position;
+ initial[i].tags = NULL;
+ /* Copy the arrays p->tags, they are allocated
+ from a tre_mem object. */
+ if (p->tags)
+ {
+ int j;
+ for (j = 0; p->tags[j] >= 0; j++);
+ initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
+ if (!initial[i].tags)
+ ERROR_EXIT(REG_ESPACE);
+ memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
+ }
+ initial[i].assertions = p->assertions;
+ i++;
+ }
+ initial[i].state = NULL;
+
+ tnfa->num_transitions = add;
+ tnfa->final = transitions + offs[tree->lastpos[0].position];
+ tnfa->num_states = parse_ctx.position;
+ tnfa->cflags = cflags;
+
+ DPRINT(("final state %p\n", (void *)tnfa->final));
+
+ tre_mem_destroy(mem);
+ tre_stack_destroy(stack);
+ xfree(counts);
+ xfree(offs);
+
+ preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+ return REG_OK;
+
+ error_exit:
+ /* Free everything that was allocated and return the error code. */
+ tre_mem_destroy(mem);
+ if (stack != NULL)
+ tre_stack_destroy(stack);
+ if (counts != NULL)
+ xfree(counts);
+ if (offs != NULL)
+ xfree(offs);
+ preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+ tre_free(preg);
+ return errcode;
+}
+
+
+/***********************************************************************
+ from regcomp.c
+***********************************************************************/
+
+int
+regcomp(regex_t *preg, const char *regex, int cflags)
+{
+ int ret;
+ tre_char_t *wregex;
+ size_t n = strlen(regex);
+
+ if (n+1 > SIZE_MAX/sizeof(tre_char_t))
+ return REG_ESPACE;
+ wregex = xmalloc(sizeof(tre_char_t) * (n + 1));
+ if (wregex == NULL)
+ return REG_ESPACE;
+
+ n = mbstowcs(wregex, regex, n+1);
+ if (n == (size_t)-1) {
+ xfree(wregex);
+ return REG_BADPAT;
+ }
+
+ ret = tre_compile(preg, wregex, n, cflags);
+ xfree(wregex);
+
+ return ret;
+}
+
+void
+regfree(regex_t *preg)
+{
+ tre_free(preg);
+}
+
+/* EOF */