diff options
| author | Rich Felker <dalias@aerifal.cx> | 2011-02-12 00:22:29 -0500 | 
|---|---|---|
| committer | Rich Felker <dalias@aerifal.cx> | 2011-02-12 00:22:29 -0500 | 
| commit | 0b44a0315b47dd8eced9f3b7f31580cf14bbfc01 (patch) | |
| tree | 6eaef0d8a720fa3da580de87b647fff796fe80b3 /src/regex | |
| download | musl-0b44a0315b47dd8eced9f3b7f31580cf14bbfc01.tar.gz | |
initial check-in, version 0.5.0v0.5.0
Diffstat (limited to 'src/regex')
| -rw-r--r-- | src/regex/fnmatch.c | 150 | ||||
| -rw-r--r-- | src/regex/glob.c | 238 | ||||
| -rw-r--r-- | src/regex/regcomp.c | 3362 | ||||
| -rw-r--r-- | src/regex/regerror.c | 75 | ||||
| -rw-r--r-- | src/regex/regexec.c | 1107 | ||||
| -rw-r--r-- | src/regex/tre-mem.c | 163 | ||||
| -rw-r--r-- | src/regex/tre.h | 269 | 
7 files changed, 5364 insertions, 0 deletions
| diff --git a/src/regex/fnmatch.c b/src/regex/fnmatch.c new file mode 100644 index 00000000..5f2fccdb --- /dev/null +++ b/src/regex/fnmatch.c @@ -0,0 +1,150 @@ +#include <fnmatch.h> +#include <wctype.h> +#include <string.h> +#include <wchar.h> +#include <stdlib.h> +#include <limits.h> + +static int next(const char **s) +{ +	wchar_t c; +	int l = mbtowc(&c, *s, MB_LEN_MAX); +	/* hack to allow literal matches of invalid byte sequences */ +	if (l < 0) return (unsigned char)*(*s)++ - 0x100; +	*s += l; +	return c; +} + +#define BRACKET_ERROR  -0x100 +#define BRACKET_NOCHAR -0x101 + +static int bracket_next(const char **s) +{ +	int c; +	int type; +	if (**s == '[') { +		type = *(*s+1); +		if (type == '.' || type == '=') { +			*s += 2; +			c = next(s); +			if (c <= 0) return BRACKET_ERROR; +			if (**s == type && *(*s+1) == ']') { +				*s += 2; +				return c; +			} +			for (; **s && (**s != type || *(*s+1) != ']'); (*s)++); +			if (!**s) return BRACKET_ERROR; +			*s += 2; +			return BRACKET_NOCHAR; +		} +	} +	c = next(s); +	if (c <= 0) return BRACKET_ERROR; +	return c; +} + +#define __FNM_CONT 0x8000 + +int fnmatch(const char *p, const char *s, int flags) +{ +	int c, d, k; +	int not; +	int match; +	int first; +	int no_slash = (flags & FNM_PATHNAME) ? '/' : 0; +	int no_period = (flags & FNM_PERIOD) && !(flags & __FNM_CONT) ? '.' : 0x100; + +	flags |= __FNM_CONT; + +	while ((c = *p++)) { +		switch (c) { +		case '?': +			k = next(&s); +			if (!k || k == no_period || k == no_slash) +				return FNM_NOMATCH; +			break; +		case '\\': +			if (!(flags & FNM_NOESCAPE)) { +				c = *p++; +				goto literal; +			} +			if (*s++ != c) return FNM_NOMATCH; +			break; +		case '*': +			for (; *p == '*'; p++); +			if (*p && !*s) return FNM_NOMATCH; +			if (*s == no_period) +				return FNM_NOMATCH; +			if (!*p && (!no_slash || !strchr(s, no_slash))) +				return 0; +			for (; *s; s++) +				if (!fnmatch(p, s, flags)) +					return 0; +				else if (*s == no_slash) +					break; +			return FNM_NOMATCH; +		case '[': +			not = (*p == '!' || *p == '^'); +			if (not) p++; +			k = next(&s); +			if (!k || k == no_slash || k == no_period) +				return FNM_NOMATCH; +			match = 0; +			first = 1; +			for (;;) { +				if (!*p) return FNM_NOMATCH; +				if (*p == ']' && !first) break; +				first = 0; +				if (*p == '[' && *(p+1) == ':') { +					const char *z; +					p += 2; +					for (z=p; *z && (*z != ':' || *(z+1) != ']'); z++); +					if (!*z || z-p > 32) { /* FIXME: symbolic const? */ +						return FNM_NOMATCH; +					} else { +						char class[z-p+1]; +						memcpy(class, p, z-p); +						class[z-p] = 0; +						if (iswctype(k, wctype(class))) +							match = 1; +					} +					p = z+2; +					continue; +				} +				c = bracket_next(&p); +				if (c == BRACKET_ERROR) +					return FNM_NOMATCH; +				if (c == BRACKET_NOCHAR) +					continue; +				if (*p == '-' && *(p+1) != ']') { +					p++; +					d = bracket_next(&p); +					if (d == BRACKET_ERROR) +						return FNM_NOMATCH; +					if (d == BRACKET_NOCHAR) +						continue; +					if (k >= c && k <= d) +						match = 1; +					continue; +				} +				if (k == c) match = 1; +			} +			p++; +			if (not == match) +				return FNM_NOMATCH; +			break; +		default: +		literal: +			if (*s++ != c) +				return FNM_NOMATCH; +			if (c == no_slash && (flags & FNM_PERIOD)) { +				no_period = '.'; +				continue; +			} +			break; +		} +		no_period = 0x100; +	} +	if (*s) return FNM_NOMATCH; +	return 0; +} diff --git a/src/regex/glob.c b/src/regex/glob.c new file mode 100644 index 00000000..9a70f0bc --- /dev/null +++ b/src/regex/glob.c @@ -0,0 +1,238 @@ +#include <glob.h> +#include <fnmatch.h> +#include <sys/stat.h> +#include <dirent.h> +#include <limits.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <stddef.h> +#include <unistd.h> +#include <stdio.h> +#include "libc.h" + +struct match +{ +	struct match *next; +	char name[1]; +}; + +static int is_literal(const char *p, int useesc) +{ +	int bracket = 0; +	for (; *p; p++) { +		switch (*p) { +		case '\\': +			if (!useesc) break; +		case '?': +		case '*': +			return 0; +		case '[': +			bracket = 1; +			break; +		case ']': +			if (bracket) return 0; +			break; +		} +	} +	return 1; +} + +static int append(struct match **tail, const char *name, size_t len, int mark) +{ +	struct match *new = malloc(sizeof(struct match) + len + 1); +	if (!new) return -1; +	(*tail)->next = new; +	new->next = NULL; +	strcpy(new->name, name); +	if (mark) strcat(new->name, "/"); +	*tail = new; +	return 0; +} + +static int match_in_dir(const char *d, const char *p, int flags, int (*errfunc)(const char *path, int err), struct match **tail) +{ +	DIR *dir; +	long long de_buf[(sizeof(struct dirent) + NAME_MAX + sizeof(long long))/sizeof(long long)]; +	struct dirent *de; +	char pat[strlen(p)+1]; +	char *p2; +	size_t l = strlen(d); +	int literal; +	int fnm_flags= ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0) | FNM_PERIOD; +	int error; + +	if ((p2 = strchr(p, '/'))) { +		strcpy(pat, p); +		pat[p2-p] = 0; +		for (; *p2 == '/'; p2++); +		p = pat; +	} +	literal = is_literal(p, !(flags & GLOB_NOESCAPE)); +	if (*d == '/' && !*(d+1)) l = 0; + +	/* rely on opendir failing for nondirectory objects */ +	dir = opendir(*d ? d : "."); +	error = errno; +	if (!dir) { +		/* this is not an error -- we let opendir call stat for us */ +		if (error == ENOTDIR) return 0; +		if (error == EACCES && !*p) { +			struct stat st; +			if (!stat(d, &st) && S_ISDIR(st.st_mode)) { +				if (append(tail, d, l, l)) +					return GLOB_NOSPACE; +				return 0; +			} +		} +		if (errfunc(d, error) || (flags & GLOB_ERR)) +			return GLOB_ABORTED; +		return 0; +	} +	if (!*p) { +		error = append(tail, d, l, l) ? GLOB_NOSPACE : 0; +		closedir(dir); +		return error; +	} +	while (!(error = readdir_r(dir, (void *)de_buf, &de)) && de) { +		char namebuf[l+de->d_reclen+2], *name = namebuf; +		if (!literal && fnmatch(p, de->d_name, fnm_flags)) +			continue; +		if (literal && strcmp(p, de->d_name)) +			continue; +		if (p2 && de->d_type && !S_ISDIR(de->d_type<<12) && !S_ISLNK(de->d_type<<12)) +			continue; +		if (*d) { +			memcpy(name, d, l); +			name[l] = '/'; +			strcpy(name+l+1, de->d_name); +		} else { +			name = de->d_name; +		} +		if (p2) { +			if ((error = match_in_dir(name, p2, flags, errfunc, tail))) { +				closedir(dir); +				return error; +			} +		} else { +			int mark = 0; +			if (flags & GLOB_MARK) { +				if (de->d_type) +					mark = S_ISDIR(de->d_type<<12); +				else { +					struct stat st; +					stat(name, &st); +					mark = S_ISDIR(st.st_mode); +				} +			} +			if (append(tail, name, l+de->d_reclen+1, mark)) { +				closedir(dir); +				return GLOB_NOSPACE; +			} +		} +	} +	closedir(dir); +	if (error && (errfunc(d, error) || (flags & GLOB_ERR))) +		return GLOB_ABORTED; +	return 0; +} + +static int ignore_err(const char *path, int err) +{ +	return 0; +} + +static void freelist(struct match *head) +{ +	struct match *match, *next; +	for (match=head->next; match; match=next) { +		next = match->next; +		free(match); +	} +} + +static int sort(const void *a, const void *b) +{ +	return strcmp(*(const char **)a, *(const char **)b); +} + +int glob(const char *pat, int flags, int (*errfunc)(const char *path, int err), glob_t *g) +{ +	const char *p=pat, *d; +	struct match head = { .next = NULL }, *tail = &head; +	size_t cnt, i; +	size_t offs = (flags & GLOB_DOOFFS) ? g->gl_offs : 0; +	int error = 0; +	 +	if (*p == '/') { +		for (; *p == '/'; p++); +		d = "/"; +	} else { +		d = ""; +	} + +	if (!errfunc) errfunc = ignore_err; + +	if (!(flags & GLOB_APPEND)) { +		g->gl_offs = offs; +		g->gl_pathc = 0; +		g->gl_pathv = NULL; +	} + +	if (*p) error = match_in_dir(d, p, flags, errfunc, &tail); +	if (error == GLOB_NOSPACE) { +		freelist(&head); +		return error; +	} +	 +	for (cnt=0, tail=head.next; tail; tail=tail->next, cnt++); +	if (!cnt) { +		if (flags & GLOB_NOCHECK) { +			tail = &head; +			if (append(&tail, pat, strlen(pat), 0)) +				return GLOB_NOSPACE; +			cnt++; +		} else +			return GLOB_NOMATCH; +	} + +	if (flags & GLOB_APPEND) { +		char **pathv = realloc(g->gl_pathv, (offs + g->gl_pathc + cnt + 1) * sizeof(char *)); +		if (!pathv) { +			freelist(&head); +			return GLOB_NOSPACE; +		} +		g->gl_pathv = pathv; +		offs += g->gl_pathc; +	} else { +		g->gl_pathv = malloc((offs + cnt + 1) * sizeof(char *)); +		if (!g->gl_pathv) { +			freelist(&head); +			return GLOB_NOSPACE; +		} +		for (i=0; i<offs; i++) +			g->gl_pathv[i] = NULL; +	} +	for (i=0, tail=head.next; i<cnt; tail=tail->next, i++) +		g->gl_pathv[offs + i] = tail->name; +	g->gl_pathv[offs + i] = NULL; +	g->gl_pathc += cnt; + +	if (!(flags & GLOB_NOSORT)) +		qsort(g->gl_pathv+offs, cnt, sizeof(char *), sort); +	 +	return error; +} + +void globfree(glob_t *g) +{ +	size_t i; +	for (i=0; i<g->gl_pathc; i++) +		free(g->gl_pathv[g->gl_offs + i] - offsetof(struct match, name)); +	free(g->gl_pathv); +	g->gl_pathc = 0; +	g->gl_pathv = NULL; +} + +LFS64(glob); +LFS64(globfree); diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c new file mode 100644 index 00000000..3307942e --- /dev/null +++ b/src/regex/regcomp.c @@ -0,0 +1,3362 @@ +/* +  regcomp.c - TRE POSIX compatible regex compilation functions. + +  Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi> + +  This library is free software; you can redistribute it and/or +  modify it under the terms of the GNU Lesser General Public +  License as published by the Free Software Foundation; either +  version 2.1 of the License, or (at your option) any later version. + +  This library is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public +  License along with this library; if not, write to the Free Software +  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA + +*/ + +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <regex.h> +#include <limits.h> +#include <stdint.h> + +#include "tre.h" + +#include <assert.h> + +/*********************************************************************** + from tre-ast.c and tre-ast.h +***********************************************************************/ + +/* The different AST node types. */ +typedef enum { +  LITERAL, +  CATENATION, +  ITERATION, +  UNION +} tre_ast_type_t; + +/* Special subtypes of TRE_LITERAL. */ +#define EMPTY	  -1   /* Empty leaf (denotes empty string). */ +#define ASSERTION -2   /* Assertion leaf. */ +#define TAG	  -3   /* Tag leaf. */ +#define BACKREF	  -4   /* Back reference leaf. */ + +#define IS_SPECIAL(x)	((x)->code_min < 0) +#define IS_EMPTY(x)	((x)->code_min == EMPTY) +#define IS_ASSERTION(x) ((x)->code_min == ASSERTION) +#define IS_TAG(x)	((x)->code_min == TAG) +#define IS_BACKREF(x)	((x)->code_min == BACKREF) + +/* Taken from tre-compile.h */ +typedef struct { +  int position; +  int code_min; +  int code_max; +  int *tags; +  int assertions; +  tre_ctype_t class; +  tre_ctype_t *neg_classes; +  int backref; +} tre_pos_and_tags_t; + +/* A generic AST node.  All AST nodes consist of this node on the top +   level with `obj' pointing to the actual content. */ +typedef struct { +  tre_ast_type_t type;   /* Type of the node. */ +  void *obj;             /* Pointer to actual node. */ +  int nullable; +  int submatch_id; +  int num_submatches; +  int num_tags; +  tre_pos_and_tags_t *firstpos; +  tre_pos_and_tags_t *lastpos; +} tre_ast_node_t; + + +/* A "literal" node.  These are created for assertions, back references, +   tags, matching parameter settings, and all expressions that match one +   character. */ +typedef struct { +  long code_min; +  long code_max; +  int position; +  tre_ctype_t class; +  tre_ctype_t *neg_classes; +} tre_literal_t; + +/* A "catenation" node.	 These are created when two regexps are concatenated. +   If there are more than one subexpressions in sequence, the `left' part +   holds all but the last, and `right' part holds the last subexpression +   (catenation is left associative). */ +typedef struct { +  tre_ast_node_t *left; +  tre_ast_node_t *right; +} tre_catenation_t; + +/* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}" +   operators. */ +typedef struct { +  /* Subexpression to match. */ +  tre_ast_node_t *arg; +  /* Minimum number of consecutive matches. */ +  int min; +  /* Maximum number of consecutive matches. */ +  int max; +} tre_iteration_t; + +/* An "union" node.  These are created for the "|" operator. */ +typedef struct { +  tre_ast_node_t *left; +  tre_ast_node_t *right; +} tre_union_t; + +static tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +{ +  tre_ast_node_t *node; + +  node = tre_mem_calloc(mem, sizeof(*node)); +  if (!node) +    return NULL; +  node->obj = tre_mem_calloc(mem, size); +  if (!node->obj) +    return NULL; +  node->type = type; +  node->nullable = -1; +  node->submatch_id = -1; + +  return node; +} + +static tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) +{ +  tre_ast_node_t *node; +  tre_literal_t *lit; + +  node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); +  if (!node) +    return NULL; +  lit = node->obj; +  lit->code_min = code_min; +  lit->code_max = code_max; +  lit->position = position; + +  return node; +} + +static tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max) +{ +  tre_ast_node_t *node; +  tre_iteration_t *iter; + +  node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); +  if (!node) +    return NULL; +  iter = node->obj; +  iter->arg = arg; +  iter->min = min; +  iter->max = max; +  node->num_submatches = arg->num_submatches; + +  return node; +} + +static tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) +{ +  tre_ast_node_t *node; + +  node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); +  if (node == NULL) +    return NULL; +  ((tre_union_t *)node->obj)->left = left; +  ((tre_union_t *)node->obj)->right = right; +  node->num_submatches = left->num_submatches + right->num_submatches; + +  return node; +} + +static tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, +		       tre_ast_node_t *right) +{ +  tre_ast_node_t *node; + +  node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); +  if (node == NULL) +    return NULL; +  ((tre_catenation_t *)node->obj)->left = left; +  ((tre_catenation_t *)node->obj)->right = right; +  node->num_submatches = left->num_submatches + right->num_submatches; + +  return node; +} + +/*********************************************************************** + from tre-stack.c and tre-stack.h +***********************************************************************/ + +/* Just to save some typing. */ +#define STACK_PUSH(s, value)						      \ +  do									      \ +    {									      \ +      status = tre_stack_push(s, (void *)(value));			      \ +    }									      \ +  while (0) + +#define STACK_PUSHX(s, value)						      \ +  {									      \ +    status = tre_stack_push(s, (void *)(value));			      \ +    if (status != REG_OK)						      \ +      break;								      \ +  } + +#define STACK_PUSHR(s, value)						      \ +  {									      \ +    reg_errcode_t status;						      \ +    status = tre_stack_push(s, (void *)(value));			      \ +    if (status != REG_OK)						      \ +      return status;							      \ +  } + +typedef struct tre_stack_rec { +  int size; +  int max_size; +  int increment; +  int ptr; +  void **stack; +} tre_stack_t; + + +static tre_stack_t * +tre_stack_new(int size, int max_size, int increment) +{ +  tre_stack_t *s; + +  s = xmalloc(sizeof(*s)); +  if (s != NULL) +    { +      s->stack = xmalloc(sizeof(*s->stack) * size); +      if (s->stack == NULL) +	{ +	  xfree(s); +	  return NULL; +	} +      s->size = size; +      s->max_size = max_size; +      s->increment = increment; +      s->ptr = 0; +    } +  return s; +} + +static void +tre_stack_destroy(tre_stack_t *s) +{ +  xfree(s->stack); +  xfree(s); +} + +static int +tre_stack_num_objects(tre_stack_t *s) +{ +  return s->ptr; +} + +static reg_errcode_t +tre_stack_push(tre_stack_t *s, void *value) +{ +  if (s->ptr < s->size) +    { +      s->stack[s->ptr] = value; +      s->ptr++; +    } +  else +    { +      if (s->size >= s->max_size) +	{ +	  DPRINT(("tre_stack_push: stack full\n")); +	  return REG_ESPACE; +	} +      else +	{ +	  void **new_buffer; +	  int new_size; +	  DPRINT(("tre_stack_push: trying to realloc more space\n")); +	  new_size = s->size + s->increment; +	  if (new_size > s->max_size) +	    new_size = s->max_size; +	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); +	  if (new_buffer == NULL) +	    { +	      DPRINT(("tre_stack_push: realloc failed.\n")); +	      return REG_ESPACE; +	    } +	  DPRINT(("tre_stack_push: realloc succeeded.\n")); +	  assert(new_size > s->size); +	  s->size = new_size; +	  s->stack = new_buffer; +	  tre_stack_push(s, value); +	} +    } +  return REG_OK; +} + +static void * +tre_stack_pop(tre_stack_t *s) +{ +  return s->stack[--s->ptr]; +} + + +/*********************************************************************** + from tre-parse.c and tre-parse.h +***********************************************************************/ + +/* Parse context. */ +typedef struct { +  /* Memory allocator.	The AST is allocated using this. */ +  tre_mem_t mem; +  /* Stack used for keeping track of regexp syntax. */ +  tre_stack_t *stack; +  /* The parse result. */ +  tre_ast_node_t *result; +  /* The regexp to parse and its length. */ +  const tre_char_t *re; +  /* The first character of the entire regexp. */ +  const tre_char_t *re_start; +  /* The first character after the end of the regexp. */ +  const tre_char_t *re_end; +  int len; +  /* Current submatch ID. */ +  int submatch_id; +  /* Current position (number of literal). */ +  int position; +  /* The highest back reference or -1 if none seen so far. */ +  int max_backref; +  /* Compilation flags. */ +  int cflags; +  /* If this flag is set the top-level submatch is not captured. */ +  int nofirstsub; +} tre_parse_ctx_t; + +static reg_errcode_t +tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, +	 tre_ast_node_t ***items) +{ +  reg_errcode_t status; +  tre_ast_node_t **array = *items; +  /* Allocate more space if necessary. */ +  if (*i >= *max_i) +    { +      tre_ast_node_t **new_items; +      DPRINT(("out of array space, i = %d\n", *i)); +      /* If the array is already 1024 items large, give up -- there's +	 probably an error in the regexp (e.g. not a '\0' terminated +	 string and missing ']') */ +      if (*max_i > 1024) +	return REG_ESPACE; +      *max_i *= 2; +      new_items = xrealloc(array, sizeof(*items) * *max_i); +      if (new_items == NULL) +	return REG_ESPACE; +      *items = array = new_items; +    } +  array[*i] = tre_ast_new_literal(mem, min, max, -1); +  status = array[*i] == NULL ? REG_ESPACE : REG_OK; +  (*i)++; +  return status; +} + + +/* Expands a character class to character ranges. */ +static reg_errcode_t +tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, +		 int *i, int *max_i, int cflags) +{ +  reg_errcode_t status = REG_OK; +  tre_cint_t c; +  int j, min = -1, max = 0; +  assert(TRE_MB_CUR_MAX == 1); + +  DPRINT(("  expanding class to character ranges\n")); +  for (j = 0; (j < 256) && (status == REG_OK); j++) +    { +      c = j; +      if (tre_isctype(c, class) +	  || ((cflags & REG_ICASE) +	      && (tre_isctype(tre_tolower(c), class) +		  || tre_isctype(tre_toupper(c), class)))) +{ +	  if (min < 0) +	    min = c; +	  max = c; +	} +      else if (min >= 0) +	{ +	  DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max)); +	  status = tre_new_item(mem, min, max, i, max_i, items); +	  min = -1; +	} +    } +  if (min >= 0 && status == REG_OK) +    status = tre_new_item(mem, min, max, i, max_i, items); +  return status; +} + + +static int +tre_compare_items(const void *a, const void *b) +{ +  tre_ast_node_t *node_a = *(tre_ast_node_t **)a; +  tre_ast_node_t *node_b = *(tre_ast_node_t **)b; +  tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; +  int a_min = l_a->code_min, b_min = l_b->code_min; + +  if (a_min < b_min) +    return -1; +  else if (a_min > b_min) +    return 1; +  else +    return 0; +} + +/* Maximum number of character classes that can occur in a negated bracket +   expression.	*/ +#define MAX_NEG_CLASSES 64 + +/* Maximum length of character class names. */ +#define MAX_CLASS_NAME + +static reg_errcode_t +tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, +			tre_ctype_t neg_classes[], int *num_neg_classes, +			tre_ast_node_t ***items, int *num_items, +			int *items_size) +{ +  const tre_char_t *re = ctx->re; +  reg_errcode_t status = REG_OK; +  tre_ctype_t class = (tre_ctype_t)0; +  int i = *num_items; +  int max_i = *items_size; +  int skip; + +  /* Build an array of the items in the bracket expression. */ +  while (status == REG_OK) +    { +      skip = 0; +      if (re == ctx->re_end) +	{ +	  status = REG_EBRACK; +	} +      else if (*re == ']' && re > ctx->re) +	{ +	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", +		  ctx->re_end - re, re)); +	  re++; +	  break; +	} +      else +	{ +	  tre_cint_t min = 0, max = 0; + +	  class = (tre_ctype_t)0; +	  if (re + 2 < ctx->re_end +	      && *(re + 1) == '-' && *(re + 2) != ']') +	    { +	      DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", +		      ctx->re_end - re, re)); +	      min = *re; +	      max = *(re + 2); +	      re += 3; +	      /* XXX - Should use collation order instead of encoding values +		 in character ranges. */ +	      if (min > max) +		status = REG_ERANGE; +	    } +	  else if (re + 1 < ctx->re_end +		   && *re == '[' && *(re + 1) == '.') +	    status = REG_ECOLLATE; +	  else if (re + 1 < ctx->re_end +		   && *re == '[' && *(re + 1) == '=') +	    status = REG_ECOLLATE; +	  else if (re + 1 < ctx->re_end +		   && *re == '[' && *(re + 1) == ':') +	    { +	      char tmp_str[64]; +	      const tre_char_t *endptr = re + 2; +	      int len; +	      DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", +		      ctx->re_end - re, re)); +	      while (endptr < ctx->re_end && *endptr != ':') +		endptr++; +	      if (endptr != ctx->re_end) +		{ +		  len = MIN(endptr - re - 2, 63); +#ifdef TRE_WCHAR +		  { +		    tre_char_t tmp_wcs[64]; +		    wcsncpy(tmp_wcs, re + 2, len); +		    tmp_wcs[len] = '\0'; +#if defined HAVE_WCSRTOMBS +		    { +		      mbstate_t state; +		      const tre_char_t *src = tmp_wcs; +		      memset(&state, '\0', sizeof(state)); +		      len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); +		    } +#elif defined HAVE_WCSTOMBS +		    len = wcstombs(tmp_str, tmp_wcs, 63); +#endif /* defined HAVE_WCSTOMBS */ +		  } +#else /* !TRE_WCHAR */ +		  strncpy(tmp_str, re + 2, len); +#endif /* !TRE_WCHAR */ +		  tmp_str[len] = '\0'; +		  DPRINT(("  class name: %s\n", tmp_str)); +		  class = tre_ctype(tmp_str); +		  if (!class) +		    status = REG_ECTYPE; +		  /* Optimize character classes for 8 bit character sets. */ +		  if (status == REG_OK && TRE_MB_CUR_MAX == 1) +		    { +		      status = tre_expand_ctype(ctx->mem, class, items, +						&i, &max_i, ctx->cflags); +		      class = (tre_ctype_t)0; +		      skip = 1; +		    } +		  re = endptr + 2; +		} +	      else +		status = REG_ECTYPE; +	      min = 0; +	      max = TRE_CHAR_MAX; +	    } +	  else +	    { +	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", +		      ctx->re_end - re, re)); +	      if (*re == '-' && *(re + 1) != ']' +		  && ctx->re != re) +		/* Two ranges are not allowed to share and endpoint. */ +		status = REG_ERANGE; +	      min = max = *re++; +	    } + +	  if (status != REG_OK) +	    break; + +	  if (class && negate) +	    if (*num_neg_classes >= MAX_NEG_CLASSES) +	      status = REG_ESPACE; +	    else +	      neg_classes[(*num_neg_classes)++] = class; +	  else if (!skip) +	    { +	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); +	      if (status != REG_OK) +		break; +	      ((tre_literal_t*)((*items)[i-1])->obj)->class = class; +	    } + +	  /* Add opposite-case counterpoints if REG_ICASE is present. +	     This is broken if there are more than two "same" characters. */ +	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) +	    { +	      int cmin, ccurr; + +	      DPRINT(("adding opposite-case counterpoints\n")); +	      while (min <= max) +		{ +		  if (tre_islower(min)) +		    { +		      cmin = ccurr = tre_toupper(min++); +		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1 +			     && min <= max) +			ccurr = tre_toupper(min++); +		      status = tre_new_item(ctx->mem, cmin, ccurr, +					    &i, &max_i, items); +		    } +		  else if (tre_isupper(min)) +		    { +		      cmin = ccurr = tre_tolower(min++); +		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 +			     && min <= max) +			ccurr = tre_tolower(min++); +		      status = tre_new_item(ctx->mem, cmin, ccurr, +					    &i, &max_i, items); +		    } +		  else min++; +		  if (status != REG_OK) +		    break; +		} +	      if (status != REG_OK) +		break; +	    } +	} +    } +  *num_items = i; +  *items_size = max_i; +  ctx->re = re; +  return status; +} + +static reg_errcode_t +tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ +  tre_ast_node_t *node = NULL; +  int negate = 0; +  reg_errcode_t status = REG_OK; +  tre_ast_node_t **items, *u, *n; +  int i = 0, j, max_i = 32, curr_max, curr_min; +  tre_ctype_t neg_classes[MAX_NEG_CLASSES]; +  int num_neg_classes = 0; + +  /* Start off with an array of `max_i' elements. */ +  items = xmalloc(sizeof(*items) * max_i); +  if (items == NULL) +    return REG_ESPACE; + +  if (*ctx->re == '^') +    { +      DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", +	      ctx->re_end - ctx->re, ctx->re)); +      negate = 1; +      ctx->re++; +    } + +  status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, +				   &items, &i, &max_i); + +  if (status != REG_OK) +    goto parse_bracket_done; + +  /* Sort the array if we need to negate it. */ +  if (negate) +    qsort(items, i, sizeof(*items), tre_compare_items); + +  curr_max = curr_min = 0; +  /* Build a union of the items in the array, negated if necessary. */ +  for (j = 0; j < i && status == REG_OK; j++) +    { +      int min, max; +      tre_literal_t *l = items[j]->obj; +      min = l->code_min; +      max = l->code_max; + +      DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", +	      (int)l->code_min, (int)l->code_max, (long)l->class, curr_max)); + +      if (negate) +	{ +	  if (min < curr_max) +	    { +	      /* Overlap. */ +	      curr_max = MAX(max + 1, curr_max); +	      DPRINT(("overlap, curr_max = %d\n", curr_max)); +	      l = NULL; +	    } +	  else +	    { +	      /* No overlap. */ +	      curr_max = min - 1; +	      if (curr_max >= curr_min) +		{ +		  DPRINT(("no overlap\n")); +		  l->code_min = curr_min; +		  l->code_max = curr_max; +		} +	      else +		{ +		  DPRINT(("no overlap, zero room\n")); +		  l = NULL; +		} +	      curr_min = curr_max = max + 1; +	    } +	} + +      if (l != NULL) +	{ +	  int k; +	  DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); +	  l->position = ctx->position; +	  if (num_neg_classes > 0) +	    { +	      l->neg_classes = tre_mem_alloc(ctx->mem, +					     (sizeof(l->neg_classes) +					      * (num_neg_classes + 1))); +	      if (l->neg_classes == NULL) +		{ +		  status = REG_ESPACE; +		  break; +		} +	      for (k = 0; k < num_neg_classes; k++) +		l->neg_classes[k] = neg_classes[k]; +	      l->neg_classes[k] = (tre_ctype_t)0; +	    } +	  else +	    l->neg_classes = NULL; +	  if (node == NULL) +	    node = items[j]; +	  else +	    { +	      u = tre_ast_new_union(ctx->mem, node, items[j]); +	      if (u == NULL) +		status = REG_ESPACE; +	      node = u; +	    } +	} +    } + +  if (status != REG_OK) +    goto parse_bracket_done; + +  if (negate) +    { +      int k; +      DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); +      n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); +      if (n == NULL) +	status = REG_ESPACE; +      else +	{ +	  tre_literal_t *l = n->obj; +	  if (num_neg_classes > 0) +	    { +	      l->neg_classes = tre_mem_alloc(ctx->mem, +					     (sizeof(l->neg_classes) +					      * (num_neg_classes + 1))); +	      if (l->neg_classes == NULL) +		{ +		  status = REG_ESPACE; +		  goto parse_bracket_done; +		} +	      for (k = 0; k < num_neg_classes; k++) +		l->neg_classes[k] = neg_classes[k]; +	      l->neg_classes[k] = (tre_ctype_t)0; +	    } +	  else +	    l->neg_classes = NULL; +	  if (node == NULL) +	    node = n; +	  else +	    { +	      u = tre_ast_new_union(ctx->mem, node, n); +	      if (u == NULL) +		status = REG_ESPACE; +	      node = u; +	    } +	} +    } + +  if (status != REG_OK) +    goto parse_bracket_done; + +#ifdef TRE_DEBUG +  tre_ast_print(node); +#endif /* TRE_DEBUG */ + + parse_bracket_done: +  xfree(items); +  ctx->position++; +  *result = node; +  return status; +} + + +/* Parses a positive decimal integer.  Returns -1 if the string does not +   contain a valid number. */ +static int +tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) +{ +  int num = -1; +  const tre_char_t *r = *regex; +  while (r < regex_end && *r >= '0' && *r <= '9') +    { +      if (num < 0) +	num = 0; +      num = num * 10 + *r - '0'; +      r++; +    } +  *regex = r; +  return num; +} + + +static reg_errcode_t +tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ +  int min, max; +  const tre_char_t *r = ctx->re; +  const tre_char_t *start; +  int counts_set = 0; + +  /* Parse number (minimum repetition count). */ +  min = -1; +  if (r < ctx->re_end && *r >= '0' && *r <= '9') { +    DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", ctx->re_end - r, r)); +    min = tre_parse_int(&r, ctx->re_end); +  } + +  /* Parse comma and second number (maximum repetition count). */ +  max = min; +  if (r < ctx->re_end && *r == ',') +    { +      r++; +      DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", ctx->re_end - r, r)); +      max = tre_parse_int(&r, ctx->re_end); +    } + +  /* Check that the repeat counts are sane. */ +  if ((max >= 0 && min > max) || max > RE_DUP_MAX) +    return REG_BADBR; + + +  /* +   '{' +     optionally followed immediately by a number == minimum repcount +     optionally followed by , then a number == maximum repcount +  */ + + +  do { +    int done; +    start = r; + +    /* Parse count limit settings */ +    done = 0; +    if (!counts_set) +      while (r + 1 < ctx->re_end && !done) +	{ +	  switch (*r) +	    { +	    case ',': +	      r++; +	      break; +	    case ' ': +	      r++; +	      break; +	    case '}': +	      done = 1; +	      break; +	    default: +	      done = 1; +	      break; +	    } +	} +  } while (start != r); + +  /* Missing }. */ +  if (r >= ctx->re_end) +    return REG_EBRACE; + +  /* Empty contents of {}. */ +  if (r == ctx->re) +    return REG_BADBR; + +  /* Parse the ending '}' or '\}'.*/ +  if (ctx->cflags & REG_EXTENDED) +    { +      if (r >= ctx->re_end || *r != '}') +	return REG_BADBR; +      r++; +    } +  else +    { +      if (r + 1 >= ctx->re_end +	  || *r != '\\' +	  || *(r + 1) != '}') +	return REG_BADBR; +      r += 2; +    } + + +  /* Create the AST node(s). */ +  if (min == 0 && max == 0) +    { +      *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +      if (*result == NULL) +	return REG_ESPACE; +    } +  else +    { +      if (min < 0 && max < 0) +	/* Only approximate parameters set, no repetitions. */ +	min = max = 1; + +      *result = tre_ast_new_iter(ctx->mem, *result, min, max); +      if (!*result) +	return REG_ESPACE; +    } + +  ctx->re = r; +  return REG_OK; +} + +typedef enum { +  PARSE_RE = 0, +  PARSE_ATOM, +  PARSE_MARK_FOR_SUBMATCH, +  PARSE_BRANCH, +  PARSE_PIECE, +  PARSE_CATENATION, +  PARSE_POST_CATENATION, +  PARSE_UNION, +  PARSE_POST_UNION, +  PARSE_POSTFIX, +  PARSE_RESTORE_CFLAGS +} tre_parse_re_stack_symbol_t; + + +static reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx) +{ +  tre_ast_node_t *result = NULL; +  tre_parse_re_stack_symbol_t symbol; +  reg_errcode_t status = REG_OK; +  tre_stack_t *stack = ctx->stack; +  int bottom = tre_stack_num_objects(stack); +  int depth = 0; + +  DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", +	  ctx->len, ctx->re, ctx->len)); + +  if (!ctx->nofirstsub) +    { +      STACK_PUSH(stack, ctx->re); +      STACK_PUSH(stack, ctx->submatch_id); +      STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH); +      ctx->submatch_id++; +    } +  STACK_PUSH(stack, PARSE_RE); +  ctx->re_start = ctx->re; +  ctx->re_end = ctx->re + ctx->len; + + +  /* The following is basically just a recursive descent parser.  I use +     an explicit stack instead of recursive functions mostly because of +     two reasons: compatibility with systems which have an overflowable +     call stack, and efficiency (both in lines of code and speed).  */ +  while (tre_stack_num_objects(stack) > bottom && status == REG_OK) +    { +      if (status != REG_OK) +	break; +      symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack); +      switch (symbol) +	{ +	case PARSE_RE: +	  /* Parse a full regexp.  A regexp is one or more branches, +	     separated by the union operator `|'. */ +	  if (ctx->cflags & REG_EXTENDED) +	    STACK_PUSHX(stack, PARSE_UNION); +	  STACK_PUSHX(stack, PARSE_BRANCH); +	  break; + +	case PARSE_BRANCH: +	  /* Parse a branch.  A branch is one or more pieces, concatenated. +	     A piece is an atom possibly followed by a postfix operator. */ +	  STACK_PUSHX(stack, PARSE_CATENATION); +	  STACK_PUSHX(stack, PARSE_PIECE); +	  break; + +	case PARSE_PIECE: +	  /* Parse a piece.  A piece is an atom possibly followed by one +	     or more postfix operators. */ +	  STACK_PUSHX(stack, PARSE_POSTFIX); +	  STACK_PUSHX(stack, PARSE_ATOM); +	  break; + +	case PARSE_CATENATION: +	  /* If the expression has not ended, parse another piece. */ +	  { +	    tre_char_t c; +	    if (ctx->re >= ctx->re_end) +	      break; +	    c = *ctx->re; +	    if (ctx->cflags & REG_EXTENDED && c == '|') +	      break; +	    if ((ctx->cflags & REG_EXTENDED +	         && c == ')' && depth > 0) +	        || (!(ctx->cflags & REG_EXTENDED) +		    && (c == '\\' +		        && *(ctx->re + 1) == ')'))) +	      { +	        if (!(ctx->cflags & REG_EXTENDED) && depth == 0) +	          status = REG_EPAREN; +	        DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n", +		        ctx->re_end - ctx->re, ctx->re)); +	        depth--; +	        if (!(ctx->cflags & REG_EXTENDED)) +	          ctx->re += 2; +	        break; +	      } + +	    /* Left associative concatenation. */ +	    STACK_PUSHX(stack, PARSE_CATENATION); +	    STACK_PUSHX(stack, result); +	    STACK_PUSHX(stack, PARSE_POST_CATENATION); +	    STACK_PUSHX(stack, PARSE_PIECE); +	    break; +	  } + +	case PARSE_POST_CATENATION: +	  { +	    tre_ast_node_t *tree = tre_stack_pop(stack); +	    tre_ast_node_t *tmp_node; +	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); +	    if (!tmp_node) +	      return REG_ESPACE; +	    result = tmp_node; +	    break; +	  } + +	case PARSE_UNION: +	  if (ctx->re >= ctx->re_end) +	    break; +	  switch (*ctx->re) +	    { +	    case '|': +	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      STACK_PUSHX(stack, PARSE_UNION); +	      STACK_PUSHX(stack, result); +	      STACK_PUSHX(stack, PARSE_POST_UNION); +	      STACK_PUSHX(stack, PARSE_BRANCH); +	      ctx->re++; +	      break; + +	    case ')': +	      ctx->re++; +	      break; + +	    default: +	      break; +	    } +	  break; + +	case PARSE_POST_UNION: +	  { +	    tre_ast_node_t *tmp_node; +	    tre_ast_node_t *tree = tre_stack_pop(stack); +	    tmp_node = tre_ast_new_union(ctx->mem, tree, result); +	    if (!tmp_node) +	      return REG_ESPACE; +	    result = tmp_node; +	    break; +	  } + +	case PARSE_POSTFIX: +	  /* Parse postfix operators. */ +	  if (ctx->re >= ctx->re_end) +	    break; +	  switch (*ctx->re) +	    { +	    case '+': +	    case '?': +	      if (!(ctx->cflags & REG_EXTENDED)) +	        break; +	    case '*': +	      { +		tre_ast_node_t *tmp_node; +		int rep_min = 0; +		int rep_max = -1; +		if (*ctx->re == '+') +		  rep_min = 1; +		if (*ctx->re == '?') +		  rep_max = 1; + +		ctx->re++; +		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max); +		if (tmp_node == NULL) +		  return REG_ESPACE; +		result = tmp_node; +		STACK_PUSHX(stack, PARSE_POSTFIX); +		break; +	      } + +	    case '\\': +	      /* "\{" is special without REG_EXTENDED */ +	      if (!(ctx->cflags & REG_EXTENDED) +		  && ctx->re + 1 < ctx->re_end +		  && *(ctx->re + 1) == '{') +		{ +		  ctx->re++; +		  goto parse_brace; +		} +	      else +		break; + +	    case '{': +	      /* "{" is literal without REG_EXTENDED */ +	      if (!(ctx->cflags & REG_EXTENDED)) +		break; + +	    parse_brace: +	      DPRINT(("tre_parse:	bound: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      ctx->re++; + +	      status = tre_parse_bound(ctx, &result); +	      if (status != REG_OK) +		return status; +	      STACK_PUSHX(stack, PARSE_POSTFIX); +	      break; +	    } +	  break; + +	case PARSE_ATOM: +	  /* Parse an atom.  An atom is a regular expression enclosed in `()', +	     an empty set of `()', a bracket expression, `.', `^', `$', +	     a `\' followed by a character, or a single character. */ + +	  /* End of regexp? (empty string). */ +	  if (ctx->re >= ctx->re_end) +	    goto parse_literal; + +	  switch (*ctx->re) +	    { +	    case '(':  /* parenthesized subexpression */ + +	      if (ctx->cflags & REG_EXTENDED +		  || (ctx->re > ctx->re_start +		      && *(ctx->re - 1) == '\\')) +		{ +		  depth++; +		    { +		      DPRINT(("tre_parse: group begin: '%.*" STRF +			      "', submatch %d\n", +			      ctx->re_end - ctx->re, ctx->re, +			      ctx->submatch_id)); +		      ctx->re++; +		      /* First parse a whole RE, then mark the resulting tree +			 for submatching. */ +		      STACK_PUSHX(stack, ctx->submatch_id); +		      STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH); +		      STACK_PUSHX(stack, PARSE_RE); +		      ctx->submatch_id++; +		    } +		} +	      else +		goto parse_literal; +	      break; + +	    case ')':  /* end of current subexpression */ +	      if ((ctx->cflags & REG_EXTENDED && depth > 0) +		  || (ctx->re > ctx->re_start +		      && *(ctx->re - 1) == '\\')) +		{ +		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n", +			  ctx->re_end - ctx->re, ctx->re)); +		  /* We were expecting an atom, but instead the current +		     subexpression was closed.	POSIX leaves the meaning of +		     this to be implementation-defined.	 We interpret this as +		     an empty expression (which matches an empty string).  */ +		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +		  if (result == NULL) +		    return REG_ESPACE; +		  if (!(ctx->cflags & REG_EXTENDED)) +		    ctx->re--; +		} +	      else +		goto parse_literal; +	      break; + +	    case '[': /* bracket expression */ +	      DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      ctx->re++; +	      status = tre_parse_bracket(ctx, &result); +	      if (status != REG_OK) +		return status; +	      break; + +	    case '\\': +	      /* If this is "\(" or "\)" chew off the backslash and +		 try again. */ +	      if (!(ctx->cflags & REG_EXTENDED) +		  && ctx->re + 1 < ctx->re_end +		  && (*(ctx->re + 1) == '(' +		      || *(ctx->re + 1) == ')')) +		{ +		  ctx->re++; +		  STACK_PUSHX(stack, PARSE_ATOM); +		  break; +		} + +	      if (ctx->re + 1 >= ctx->re_end) +		/* Trailing backslash. */ +		return REG_EESCAPE; + +	      DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      ctx->re++; +	      switch (*ctx->re) +		{ +		default: +		  if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re)) +		    { +		      /* Back reference. */ +		      int val = *ctx->re - '0'; +		      DPRINT(("tre_parse:     backref: '%.*" STRF "'\n", +			      ctx->re_end - ctx->re + 1, ctx->re - 1)); +		      result = tre_ast_new_literal(ctx->mem, BACKREF, val, +						   ctx->position); +		      if (result == NULL) +			return REG_ESPACE; +		      ctx->position++; +		      ctx->max_backref = MAX(val, ctx->max_backref); +		      ctx->re++; +		    } +		  else +		    { +		      /* Escaped character. */ +		      DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n", +			      ctx->re_end - ctx->re + 1, ctx->re - 1)); +		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, +						   ctx->position); +		      ctx->position++; +		      ctx->re++; +		    } +		  break; +		} +	      if (result == NULL) +		return REG_ESPACE; +	      break; + +	    case '.':	 /* the any-symbol */ +	      DPRINT(("tre_parse:	  any: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      if (ctx->cflags & REG_NEWLINE) +		{ +		  tre_ast_node_t *tmp1; +		  tre_ast_node_t *tmp2; +		  tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1, +					     ctx->position); +		  if (!tmp1) +		    return REG_ESPACE; +		  tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX, +					     ctx->position + 1); +		  if (!tmp2) +		    return REG_ESPACE; +		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2); +		  if (!result) +		    return REG_ESPACE; +		  ctx->position += 2; +		} +	      else +		{ +		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, +					       ctx->position); +		  if (!result) +		    return REG_ESPACE; +		  ctx->position++; +		} +	      ctx->re++; +	      break; + +	    case '^':	 /* beginning of line assertion */ +	      /* '^' has a special meaning everywhere in EREs, and in the +		 beginning of the RE and after \( is BREs. */ +	      if (ctx->cflags & REG_EXTENDED +		  || (ctx->re - 2 >= ctx->re_start +		      && *(ctx->re - 2) == '\\' +		      && *(ctx->re - 1) == '(') +		  || ctx->re == ctx->re_start) +		{ +		  DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n", +			  ctx->re_end - ctx->re, ctx->re)); +		  result = tre_ast_new_literal(ctx->mem, ASSERTION, +					       ASSERT_AT_BOL, -1); +		  if (result == NULL) +		    return REG_ESPACE; +		  ctx->re++; +		} +	      else +		goto parse_literal; +	      break; + +	    case '$':	 /* end of line assertion. */ +	      /* '$' is special everywhere in EREs, and in the end of the +		 string and before \) is BREs. */ +	      if (ctx->cflags & REG_EXTENDED +		  || (ctx->re + 2 < ctx->re_end +		      && *(ctx->re + 1) == '\\' +		      && *(ctx->re + 2) == ')') +		  || ctx->re + 1 == ctx->re_end) +		{ +		  DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n", +			  ctx->re_end - ctx->re, ctx->re)); +		  result = tre_ast_new_literal(ctx->mem, ASSERTION, +					       ASSERT_AT_EOL, -1); +		  if (result == NULL) +		    return REG_ESPACE; +		  ctx->re++; +		} +	      else +		goto parse_literal; +	      break; + +	    default: +	    parse_literal: + +	      /* We are expecting an atom.  If the subexpression (or the whole +		 regexp ends here, we interpret it as an empty expression +		 (which matches an empty string).  */ +	      if ( +		  (ctx->re >= ctx->re_end +		   || *ctx->re == '*' +		   || (ctx->cflags & REG_EXTENDED +		       && (*ctx->re == '|' +			   || *ctx->re == '{' +			   || *ctx->re == '+' +			   || *ctx->re == '?')) +		   /* Test for "\)" in BRE mode. */ +		   || (!(ctx->cflags & REG_EXTENDED) +		       && ctx->re + 1 < ctx->re_end +		       && *ctx->re == '\\' +		       && *(ctx->re + 1) == '{'))) +		{ +		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n", +			  ctx->re_end - ctx->re, ctx->re)); +		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +		  if (!result) +		    return REG_ESPACE; +		  break; +		} + +	      DPRINT(("tre_parse:     literal: '%.*" STRF "'\n", +		      ctx->re_end - ctx->re, ctx->re)); +	      /* Note that we can't use an tre_isalpha() test here, since there +		 may be characters which are alphabetic but neither upper or +		 lower case. */ +	      if (ctx->cflags & REG_ICASE +		  && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) +		{ +		  tre_ast_node_t *tmp1; +		  tre_ast_node_t *tmp2; + +		  /* XXX - Can there be more than one opposite-case +		     counterpoints for some character in some locale?  Or +		     more than two characters which all should be regarded +		     the same character if case is ignored?  If yes, there +		     does not seem to be a portable way to detect it.  I guess +		     that at least for multi-character collating elements there +		     could be several opposite-case counterpoints, but they +		     cannot be supported portably anyway. */ +		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), +					     tre_toupper(*ctx->re), +					     ctx->position); +		  if (!tmp1) +		    return REG_ESPACE; +		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), +					     tre_tolower(*ctx->re), +					     ctx->position); +		  if (!tmp2) +		    return REG_ESPACE; +		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2); +		  if (!result) +		    return REG_ESPACE; +		} +	      else +		{ +		  result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, +					       ctx->position); +		  if (!result) +		    return REG_ESPACE; +		} +	      ctx->position++; +	      ctx->re++; +	      break; +	    } +	  break; + +	case PARSE_MARK_FOR_SUBMATCH: +	  { +	    int submatch_id = (int)tre_stack_pop(stack); + +	    if (result->submatch_id >= 0) +	      { +		tre_ast_node_t *n, *tmp_node; +		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +		if (n == NULL) +		  return REG_ESPACE; +		tmp_node = tre_ast_new_catenation(ctx->mem, n, result); +		if (tmp_node == NULL) +		  return REG_ESPACE; +		tmp_node->num_submatches = result->num_submatches; +		result = tmp_node; +	      } +	    result->submatch_id = submatch_id; +	    result->num_submatches++; +	    break; +	  } + +	case PARSE_RESTORE_CFLAGS: +	  ctx->cflags = (int)tre_stack_pop(stack); +	  break; +	} +    } + +  /* Check for missing closing parentheses. */ +  if (depth > 0) +    return REG_EPAREN; + +  if (status == REG_OK) +    ctx->result = result; + +  return status; +} + + +/*********************************************************************** + from tre-compile.c +***********************************************************************/ + +/* +  Algorithms to setup tags so that submatch addressing can be done. +*/ + + +/* Inserts a catenation node to the root of the tree given in `node'. +   As the left child a new tag with number `tag_id' to `node' is added, +   and the right child is the old root. */ +/*              OR                  */ +/* Inserts a catenation node to the root of the tree given in `node'. +   As the right child a new tag with number `tag_id' to `node' is added, +   and the left child is the old root. */ +static reg_errcode_t +tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right) +{ +  tre_catenation_t *c; +  tre_ast_node_t *child_tag, *child_old; + +  DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id)); + +  c = tre_mem_alloc(mem, sizeof(*c)); +  if (c == NULL) +    return REG_ESPACE; +  child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1); +  if (child_tag == NULL) +    return REG_ESPACE; +  child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); +  if (child_old == NULL) +    return REG_ESPACE; + +  child_old->obj = node->obj; +  child_old->type = node->type; +  child_old->nullable = -1; +  child_old->submatch_id = -1; +  child_old->firstpos = NULL; +  child_old->lastpos = NULL; +  child_old->num_tags = 0; +  node->obj = c; +  node->type = CATENATION; + +  c->right = c->left = child_old; +  if (right) c->right = child_tag; +  else c->left = child_tag; + +  return REG_OK; +} + +typedef enum { +  ADDTAGS_RECURSE, +  ADDTAGS_AFTER_ITERATION, +  ADDTAGS_AFTER_UNION_LEFT, +  ADDTAGS_AFTER_UNION_RIGHT, +  ADDTAGS_AFTER_CAT_LEFT, +  ADDTAGS_AFTER_CAT_RIGHT, +  ADDTAGS_SET_SUBMATCH_END +} tre_addtags_symbol_t; + + +typedef struct { +  int tag; +  int next_tag; +} tre_tag_states_t; + +/* Adds tags to appropriate locations in the parse tree in `tree', so that +   subexpressions marked for submatch addressing can be traced. */ +static reg_errcode_t +tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, +	     tre_tnfa_t *tnfa) +{ +  reg_errcode_t status = REG_OK; +  tre_addtags_symbol_t symbol; +  tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */ +  int bottom = tre_stack_num_objects(stack); +  /* True for first pass (counting number of needed tags) */ +  int first_pass = (mem == NULL || tnfa == NULL); +  int *regset, *orig_regset; +  int num_tags = 0; /* Total number of tags. */ +  int tag = 0;	    /* The tag that is to be added next. */ +  int next_tag = 1; /* Next tag to use after this one. */ +  int *parents;	    /* Stack of submatches the current submatch is +		       contained in. */ +  tre_tag_states_t *saved_states; + +  tre_tag_direction_t direction = TRE_TAG_MINIMIZE; +  if (!first_pass) +      tnfa->end_tag = 0; + +  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); +  if (regset == NULL) +    return REG_ESPACE; +  regset[0] = -1; +  orig_regset = regset; + +  parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1)); +  if (parents == NULL) +    { +      xfree(regset); +      return REG_ESPACE; +    } +  parents[0] = -1; + +  saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1)); +  if (saved_states == NULL) +    { +      xfree(regset); +      xfree(parents); +      return REG_ESPACE; +    } +  else +    { +      unsigned int i; +      for (i = 0; i <= tnfa->num_submatches; i++) +	saved_states[i].tag = -1; +    } + +  STACK_PUSH(stack, node); +  STACK_PUSH(stack, ADDTAGS_RECURSE); + +  while (tre_stack_num_objects(stack) > bottom) +    { +      if (status != REG_OK) +	break; + +      symbol = (tre_addtags_symbol_t)tre_stack_pop(stack); +      switch (symbol) +	{ + +	case ADDTAGS_SET_SUBMATCH_END: +	  { +	    int id = (int)tre_stack_pop(stack); +	    int i; + +	    /* Add end of this submatch to regset. */ +	    for (i = 0; regset[i] >= 0; i++); +	    regset[i] = id * 2 + 1; +	    regset[i + 1] = -1; + +	    /* Pop this submatch from the parents stack. */ +	    for (i = 0; parents[i] >= 0; i++); +	    parents[i - 1] = -1; +	    break; +	  } + +	case ADDTAGS_RECURSE: +	  node = tre_stack_pop(stack); + +	  if (node->submatch_id >= 0) +	    { +	      int id = node->submatch_id; +	      int i; + + +	      /* Add start of this submatch to regset. */ +	      for (i = 0; regset[i] >= 0; i++); +	      regset[i] = id * 2; +	      regset[i + 1] = -1; + +	      if (!first_pass) +		{ +		  for (i = 0; parents[i] >= 0; i++); +		  tnfa->submatch_data[id].parents = NULL; +		  if (i > 0) +		    { +		      int *p = xmalloc(sizeof(*p) * (i + 1)); +		      if (p == NULL) +			{ +			  status = REG_ESPACE; +			  break; +			} +		      assert(tnfa->submatch_data[id].parents == NULL); +		      tnfa->submatch_data[id].parents = p; +		      for (i = 0; parents[i] >= 0; i++) +			p[i] = parents[i]; +		      p[i] = -1; +		    } +		} + +	      /* Add end of this submatch to regset after processing this +		 node. */ +	      STACK_PUSHX(stack, node->submatch_id); +	      STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END); +	    } + +	  switch (node->type) +	    { +	    case LITERAL: +	      { +		tre_literal_t *lit = node->obj; + +		if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) +		  { +		    int i; +		    DPRINT(("Literal %d-%d\n", +			    (int)lit->code_min, (int)lit->code_max)); +		    if (regset[0] >= 0) +		      { +			/* Regset is not empty, so add a tag before the +			   literal or backref. */ +			if (!first_pass) +			  { +			    status = tre_add_tag(mem, node, tag, 0 /*left*/); +			    tnfa->tag_directions[tag] = direction; +			    /* Go through the regset and set submatch data for +			       submatches that are using this tag. */ +			    for (i = 0; regset[i] >= 0; i++) +			      { +				int id = regset[i] >> 1; +				int start = !(regset[i] & 1); +				DPRINT(("  Using tag %d for %s offset of " +					"submatch %d\n", tag, +					start ? "start" : "end", id)); +				if (start) +				  tnfa->submatch_data[id].so_tag = tag; +				else +				  tnfa->submatch_data[id].eo_tag = tag; +			      } +			  } +			else +			  { +			    DPRINT(("  num_tags = 1\n")); +			    node->num_tags = 1; +			  } + +			DPRINT(("  num_tags++\n")); +			regset[0] = -1; +			tag = next_tag; +			num_tags++; +			next_tag++; +		      } +		  } +		else +		  { +		    assert(!IS_TAG(lit)); +		  } +		break; +	      } +	    case CATENATION: +	      { +		tre_catenation_t *cat = node->obj; +		tre_ast_node_t *left = cat->left; +		tre_ast_node_t *right = cat->right; +		int reserved_tag = -1; +		DPRINT(("Catenation, next_tag = %d\n", next_tag)); + + +		/* After processing right child. */ +		STACK_PUSHX(stack, node); +		STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT); + +		/* Process right child. */ +		STACK_PUSHX(stack, right); +		STACK_PUSHX(stack, ADDTAGS_RECURSE); + +		/* After processing left child. */ +		STACK_PUSHX(stack, next_tag + left->num_tags); +		DPRINT(("  Pushing %d for after left\n", +			next_tag + left->num_tags)); +		if (left->num_tags > 0 && right->num_tags > 0) +		  { +		    /* Reserve the next tag to the right child. */ +		    DPRINT(("  Reserving next_tag %d to right child\n", +			    next_tag)); +		    reserved_tag = next_tag; +		    next_tag++; +		  } +		STACK_PUSHX(stack, reserved_tag); +		STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT); + +		/* Process left child. */ +		STACK_PUSHX(stack, left); +		STACK_PUSHX(stack, ADDTAGS_RECURSE); + +		} +	      break; +	    case ITERATION: +	      { +		tre_iteration_t *iter = node->obj; +		DPRINT(("Iteration\n")); + +		if (first_pass) +		  { +		    STACK_PUSHX(stack, regset[0] >= 0); +		  } +		else +		  { +		    STACK_PUSHX(stack, tag); +		  } +		STACK_PUSHX(stack, node); +		STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION); + +		STACK_PUSHX(stack, iter->arg); +		STACK_PUSHX(stack, ADDTAGS_RECURSE); + +		/* Regset is not empty, so add a tag here. */ +		if (regset[0] >= 0) +		  { +		    if (!first_pass) +		      { +			int i; +			status = tre_add_tag(mem, node, tag, 0 /*left*/); +			tnfa->tag_directions[tag] = direction; +			/* Go through the regset and set submatch data for +			   submatches that are using this tag. */ +			for (i = 0; regset[i] >= 0; i++) +			  { +			    int id = regset[i] >> 1; +			    int start = !(regset[i] & 1); +			    DPRINT(("  Using tag %d for %s offset of " +				    "submatch %d\n", tag, +				    start ? "start" : "end", id)); +			    if (start) +			      tnfa->submatch_data[id].so_tag = tag; +			    else +			      tnfa->submatch_data[id].eo_tag = tag; +			  } +		      } + +		    DPRINT(("  num_tags++\n")); +		    regset[0] = -1; +		    tag = next_tag; +		    num_tags++; +		    next_tag++; +		  } +		direction = TRE_TAG_MINIMIZE; +	      } +	      break; +	    case UNION: +	      { +		tre_union_t *uni = node->obj; +		tre_ast_node_t *left = uni->left; +		tre_ast_node_t *right = uni->right; +		int left_tag; +		int right_tag; + +		if (regset[0] >= 0) +		  { +		    left_tag = next_tag; +		    right_tag = next_tag + 1; +		  } +		else +		  { +		    left_tag = tag; +		    right_tag = next_tag; +		  } + +		DPRINT(("Union\n")); + +		/* After processing right child. */ +		STACK_PUSHX(stack, right_tag); +		STACK_PUSHX(stack, left_tag); +		STACK_PUSHX(stack, regset); +		STACK_PUSHX(stack, regset[0] >= 0); +		STACK_PUSHX(stack, node); +		STACK_PUSHX(stack, right); +		STACK_PUSHX(stack, left); +		STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT); + +		/* Process right child. */ +		STACK_PUSHX(stack, right); +		STACK_PUSHX(stack, ADDTAGS_RECURSE); + +		/* After processing left child. */ +		STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT); + +		/* Process left child. */ +		STACK_PUSHX(stack, left); +		STACK_PUSHX(stack, ADDTAGS_RECURSE); + +		/* Regset is not empty, so add a tag here. */ +		if (regset[0] >= 0) +		  { +		    if (!first_pass) +		      { +			int i; +			status = tre_add_tag(mem, node, tag, 0 /*left*/); +			tnfa->tag_directions[tag] = direction; +			/* Go through the regset and set submatch data for +			   submatches that are using this tag. */ +			for (i = 0; regset[i] >= 0; i++) +			  { +			    int id = regset[i] >> 1; +			    int start = !(regset[i] & 1); +			    DPRINT(("  Using tag %d for %s offset of " +				    "submatch %d\n", tag, +				    start ? "start" : "end", id)); +			    if (start) +			      tnfa->submatch_data[id].so_tag = tag; +			    else +			      tnfa->submatch_data[id].eo_tag = tag; +			  } +		      } + +		    DPRINT(("  num_tags++\n")); +		    regset[0] = -1; +		    tag = next_tag; +		    num_tags++; +		    next_tag++; +		  } + +		if (node->num_submatches > 0) +		  { +		    /* The next two tags are reserved for markers. */ +		    next_tag++; +		    tag = next_tag; +		    next_tag++; +		  } + +		break; +	      } +	    } + +	  if (node->submatch_id >= 0) +	    { +	      int i; +	      /* Push this submatch on the parents stack. */ +	      for (i = 0; parents[i] >= 0; i++); +	      parents[i] = node->submatch_id; +	      parents[i + 1] = -1; +	    } + +	  break; /* end case: ADDTAGS_RECURSE */ + +	case ADDTAGS_AFTER_ITERATION: +	  { +	    int enter_tag; +	    node = tre_stack_pop(stack); +	    if (first_pass) +		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags +		  + (int)tre_stack_pop(stack); +	    else +		enter_tag = (int)tre_stack_pop(stack); + +	    DPRINT(("After iteration\n")); +	    direction = TRE_TAG_MAXIMIZE; +	    break; +	  } + +	case ADDTAGS_AFTER_CAT_LEFT: +	  { +	    int new_tag = (int)tre_stack_pop(stack); +	    next_tag = (int)tre_stack_pop(stack); +	    DPRINT(("After cat left, tag = %d, next_tag = %d\n", +		    tag, next_tag)); +	    if (new_tag >= 0) +	      { +		DPRINT(("  Setting tag to %d\n", new_tag)); +		tag = new_tag; +	      } +	    break; +	  } + +	case ADDTAGS_AFTER_CAT_RIGHT: +	  DPRINT(("After cat right\n")); +	  node = tre_stack_pop(stack); +	  if (first_pass) +	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags +	      + ((tre_catenation_t *)node->obj)->right->num_tags; +	  break; + +	case ADDTAGS_AFTER_UNION_LEFT: +	  DPRINT(("After union left\n")); +	  /* Lift the bottom of the `regset' array so that when processing +	     the right operand the items currently in the array are +	     invisible.	 The original bottom was saved at ADDTAGS_UNION and +	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ +	  while (*regset >= 0) +	    regset++; +	  break; + +	case ADDTAGS_AFTER_UNION_RIGHT: +	  { +	    int added_tags, tag_left, tag_right; +	    tre_ast_node_t *left = tre_stack_pop(stack); +	    tre_ast_node_t *right = tre_stack_pop(stack); +	    DPRINT(("After union right\n")); +	    node = tre_stack_pop(stack); +	    added_tags = (int)tre_stack_pop(stack); +	    if (first_pass) +	      { +		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags +		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags +		  + ((node->num_submatches > 0) ? 2 : 0); +	      } +	    regset = tre_stack_pop(stack); +	    tag_left = (int)tre_stack_pop(stack); +	    tag_right = (int)tre_stack_pop(stack); + +	    /* Add tags after both children, the left child gets a smaller +	       tag than the right child.  This guarantees that we prefer +	       the left child over the right child. */ +	    /* XXX - This is not always necessary (if the children have +	       tags which must be seen for every match of that child). */ +	    /* XXX - Check if this is the only place where tre_add_tag_right +	       is used.	 If so, use tre_add_tag_left (putting the tag before +	       the child as opposed after the child) and throw away +	       tre_add_tag_right. */ +	    if (node->num_submatches > 0) +	      { +		if (!first_pass) +		  { +		    status = tre_add_tag(mem, left, tag_left, 1 /*right*/); +		    tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; +		    status = tre_add_tag(mem, right, tag_right, 1 /*right*/); +		    tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; +		  } +		DPRINT(("  num_tags += 2\n")); +		num_tags += 2; +	      } +	    direction = TRE_TAG_MAXIMIZE; +	    break; +	  } + +	default: +	  assert(0); +	  break; + +	} /* end switch(symbol) */ +    } /* end while(tre_stack_num_objects(stack) > bottom) */ + +  if (!first_pass) +    { +      int i; +      /* Go through the regset and set submatch data for +	 submatches that are using this tag. */ +      for (i = 0; regset[i] >= 0; i++) +	{ +	  int id = regset[i] >> 1; +	  int start = !(regset[i] & 1); +	  DPRINT(("  Using tag %d for %s offset of " +		  "submatch %d\n", num_tags, +		  start ? "start" : "end", id)); +	  if (start) +	    tnfa->submatch_data[id].so_tag = num_tags; +	  else +	    tnfa->submatch_data[id].eo_tag = num_tags; +	} +    } + +  DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n", +	  first_pass? "First pass" : "Second pass", num_tags)); + +  assert(tree->num_tags == num_tags); +  tnfa->end_tag = num_tags; +  tnfa->num_tags = num_tags; +  xfree(orig_regset); +  xfree(parents); +  xfree(saved_states); +  return status; +} + + + +/* +  AST to TNFA compilation routines. +*/ + +typedef enum { +  COPY_RECURSE, +  COPY_SET_RESULT_PTR +} tre_copyast_symbol_t; + +/* Flags for tre_copy_ast(). */ +#define COPY_REMOVE_TAGS	 1 +#define COPY_MAXIMIZE_FIRST_TAG	 2 + +static reg_errcode_t +tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, +	     int flags, int *pos_add, tre_tag_direction_t *tag_directions, +	     tre_ast_node_t **copy, int *max_pos) +{ +  reg_errcode_t status = REG_OK; +  int bottom = tre_stack_num_objects(stack); +  int num_copied = 0; +  int first_tag = 1; +  tre_ast_node_t **result = copy; +  tre_copyast_symbol_t symbol; + +  STACK_PUSH(stack, ast); +  STACK_PUSH(stack, COPY_RECURSE); + +  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) +    { +      tre_ast_node_t *node; +      if (status != REG_OK) +	break; + +      symbol = (tre_copyast_symbol_t)tre_stack_pop(stack); +      switch (symbol) +	{ +	case COPY_SET_RESULT_PTR: +	  result = tre_stack_pop(stack); +	  break; +	case COPY_RECURSE: +	  node = tre_stack_pop(stack); +	  switch (node->type) +	    { +	    case LITERAL: +	      { +		tre_literal_t *lit = node->obj; +		int pos = lit->position; +		int min = lit->code_min; +		int max = lit->code_max; +		if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) +		  { +		    /* XXX - e.g. [ab] has only one position but two +		       nodes, so we are creating holes in the state space +		       here.  Not fatal, just wastes memory. */ +		    pos += *pos_add; +		    num_copied++; +		  } +		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) +		  { +		    /* Change this tag to empty. */ +		    min = EMPTY; +		    max = pos = -1; +		  } +		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) +			 && first_tag) +		  { +		    /* Maximize the first tag. */ +		    tag_directions[max] = TRE_TAG_MAXIMIZE; +		    first_tag = 0; +		  } +		*result = tre_ast_new_literal(mem, min, max, pos); +		if (*result == NULL) +		  status = REG_ESPACE; + +		if (pos > *max_pos) +		  *max_pos = pos; +		break; +	      } +	    case UNION: +	      { +		tre_union_t *uni = node->obj; +		tre_union_t *copy; +		*result = tre_ast_new_union(mem, uni->left, uni->right); +		if (*result == NULL) +		  { +		    status = REG_ESPACE; +		    break; +		  } +		copy = (*result)->obj; +		result = ©->left; +		STACK_PUSHX(stack, uni->right); +		STACK_PUSHX(stack, COPY_RECURSE); +		STACK_PUSHX(stack, ©->right); +		STACK_PUSHX(stack, COPY_SET_RESULT_PTR); +		STACK_PUSHX(stack, uni->left); +		STACK_PUSHX(stack, COPY_RECURSE); +		break; +	      } +	    case CATENATION: +	      { +		tre_catenation_t *cat = node->obj; +		tre_catenation_t *copy; +		*result = tre_ast_new_catenation(mem, cat->left, cat->right); +		if (*result == NULL) +		  { +		    status = REG_ESPACE; +		    break; +		  } +		copy = (*result)->obj; +		copy->left = NULL; +		copy->right = NULL; +		result = ©->left; + +		STACK_PUSHX(stack, cat->right); +		STACK_PUSHX(stack, COPY_RECURSE); +		STACK_PUSHX(stack, ©->right); +		STACK_PUSHX(stack, COPY_SET_RESULT_PTR); +		STACK_PUSHX(stack, cat->left); +		STACK_PUSHX(stack, COPY_RECURSE); +		break; +	      } +	    case ITERATION: +	      { +		tre_iteration_t *iter = node->obj; +		STACK_PUSHX(stack, iter->arg); +		STACK_PUSHX(stack, COPY_RECURSE); +		*result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max); +		if (*result == NULL) +		  { +		    status = REG_ESPACE; +		    break; +		  } +		iter = (*result)->obj; +		result = &iter->arg; +		break; +	      } +	    default: +	      assert(0); +	      break; +	    } +	  break; +	} +    } +  *pos_add += num_copied; +  return status; +} + +typedef enum { +  EXPAND_RECURSE, +  EXPAND_AFTER_ITER +} tre_expand_ast_symbol_t; + +/* Expands each iteration node that has a finite nonzero minimum or maximum +   iteration count to a catenated sequence of copies of the node. */ +static reg_errcode_t +tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, +	       int *position, tre_tag_direction_t *tag_directions, +	       int *max_depth) +{ +  reg_errcode_t status = REG_OK; +  int bottom = tre_stack_num_objects(stack); +  int pos_add = 0; +  int pos_add_total = 0; +  int max_pos = 0; +  /* Approximate parameter nesting level. */ +  int iter_depth = 0; + +  STACK_PUSHR(stack, ast); +  STACK_PUSHR(stack, EXPAND_RECURSE); +  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) +    { +      tre_ast_node_t *node; +      tre_expand_ast_symbol_t symbol; + +      if (status != REG_OK) +	break; + +      DPRINT(("pos_add %d\n", pos_add)); + +      symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack); +      node = tre_stack_pop(stack); +      switch (symbol) +	{ +	case EXPAND_RECURSE: +	  switch (node->type) +	    { +	    case LITERAL: +	      { +		tre_literal_t *lit= node->obj; +		if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) +		  { +		    lit->position += pos_add; +		    if (lit->position > max_pos) +		      max_pos = lit->position; +		  } +		break; +	      } +	    case UNION: +	      { +		tre_union_t *uni = node->obj; +		STACK_PUSHX(stack, uni->right); +		STACK_PUSHX(stack, EXPAND_RECURSE); +		STACK_PUSHX(stack, uni->left); +		STACK_PUSHX(stack, EXPAND_RECURSE); +		break; +	      } +	    case CATENATION: +	      { +		tre_catenation_t *cat = node->obj; +		STACK_PUSHX(stack, cat->right); +		STACK_PUSHX(stack, EXPAND_RECURSE); +		STACK_PUSHX(stack, cat->left); +		STACK_PUSHX(stack, EXPAND_RECURSE); +		break; +	      } +	    case ITERATION: +	      { +		tre_iteration_t *iter = node->obj; +		STACK_PUSHX(stack, pos_add); +		STACK_PUSHX(stack, node); +		STACK_PUSHX(stack, EXPAND_AFTER_ITER); +		STACK_PUSHX(stack, iter->arg); +		STACK_PUSHX(stack, EXPAND_RECURSE); +		/* If we are going to expand this node at EXPAND_AFTER_ITER +		   then don't increase the `pos' fields of the nodes now, it +		   will get done when expanding. */ +		if (iter->min > 1 || iter->max > 1) +		  pos_add = 0; +		iter_depth++; +		DPRINT(("iter\n")); +		break; +	      } +	    default: +	      assert(0); +	      break; +	    } +	  break; +	case EXPAND_AFTER_ITER: +	  { +	    tre_iteration_t *iter = node->obj; +	    int pos_add_last; +	    pos_add = (int)tre_stack_pop(stack); +	    pos_add_last = pos_add; +	    if (iter->min > 1 || iter->max > 1) +	      { +		tre_ast_node_t *seq1 = NULL, *seq2 = NULL; +		int i; +		int pos_add_save = pos_add; + +		/* Create a catenated sequence of copies of the node. */ +		for (i = 0; i < iter->min; i++) +		  { +		    tre_ast_node_t *copy; +		    /* Remove tags from all but the last copy. */ +		    int flags = ((i + 1 < iter->min) +				 ? COPY_REMOVE_TAGS +				 : COPY_MAXIMIZE_FIRST_TAG); +		    DPRINT(("  pos_add %d\n", pos_add)); +		    pos_add_save = pos_add; +		    status = tre_copy_ast(mem, stack, iter->arg, flags, +					  &pos_add, tag_directions, ©, +					  &max_pos); +		    if (status != REG_OK) +		      return status; +		    if (seq1 != NULL) +		      seq1 = tre_ast_new_catenation(mem, seq1, copy); +		    else +		      seq1 = copy; +		    if (seq1 == NULL) +		      return REG_ESPACE; +		  } + +		if (iter->max == -1) +		  { +		    /* No upper limit. */ +		    pos_add_save = pos_add; +		    status = tre_copy_ast(mem, stack, iter->arg, 0, +					  &pos_add, NULL, &seq2, &max_pos); +		    if (status != REG_OK) +		      return status; +		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1); +		    if (seq2 == NULL) +		      return REG_ESPACE; +		  } +		else +		  { +		    for (i = iter->min; i < iter->max; i++) +		      { +			tre_ast_node_t *tmp, *copy; +			pos_add_save = pos_add; +			status = tre_copy_ast(mem, stack, iter->arg, 0, +					      &pos_add, NULL, ©, &max_pos); +			if (status != REG_OK) +			  return status; +			if (seq2 != NULL) +			  seq2 = tre_ast_new_catenation(mem, copy, seq2); +			else +			  seq2 = copy; +			if (seq2 == NULL) +			  return REG_ESPACE; +			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1); +			if (tmp == NULL) +			  return REG_ESPACE; +			seq2 = tre_ast_new_union(mem, tmp, seq2); +			if (seq2 == NULL) +			  return REG_ESPACE; +		      } +		  } + +		pos_add = pos_add_save; +		if (seq1 == NULL) +		  seq1 = seq2; +		else if (seq2 != NULL) +		  seq1 = tre_ast_new_catenation(mem, seq1, seq2); +		if (seq1 == NULL) +		  return REG_ESPACE; +		node->obj = seq1->obj; +		node->type = seq1->type; +	      } + +	    iter_depth--; +	    pos_add_total += pos_add - pos_add_last; +	    if (iter_depth == 0) +	      pos_add = pos_add_total; + +	    break; +	  } +	default: +	  assert(0); +	  break; +	} +    } + +  *position += pos_add_total; + +  /* `max_pos' should never be larger than `*position' if the above +     code works, but just an extra safeguard let's make sure +     `*position' is set large enough so enough memory will be +     allocated for the transition table. */ +  if (max_pos > *position) +    *position = max_pos; + +#ifdef TRE_DEBUG +  DPRINT(("Expanded AST:\n")); +  tre_ast_print(ast); +  DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); +#endif + +  return status; +} + +static tre_pos_and_tags_t * +tre_set_empty(tre_mem_t mem) +{ +  tre_pos_and_tags_t *new_set; + +  new_set = tre_mem_calloc(mem, sizeof(*new_set)); +  if (new_set == NULL) +    return NULL; + +  new_set[0].position = -1; +  new_set[0].code_min = -1; +  new_set[0].code_max = -1; + +  return new_set; +} + +static tre_pos_and_tags_t * +tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, +	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref) +{ +  tre_pos_and_tags_t *new_set; + +  new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2); +  if (new_set == NULL) +    return NULL; + +  new_set[0].position = position; +  new_set[0].code_min = code_min; +  new_set[0].code_max = code_max; +  new_set[0].class = class; +  new_set[0].neg_classes = neg_classes; +  new_set[0].backref = backref; +  new_set[1].position = -1; +  new_set[1].code_min = -1; +  new_set[1].code_max = -1; + +  return new_set; +} + +static tre_pos_and_tags_t * +tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, +	      int *tags, int assertions) +{ +  int s1, s2, i, j; +  tre_pos_and_tags_t *new_set; +  int *new_tags; +  int num_tags; + +  for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++); +  for (s1 = 0; set1[s1].position >= 0; s1++); +  for (s2 = 0; set2[s2].position >= 0; s2++); +  new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1)); +  if (!new_set ) +    return NULL; + +  for (s1 = 0; set1[s1].position >= 0; s1++) +    { +      new_set[s1].position = set1[s1].position; +      new_set[s1].code_min = set1[s1].code_min; +      new_set[s1].code_max = set1[s1].code_max; +      new_set[s1].assertions = set1[s1].assertions | assertions; +      new_set[s1].class = set1[s1].class; +      new_set[s1].neg_classes = set1[s1].neg_classes; +      new_set[s1].backref = set1[s1].backref; +      if (set1[s1].tags == NULL && tags == NULL) +	new_set[s1].tags = NULL; +      else +	{ +	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++); +	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) +					 * (i + num_tags + 1))); +	  if (new_tags == NULL) +	    return NULL; +	  for (j = 0; j < i; j++) +	    new_tags[j] = set1[s1].tags[j]; +	  for (i = 0; i < num_tags; i++) +	    new_tags[j + i] = tags[i]; +	  new_tags[j + i] = -1; +	  new_set[s1].tags = new_tags; +	} +    } + +  for (s2 = 0; set2[s2].position >= 0; s2++) +    { +      new_set[s1 + s2].position = set2[s2].position; +      new_set[s1 + s2].code_min = set2[s2].code_min; +      new_set[s1 + s2].code_max = set2[s2].code_max; +      /* XXX - why not | assertions here as well? */ +      new_set[s1 + s2].assertions = set2[s2].assertions; +      new_set[s1 + s2].class = set2[s2].class; +      new_set[s1 + s2].neg_classes = set2[s2].neg_classes; +      new_set[s1 + s2].backref = set2[s2].backref; +      if (set2[s2].tags == NULL) +	new_set[s1 + s2].tags = NULL; +      else +	{ +	  for (i = 0; set2[s2].tags[i] >= 0; i++); +	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1)); +	  if (new_tags == NULL) +	    return NULL; +	  for (j = 0; j < i; j++) +	    new_tags[j] = set2[s2].tags[j]; +	  new_tags[j] = -1; +	  new_set[s1 + s2].tags = new_tags; +	} +    } +  new_set[s1 + s2].position = -1; +  return new_set; +} + +/* Finds the empty path through `node' which is the one that should be +   taken according to POSIX.2 rules, and adds the tags on that path to +   `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is +   set to the number of tags seen on the path. */ +static reg_errcode_t +tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, +		int *assertions, int *num_tags_seen) +{ +  tre_literal_t *lit; +  tre_union_t *uni; +  tre_catenation_t *cat; +  tre_iteration_t *iter; +  int i; +  int bottom = tre_stack_num_objects(stack); +  reg_errcode_t status = REG_OK; +  if (num_tags_seen) +    *num_tags_seen = 0; + +  status = tre_stack_push(stack, node); + +  /* Walk through the tree recursively. */ +  while (status == REG_OK && tre_stack_num_objects(stack) > bottom) +    { +      node = tre_stack_pop(stack); + +      switch (node->type) +	{ +	case LITERAL: +	  lit = (tre_literal_t *)node->obj; +	  switch (lit->code_min) +	    { +	    case TAG: +	      if (lit->code_max >= 0) +		{ +		  if (tags != NULL) +		    { +		      /* Add the tag to `tags'. */ +		      for (i = 0; tags[i] >= 0; i++) +			if (tags[i] == lit->code_max) +			  break; +		      if (tags[i] < 0) +			{ +			  tags[i] = lit->code_max; +			  tags[i + 1] = -1; +			} +		    } +		  if (num_tags_seen) +		    (*num_tags_seen)++; +		} +	      break; +	    case ASSERTION: +	      assert(lit->code_max >= 1 +		     || lit->code_max <= ASSERT_LAST); +	      if (assertions != NULL) +		*assertions |= lit->code_max; +	      break; +	    case EMPTY: +	      break; +	    default: +	      assert(0); +	      break; +	    } +	  break; + +	case UNION: +	  /* Subexpressions starting earlier take priority over ones +	     starting later, so we prefer the left subexpression over the +	     right subexpression. */ +	  uni = (tre_union_t *)node->obj; +	  if (uni->left->nullable) +	    STACK_PUSHX(stack, uni->left) +	  else if (uni->right->nullable) +	    STACK_PUSHX(stack, uni->right) +	  else +	    assert(0); +	  break; + +	case CATENATION: +	  /* The path must go through both children. */ +	  cat = (tre_catenation_t *)node->obj; +	  assert(cat->left->nullable); +	  assert(cat->right->nullable); +	  STACK_PUSHX(stack, cat->left); +	  STACK_PUSHX(stack, cat->right); +	  break; + +	case ITERATION: +	  /* A match with an empty string is preferred over no match at +	     all, so we go through the argument if possible. */ +	  iter = (tre_iteration_t *)node->obj; +	  if (iter->arg->nullable) +	    STACK_PUSHX(stack, iter->arg); +	  break; + +	default: +	  assert(0); +	  break; +	} +    } + +  return status; +} + + +typedef enum { +  NFL_RECURSE, +  NFL_POST_UNION, +  NFL_POST_CATENATION, +  NFL_POST_ITERATION +} tre_nfl_stack_symbol_t; + + +/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for +   the nodes of the AST `tree'. */ +static reg_errcode_t +tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) +{ +  int bottom = tre_stack_num_objects(stack); + +  STACK_PUSHR(stack, tree); +  STACK_PUSHR(stack, NFL_RECURSE); + +  while (tre_stack_num_objects(stack) > bottom) +    { +      tre_nfl_stack_symbol_t symbol; +      tre_ast_node_t *node; + +      symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack); +      node = tre_stack_pop(stack); +      switch (symbol) +	{ +	case NFL_RECURSE: +	  switch (node->type) +	    { +	    case LITERAL: +	      { +		tre_literal_t *lit = (tre_literal_t *)node->obj; +		if (IS_BACKREF(lit)) +		  { +		    /* Back references: nullable = false, firstpos = {i}, +		       lastpos = {i}. */ +		    node->nullable = 0; +		    node->firstpos = tre_set_one(mem, lit->position, 0, +					     TRE_CHAR_MAX, 0, NULL, -1); +		    if (!node->firstpos) +		      return REG_ESPACE; +		    node->lastpos = tre_set_one(mem, lit->position, 0, +						TRE_CHAR_MAX, 0, NULL, +						lit->code_max); +		    if (!node->lastpos) +		      return REG_ESPACE; +		  } +		else if (lit->code_min < 0) +		  { +		    /* Tags, empty strings and zero width assertions: +		       nullable = true, firstpos = {}, and lastpos = {}. */ +		    node->nullable = 1; +		    node->firstpos = tre_set_empty(mem); +		    if (!node->firstpos) +		      return REG_ESPACE; +		    node->lastpos = tre_set_empty(mem); +		    if (!node->lastpos) +		      return REG_ESPACE; +		  } +		else +		  { +		    /* Literal at position i: nullable = false, firstpos = {i}, +		       lastpos = {i}. */ +		    node->nullable = 0; +		    node->firstpos = +		      tre_set_one(mem, lit->position, lit->code_min, +				  lit->code_max, 0, NULL, -1); +		    if (!node->firstpos) +		      return REG_ESPACE; +		    node->lastpos = tre_set_one(mem, lit->position, +						lit->code_min, lit->code_max, +						lit->class, lit->neg_classes, +						-1); +		    if (!node->lastpos) +		      return REG_ESPACE; +		  } +		break; +	      } + +	    case UNION: +	      /* Compute the attributes for the two subtrees, and after that +		 for this node. */ +	      STACK_PUSHR(stack, node); +	      STACK_PUSHR(stack, NFL_POST_UNION); +	      STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right); +	      STACK_PUSHR(stack, NFL_RECURSE); +	      STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left); +	      STACK_PUSHR(stack, NFL_RECURSE); +	      break; + +	    case CATENATION: +	      /* Compute the attributes for the two subtrees, and after that +		 for this node. */ +	      STACK_PUSHR(stack, node); +	      STACK_PUSHR(stack, NFL_POST_CATENATION); +	      STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right); +	      STACK_PUSHR(stack, NFL_RECURSE); +	      STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left); +	      STACK_PUSHR(stack, NFL_RECURSE); +	      break; + +	    case ITERATION: +	      /* Compute the attributes for the subtree, and after that for +		 this node. */ +	      STACK_PUSHR(stack, node); +	      STACK_PUSHR(stack, NFL_POST_ITERATION); +	      STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg); +	      STACK_PUSHR(stack, NFL_RECURSE); +	      break; +	    } +	  break; /* end case: NFL_RECURSE */ + +	case NFL_POST_UNION: +	  { +	    tre_union_t *uni = (tre_union_t *)node->obj; +	    node->nullable = uni->left->nullable || uni->right->nullable; +	    node->firstpos = tre_set_union(mem, uni->left->firstpos, +					   uni->right->firstpos, NULL, 0); +	    if (!node->firstpos) +	      return REG_ESPACE; +	    node->lastpos = tre_set_union(mem, uni->left->lastpos, +					  uni->right->lastpos, NULL, 0); +	    if (!node->lastpos) +	      return REG_ESPACE; +	    break; +	  } + +	case NFL_POST_ITERATION: +	  { +	    tre_iteration_t *iter = (tre_iteration_t *)node->obj; + +	    if (iter->min == 0 || iter->arg->nullable) +	      node->nullable = 1; +	    else +	      node->nullable = 0; +	    node->firstpos = iter->arg->firstpos; +	    node->lastpos = iter->arg->lastpos; +	    break; +	  } + +	case NFL_POST_CATENATION: +	  { +	    int num_tags, *tags, assertions; +	    reg_errcode_t status; +	    tre_catenation_t *cat = node->obj; +	    node->nullable = cat->left->nullable && cat->right->nullable; + +	    /* Compute firstpos. */ +	    if (cat->left->nullable) +	      { +		/* The left side matches the empty string.  Make a first pass +		   with tre_match_empty() to get the number of tags. */ +		status = tre_match_empty(stack, cat->left, +					 NULL, NULL, &num_tags); +		if (status != REG_OK) +		  return status; +		/* Allocate arrays for the tags and parameters. */ +		tags = xmalloc(sizeof(*tags) * (num_tags + 1)); +		if (!tags) +		  return REG_ESPACE; +		tags[0] = -1; +		assertions = 0; +		/* Second pass with tre_mach_empty() to get the list of +		   tags. */ +		status = tre_match_empty(stack, cat->left, tags, +					 &assertions, NULL); +		if (status != REG_OK) +		  { +		    xfree(tags); +		    return status; +		  } +		node->firstpos = +		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos, +				tags, assertions); +		xfree(tags); +		if (!node->firstpos) +		  return REG_ESPACE; +	      } +	    else +	      { +		node->firstpos = cat->left->firstpos; +	      } + +	    /* Compute lastpos. */ +	    if (cat->right->nullable) +	      { +		/* The right side matches the empty string.  Make a first pass +		   with tre_match_empty() to get the number of tags. */ +		status = tre_match_empty(stack, cat->right, +					 NULL, NULL, &num_tags); +		if (status != REG_OK) +		  return status; +		/* Allocate arrays for the tags and parameters. */ +		tags = xmalloc(sizeof(int) * (num_tags + 1)); +		if (!tags) +		  return REG_ESPACE; +		tags[0] = -1; +		assertions = 0; +		/* Second pass with tre_mach_empty() to get the list of +		   tags. */ +		status = tre_match_empty(stack, cat->right, tags, +					 &assertions, NULL); +		if (status != REG_OK) +		  { +		    xfree(tags); +		    return status; +		  } +		node->lastpos = +		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos, +				tags, assertions); +		xfree(tags); +		if (!node->lastpos) +		  return REG_ESPACE; +	      } +	    else +	      { +		node->lastpos = cat->right->lastpos; +	      } +	    break; +	  } + +	default: +	  assert(0); +	  break; +	} +    } + +  return REG_OK; +} + + +/* Adds a transition from each position in `p1' to each position in `p2'. */ +static reg_errcode_t +tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, +	       tre_tnfa_transition_t *transitions, +	       int *counts, int *offs) +{ +  tre_pos_and_tags_t *orig_p2 = p2; +  tre_tnfa_transition_t *trans; +  int i, j, k, l, dup, prev_p2_pos; + +  if (transitions != NULL) +    while (p1->position >= 0) +      { +	p2 = orig_p2; +	prev_p2_pos = -1; +	while (p2->position >= 0) +	  { +	    /* Optimization: if this position was already handled, skip it. */ +	    if (p2->position == prev_p2_pos) +	      { +		p2++; +		continue; +	      } +	    prev_p2_pos = p2->position; +	    /* Set `trans' to point to the next unused transition from +	       position `p1->position'. */ +	    trans = transitions + offs[p1->position]; +	    while (trans->state != NULL) +	      { +#if 0 +		/* If we find a previous transition from `p1->position' to +		   `p2->position', it is overwritten.  This can happen only +		   if there are nested loops in the regexp, like in "((a)*)*". +		   In POSIX.2 repetition using the outer loop is always +		   preferred over using the inner loop.	 Therefore the +		   transition for the inner loop is useless and can be thrown +		   away. */ +		/* XXX - The same position is used for all nodes in a bracket +		   expression, so this optimization cannot be used (it will +		   break bracket expressions) unless I figure out a way to +		   detect it here. */ +		if (trans->state_id == p2->position) +		  { +		    DPRINT(("*")); +		    break; +		  } +#endif +		trans++; +	      } + +	    if (trans->state == NULL) +	      (trans + 1)->state = NULL; +	    /* Use the character ranges, assertions, etc. from `p1' for +	       the transition from `p1' to `p2'. */ +	    trans->code_min = p1->code_min; +	    trans->code_max = p1->code_max; +	    trans->state = transitions + offs[p2->position]; +	    trans->state_id = p2->position; +	    trans->assertions = p1->assertions | p2->assertions +	      | (p1->class ? ASSERT_CHAR_CLASS : 0) +	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0); +	    if (p1->backref >= 0) +	      { +		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0); +		assert(p2->backref < 0); +		trans->u.backref = p1->backref; +		trans->assertions |= ASSERT_BACKREF; +	      } +	    else +	      trans->u.class = p1->class; +	    if (p1->neg_classes != NULL) +	      { +		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++); +		trans->neg_classes = +		  xmalloc(sizeof(*trans->neg_classes) * (i + 1)); +		if (trans->neg_classes == NULL) +		  return REG_ESPACE; +		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++) +		  trans->neg_classes[i] = p1->neg_classes[i]; +		trans->neg_classes[i] = (tre_ctype_t)0; +	      } +	    else +	      trans->neg_classes = NULL; + +	    /* Find out how many tags this transition has. */ +	    i = 0; +	    if (p1->tags != NULL) +	      while(p1->tags[i] >= 0) +		i++; +	    j = 0; +	    if (p2->tags != NULL) +	      while(p2->tags[j] >= 0) +		j++; + +	    /* If we are overwriting a transition, free the old tag array. */ +	    if (trans->tags != NULL) +	      xfree(trans->tags); +	    trans->tags = NULL; + +	    /* If there were any tags, allocate an array and fill it. */ +	    if (i + j > 0) +	      { +		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1)); +		if (!trans->tags) +		  return REG_ESPACE; +		i = 0; +		if (p1->tags != NULL) +		  while(p1->tags[i] >= 0) +		    { +		      trans->tags[i] = p1->tags[i]; +		      i++; +		    } +		l = i; +		j = 0; +		if (p2->tags != NULL) +		  while (p2->tags[j] >= 0) +		    { +		      /* Don't add duplicates. */ +		      dup = 0; +		      for (k = 0; k < i; k++) +			if (trans->tags[k] == p2->tags[j]) +			  { +			    dup = 1; +			    break; +			  } +		      if (!dup) +			trans->tags[l++] = p2->tags[j]; +		      j++; +		    } +		trans->tags[l] = -1; +	      } + + +#ifdef TRE_DEBUG +	    { +	      int *tags; + +	      DPRINT(("	 %2d -> %2d on %3d", p1->position, p2->position, +		      p1->code_min)); +	      if (p1->code_max != p1->code_min) +		DPRINT(("-%3d", p1->code_max)); +	      tags = trans->tags; +	      if (tags) +		{ +		  DPRINT((", tags [")); +		  while (*tags >= 0) +		    { +		      DPRINT(("%d", *tags)); +		      tags++; +		      if (*tags >= 0) +			DPRINT((",")); +		    } +		  DPRINT(("]")); +		} +	      if (trans->assertions) +		DPRINT((", assert %d", trans->assertions)); +	      if (trans->assertions & ASSERT_BACKREF) +		DPRINT((", backref %d", trans->u.backref)); +	      else if (trans->class) +		DPRINT((", class %ld", (long)trans->class)); +	      if (trans->neg_classes) +		DPRINT((", neg_classes %p", trans->neg_classes)); +	      DPRINT(("\n")); +	    } +#endif /* TRE_DEBUG */ +	    p2++; +	  } +	p1++; +      } +  else +    /* Compute a maximum limit for the number of transitions leaving +       from each state. */ +    while (p1->position >= 0) +      { +	p2 = orig_p2; +	while (p2->position >= 0) +	  { +	    counts[p1->position]++; +	    p2++; +	  } +	p1++; +      } +  return REG_OK; +} + +/* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are +   labelled with one character range (there are no transitions on empty +   strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of +   the regexp. */ +static reg_errcode_t +tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, +		int *counts, int *offs) +{ +  tre_union_t *uni; +  tre_catenation_t *cat; +  tre_iteration_t *iter; +  reg_errcode_t errcode = REG_OK; + +  /* XXX - recurse using a stack!. */ +  switch (node->type) +    { +    case LITERAL: +      break; +    case UNION: +      uni = (tre_union_t *)node->obj; +      errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs); +      if (errcode != REG_OK) +	return errcode; +      errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs); +      break; + +    case CATENATION: +      cat = (tre_catenation_t *)node->obj; +      /* Add a transition from each position in cat->left->lastpos +	 to each position in cat->right->firstpos. */ +      errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos, +			       transitions, counts, offs); +      if (errcode != REG_OK) +	return errcode; +      errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs); +      if (errcode != REG_OK) +	return errcode; +      errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs); +      break; + +    case ITERATION: +      iter = (tre_iteration_t *)node->obj; +      assert(iter->max == -1 || iter->max == 1); + +      if (iter->max == -1) +	{ +	  assert(iter->min == 0 || iter->min == 1); +	  /* Add a transition from each last position in the iterated +	     expression to each first position. */ +	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos, +				   transitions, counts, offs); +	  if (errcode != REG_OK) +	    return errcode; +	} +      errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs); +      break; +    } +  return errcode; +} + + +static void +tre_free(regex_t *preg) +{ +  tre_tnfa_t *tnfa; +  unsigned int i; +  tre_tnfa_transition_t *trans; + +  tnfa = (void *)preg->TRE_REGEX_T_FIELD; +  if (!tnfa) +    return; + +  for (i = 0; i < tnfa->num_transitions; i++) +    if (tnfa->transitions[i].state) +      { +	if (tnfa->transitions[i].tags) +	  xfree(tnfa->transitions[i].tags); +	if (tnfa->transitions[i].neg_classes) +	  xfree(tnfa->transitions[i].neg_classes); +      } +  if (tnfa->transitions) +    xfree(tnfa->transitions); + +  if (tnfa->initial) +    { +      for (trans = tnfa->initial; trans->state; trans++) +	{ +	  if (trans->tags) +	    xfree(trans->tags); +	} +      xfree(tnfa->initial); +    } + +  if (tnfa->submatch_data) +    { +      for (i = 0; i < tnfa->num_submatches; i++) +	if (tnfa->submatch_data[i].parents) +	  xfree(tnfa->submatch_data[i].parents); +      xfree(tnfa->submatch_data); +    } + +  if (tnfa->tag_directions) +    xfree(tnfa->tag_directions); +  xfree(tnfa); +} + + +#define ERROR_EXIT(err)		  \ +  do				  \ +    {				  \ +      errcode = err;		  \ +      if (1) goto error_exit;	  \ +    }				  \ + while (0) + + +static int +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) +{ +  tre_stack_t *stack; +  tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; +  tre_pos_and_tags_t *p; +  int *counts = NULL, *offs = NULL; +  int i, add = 0; +  tre_tnfa_transition_t *transitions, *initial; +  tre_tnfa_t *tnfa = NULL; +  tre_submatch_data_t *submatch_data; +  tre_tag_direction_t *tag_directions = NULL; +  reg_errcode_t errcode; +  tre_mem_t mem; + +  /* Parse context. */ +  tre_parse_ctx_t parse_ctx; + +  /* Allocate a stack used throughout the compilation process for various +     purposes. */ +  stack = tre_stack_new(512, 10240, 128); +  if (!stack) +    return REG_ESPACE; +  /* Allocate a fast memory allocator. */ +  mem = tre_mem_new(); +  if (!mem) +    { +      tre_stack_destroy(stack); +      return REG_ESPACE; +    } + +  /* Parse the regexp. */ +  memset(&parse_ctx, 0, sizeof(parse_ctx)); +  parse_ctx.mem = mem; +  parse_ctx.stack = stack; +  parse_ctx.re = regex; +  parse_ctx.len = n; +  parse_ctx.cflags = cflags; +  parse_ctx.max_backref = -1; +  DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex)); +  errcode = tre_parse(&parse_ctx); +  if (errcode != REG_OK) +    ERROR_EXIT(errcode); +  preg->re_nsub = parse_ctx.submatch_id - 1; +  tree = parse_ctx.result; + +#ifdef TRE_DEBUG +  tre_ast_print(tree); +#endif /* TRE_DEBUG */ + +  /* Referring to nonexistent subexpressions is illegal. */ +  if (parse_ctx.max_backref > (int)preg->re_nsub) +    ERROR_EXIT(REG_ESUBREG); + +  /* Allocate the TNFA struct. */ +  tnfa = xcalloc(1, sizeof(tre_tnfa_t)); +  if (tnfa == NULL) +    ERROR_EXIT(REG_ESPACE); +  tnfa->have_backrefs = parse_ctx.max_backref >= 0; +  tnfa->num_submatches = parse_ctx.submatch_id; + +  /* Set up tags for submatch addressing.  If REG_NOSUB is set and the +     regexp does not have back references, this can be skipped. */ +  if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) +    { +      DPRINT(("tre_compile: setting up tags\n")); + +      /* Figure out how many tags we will need. */ +      errcode = tre_add_tags(NULL, stack, tree, tnfa); +      if (errcode != REG_OK) +	ERROR_EXIT(errcode); +#ifdef TRE_DEBUG +      tre_ast_print(tree); +#endif /* TRE_DEBUG */ + +      if (tnfa->num_tags > 0) +	{ +	  tag_directions = xmalloc(sizeof(*tag_directions) +				   * (tnfa->num_tags + 1)); +	  if (tag_directions == NULL) +	    ERROR_EXIT(REG_ESPACE); +	  tnfa->tag_directions = tag_directions; +	  memset(tag_directions, -1, +		 sizeof(*tag_directions) * (tnfa->num_tags + 1)); +	} + +      submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data)); +      if (submatch_data == NULL) +	ERROR_EXIT(REG_ESPACE); +      tnfa->submatch_data = submatch_data; + +      errcode = tre_add_tags(mem, stack, tree, tnfa); +      if (errcode != REG_OK) +	ERROR_EXIT(errcode); + +#ifdef TRE_DEBUG +      for (i = 0; i < parse_ctx.submatch_id; i++) +	DPRINT(("pmatch[%d] = {t%d, t%d}\n", +		i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); +      for (i = 0; i < tnfa->num_tags; i++) +	DPRINT(("t%d is %s\n", i, +		tag_directions[i] == TRE_TAG_MINIMIZE ? +		"minimized" : "maximized")); +#endif /* TRE_DEBUG */ +    } + +  /* Expand iteration nodes. */ +  errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, +			   tag_directions, NULL); +  if (errcode != REG_OK) +    ERROR_EXIT(errcode); + +  /* Add a dummy node for the final state. +     XXX - For certain patterns this dummy node can be optimized away, +	   for example "a*" or "ab*".	Figure out a simple way to detect +	   this possibility. */ +  tmp_ast_l = tree; +  tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); +  if (tmp_ast_r == NULL) +    ERROR_EXIT(REG_ESPACE); + +  tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); +  if (tree == NULL) +    ERROR_EXIT(REG_ESPACE); + +#ifdef TRE_DEBUG +  tre_ast_print(tree); +  DPRINT(("Number of states: %d\n", parse_ctx.position)); +#endif /* TRE_DEBUG */ + +  errcode = tre_compute_nfl(mem, stack, tree); +  if (errcode != REG_OK) +    ERROR_EXIT(errcode); + +  counts = xmalloc(sizeof(int) * parse_ctx.position); +  if (counts == NULL) +    ERROR_EXIT(REG_ESPACE); + +  offs = xmalloc(sizeof(int) * parse_ctx.position); +  if (offs == NULL) +    ERROR_EXIT(REG_ESPACE); + +  for (i = 0; i < parse_ctx.position; i++) +    counts[i] = 0; +  tre_ast_to_tnfa(tree, NULL, counts, NULL); + +  add = 0; +  for (i = 0; i < parse_ctx.position; i++) +    { +      offs[i] = add; +      add += counts[i] + 1; +      counts[i] = 0; +    } +  transitions = xcalloc(add + 1, sizeof(*transitions)); +  if (transitions == NULL) +    ERROR_EXIT(REG_ESPACE); +  tnfa->transitions = transitions; +  tnfa->num_transitions = add; + +  DPRINT(("Converting to TNFA:\n")); +  errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); +  if (errcode != REG_OK) +    ERROR_EXIT(errcode); + +  p = tree->firstpos; +  i = 0; +  while (p->position >= 0) +    { +      i++; + +#ifdef TRE_DEBUG +      { +	int *tags; +	DPRINT(("initial: %d", p->position)); +	tags = p->tags; +	if (tags != NULL) +	  { +	    if (*tags >= 0) +	      DPRINT(("/")); +	    while (*tags >= 0) +	      { +		DPRINT(("%d", *tags)); +		tags++; +		if (*tags >= 0) +		  DPRINT((",")); +	      } +	  } +	DPRINT((", assert %d", p->assertions)); +	DPRINT(("\n")); +      } +#endif /* TRE_DEBUG */ + +      p++; +    } + +  initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t)); +  if (initial == NULL) +    ERROR_EXIT(REG_ESPACE); +  tnfa->initial = initial; + +  i = 0; +  for (p = tree->firstpos; p->position >= 0; p++) +    { +      initial[i].state = transitions + offs[p->position]; +      initial[i].state_id = p->position; +      initial[i].tags = NULL; +      /* Copy the arrays p->tags, they are allocated +	 from a tre_mem object. */ +      if (p->tags) +	{ +	  int j; +	  for (j = 0; p->tags[j] >= 0; j++); +	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1)); +	  if (!initial[i].tags) +	    ERROR_EXIT(REG_ESPACE); +	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); +	} +      initial[i].assertions = p->assertions; +      i++; +    } +  initial[i].state = NULL; + +  tnfa->num_transitions = add; +  tnfa->final = transitions + offs[tree->lastpos[0].position]; +  tnfa->num_states = parse_ctx.position; +  tnfa->cflags = cflags; + +  DPRINT(("final state %p\n", (void *)tnfa->final)); + +  tre_mem_destroy(mem); +  tre_stack_destroy(stack); +  xfree(counts); +  xfree(offs); + +  preg->TRE_REGEX_T_FIELD = (void *)tnfa; +  return REG_OK; + + error_exit: +  /* Free everything that was allocated and return the error code. */ +  tre_mem_destroy(mem); +  if (stack != NULL) +    tre_stack_destroy(stack); +  if (counts != NULL) +    xfree(counts); +  if (offs != NULL) +    xfree(offs); +  preg->TRE_REGEX_T_FIELD = (void *)tnfa; +  tre_free(preg); +  return errcode; +} + + +/*********************************************************************** + from regcomp.c +***********************************************************************/ + +int +regcomp(regex_t *preg, const char *regex, int cflags) +{ +  int ret; +  tre_char_t *wregex; +  size_t n = strlen(regex); + +  if (n+1 > SIZE_MAX/sizeof(tre_char_t)) +    return REG_ESPACE; +  wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); +  if (wregex == NULL) +    return REG_ESPACE; + +  n = mbstowcs(wregex, regex, n+1); +  if (n == (size_t)-1) { +    xfree(wregex); +    return REG_BADPAT; +  } + +  ret = tre_compile(preg, wregex, n, cflags); +  xfree(wregex); + +  return ret; +} + +void +regfree(regex_t *preg) +{ +  tre_free(preg); +} + +/* EOF */ diff --git a/src/regex/regerror.c b/src/regex/regerror.c new file mode 100644 index 00000000..39d70b2a --- /dev/null +++ b/src/regex/regerror.c @@ -0,0 +1,75 @@ +/* +  regerror.c - POSIX regerror() implementation for TRE. + +  Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>. + +  This library is free software; you can redistribute it and/or +  modify it under the terms of the GNU Lesser General Public +  License as published by the Free Software Foundation; either +  version 2.1 of the License, or (at your option) any later version. + +  This library is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public +  License along with this library; if not, write to the Free Software +  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA + +*/ + +#include <string.h> +#include <regex.h> + +/* Error message strings for error codes listed in `regex.h'.  This list +   needs to be in sync with the codes listed there, naturally. */ + +/* Converted to single string by Rich Felker to remove the need for + * data relocations at runtime, 27 Feb 2006. */ + +static const char tre_error_messages[] = { +  "No error\0" +  "No match\0" +  "Invalid regexp\0" +  "Unknown collating element\0" +  "Unknown character class name\0" +  "Trailing backslash\0" +  "Invalid back reference\0" +  "Missing ']'\0" +  "Missing ')'\0" +  "Missing '}'\0" +  "Invalid contents of {}\0" +  "Invalid character range\0" +  "Out of memory\0" +  "XXX\0" +}; + +size_t +regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +{ +  const char *err; +  size_t err_len; + +  if (errcode >= 0 && errcode <= REG_BADRPT) +    for (err=tre_error_messages; errcode; errcode--, err+=strlen(err)+1); +  else +    err = "Unknown error"; + +  err_len = strlen(err) + 1; +  if (errbuf_size > 0 && errbuf != NULL) +    { +      if (err_len > errbuf_size) +	{ +	  memcpy(errbuf, err, errbuf_size - 1); +	  errbuf[errbuf_size - 1] = '\0'; +	} +      else +	{ +	  strcpy(errbuf, err); +	} +    } +  return err_len; +} + +/* EOF */ diff --git a/src/regex/regexec.c b/src/regex/regexec.c new file mode 100644 index 00000000..0c3d2834 --- /dev/null +++ b/src/regex/regexec.c @@ -0,0 +1,1107 @@ +/* +  regexec.c - TRE POSIX compatible matching functions (and more). + +  Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>. + +  This library is free software; you can redistribute it and/or +  modify it under the terms of the GNU Lesser General Public +  License as published by the Free Software Foundation; either +  version 2.1 of the License, or (at your option) any later version. + +  This library is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public +  License along with this library; if not, write to the Free Software +  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA + +*/ + +#include <stdlib.h> +#include <string.h> +#include <wchar.h> +#include <wctype.h> +#include <limits.h> + +#include <regex.h> + +#include "tre.h" + +#include <assert.h> + +static void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, +		const tre_tnfa_t *tnfa, int *tags, int match_eo); + +/*********************************************************************** + from tre-match-utils.h +***********************************************************************/ + +#define GET_NEXT_WCHAR() do {                                                 \ +    prev_c = next_c; pos += pos_add_next;                                     \ +    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \ +        if (pos_add_next < 0) return REG_NOMATCH;                             \ +        else pos_add_next++;                                                  \ +    }                                                                         \ +    str_byte += pos_add_next;                                                 \ +  } while (0) + +#define CHECK_ASSERTIONS(assertions)					      \ +  (((assertions & ASSERT_AT_BOL)					      \ +    && (pos > 0 || reg_notbol)						      \ +    && (prev_c != L'\n' || !reg_newline))				      \ +   || ((assertions & ASSERT_AT_EOL)					      \ +       && (next_c != L'\0' || reg_noteol)				      \ +       && (next_c != L'\n' || !reg_newline))) + +/* Returns 1 if `t1' wins `t2', 0 otherwise. */ +static int +tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, +	      int *t1, int *t2) +{ +  int i; +  for (i = 0; i < num_tags; i++) +    { +      if (tag_directions[i] == TRE_TAG_MINIMIZE) +	{ +	  if (t1[i] < t2[i]) +	    return 1; +	  if (t1[i] > t2[i]) +	    return 0; +	} +      else +	{ +	  if (t1[i] > t2[i]) +	    return 1; +	  if (t1[i] < t2[i]) +	    return 0; +	} +    } +  /*  assert(0);*/ +  return 0; +} + +static int +tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) +{ +  DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase)); +  while (*classes != (tre_ctype_t)0) +    if ((!icase && tre_isctype(wc, *classes)) +	|| (icase && (tre_isctype(tre_toupper(wc), *classes) +		      || tre_isctype(tre_tolower(wc), *classes)))) +      return 1; /* Match. */ +    else +      classes++; +  return 0; /* No match. */ +} + + +/*********************************************************************** + from tre-match-parallel.c +***********************************************************************/ + +/* +  This algorithm searches for matches basically by reading characters +  in the searched string one by one, starting at the beginning.	 All +  matching paths in the TNFA are traversed in parallel.	 When two or +  more paths reach the same state, exactly one is chosen according to +  tag ordering rules; if returning submatches is not required it does +  not matter which path is chosen. + +  The worst case time required for finding the leftmost and longest +  match, or determining that there is no match, is always linearly +  dependent on the length of the text being searched. + +  This algorithm cannot handle TNFAs with back referencing nodes. +  See `tre-match-backtrack.c'. +*/ + + +typedef struct { +  tre_tnfa_transition_t *state; +  int *tags; +} tre_tnfa_reach_t; + +typedef struct { +  int pos; +  int **tags; +} tre_reach_pos_t; + + +#ifdef TRE_DEBUG +static void +tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) +{ +  int i; + +  while (reach->state != NULL) +    { +      DPRINT((" %p", (void *)reach->state)); +      if (num_tags > 0) +	{ +	  DPRINT(("/")); +	  for (i = 0; i < num_tags; i++) +	    { +	      DPRINT(("%d:%d", i, reach->tags[i])); +	      if (i < (num_tags-1)) +		DPRINT((",")); +	    } +	} +      reach++; +    } +  DPRINT(("\n")); + +} +#endif /* TRE_DEBUG */ + +static reg_errcode_t +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, +		      int *match_tags, int eflags, int *match_end_ofs) +{ +  /* State variables required by GET_NEXT_WCHAR. */ +  tre_char_t prev_c = 0, next_c = 0; +  const char *str_byte = string; +  int pos = -1; +  int pos_add_next = 1; +#ifdef TRE_MBSTATE +  mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +  int reg_notbol = eflags & REG_NOTBOL; +  int reg_noteol = eflags & REG_NOTEOL; +  int reg_newline = tnfa->cflags & REG_NEWLINE; + +  char *buf; +  tre_tnfa_transition_t *trans_i; +  tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; +  tre_reach_pos_t *reach_pos; +  int *tag_i; +  int num_tags, i; + +  int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */ +  int new_match = 0; +  int *tmp_tags = NULL; +  int *tmp_iptr; + +#ifdef TRE_MBSTATE +  memset(&mbstate, '\0', sizeof(mbstate)); +#endif /* TRE_MBSTATE */ + +  if (!match_tags) +    num_tags = 0; +  else +    num_tags = tnfa->num_tags; + +  /* Allocate memory for temporary data required for matching.	This needs to +     be done for every matching operation to be thread safe.  This allocates +     everything in a single large block from the stack frame using alloca() +     or with malloc() if alloca is unavailable. */ +  { +    int tbytes, rbytes, pbytes, xbytes, total_bytes; +    char *tmp_buf; +    /* Compute the length of the block we need. */ +    tbytes = sizeof(*tmp_tags) * num_tags; +    rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); +    pbytes = sizeof(*reach_pos) * tnfa->num_states; +    xbytes = sizeof(int) * num_tags; +    total_bytes = +      (sizeof(long) - 1) * 4 /* for alignment paddings */ +      + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; + +    /* Allocate the memory. */ +#ifdef TRE_USE_ALLOCA +    buf = alloca(total_bytes); +#else /* !TRE_USE_ALLOCA */ +    buf = xmalloc(total_bytes); +#endif /* !TRE_USE_ALLOCA */ +    if (buf == NULL) +      return REG_ESPACE; +    memset(buf, 0, total_bytes); + +    /* Get the various pointers within tmp_buf (properly aligned). */ +    tmp_tags = (void *)buf; +    tmp_buf = buf + tbytes; +    tmp_buf += ALIGN(tmp_buf, long); +    reach_next = (void *)tmp_buf; +    tmp_buf += rbytes; +    tmp_buf += ALIGN(tmp_buf, long); +    reach = (void *)tmp_buf; +    tmp_buf += rbytes; +    tmp_buf += ALIGN(tmp_buf, long); +    reach_pos = (void *)tmp_buf; +    tmp_buf += pbytes; +    tmp_buf += ALIGN(tmp_buf, long); +    for (i = 0; i < tnfa->num_states; i++) +      { +	reach[i].tags = (void *)tmp_buf; +	tmp_buf += xbytes; +	reach_next[i].tags = (void *)tmp_buf; +	tmp_buf += xbytes; +      } +  } + +  for (i = 0; i < tnfa->num_states; i++) +    reach_pos[i].pos = -1; + +  GET_NEXT_WCHAR(); +  pos = 0; + +  DPRINT(("length: %d\n", len)); +  DPRINT(("pos:chr/code | states and tags\n")); +  DPRINT(("-------------+------------------------------------------------\n")); + +  reach_next_i = reach_next; +  while (1) +    { +      /* If no match found yet, add the initial states to `reach_next'. */ +      if (match_eo < 0) +	{ +	  DPRINT((" init >")); +	  trans_i = tnfa->initial; +	  while (trans_i->state != NULL) +	    { +	      if (reach_pos[trans_i->state_id].pos < pos) +		{ +		  if (trans_i->assertions +		      && CHECK_ASSERTIONS(trans_i->assertions)) +		    { +		      DPRINT(("assertion failed\n")); +		      trans_i++; +		      continue; +		    } + +		  DPRINT((" %p", (void *)trans_i->state)); +		  reach_next_i->state = trans_i->state; +		  for (i = 0; i < num_tags; i++) +		    reach_next_i->tags[i] = -1; +		  tag_i = trans_i->tags; +		  if (tag_i) +		    while (*tag_i >= 0) +		      { +			if (*tag_i < num_tags) +			  reach_next_i->tags[*tag_i] = pos; +			tag_i++; +		      } +		  if (reach_next_i->state == tnfa->final) +		    { +		      DPRINT(("	 found empty match\n")); +		      match_eo = pos; +		      new_match = 1; +		      for (i = 0; i < num_tags; i++) +			match_tags[i] = reach_next_i->tags[i]; +		    } +		  reach_pos[trans_i->state_id].pos = pos; +		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags; +		  reach_next_i++; +		} +	      trans_i++; +	    } +	  DPRINT(("\n")); +	  reach_next_i->state = NULL; +	} +      else +	{ +	  if (num_tags == 0 || reach_next_i == reach_next) +	    /* We have found a match. */ +	    break; +	} + +      /* Check for end of string. */ +      if (!next_c) break; + +      GET_NEXT_WCHAR(); + +#ifdef TRE_DEBUG +      DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); +      tre_print_reach(tnfa, reach_next, num_tags); +      DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); +      tre_print_reach(tnfa, reach_next, num_tags); +#endif /* TRE_DEBUG */ + +      /* Swap `reach' and `reach_next'. */ +      reach_i = reach; +      reach = reach_next; +      reach_next = reach_i; + +      /* For each state in `reach' see if there is a transition leaving with +	 the current input symbol to a state not yet in `reach_next', and +	 add the destination states to `reach_next'. */ +      reach_next_i = reach_next; +      for (reach_i = reach; reach_i->state; reach_i++) +	{ +	  for (trans_i = reach_i->state; trans_i->state; trans_i++) +	    { +	      /* Does this transition match the input symbol? */ +	      if (trans_i->code_min <= prev_c && +		  trans_i->code_max >= prev_c) +		{ +		  if (trans_i->assertions +		      && (CHECK_ASSERTIONS(trans_i->assertions) +			  /* Handle character class transitions. */ +			  || ((trans_i->assertions & ASSERT_CHAR_CLASS) +			      && !(tnfa->cflags & REG_ICASE) +			      && !tre_isctype((tre_cint_t)prev_c, +					      trans_i->u.class)) +			  || ((trans_i->assertions & ASSERT_CHAR_CLASS) +			      && (tnfa->cflags & REG_ICASE) +			      && (!tre_isctype(tre_tolower((tre_cint_t)prev_c), +					       trans_i->u.class) +				  && !tre_isctype(tre_toupper((tre_cint_t)prev_c), +						  trans_i->u.class))) +			  || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) +			      && tre_neg_char_classes_match(trans_i->neg_classes, +							    (tre_cint_t)prev_c, +							    tnfa->cflags & REG_ICASE)))) +		    { +		      DPRINT(("assertion failed\n")); +		      continue; +		    } + +		  /* Compute the tags after this transition. */ +		  for (i = 0; i < num_tags; i++) +		    tmp_tags[i] = reach_i->tags[i]; +		  tag_i = trans_i->tags; +		  if (tag_i != NULL) +		    while (*tag_i >= 0) +		      { +			if (*tag_i < num_tags) +			  tmp_tags[*tag_i] = pos; +			tag_i++; +		      } + +		  if (reach_pos[trans_i->state_id].pos < pos) +		    { +		      /* Found an unvisited node. */ +		      reach_next_i->state = trans_i->state; +		      tmp_iptr = reach_next_i->tags; +		      reach_next_i->tags = tmp_tags; +		      tmp_tags = tmp_iptr; +		      reach_pos[trans_i->state_id].pos = pos; +		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags; + +		      if (reach_next_i->state == tnfa->final +			  && (match_eo == -1 +			      || (num_tags > 0 +				  && reach_next_i->tags[0] <= match_tags[0]))) +			{ +			  DPRINT(("  found match %p\n", trans_i->state)); +			  match_eo = pos; +			  new_match = 1; +			  for (i = 0; i < num_tags; i++) +			    match_tags[i] = reach_next_i->tags[i]; +			} +		      reach_next_i++; + +		    } +		  else +		    { +		      assert(reach_pos[trans_i->state_id].pos == pos); +		      /* Another path has also reached this state.  We choose +			 the winner by examining the tag values for both +			 paths. */ +		      if (tre_tag_order(num_tags, tnfa->tag_directions, +					tmp_tags, +					*reach_pos[trans_i->state_id].tags)) +			{ +			  /* The new path wins. */ +			  tmp_iptr = *reach_pos[trans_i->state_id].tags; +			  *reach_pos[trans_i->state_id].tags = tmp_tags; +			  if (trans_i->state == tnfa->final) +			    { +			      DPRINT(("	 found better match\n")); +			      match_eo = pos; +			      new_match = 1; +			      for (i = 0; i < num_tags; i++) +				match_tags[i] = tmp_tags[i]; +			    } +			  tmp_tags = tmp_iptr; +			} +		    } +		} +	    } +	} +      reach_next_i->state = NULL; +    } + +  DPRINT(("match end offset = %d\n", match_eo)); + +#ifndef TRE_USE_ALLOCA +  if (buf) +    xfree(buf); +#endif /* !TRE_USE_ALLOCA */ + +  *match_end_ofs = match_eo; +  return match_eo >= 0 ? REG_OK : REG_NOMATCH; +} + + +/*********************************************************************** + from tre-match-backtrack.c +***********************************************************************/ + +/* +  This matcher is for regexps that use back referencing.  Regexp matching +  with back referencing is an NP-complete problem on the number of back +  references.  The easiest way to match them is to use a backtracking +  routine which basically goes through all possible paths in the TNFA +  and chooses the one which results in the best (leftmost and longest) +  match.  This can be spectacularly expensive and may run out of stack +  space, but there really is no better known generic algorithm.	 Quoting +  Henry Spencer from comp.compilers: +  <URL: http://compilers.iecc.com/comparch/article/93-03-102> + +    POSIX.2 REs require longest match, which is really exciting to +    implement since the obsolete ("basic") variant also includes +    \<digit>.  I haven't found a better way of tackling this than doing +    a preliminary match using a DFA (or simulation) on a modified RE +    that just replicates subREs for \<digit>, and then doing a +    backtracking match to determine whether the subRE matches were +    right.  This can be rather slow, but I console myself with the +    thought that people who use \<digit> deserve very slow execution. +    (Pun unintentional but very appropriate.) + +*/ + +typedef struct { +  int pos; +  const char *str_byte; +  tre_tnfa_transition_t *state; +  int state_id; +  int next_c; +  int *tags; +#ifdef TRE_MBSTATE +  mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +} tre_backtrack_item_t; + +typedef struct tre_backtrack_struct { +  tre_backtrack_item_t item; +  struct tre_backtrack_struct *prev; +  struct tre_backtrack_struct *next; +} *tre_backtrack_t; + +#ifdef TRE_MBSTATE +#define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate) +#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate +#else /* !TRE_MBSTATE */ +#define BT_STACK_MBSTATE_IN +#define BT_STACK_MBSTATE_OUT +#endif /* !TRE_MBSTATE */ + + +#ifdef TRE_USE_ALLOCA +#define tre_bt_mem_new		  tre_mem_newa +#define tre_bt_mem_alloc	  tre_mem_alloca +#define tre_bt_mem_destroy(obj)	  do { } while (0) +#else /* !TRE_USE_ALLOCA */ +#define tre_bt_mem_new		  tre_mem_new +#define tre_bt_mem_alloc	  tre_mem_alloc +#define tre_bt_mem_destroy	  tre_mem_destroy +#endif /* !TRE_USE_ALLOCA */ + + +#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ +  do									      \ +    {									      \ +      int i;								      \ +      if (!stack->next)							      \ +	{								      \ +	  tre_backtrack_t s;						      \ +	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \ +	  if (!s)							      \ +	    {								      \ +	      tre_bt_mem_destroy(mem);					      \ +	      if (tags)							      \ +		xfree(tags);						      \ +	      if (pmatch)						      \ +		xfree(pmatch);						      \ +	      if (states_seen)						      \ +		xfree(states_seen);					      \ +	      return REG_ESPACE;					      \ +	    }								      \ +	  s->prev = stack;						      \ +	  s->next = NULL;						      \ +	  s->item.tags = tre_bt_mem_alloc(mem,				      \ +					  sizeof(*tags) * tnfa->num_tags);    \ +	  if (!s->item.tags)						      \ +	    {								      \ +	      tre_bt_mem_destroy(mem);					      \ +	      if (tags)							      \ +		xfree(tags);						      \ +	      if (pmatch)						      \ +		xfree(pmatch);						      \ +	      if (states_seen)						      \ +		xfree(states_seen);					      \ +	      return REG_ESPACE;					      \ +	    }								      \ +	  stack->next = s;						      \ +	  stack = s;							      \ +	}								      \ +      else								      \ +	stack = stack->next;						      \ +      stack->item.pos = (_pos);						      \ +      stack->item.str_byte = (_str_byte);				      \ +      stack->item.state = (_state);					      \ +      stack->item.state_id = (_state_id);				      \ +      stack->item.next_c = (_next_c);					      \ +      for (i = 0; i < tnfa->num_tags; i++)				      \ +	stack->item.tags[i] = (_tags)[i];				      \ +      BT_STACK_MBSTATE_IN;						      \ +    }									      \ +  while (0) + +#define BT_STACK_POP()							      \ +  do									      \ +    {									      \ +      int i;								      \ +      assert(stack->prev);						      \ +      pos = stack->item.pos;						      \ +      str_byte = stack->item.str_byte;					      \ +      state = stack->item.state;					      \ +      next_c = stack->item.next_c;					      \ +      for (i = 0; i < tnfa->num_tags; i++)				      \ +	tags[i] = stack->item.tags[i];					      \ +      BT_STACK_MBSTATE_OUT;						      \ +      stack = stack->prev;						      \ +    }									      \ +  while (0) + +#undef MIN +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +static reg_errcode_t +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, +		       int len, int *match_tags, +		       int eflags, int *match_end_ofs) +{ +  /* State variables required by GET_NEXT_WCHAR. */ +  tre_char_t prev_c = 0, next_c = 0; +  const char *str_byte = string; +  int pos = 0; +  int pos_add_next = 1; +#ifdef TRE_MBSTATE +  mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +  int reg_notbol = eflags & REG_NOTBOL; +  int reg_noteol = eflags & REG_NOTEOL; +  int reg_newline = tnfa->cflags & REG_NEWLINE; + +  /* These are used to remember the necessary values of the above +     variables to return to the position where the current search +     started from. */ +  int next_c_start; +  const char *str_byte_start; +  int pos_start = -1; +#ifdef TRE_MBSTATE +  mbstate_t mbstate_start; +#endif /* TRE_MBSTATE */ + +  /* Compilation flags for this regexp. */ +  int cflags = tnfa->cflags; + +  /* End offset of best match so far, or -1 if no match found yet. */ +  int match_eo = -1; +  /* Tag arrays. */ +  int *next_tags, *tags = NULL; +  /* Current TNFA state. */ +  tre_tnfa_transition_t *state; +  int *states_seen = NULL; + +  /* Memory allocator to for allocating the backtracking stack. */ +  tre_mem_t mem = tre_bt_mem_new(); + +  /* The backtracking stack. */ +  tre_backtrack_t stack; + +  tre_tnfa_transition_t *trans_i; +  regmatch_t *pmatch = NULL; +  int ret; + +#ifdef TRE_MBSTATE +  memset(&mbstate, '\0', sizeof(mbstate)); +#endif /* TRE_MBSTATE */ + +  if (!mem) +    return REG_ESPACE; +  stack = tre_bt_mem_alloc(mem, sizeof(*stack)); +  if (!stack) +    { +      ret = REG_ESPACE; +      goto error_exit; +    } +  stack->prev = NULL; +  stack->next = NULL; + +#ifdef TRE_USE_ALLOCA +  tags = alloca(sizeof(*tags) * tnfa->num_tags); +  pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); +  states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); +#else /* !TRE_USE_ALLOCA */ +  tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +  if (!tags) +    { +      ret = REG_ESPACE; +      goto error_exit; +    } +  pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); +  if (!pmatch) +    { +      ret = REG_ESPACE; +      goto error_exit; +    } +  states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); +  if (!states_seen) +    { +      ret = REG_ESPACE; +      goto error_exit; +    } +#endif /* !TRE_USE_ALLOCA */ + + retry: +  { +    int i; +    for (i = 0; i < tnfa->num_tags; i++) +      { +	tags[i] = -1; +	if (match_tags) +	  match_tags[i] = -1; +      } +    for (i = 0; i < tnfa->num_states; i++) +      states_seen[i] = 0; +  } + +  state = NULL; +  pos = pos_start; +  GET_NEXT_WCHAR(); +  pos_start = pos; +  next_c_start = next_c; +  str_byte_start = str_byte; +#ifdef TRE_MBSTATE +  mbstate_start = mbstate; +#endif /* TRE_MBSTATE */ + +  /* Handle initial states. */ +  next_tags = NULL; +  for (trans_i = tnfa->initial; trans_i->state; trans_i++) +    { +      DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); +      if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) +	{ +	  DPRINT(("assert failed\n")); +	  continue; +	} +      if (state == NULL) +	{ +	  /* Start from this state. */ +	  state = trans_i->state; +	  next_tags = trans_i->tags; +	} +      else +	{ +	  /* Backtrack to this state. */ +	  DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); +	  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, +			trans_i->state_id, next_c, tags, mbstate); +	  { +	    int *tmp = trans_i->tags; +	    if (tmp) +	      while (*tmp >= 0) +		stack->item.tags[*tmp++] = pos; +	  } +	} +    } + +  if (next_tags) +    for (; *next_tags >= 0; next_tags++) +      tags[*next_tags] = pos; + + +  DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); +  DPRINT(("pos:chr/code | state and tags\n")); +  DPRINT(("-------------+------------------------------------------------\n")); + +  if (state == NULL) +    goto backtrack; + +  while (1) +    { +      tre_tnfa_transition_t *trans_i, *next_state; +      int empty_br_match; + +      DPRINT(("start loop\n")); +      if (state == tnfa->final) +	{ +	  DPRINT(("  match found, %d %d\n", match_eo, pos)); +	  if (match_eo < pos +	      || (match_eo == pos +		  && match_tags +		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, +				   tags, match_tags))) +	    { +	      int i; +	      /* This match wins the previous match. */ +	      DPRINT(("	 win previous\n")); +	      match_eo = pos; +	      if (match_tags) +		for (i = 0; i < tnfa->num_tags; i++) +		  match_tags[i] = tags[i]; +	    } +	  /* Our TNFAs never have transitions leaving from the final state, +	     so we jump right to backtracking. */ +	  goto backtrack; +	} + +#ifdef TRE_DEBUG +      DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, +	      state)); +      { +	int i; +	for (i = 0; i < tnfa->num_tags; i++) +	  DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); +	DPRINT(("\n")); +      } +#endif /* TRE_DEBUG */ + +      /* Go to the next character in the input string. */ +      empty_br_match = 0; +      trans_i = state; +      if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) +	{ +	  /* This is a back reference state.  All transitions leaving from +	     this state have the same back reference "assertion".  Instead +	     of reading the next character, we match the back reference. */ +	  int so, eo, bt = trans_i->u.backref; +	  int bt_len; +	  int result; + +	  DPRINT(("  should match back reference %d\n", bt)); +	  /* Get the substring we need to match against.  Remember to +	     turn off REG_NOSUB temporarily. */ +	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB, +			  tnfa, tags, pos); +	  so = pmatch[bt].rm_so; +	  eo = pmatch[bt].rm_eo; +	  bt_len = eo - so; + +	  if (len < 0) +	    { +	      result = strncmp((char*)string + so, str_byte - 1, bt_len); +	    } +	  else if (len - pos < bt_len) +	    result = 1; +	  else +	    result = memcmp((char*)string + so, str_byte - 1, bt_len); + +	  /* We can ignore multibyte characters here because the backref +	     string is already aligned at character boundaries. */ +	  if (result == 0) +	    { +	      /* Back reference matched.  Check for infinite loop. */ +	      if (bt_len == 0) +		empty_br_match = 1; +	      if (empty_br_match && states_seen[trans_i->state_id]) +		{ +		  DPRINT(("  avoid loop\n")); +		  goto backtrack; +		} + +	      states_seen[trans_i->state_id] = empty_br_match; + +	      /* Advance in input string and resync `prev_c', `next_c' +		 and pos. */ +	      DPRINT(("	 back reference matched\n")); +	      str_byte += bt_len - 1; +	      pos += bt_len - 1; +	      GET_NEXT_WCHAR(); +	      DPRINT(("	 pos now %d\n", pos)); +	    } +	  else +	    { +	      DPRINT(("	 back reference did not match\n")); +	      goto backtrack; +	    } +	} +      else +	{ +	  /* Check for end of string. */ +	  if (len < 0) +	    { +	      if (next_c == L'\0') +		goto backtrack; +	    } +	  else +	    { +	      if (pos >= len) +		goto backtrack; +	    } + +	  /* Read the next character. */ +	  GET_NEXT_WCHAR(); +	} + +      next_state = NULL; +      for (trans_i = state; trans_i->state; trans_i++) +	{ +	  DPRINT(("  transition %d-%d (%c-%c) %d to %d\n", +		  trans_i->code_min, trans_i->code_max, +		  trans_i->code_min, trans_i->code_max, +		  trans_i->assertions, trans_i->state_id)); +	  if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c) +	    { +	      if (trans_i->assertions +		  && (CHECK_ASSERTIONS(trans_i->assertions) +		      /* Handle character class transitions. */ +		      || ((trans_i->assertions & ASSERT_CHAR_CLASS) +			  && !(cflags & REG_ICASE) +			  && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) +		      || ((trans_i->assertions & ASSERT_CHAR_CLASS) +			  && (cflags & REG_ICASE) +			  && (!tre_isctype(tre_tolower((tre_cint_t)prev_c), +					   trans_i->u.class) +			      && !tre_isctype(tre_toupper((tre_cint_t)prev_c), +					      trans_i->u.class))) +		      || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) +			  && tre_neg_char_classes_match(trans_i->neg_classes, +							(tre_cint_t)prev_c, +							cflags & REG_ICASE)))) +		{ +		  DPRINT(("  assertion failed\n")); +		  continue; +		} + +	      if (next_state == NULL) +		{ +		  /* First matching transition. */ +		  DPRINT(("  Next state is %d\n", trans_i->state_id)); +		  next_state = trans_i->state; +		  next_tags = trans_i->tags; +		} +	      else +		{ +		  /* Second mathing transition.	 We may need to backtrack here +		     to take this transition instead of the first one, so we +		     push this transition in the backtracking stack so we can +		     jump back here if needed. */ +		  DPRINT(("  saving state %d for backtracking\n", +			  trans_i->state_id)); +		  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, +				trans_i->state_id, next_c, tags, mbstate); +		  { +		    int *tmp; +		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) +		      stack->item.tags[*tmp] = pos; +		  } +#if 0 /* XXX - it's important not to look at all transitions here to keep +	 the stack small! */ +		  break; +#endif +		} +	    } +	} + +      if (next_state != NULL) +	{ +	  /* Matching transitions were found.  Take the first one. */ +	  state = next_state; + +	  /* Update the tag values. */ +	  if (next_tags) +	    while (*next_tags >= 0) +	      tags[*next_tags++] = pos; +	} +      else +	{ +	backtrack: +	  /* A matching transition was not found.  Try to backtrack. */ +	  if (stack->prev) +	    { +	      DPRINT(("	 backtracking\n")); +	      if (stack->item.state->assertions && ASSERT_BACKREF) +		{ +		  DPRINT(("  states_seen[%d] = 0\n", +			  stack->item.state_id)); +		  states_seen[stack->item.state_id] = 0; +		} + +	      BT_STACK_POP(); +	    } +	  else if (match_eo < 0) +	    { +	      /* Try starting from a later position in the input string. */ +	      /* Check for end of string. */ +	      if (len < 0) +		{ +		  if (next_c == L'\0') +		    { +		      DPRINT(("end of string.\n")); +		      break; +		    } +		} +	      else +		{ +		  if (pos >= len) +		    { +		      DPRINT(("end of string.\n")); +		      break; +		    } +		} +	      DPRINT(("restarting from next start position\n")); +	      next_c = next_c_start; +#ifdef TRE_MBSTATE +	      mbstate = mbstate_start; +#endif /* TRE_MBSTATE */ +	      str_byte = str_byte_start; +	      goto retry; +	    } +	  else +	    { +	      DPRINT(("finished\n")); +	      break; +	    } +	} +    } + +  ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; +  *match_end_ofs = match_eo; + + error_exit: +  tre_bt_mem_destroy(mem); +#ifndef TRE_USE_ALLOCA +  if (tags) +    xfree(tags); +  if (pmatch) +    xfree(pmatch); +  if (states_seen) +    xfree(states_seen); +#endif /* !TRE_USE_ALLOCA */ + +  return ret; +} + + +/*********************************************************************** + from regexec.c +***********************************************************************/ + +/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match +   endpoint values. */ +static void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, +		const tre_tnfa_t *tnfa, int *tags, int match_eo) +{ +  tre_submatch_data_t *submatch_data; +  unsigned int i, j; +  int *parents; + +  i = 0; +  if (match_eo >= 0 && !(cflags & REG_NOSUB)) +    { +      /* Construct submatch offsets from the tags. */ +      DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); +      submatch_data = tnfa->submatch_data; +      while (i < tnfa->num_submatches && i < nmatch) +	{ +	  if (submatch_data[i].so_tag == tnfa->end_tag) +	    pmatch[i].rm_so = match_eo; +	  else +	    pmatch[i].rm_so = tags[submatch_data[i].so_tag]; + +	  if (submatch_data[i].eo_tag == tnfa->end_tag) +	    pmatch[i].rm_eo = match_eo; +	  else +	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; + +	  /* If either of the endpoints were not used, this submatch +	     was not part of the match. */ +	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) +	    pmatch[i].rm_so = pmatch[i].rm_eo = -1; + +	  DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, +		  submatch_data[i].so_tag, pmatch[i].rm_so, +		  submatch_data[i].eo_tag, pmatch[i].rm_eo)); +	  i++; +	} +      /* Reset all submatches that are not within all of their parent +	 submatches. */ +      i = 0; +      while (i < tnfa->num_submatches && i < nmatch) +	{ +	  if (pmatch[i].rm_eo == -1) +	    assert(pmatch[i].rm_so == -1); +	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo); + +	  parents = submatch_data[i].parents; +	  if (parents != NULL) +	    for (j = 0; parents[j] >= 0; j++) +	      { +		DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); +		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so +		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) +		  pmatch[i].rm_so = pmatch[i].rm_eo = -1; +	      } +	  i++; +	} +    } + +  while (i < nmatch) +    { +      pmatch[i].rm_so = -1; +      pmatch[i].rm_eo = -1; +      i++; +    } +} + + +/* +  Wrapper functions for POSIX compatible regexp matching. +*/ + +static int +tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, +	  size_t nmatch, regmatch_t pmatch[], int eflags) +{ +  reg_errcode_t status; +  int *tags = NULL, eo; +  if (tnfa->num_tags > 0 && nmatch > 0) +    { +#ifdef TRE_USE_ALLOCA +      tags = alloca(sizeof(*tags) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ +      tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ +      if (tags == NULL) +	return REG_ESPACE; +    } + +  /* Dispatch to the appropriate matcher. */ +  if (tnfa->have_backrefs) +    { +      /* The regex has back references, use the backtracking matcher. */ +      status = tre_tnfa_run_backtrack(tnfa, string, len, tags, eflags, &eo); +    } +  else +    { +      /* Exact matching, no back references, use the parallel matcher. */ +      status = tre_tnfa_run_parallel(tnfa, string, len, tags, eflags, &eo); +    } + +  if (status == REG_OK) +    /* A match was found, so fill the submatch registers. */ +    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); +#ifndef TRE_USE_ALLOCA +  if (tags) +    xfree(tags); +#endif /* !TRE_USE_ALLOCA */ +  return status; +} + +int +regexec(const regex_t *preg, const char *str, +	size_t nmatch, regmatch_t pmatch[], int eflags) +{ +  return tre_match((void *)preg->TRE_REGEX_T_FIELD, str, -1, +                   nmatch, pmatch, eflags); +} + +/* EOF */ diff --git a/src/regex/tre-mem.c b/src/regex/tre-mem.c new file mode 100644 index 00000000..d7bdd3db --- /dev/null +++ b/src/regex/tre-mem.c @@ -0,0 +1,163 @@ +/* +  tre-mem.c - TRE memory allocator + +  Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi> + +  This library is free software; you can redistribute it and/or +  modify it under the terms of the GNU Lesser General Public +  License as published by the Free Software Foundation; either +  version 2.1 of the License, or (at your option) any later version. + +  This library is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public +  License along with this library; if not, write to the Free Software +  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA + +*/ + +/* +  This memory allocator is for allocating small memory blocks efficiently +  in terms of memory overhead and execution speed.  The allocated blocks +  cannot be freed individually, only all at once.  There can be multiple +  allocators, though. +*/ + +#include <stdlib.h> +#include <string.h> + +#include "tre.h" + + +/* Returns a new memory allocator or NULL if out of memory. */ +tre_mem_t +tre_mem_new_impl(int provided, void *provided_block) +{ +  tre_mem_t mem; +  if (provided) +    { +      mem = provided_block; +      memset(mem, 0, sizeof(*mem)); +    } +  else +    mem = xcalloc(1, sizeof(*mem)); +  if (mem == NULL) +    return NULL; +  return mem; +} + + +/* Frees the memory allocator and all memory allocated with it. */ +void +tre_mem_destroy(tre_mem_t mem) +{ +  tre_list_t *tmp, *l = mem->blocks; + +  while (l != NULL) +    { +      xfree(l->data); +      tmp = l->next; +      xfree(l); +      l = tmp; +    } +  xfree(mem); +} + + +/* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the +   allocated block or NULL if an underlying malloc() failed. */ +void * +tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, +		   int zero, size_t size) +{ +  void *ptr; + +  if (mem->failed) +    { +      DPRINT(("tre_mem_alloc: oops, called after failure?!\n")); +      return NULL; +    } + +#ifdef MALLOC_DEBUGGING +  if (!provided) +    { +      ptr = xmalloc(1); +      if (ptr == NULL) +	{ +	  DPRINT(("tre_mem_alloc: xmalloc forced failure\n")); +	  mem->failed = 1; +	  return NULL; +	} +      xfree(ptr); +    } +#endif /* MALLOC_DEBUGGING */ + +  if (mem->n < size) +    { +      /* We need more memory than is available in the current block. +	 Allocate a new block. */ +      tre_list_t *l; +      if (provided) +	{ +	  DPRINT(("tre_mem_alloc: using provided block\n")); +	  if (provided_block == NULL) +	    { +	      DPRINT(("tre_mem_alloc: provided block was NULL\n")); +	      mem->failed = 1; +	      return NULL; +	    } +	  mem->ptr = provided_block; +	  mem->n = TRE_MEM_BLOCK_SIZE; +	} +      else +	{ +	  int block_size; +	  if (size * 8 > TRE_MEM_BLOCK_SIZE) +	    block_size = size * 8; +	  else +	    block_size = TRE_MEM_BLOCK_SIZE; +	  DPRINT(("tre_mem_alloc: allocating new %d byte block\n", +		  block_size)); +	  l = xmalloc(sizeof(*l)); +	  if (l == NULL) +	    { +	      mem->failed = 1; +	      return NULL; +	    } +	  l->data = xmalloc(block_size); +	  if (l->data == NULL) +	    { +	      xfree(l); +	      mem->failed = 1; +	      return NULL; +	    } +	  l->next = NULL; +	  if (mem->current != NULL) +	    mem->current->next = l; +	  if (mem->blocks == NULL) +	    mem->blocks = l; +	  mem->current = l; +	  mem->ptr = l->data; +	  mem->n = block_size; +	} +    } + +  /* Make sure the next pointer will be aligned. */ +  size += ALIGN(mem->ptr + size, long); + +  /* Allocate from current block. */ +  ptr = mem->ptr; +  mem->ptr += size; +  mem->n -= size; + +  /* Set to zero if needed. */ +  if (zero) +    memset(ptr, 0, size); + +  return ptr; +} + +/* EOF */ diff --git a/src/regex/tre.h b/src/regex/tre.h new file mode 100644 index 00000000..bfd171f4 --- /dev/null +++ b/src/regex/tre.h @@ -0,0 +1,269 @@ +/* +  tre-internal.h - TRE internal definitions + +  Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>. + +  This library is free software; you can redistribute it and/or +  modify it under the terms of the GNU Lesser General Public +  License as published by the Free Software Foundation; either +  version 2.1 of the License, or (at your option) any later version. + +  This library is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public +  License along with this library; if not, write to the Free Software +  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA + +*/ + +#include <regex.h> +#include <wchar.h> +#include <wctype.h> + +#define TRE_MULTIBYTE 1 +#undef  TRE_MBSTATE +#define TRE_WCHAR 1 +#define TRE_USE_SYSTEM_WCTYPE 1 +#define HAVE_WCSTOMBS 1 +#define TRE_MB_CUR_MAX MB_CUR_MAX + +#define NDEBUG + +#define TRE_REGEX_T_FIELD __opaque +typedef int reg_errcode_t; + +typedef wchar_t tre_char_t; + + +#ifdef TRE_DEBUG +#include <stdio.h> +#define DPRINT(msg) do {printf msg; fflush(stdout);} while(0) +#else /* !TRE_DEBUG */ +#define DPRINT(msg) do { } while(0) +#endif /* !TRE_DEBUG */ + +#define elementsof(x)	( sizeof(x) / sizeof(x[0]) ) + +#if 1 +int __mbtowc(wchar_t *, const char *); +#define tre_mbrtowc(pwc, s, n, ps) (__mbtowc((pwc), (s))) +#else +#define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) +#endif + +/* Wide characters. */ +typedef wint_t tre_cint_t; +#define TRE_CHAR_MAX WCHAR_MAX + +#ifdef TRE_MULTIBYTE +#define TRE_MB_CUR_MAX MB_CUR_MAX +#else /* !TRE_MULTIBYTE */ +#define TRE_MB_CUR_MAX 1 +#endif /* !TRE_MULTIBYTE */ + +#define tre_isalnum iswalnum +#define tre_isalpha iswalpha +#define tre_isblank iswblank +#define tre_iscntrl iswcntrl +#define tre_isdigit iswdigit +#define tre_isgraph iswgraph +#define tre_islower iswlower +#define tre_isprint iswprint +#define tre_ispunct iswpunct +#define tre_isspace iswspace +#define tre_isupper iswupper +#define tre_isxdigit iswxdigit + +#define tre_tolower towlower +#define tre_toupper towupper +#define tre_strlen  wcslen + +/* Use system provided iswctype() and wctype(). */ +typedef wctype_t tre_ctype_t; +#define tre_isctype iswctype +#define tre_ctype   wctype + +/* Returns number of bytes to add to (char *)ptr to make it +   properly aligned for the type. */ +#define ALIGN(ptr, type) \ +  ((((long)ptr) % sizeof(type)) \ +   ? (sizeof(type) - (((long)ptr) % sizeof(type))) \ +   : 0) + +#undef MAX +#undef MIN +#define MAX(a, b) (((a) >= (b)) ? (a) : (b)) +#define MIN(a, b) (((a) <= (b)) ? (a) : (b)) + +/* Define STRF to the correct printf formatter for strings. */ +#define STRF "ls" + +/* TNFA transition type. A TNFA state is an array of transitions, +   the terminator is a transition with NULL `state'. */ +typedef struct tnfa_transition tre_tnfa_transition_t; + +struct tnfa_transition { +  /* Range of accepted characters. */ +  tre_cint_t code_min; +  tre_cint_t code_max; +  /* Pointer to the destination state. */ +  tre_tnfa_transition_t *state; +  /* ID number of the destination state. */ +  int state_id; +  /* -1 terminated array of tags (or NULL). */ +  int *tags; +  /* Assertion bitmap. */ +  int assertions; +  /* Assertion parameters. */ +  union { +    /* Character class assertion. */ +    tre_ctype_t class; +    /* Back reference assertion. */ +    int backref; +  } u; +  /* Negative character class assertions. */ +  tre_ctype_t *neg_classes; +}; + + +/* Assertions. */ +#define ASSERT_AT_BOL		  1   /* Beginning of line. */ +#define ASSERT_AT_EOL		  2   /* End of line. */ +#define ASSERT_CHAR_CLASS	  4   /* Character class in `class'. */ +#define ASSERT_CHAR_CLASS_NEG	  8   /* Character classes in `neg_classes'. */ +#define ASSERT_AT_BOW		 16   /* Beginning of word. */ +#define ASSERT_AT_EOW		 32   /* End of word. */ +#define ASSERT_AT_WB		 64   /* Word boundary. */ +#define ASSERT_AT_WB_NEG	128   /* Not a word boundary. */ +#define ASSERT_BACKREF		256   /* A back reference in `backref'. */ +#define ASSERT_LAST		256 + +/* Tag directions. */ +typedef enum { +  TRE_TAG_MINIMIZE = 0, +  TRE_TAG_MAXIMIZE = 1 +} tre_tag_direction_t; + +/* Instructions to compute submatch register values from tag values +   after a successful match.  */ +struct tre_submatch_data { +  /* Tag that gives the value for rm_so (submatch start offset). */ +  int so_tag; +  /* Tag that gives the value for rm_eo (submatch end offset). */ +  int eo_tag; +  /* List of submatches this submatch is contained in. */ +  int *parents; +}; + +typedef struct tre_submatch_data tre_submatch_data_t; + + +/* TNFA definition. */ +typedef struct tnfa tre_tnfa_t; + +struct tnfa { +  tre_tnfa_transition_t *transitions; +  unsigned int num_transitions; +  tre_tnfa_transition_t *initial; +  tre_tnfa_transition_t *final; +  tre_submatch_data_t *submatch_data; +  unsigned int num_submatches; +  tre_tag_direction_t *tag_directions; +  int num_tags; +  int end_tag; +  int num_states; +  int cflags; +  int have_backrefs; +}; + +#if 0 +static int +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags); + +static void +tre_free(regex_t *preg); + +static void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, +		const tre_tnfa_t *tnfa, int *tags, int match_eo); + +static reg_errcode_t +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, +		      tre_str_type_t type, int *match_tags, int eflags, +		      int *match_end_ofs); + +static reg_errcode_t +tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, +		      tre_str_type_t type, int *match_tags, int eflags, +		      int *match_end_ofs); + +static reg_errcode_t +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, +		       int len, tre_str_type_t type, int *match_tags, +		       int eflags, int *match_end_ofs); +#endif + +/* from tre-mem.h: */ + +#define TRE_MEM_BLOCK_SIZE 1024 + +typedef struct tre_list { +  void *data; +  struct tre_list *next; +} tre_list_t; + +typedef struct tre_mem_struct { +  tre_list_t *blocks; +  tre_list_t *current; +  char *ptr; +  size_t n; +  int failed; +  void **provided; +} *tre_mem_t; + +#define tre_mem_new_impl   __tre_mem_new_impl +#define tre_mem_alloc_impl __tre_mem_alloc_impl +#define tre_mem_destroy    __tre_mem_destroy + +tre_mem_t tre_mem_new_impl(int provided, void *provided_block); +void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block, +			 int zero, size_t size); + +/* Returns a new memory allocator or NULL if out of memory. */ +#define tre_mem_new()  tre_mem_new_impl(0, NULL) + +/* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the +   allocated block or NULL if an underlying malloc() failed. */ +#define tre_mem_alloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 0, size) + +/* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the +   allocated block or NULL if an underlying malloc() failed.  The memory +   is set to zero. */ +#define tre_mem_calloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 1, size) + +#ifdef TRE_USE_ALLOCA +/* alloca() versions.  Like above, but memory is allocated with alloca() +   instead of malloc(). */ + +#define tre_mem_newa() \ +  tre_mem_new_impl(1, alloca(sizeof(struct tre_mem_struct))) + +#define tre_mem_alloca(mem, size)					      \ +  ((mem)->n >= (size)							      \ +   ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size))			      \ +   : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size))) +#endif /* TRE_USE_ALLOCA */ + + +/* Frees the memory allocator and all memory allocated with it. */ +void tre_mem_destroy(tre_mem_t mem); + +#define xmalloc malloc +#define xcalloc calloc +#define xfree free +#define xrealloc realloc + +/* EOF */ | 
