diff options
Diffstat (limited to 'src/regex')
| -rw-r--r-- | src/regex/regcomp.c | 1715 | 
1 files changed, 634 insertions, 1081 deletions
| diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index d9076275..4cdaa1ea 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -34,6 +34,7 @@  #include <regex.h>  #include <limits.h>  #include <stdint.h> +#include <ctype.h>  #include "tre.h" @@ -135,108 +136,88 @@ typedef struct {    tre_ast_node_t *right;  } tre_union_t; -static tre_ast_node_t * -tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); - -static tre_ast_node_t * -tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position); - -static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, -		 int minimal); - -static tre_ast_node_t * -tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right); - -static tre_ast_node_t * -tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, -		       tre_ast_node_t *right); -  static tre_ast_node_t * -tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +tre_ast_new_node(tre_mem_t mem, int type, void *obj)  { -  tre_ast_node_t *node; - -  node = tre_mem_calloc(mem, sizeof(*node)); -  if (!node) -    return NULL; -  node->obj = tre_mem_calloc(mem, size); -  if (!node->obj) -    return NULL; -  node->type = type; -  node->nullable = -1; -  node->submatch_id = -1; - -  return node; +	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node); +	if (!node || !obj) +		return 0; +	node->obj = obj; +	node->type = type; +	node->nullable = -1; +	node->submatch_id = -1; +	return node;  }  static tre_ast_node_t *  tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)  { -  tre_ast_node_t *node; -  tre_literal_t *lit; - -  node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); -  if (!node) -    return NULL; -  lit = node->obj; -  lit->code_min = code_min; -  lit->code_max = code_max; -  lit->position = position; - -  return node; +	tre_ast_node_t *node; +	tre_literal_t *lit; + +	lit = tre_mem_calloc(mem, sizeof *lit); +	node = tre_ast_new_node(mem, LITERAL, lit); +	if (!node) +		return 0; +	lit->code_min = code_min; +	lit->code_max = code_max; +	lit->position = position; +	return node;  }  static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, -		 int minimal) +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)  { -  tre_ast_node_t *node; -  tre_iteration_t *iter; - -  node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); -  if (!node) -    return NULL; -  iter = node->obj; -  iter->arg = arg; -  iter->min = min; -  iter->max = max; -  iter->minimal = minimal; -  node->num_submatches = arg->num_submatches; - -  return node; +	tre_ast_node_t *node; +	tre_iteration_t *iter; + +	iter = tre_mem_calloc(mem, sizeof *iter); +	node = tre_ast_new_node(mem, ITERATION, iter); +	if (!node) +		return 0; +	iter->arg = arg; +	iter->min = min; +	iter->max = max; +	iter->minimal = minimal; +	node->num_submatches = arg->num_submatches; +	return node;  }  static tre_ast_node_t *  tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)  { -  tre_ast_node_t *node; - -  node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); -  if (node == NULL) -    return NULL; -  ((tre_union_t *)node->obj)->left = left; -  ((tre_union_t *)node->obj)->right = right; -  node->num_submatches = left->num_submatches + right->num_submatches; - -  return node; +	tre_ast_node_t *node; +	tre_union_t *un; + +	if (!left) +		return right; +	un = tre_mem_calloc(mem, sizeof *un); +	node = tre_ast_new_node(mem, UNION, un); +	if (!node || !right) +		return 0; +	un->left = left; +	un->right = right; +	node->num_submatches = left->num_submatches + right->num_submatches; +	return node;  }  static tre_ast_node_t * -tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, -		       tre_ast_node_t *right) +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)  { -  tre_ast_node_t *node; - -  node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); -  if (node == NULL) -    return NULL; -  ((tre_catenation_t *)node->obj)->left = left; -  ((tre_catenation_t *)node->obj)->right = right; -  node->num_submatches = left->num_submatches + right->num_submatches; - -  return node; +	tre_ast_node_t *node; +	tre_catenation_t *cat; + +	if (!left) +		return right; +	cat = tre_mem_calloc(mem, sizeof *cat); +	node = tre_ast_new_node(mem, CATENATION, cat); +	if (!node) +		return 0; +	cat->left = left; +	cat->right = right; +	node->num_submatches = left->num_submatches + right->num_submatches; +	return node;  } @@ -412,1077 +393,654 @@ define_popf(voidptr, void *)  /* Parse context. */  typedef struct { -  /* Memory allocator.	The AST is allocated using this. */ -  tre_mem_t mem; -  /* Stack used for keeping track of regexp syntax. */ -  tre_stack_t *stack; -  /* The parse result. */ -  tre_ast_node_t *result; -  /* The regexp to parse and its length. */ -  const char *re; -  /* The first character of the entire regexp. */ -  const char *re_start; -  /* Current submatch ID. */ -  int submatch_id; -  /* Current position (number of literal). */ -  int position; -  /* The highest back reference or -1 if none seen so far. */ -  int max_backref; -  /* This flag is set if the regexp uses approximate matching. */ -  int have_approx; -  /* Compilation flags. */ -  int cflags; -  /* If this flag is set the top-level submatch is not captured. */ -  int nofirstsub; +	/* Memory allocator. The AST is allocated using this. */ +	tre_mem_t mem; +	/* Stack used for keeping track of regexp syntax. */ +	tre_stack_t *stack; +	/* The parsed node after a parse function returns. */ +	tre_ast_node_t *n; +	/* Position in the regexp pattern after a parse function returns. */ +	const char *s; +	/* The first character of the regexp. */ +	const char *re; +	/* Current submatch ID. */ +	int submatch_id; +	/* Current position (number of literal). */ +	int position; +	/* The highest back reference or -1 if none seen so far. */ +	int max_backref; +	/* Compilation flags. */ +	int cflags;  } tre_parse_ctx_t; -/* Parses a wide character regexp pattern into a syntax tree.  This parser -   handles both syntaxes (BRE and ERE), including the TRE extensions. */ -static reg_errcode_t -tre_parse(tre_parse_ctx_t *ctx); - - -/* -  This parser is just a simple recursive descent parser for POSIX.2 -  regexps.  The parser supports both the obsolete default syntax and -  the "extended" syntax, and some nonstandard extensions. -*/ - -/* Characters with special meanings in regexp syntax. */ -#define CHAR_PIPE	   '|' -#define CHAR_LPAREN	   '(' -#define CHAR_RPAREN	   ')' -#define CHAR_LBRACE	   '{' -#define CHAR_RBRACE	   '}' -#define CHAR_LBRACKET	   '[' -#define CHAR_RBRACKET	   ']' -#define CHAR_MINUS	   '-' -#define CHAR_STAR	   '*' -#define CHAR_QUESTIONMARK  '?' -#define CHAR_PLUS	   '+' -#define CHAR_PERIOD	   '.' -#define CHAR_COLON	   ':' -#define CHAR_EQUAL	   '=' -#define CHAR_COMMA	   ',' -#define CHAR_CARET	   '^' -#define CHAR_DOLLAR	   '$' -#define CHAR_BACKSLASH	   '\\' -#define CHAR_HASH	   '#' -#define CHAR_TILDE	   '~' - -  /* Some macros for expanding \w, \s, etc. */ -static const struct tre_macro_struct { -  const char c; -  const char *expansion; -} tre_macros[] = -  { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"}, -    {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"}, -    {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, -    {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"}, -    { 0, NULL } -  }; - +static const struct { +	char c; +	const char *expansion; +} tre_macros[] = { +	{'t', "\t"}, {'n', "\n"}, {'r', "\r"}, +	{'f', "\f"}, {'a', "\a"}, {'e', "\033"}, +	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, +	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, +	{ 0, 0 } +};  /* Expands a macro delimited by `regex' and `regex_end' to `buf', which     must have at least `len' items.  Sets buf[0] to zero if the there     is no match in `tre_macros'. */ -static const char * -tre_expand_macro(const char *regex) +static const char *tre_expand_macro(const char *s)  { -  int i; - -  if (!*regex) -    return 0; - -  for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++); -  return tre_macros[i].expansion; +	int i; +	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++); +	return tre_macros[i].expansion;  } -static reg_errcode_t -tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, -	 tre_ast_node_t ***items) +static int +tre_compare_lit(const void *a, const void *b)  { -  reg_errcode_t status; -  tre_ast_node_t **array = *items; -  /* Allocate more space if necessary. */ -  if (*i >= *max_i) -    { -      tre_ast_node_t **new_items; -      /* If the array is already 1024 items large, give up -- there's -	 probably an error in the regexp (e.g. not a '\0' terminated -	 string and missing ']') */ -      if (*max_i > 1024) -	return REG_ESPACE; -      *max_i *= 2; -      new_items = xrealloc(array, sizeof(*array) * *max_i); -      if (new_items == NULL) -	return REG_ESPACE; -      *items = array = new_items; -    } -  array[*i] = tre_ast_new_literal(mem, min, max, -1); -  status = array[*i] == NULL ? REG_ESPACE : REG_OK; -  (*i)++; -  return status; +	const tre_literal_t *const *la = a; +	const tre_literal_t *const *lb = b; +	/* assumes the range of valid code_min is < INT_MAX */ +	return la[0]->code_min - lb[0]->code_min;  } +struct literals { +	tre_mem_t mem; +	tre_literal_t **a; +	int len; +	int cap; +}; -static int -tre_compare_items(const void *a, const void *b) +static tre_literal_t *tre_new_lit(struct literals *p)  { -  const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; -  const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; -  tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; -  int a_min = l_a->code_min, b_min = l_b->code_min; - -  if (a_min < b_min) -    return -1; -  else if (a_min > b_min) -    return 1; -  else -    return 0; +	tre_literal_t **a; +	if (p->len >= p->cap) { +		if (p->cap >= 1<<15) +			return 0; +		p->cap *= 2; +		a = xrealloc(p->a, p->cap * sizeof *p->a); +		if (!a) +			return 0; +		p->a = a; +	} +	a = p->a + p->len++; +	*a = tre_mem_calloc(p->mem, sizeof **a); +	return *a;  } -/* Maximum number of character classes that can occur in a negated bracket -   expression.	*/ -#define MAX_NEG_CLASSES 64 - -/* Maximum length of character class names. */ -#define MAX_CLASS_NAME - -static reg_errcode_t -tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, -			tre_ctype_t neg_classes[], int *num_neg_classes, -			tre_ast_node_t ***items, int *num_items, -			int *items_size) +static int add_icase_literals(struct literals *ls, int min, int max)  { -  const char *re = ctx->re; -  reg_errcode_t status = REG_OK; -  tre_ctype_t class = (tre_ctype_t)0; -  int i = *num_items; -  int max_i = *items_size; -  int skip; - -  /* Build an array of the items in the bracket expression. */ -  while (status == REG_OK) -    { -      skip = 0; -      if (!*re) -	{ -	  status = REG_EBRACK; -	} -      else if (*re == CHAR_RBRACKET && re > ctx->re) -	{ -	  re++; -	  break; +	tre_literal_t *lit; +	int b, e, c; +	for (c=min; c<=max; ) { +		/* assumes islower(c) and isupper(c) are exclusive +		   and toupper(c)!=c if islower(c). +		   multiple opposite case characters are not supported */ +		if (tre_islower(c)) { +			b = e = tre_toupper(c); +			for (c++, e++; c<=max; c++, e++) +				if (tre_toupper(c) != e) break; +		} else if (tre_isupper(c)) { +			b = e = tre_tolower(c); +			for (c++, e++; c<=max; c++, e++) +				if (tre_tolower(c) != e) break; +		} else { +			c++; +			continue; +		} +		lit = tre_new_lit(ls); +		if (!lit) +			return -1; +		lit->code_min = b; +		lit->code_max = e-1; +		lit->position = -1;  	} -      else -	{ -	  tre_cint_t min = 0, max = 0; -	  wchar_t wc; -	  int clen = mbtowc(&wc, re, -1); +	return 0; +} -	  if (clen<0) clen=1, wc=WEOF; -	  class = (tre_ctype_t)0; -	  if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET) -	    { -	      min = wc; -	      re += clen+1; -	      clen = mbtowc(&wc, re, -1); -	      if (clen<0) clen=1, wc=WEOF; -	      max = wc; -	      re += clen; -	      /* XXX - Should use collation order instead of encoding values -		 in character ranges. */ -	      if (min > max) -		status = REG_ERANGE; -	    } -	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) -	    status = REG_ECOLLATE; -	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) -	    status = REG_ECOLLATE; -	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) -	    { -	      char tmp_str[64]; -	      const char *endptr = re + 2; -	      int len; -	      while (*endptr && *endptr != CHAR_COLON) -		endptr++; -	      if (*endptr) -		{ -		  len = MIN(endptr - re - 2, 63); -		  strncpy(tmp_str, re + 2, len); -		  tmp_str[len] = '\0'; -		  class = tre_ctype(tmp_str); -		  if (!class) -		    status = REG_ECTYPE; -		  re = endptr + 2; -		} -	      else -		status = REG_ECTYPE; -	      min = 0; -	      max = TRE_CHAR_MAX; -	    } -	  else -	    { -	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET -		  && ctx->re != re) -		/* Two ranges are not allowed to share and endpoint. */ -		status = REG_ERANGE; -	      min = max = wc; -	      re += clen; -	    } +/* Maximum number of character classes in a negated bracket expression. */ +#define MAX_NEG_CLASSES 64 -	  if (status != REG_OK) -	    break; +struct neg { +	int negate; +	int len; +	tre_ctype_t a[MAX_NEG_CLASSES]; +}; -	  if (class && negate) -	    if (*num_neg_classes >= MAX_NEG_CLASSES) -	      status = REG_ESPACE; -	    else -	      neg_classes[(*num_neg_classes)++] = class; -	  else if (!skip) -	    { -	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); -	      if (status != REG_OK) -		break; -	      ((tre_literal_t*)((*items)[i-1])->obj)->class = class; -	    } +// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges -	  /* Add opposite-case counterpoints if REG_ICASE is present. -	     This is broken if there are more than two "same" characters. */ -	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) -	    { -	      tre_cint_t cmin, ccurr; +/* +bracket grammar: +Bracket  =  '[' List ']'  |  '[^' List ']' +List     =  Term  |  List Term +Term     =  Char  |  Range  |  Chclass  |  Eqclass +Range    =  Char '-' Char  |  Char '-' '-' +Char     =  Coll  |  coll_single +Meta     =  ']'  |  '-' +Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]' +Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]' +Chclass  =  '[:' class ':]' + +coll_single is a single char collating element but it can be + '-' only at the beginning or end of a List and + ']' only at the beginning of a List and + '^' anywhere except after the openning '[' +*/ -	      while (min <= max) -		{ -		  if (tre_islower(min)) -		    { -		      cmin = ccurr = tre_toupper(min++); -		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1 -			     && min <= max) -			ccurr = tre_toupper(min++); -		      status = tre_new_item(ctx->mem, cmin, ccurr, -					    &i, &max_i, items); -		    } -		  else if (tre_isupper(min)) -		    { -		      cmin = ccurr = tre_tolower(min++); -		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 -			     && min <= max) -			ccurr = tre_tolower(min++); -		      status = tre_new_item(ctx->mem, cmin, ccurr, -					    &i, &max_i, items); -		    } -		  else min++; -		  if (status != REG_OK) -		    break; +static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg) +{ +	const char *start = s; +	tre_ctype_t class; +	int min, max; +	wchar_t wc; +	int len; + +	for (;;) { +		class = 0; +		len = mbtowc(&wc, s, -1); +		if (len <= 0) +			return *s ? REG_BADPAT : REG_EBRACK; +		if (*s == ']' && s != start) { +			ctx->s = s+1; +			return REG_OK; +		} +		if (*s == '-' && s != start && s[1] != ']' && +		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */ +		    (s[1] != '-' || s[2] == ']')) +			return REG_ERANGE; +		if (*s == '[' && (s[1] == '.' || s[1] == '=')) +			/* collating symbols and equivalence classes are not supported */ +			return REG_ECOLLATE; +		if (*s == '[' && s[1] == ':') { +			char tmp[CHARCLASS_NAME_MAX+1]; +			s += 2; +			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) { +				if (s[len] == ':') { +					memcpy(tmp, s, len); +					tmp[len] = 0; +					class = tre_ctype(tmp); +					break; +				} +			} +			if (!class || s[len+1] != ']') +				return REG_ECTYPE; +			min = 0; +			max = TRE_CHAR_MAX; +			s += len+2; +		} else { +			min = max = wc; +			s += len; +			if (*s == '-' && s[1] != ']') { +				s++; +				len = mbtowc(&wc, s, -1); +				max = wc; +				/* XXX - Should use collation order instead of +				   encoding values in character ranges. */ +				if (len <= 0 || min > max) +					return REG_ERANGE; +				s += len; +			} +		} + +		if (class && neg->negate) { +			if (neg->len >= MAX_NEG_CLASSES) +				return REG_ESPACE; +			neg->a[neg->len++] = class; +		} else  { +			tre_literal_t *lit = tre_new_lit(ls); +			if (!lit) +				return REG_ESPACE; +			lit->code_min = min; +			lit->code_max = max; +			lit->class = class; +			lit->position = -1; + +			/* Add opposite-case codepoints if REG_ICASE is present. +			   It seems that POSIX requires that bracket negation +			   should happen before case-folding, but most practical +			   implementations do it the other way around. Changing +			   the order would need efficient representation of +			   case-fold ranges and bracket range sets even with +			   simple patterns so this is ok for now. */ +			if (ctx->cflags & REG_ICASE && !class) +				if (add_icase_literals(ls, min, max)) +					return REG_ESPACE;  		} -	      if (status != REG_OK) -		break; -	    }  	} -    } -  *num_items = i; -  *items_size = max_i; -  ctx->re = re; -  return status;  } -static reg_errcode_t -tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)  { -  tre_ast_node_t *node = NULL; -  int negate = 0; -  reg_errcode_t status = REG_OK; -  tre_ast_node_t **items, *u, *n; -  int i = 0, j, max_i = 32, curr_max, curr_min; -  tre_ctype_t neg_classes[MAX_NEG_CLASSES]; -  int num_neg_classes = 0; - -  /* Start off with an array of `max_i' elements. */ -  items = xmalloc(sizeof(*items) * max_i); -  if (items == NULL) -    return REG_ESPACE; - -  if (*ctx->re == CHAR_CARET) -    { -      negate = 1; -      ctx->re++; -    } - -  status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, -				   &items, &i, &max_i); - -  if (status != REG_OK) -    goto parse_bracket_done; - -  /* Sort the array if we need to negate it. */ -  if (negate) -    qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); - -  curr_max = curr_min = 0; -  /* Build a union of the items in the array, negated if necessary. */ -  for (j = 0; j < i && status == REG_OK; j++) -    { -      int min, max; -      tre_literal_t *l = items[j]->obj; -      min = l->code_min; -      max = l->code_max; - -      if (negate) -	{ -	  if (min < curr_max) -	    { -	      /* Overlap. */ -	      curr_max = MAX(max + 1, curr_max); -	      l = NULL; -	    } -	  else -	    { -	      /* No overlap. */ -	      curr_max = min - 1; -	      if (curr_max >= curr_min) -		{ -		  l->code_min = curr_min; -		  l->code_max = curr_max; +	int i, max, min, negmax, negmin; +	tre_ast_node_t *node = 0, *n; +	tre_ctype_t *nc = 0; +	tre_literal_t *lit; +	struct literals ls; +	struct neg neg; +	reg_errcode_t err; + +	ls.mem = ctx->mem; +	ls.len = 0; +	ls.cap = 32; +	ls.a = xmalloc(ls.cap * sizeof *ls.a); +	if (!ls.a) +		return REG_ESPACE; +	neg.len = 0; +	neg.negate = *s == '^'; +	if (neg.negate) +		s++; + +	err = parse_bracket_terms(ctx, s, &ls, &neg); +	if (err != REG_OK) +		goto parse_bracket_done; + +	if (neg.negate) { +		/* Sort the array if we need to negate it. */ +		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit); +		/* extra lit for the last negated range */ +		lit = tre_new_lit(&ls); +		if (!lit) { +			err = REG_ESPACE; +			goto parse_bracket_done;  		} -	      else -		{ -		  l = NULL; +		lit->code_min = TRE_CHAR_MAX+1; +		lit->code_max = TRE_CHAR_MAX+1; +		lit->position = -1; +		/* negated classes */ +		if (neg.len) { +			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a); +			if (!nc) { +				err = REG_ESPACE; +				goto parse_bracket_done; +			} +			memcpy(nc, neg.a, neg.len*sizeof *neg.a); +			nc[neg.len] = 0;  		} -	      curr_min = curr_max = max + 1; -	    }  	} -      if (l != NULL) -	{ -	  int k; -	  l->position = ctx->position; -	  if (num_neg_classes > 0) -	    { -	      l->neg_classes = tre_mem_alloc(ctx->mem, -					     (sizeof(*l->neg_classes) -					      * (num_neg_classes + 1))); -	      if (l->neg_classes == NULL) -		{ -		  status = REG_ESPACE; -		  break; +	/* Build a union of the items in the array, negated if necessary. */ +	negmax = negmin = 0; +	for (i = 0; i < ls.len; i++) { +		lit = ls.a[i]; +		min = lit->code_min; +		max = lit->code_max; +		if (neg.negate) { +			if (min <= negmin) { +				/* Overlap. */ +				negmin = MAX(max + 1, negmin); +				continue; +			} +			negmax = min - 1; +			lit->code_min = negmin; +			lit->code_max = negmax; +			negmin = max + 1;  		} -	      for (k = 0; k < num_neg_classes; k++) -		l->neg_classes[k] = neg_classes[k]; -	      l->neg_classes[k] = (tre_ctype_t)0; -	    } -	  else -	    l->neg_classes = NULL; -	  if (node == NULL) -	    node = items[j]; -	  else -	    { -	      u = tre_ast_new_union(ctx->mem, node, items[j]); -	      if (u == NULL) -		status = REG_ESPACE; -	      node = u; -	    } -	} -    } - -  if (status != REG_OK) -    goto parse_bracket_done; - -  if (negate) -    { -      int k; -      n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); -      if (n == NULL) -	status = REG_ESPACE; -      else -	{ -	  tre_literal_t *l = n->obj; -	  if (num_neg_classes > 0) -	    { -	      l->neg_classes = tre_mem_alloc(ctx->mem, -					     (sizeof(*l->neg_classes) -					      * (num_neg_classes + 1))); -	      if (l->neg_classes == NULL) -		{ -		  status = REG_ESPACE; -		  goto parse_bracket_done; +		lit->position = ctx->position; +		lit->neg_classes = nc; +		n = tre_ast_new_node(ctx->mem, LITERAL, lit); +		node = tre_ast_new_union(ctx->mem, node, n); +		if (!node) { +			err = REG_ESPACE; +			break;  		} -	      for (k = 0; k < num_neg_classes; k++) -		l->neg_classes[k] = neg_classes[k]; -	      l->neg_classes[k] = (tre_ctype_t)0; -	    } -	  else -	    l->neg_classes = NULL; -	  if (node == NULL) -	    node = n; -	  else -	    { -	      u = tre_ast_new_union(ctx->mem, node, n); -	      if (u == NULL) -		status = REG_ESPACE; -	      node = u; -	    }  	} -    } - -  if (status != REG_OK) -    goto parse_bracket_done; -#ifdef TRE_DEBUG -  tre_ast_print(node); -#endif /* TRE_DEBUG */ - - parse_bracket_done: -  xfree(items); -  ctx->position++; -  *result = node; -  return status; +parse_bracket_done: +	xfree(ls.a); +	ctx->position++; +	ctx->n = node; +	return err;  } - -/* Parses a positive decimal integer.  Returns -1 if the string does not -   contain a valid number. */ -static int -tre_parse_int(const char **regex) +static const char *parse_dup_count(const char *s, int *n)  { -  int num = -1; -  const char *r = *regex; -  while (*r-'0'<10U) -    { -      if (num < 0) -	num = 0; -      num = num * 10 + *r - '0'; -      r++; -    } -  *regex = r; -  return num; +	*n = -1; +	if (!isdigit(*s)) +		return s; +	*n = 0; +	for (;;) { +		*n = 10 * *n + (*s - '0'); +		s++; +		if (!isdigit(*s) || *n > RE_DUP_MAX) +			break; +	} +	return s;  } - -static reg_errcode_t -tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)  { -  int min, max; -  const char *r = ctx->re; -  int minimal = 0; - -  /* Parse number (minimum repetition count). */ -  min = -1; -  if (*r >= '0' && *r <= '9') { -    min = tre_parse_int(&r); -  } - -  /* Parse comma and second number (maximum repetition count). */ -  max = min; -  if (*r == CHAR_COMMA) -    { -      r++; -      max = tre_parse_int(&r); -    } - -  /* Check that the repeat counts are sane. */ -  if ((max >= 0 && min > max) || max > RE_DUP_MAX) -    return REG_BADBR; - -  /* Missing }. */ -  if (!*r) -    return REG_EBRACE; - -  /* Empty contents of {}. */ -  if (r == ctx->re) -    return REG_BADBR; - -  /* Parse the ending '}' or '\}'.*/ -  if (ctx->cflags & REG_EXTENDED) -    { -      if (*r != CHAR_RBRACE) -	return REG_BADBR; -      r++; -    } -  else -    { -      if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE) -	return REG_BADBR; -      r += 2; -    } - -  /* Create the AST node(s). */ -  if (min == 0 && max == 0) -    { -      *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); -      if (*result == NULL) -	return REG_ESPACE; -    } -  else -    { -      if (min < 0 && max < 0) -	/* Only approximate parameters set, no repetitions. */ -	min = max = 1; - -      *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); -      if (!*result) -	return REG_ESPACE; -    } - -  ctx->re = r; -  return REG_OK; +	int min, max; + +	s = parse_dup_count(s, &min); +	if (*s == ',') +		s = parse_dup_count(s+1, &max); +	else +		max = min; + +	if ( +		(max < min && max >= 0) || +		max > RE_DUP_MAX || +		min > RE_DUP_MAX || +		min < 0 || +		(!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') || +		*s++ != '}' +	) +		return REG_BADBR; + +	if (min == 0 && max == 0) +		ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +	else +		ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); +	if (!ctx->n) +		return REG_ESPACE; +	ctx->s = s; +	return REG_OK;  } -typedef enum { -  PARSE_RE = 0, -  PARSE_ATOM, -  PARSE_MARK_FOR_SUBMATCH, -  PARSE_BRANCH, -  PARSE_PIECE, -  PARSE_CATENATION, -  PARSE_POST_CATENATION, -  PARSE_UNION, -  PARSE_POST_UNION, -  PARSE_POSTFIX, -  PARSE_RESTORE_CFLAGS -} tre_parse_re_stack_symbol_t; - - -static reg_errcode_t -tre_parse(tre_parse_ctx_t *ctx) +static int hexval(unsigned c)  { -  tre_ast_node_t *result = NULL; -  tre_parse_re_stack_symbol_t symbol; -  reg_errcode_t status = REG_OK; -  tre_stack_t *stack = ctx->stack; -  int bottom = tre_stack_num_objects(stack); -  int depth = 0; -  wchar_t wc; -  int clen; - -  if (!ctx->nofirstsub) -    { -      STACK_PUSH(stack, int, ctx->submatch_id); -      STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); -      ctx->submatch_id++; -    } -  STACK_PUSH(stack, int, PARSE_RE); -  ctx->re_start = ctx->re; - - -  /* The following is basically just a recursive descent parser.  I use -     an explicit stack instead of recursive functions mostly because of -     two reasons: compatibility with systems which have an overflowable -     call stack, and efficiency (both in lines of code and speed).  */ -  while (tre_stack_num_objects(stack) > bottom && status == REG_OK) -    { -      if (status != REG_OK) -	break; -      symbol = tre_stack_pop_int(stack); -      switch (symbol) -	{ -	case PARSE_RE: -	  /* Parse a full regexp.  A regexp is one or more branches, -	     separated by the union operator `|'. */ -	  if (ctx->cflags & REG_EXTENDED) -	    STACK_PUSHX(stack, int, PARSE_UNION); -	  STACK_PUSHX(stack, int, PARSE_BRANCH); -	  break; - -	case PARSE_BRANCH: -	  /* Parse a branch.  A branch is one or more pieces, concatenated. -	     A piece is an atom possibly followed by a postfix operator. */ -	  STACK_PUSHX(stack, int, PARSE_CATENATION); -	  STACK_PUSHX(stack, int, PARSE_PIECE); -	  break; - -	case PARSE_PIECE: -	  /* Parse a piece.  A piece is an atom possibly followed by one -	     or more postfix operators. */ -	    STACK_PUSHX(stack, int, PARSE_POSTFIX); -	  STACK_PUSHX(stack, int, PARSE_ATOM); -	  break; - -	case PARSE_CATENATION: -	  /* If the expression has not ended, parse another piece. */ -	  { -	    tre_char_t c; -	    if (!*ctx->re) -	      break; -	    c = *ctx->re; -		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) -		  break; -		if ((ctx->cflags & REG_EXTENDED -		     && c == CHAR_RPAREN && depth > 0) -		    || (!(ctx->cflags & REG_EXTENDED) -			&& (c == CHAR_BACKSLASH -			    && *(ctx->re + 1) == CHAR_RPAREN))) -		  { -		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0) -		      status = REG_EPAREN; -		    depth--; -		    if (!(ctx->cflags & REG_EXTENDED)) -		      ctx->re += 2; -		    break; -		  } - -	      { -		/* Default case, left associative concatenation. */ -		STACK_PUSHX(stack, int, PARSE_CATENATION); -		STACK_PUSHX(stack, voidptr, result); -		STACK_PUSHX(stack, int, PARSE_POST_CATENATION); -		STACK_PUSHX(stack, int, PARSE_PIECE); -	      } -	    break; -	  } - -	case PARSE_POST_CATENATION: -	  { -	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); -	    tre_ast_node_t *tmp_node; -	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); -	    if (!tmp_node) -	      return REG_ESPACE; -	    result = tmp_node; -	    break; -	  } - -	case PARSE_UNION: -	  switch (*ctx->re) -	    { -	    case CHAR_PIPE: -	      STACK_PUSHX(stack, int, PARSE_UNION); -	      STACK_PUSHX(stack, voidptr, result); -	      STACK_PUSHX(stack, int, PARSE_POST_UNION); -	      STACK_PUSHX(stack, int, PARSE_BRANCH); -	      ctx->re++; -	      break; - -	    case CHAR_RPAREN: -	      ctx->re++; -	      break; - -	    default: -	      break; -	    } -	  break; - -	case PARSE_POST_UNION: -	  { -	    tre_ast_node_t *tmp_node; -	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); -	    tmp_node = tre_ast_new_union(ctx->mem, tree, result); -	    if (!tmp_node) -	      return REG_ESPACE; -	    result = tmp_node; -	    break; -	  } - -	case PARSE_POSTFIX: -	  /* Parse postfix operators. */ -	  switch (*ctx->re) -	    { -	    case CHAR_PLUS: -	    case CHAR_QUESTIONMARK: -	      if (!(ctx->cflags & REG_EXTENDED)) -		break; -		/*FALLTHROUGH*/ -	    case CHAR_STAR: -	      { -		tre_ast_node_t *tmp_node; -		int minimal = 0; -		int rep_min = 0; -		int rep_max = -1; - -		if (*ctx->re == CHAR_PLUS) -		  rep_min = 1; -		if (*ctx->re == CHAR_QUESTIONMARK) -		  rep_max = 1; - -		ctx->re++; -		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, -					    minimal); -		if (tmp_node == NULL) -		  return REG_ESPACE; -		result = tmp_node; -		STACK_PUSHX(stack, int, PARSE_POSTFIX); -	      } -	      break; - -	    case CHAR_BACKSLASH: -	      /* "\{" is special without REG_EXTENDED */ -	      if (!(ctx->cflags & REG_EXTENDED) -		  && *(ctx->re + 1) == CHAR_LBRACE) -		{ -		  ctx->re++; -		  goto parse_brace; -		} -	      else -		break; - -	    case CHAR_LBRACE: -	      /* "{" is literal without REG_EXTENDED */ -	      if (!(ctx->cflags & REG_EXTENDED)) -		break; - -	    parse_brace: -	      ctx->re++; +	if (c-'0'<10) return c-'0'; +	c |= 32; +	if (c-'a'<6) return c-'a'+10; +	return -1; +} -	      status = tre_parse_bound(ctx, &result); -	      if (status != REG_OK) -		return status; -	      STACK_PUSHX(stack, int, PARSE_POSTFIX); -	      break; -	    } -	  break; +static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid) +{ +	if (node->submatch_id >= 0) { +		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +		if (!n) +			return REG_ESPACE; +		n = tre_ast_new_catenation(ctx->mem, n, node); +		if (!n) +			return REG_ESPACE; +		n->num_submatches = node->num_submatches; +		node = n; +	} +	node->submatch_id = subid; +	node->num_submatches++; +	ctx->n = node; +	return REG_OK; +} -	case PARSE_ATOM: -	  /* Parse an atom.  An atom is a regular expression enclosed in `()', -	     an empty set of `()', a bracket expression, `.', `^', `$', -	     a `\' followed by a character, or a single character. */ +/* +BRE grammar: +Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$' +Branch =  Atom  |  Branch Atom +Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref +Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}' -	  switch (*ctx->re) -	    { -	    case CHAR_LPAREN:  /* parenthesized subexpression */ +(leading ^ and trailing $ in a sub expr may be an anchor or literal as well) -	      if (ctx->cflags & REG_EXTENDED) -		{ -		lparen: -		  depth++; -		    { -		      ctx->re++; -		      /* First parse a whole RE, then mark the resulting tree -			 for submatching. */ -		      STACK_PUSHX(stack, int, ctx->submatch_id); -		      STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); -		      STACK_PUSHX(stack, int, PARSE_RE); -		      ctx->submatch_id++; -		    } -		} -	      else -		goto parse_literal; -	      break; +ERE grammar: +Regex  =  Branch  |  Regex '|' Branch +Branch =  Atom  |  Branch Atom +Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$' +Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}' -	    case CHAR_LBRACKET: /* bracket expression */ -	      ctx->re++; -	      status = tre_parse_bracket(ctx, &result); -	      if (status != REG_OK) -		return status; -	      break; +(a*+?, ^*, $+, \X, {, (|a) are unspecified) +*/ -	    case CHAR_BACKSLASH: -	      /* If this is "\(" or "\)" chew off the backslash and -		 try again. */ -	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) -		{ -		  ctx->re++; -		  goto lparen; -		} -	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN) -		{ -		  goto empty_atom; +static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) +{ +	int len, ere = ctx->cflags & REG_EXTENDED; +	const char *p; +	tre_ast_node_t *node; +	wchar_t wc; +	switch (*s) { +	case '[': +		return parse_bracket(ctx, s+1); +	case '\\': +		p = tre_expand_macro(s+1); +		if (p) { +			/* assume \X expansion is a single atom */ +			reg_errcode_t err = parse_atom(ctx, p); +			ctx->s = s+2; +			return err;  		} - -	      /* If a macro is used, parse the expanded macro recursively. */ -	      { -		const char *buf = tre_expand_macro(ctx->re + 1); -		if (buf) -		  { -		    tre_parse_ctx_t subctx; -		    memcpy(&subctx, ctx, sizeof(subctx)); -		    subctx.re = buf; -		    subctx.nofirstsub = 1; -		    status = tre_parse(&subctx); -		    if (status != REG_OK) -		      return status; -		    ctx->re += 2; -		    ctx->position = subctx.position; -		    result = subctx.result; -		    break; -		  } -	      } - -	      if (!ctx->re[1]) -		/* Trailing backslash. */ -		return REG_EESCAPE; - -	      ctx->re++; -	      switch (*ctx->re) -		{ +		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */ +		switch (*++s) { +		case 0: +			return REG_EESCAPE;  		case 'b': -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_WB, -1); -		  ctx->re++; -		  break; +			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1); +			break;  		case 'B': -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_WB_NEG, -1); -		  ctx->re++; -		  break; +			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1); +			break;  		case '<': -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_BOW, -1); -		  ctx->re++; -		  break; +			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1); +			break;  		case '>': -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_EOW, -1); -		  ctx->re++; -		  break; +			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1); +			break;  		case 'x': -		  ctx->re++; -		  if (ctx->re[0] != CHAR_LBRACE) -		    { -		      /* 8 bit hex char. */ -		      char tmp[3] = {0, 0, 0}; -		      long val; - -		      if (tre_isxdigit(ctx->re[0])) -			{ -			  tmp[0] = (char)ctx->re[0]; -			  ctx->re++; +			s++; +			int i, v = 0, c; +			len = 2; +			if (*s == '{') { +				len = 8; +				s++;  			} -		      if (tre_isxdigit(ctx->re[0])) -			{ -			  tmp[1] = (char)ctx->re[0]; -			  ctx->re++; +			for (i=0; i<len && v<0x110000; i++) { +				c = hexval(s[i]); +				if (c < 0) break; +				v = 16*v + c;  			} -		      val = strtol(tmp, NULL, 16); -		      result = tre_ast_new_literal(ctx->mem, (int)val, -						   (int)val, ctx->position); -		      ctx->position++; -		      break; -		    } -		  else if (*ctx->re) -		    { -		      /* Wide char. */ -		      char tmp[32]; -		      long val; -		      int i = 0; -		      ctx->re++; -		      while (*ctx->re && i < sizeof tmp) -			{ -			  if (ctx->re[0] == CHAR_RBRACE) -			    break; -			  if (tre_isxdigit(ctx->re[0])) -			    { -			      tmp[i] = (char)ctx->re[0]; -			      i++; -			      ctx->re++; -			      continue; -			    } -			  return REG_EBRACE; +			s += i; +			if (len == 8) { +				if (*s != '}') +					return REG_EBRACE; +				s++;  			} -		      ctx->re++; -		      tmp[i] = 0; -		      val = strtol(tmp, NULL, 16); -		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, -						   ctx->position); -		      ctx->position++; -		      break; -		    } -		  /*FALLTHROUGH*/ - +			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position); +			ctx->position++; +			s--; +			break;  		default: -		  if (tre_isdigit(*ctx->re)) -		    { -		      /* Back reference. */ -		      int val = *ctx->re - '0'; -		      result = tre_ast_new_literal(ctx->mem, BACKREF, val, -						   ctx->position); -		      if (result == NULL) -			return REG_ESPACE; -		      ctx->position++; -		      ctx->max_backref = MAX(val, ctx->max_backref); -		      ctx->re++; -		    } -		  else -		    { -		      /* Escaped character. */ -		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, -						   ctx->position); -		      ctx->position++; -		      ctx->re++; -		    } -		  break; -		} -	      if (result == NULL) -		return REG_ESPACE; -	      break; - -	    case CHAR_PERIOD:	 /* the any-symbol */ -	      if (ctx->cflags & REG_NEWLINE) -		{ -		  tre_ast_node_t *tmp1; -		  tre_ast_node_t *tmp2; -		  tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1, -					     ctx->position); -		  if (!tmp1) -		    return REG_ESPACE; -		  tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX, -					     ctx->position + 1); -		  if (!tmp2) -		    return REG_ESPACE; -		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2); -		  if (!result) -		    return REG_ESPACE; -		  ctx->position += 2; -		} -	      else -		{ -		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, -					       ctx->position); -		  if (!result) -		    return REG_ESPACE; -		  ctx->position++; +			if (isdigit(*s)) { +				/* back reference */ +				int val = *s - '0'; +				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position); +				ctx->max_backref = MAX(val, ctx->max_backref); +			} else { +				/* extension: accept unknown escaped char +				   as a literal */ +				node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position); +			} +			ctx->position++;  		} -	      ctx->re++; -	      break; - -	    case CHAR_CARET:	 /* beginning of line assertion */ -	      /* '^' has a special meaning everywhere in EREs, and at -		 beginning of BRE. */ -	      if (ctx->cflags & REG_EXTENDED -		  || ctx->re == ctx->re_start) -		{ -		  if (!(ctx->cflags & REG_EXTENDED)) -		    STACK_PUSHX(stack, int, PARSE_CATENATION); -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_BOL, -1); -		  if (result == NULL) -		    return REG_ESPACE; -		  ctx->re++; +		s++; +		break; +	case '.': +		if (ctx->cflags & REG_NEWLINE) { +			tre_ast_node_t *tmp1, *tmp2; +			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++); +			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++); +			if (tmp1 && tmp2) +				node = tre_ast_new_union(ctx->mem, tmp1, tmp2); +			else +				node = 0; +		} else { +			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);  		} -	      else -		goto parse_literal; -	      break; - -	    case CHAR_DOLLAR:	 /* end of line assertion. */ -	      /* '$' is special everywhere in EREs, and in the end of the -		 string in BREs. */ -	      if (ctx->cflags & REG_EXTENDED -		  || !*(ctx->re + 1)) -		{ -		  result = tre_ast_new_literal(ctx->mem, ASSERTION, -					       ASSERT_AT_EOL, -1); -		  if (result == NULL) -		    return REG_ESPACE; -		  ctx->re++; +		s++; +		break; +	case '^': +		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ +		if (!ere && s != ctx->re) +			goto parse_literal; +		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); +		s++; +		break; +	case '$': +		/* '$' is special everywhere in EREs, and in the end of the string in BREs. */ +		if (!ere && s[1]) +			goto parse_literal; +		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); +		s++; +		break; +	case '*': +	case '|': +	case '{': +	case '+': +	case '?': +		if (!ere) +			goto parse_literal; +	case 0: +		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +		break; +	default: +parse_literal: +		len = mbtowc(&wc, s, -1); +		if (len < 0) +			return REG_BADPAT; +		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) { +			tre_ast_node_t *tmp1, *tmp2; +			/* multiple opposite case characters are not supported */ +			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position); +			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position); +			if (tmp1 && tmp2) +				node = tre_ast_new_union(ctx->mem, tmp1, tmp2); +			else +				node = 0; +		} else { +			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);  		} -	      else -		goto parse_literal; -	      break; - -	    case CHAR_RPAREN: -	      if (!depth) -	        goto parse_literal; -	    case CHAR_STAR: -	    case CHAR_PIPE: -	    case CHAR_LBRACE: -	    case CHAR_PLUS: -	    case CHAR_QUESTIONMARK: -	      if (!(ctx->cflags & REG_EXTENDED)) -	        goto parse_literal; - -	    case 0: -	    empty_atom: -	      result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); -	      if (!result) +		ctx->position++; +		s += len; +		break; +	} +	if (!node)  		return REG_ESPACE; -	      break; +	ctx->n = node; +	ctx->s = s; +	return REG_OK; +} -	    default: -	    parse_literal: +#define PUSHPTR(err, s, v) do { \ +	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ +		return err; \ +} while(0) -	      clen = mbtowc(&wc, ctx->re, -1); -	      if (clen<0) clen=1, wc=WEOF; +#define PUSHINT(err, s, v) do { \ +	if ((err = tre_stack_push_int(s, v)) != REG_OK) \ +		return err; \ +} while(0) -	      /* Note that we can't use an tre_isalpha() test here, since there -		 may be characters which are alphabetic but neither upper or -		 lower case. */ -	      if (ctx->cflags & REG_ICASE -		  && (tre_isupper(wc) || tre_islower(wc))) -		{ -		  tre_ast_node_t *tmp1; -		  tre_ast_node_t *tmp2; - -		  /* XXX - Can there be more than one opposite-case -		     counterpoints for some character in some locale?  Or -		     more than two characters which all should be regarded -		     the same character if case is ignored?  If yes, there -		     does not seem to be a portable way to detect it.  I guess -		     that at least for multi-character collating elements there -		     could be several opposite-case counterpoints, but they -		     cannot be supported portably anyway. */ -		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), -					     tre_toupper(wc), -					     ctx->position); -		  if (!tmp1) -		    return REG_ESPACE; -		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), -					     tre_tolower(wc), -					     ctx->position); -		  if (!tmp2) -		    return REG_ESPACE; -		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2); -		  if (!result) -		    return REG_ESPACE; +static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) +{ +	tre_ast_node_t *nbranch=0, *nunion=0; +	int ere = ctx->cflags & REG_EXTENDED; +	const char *s = ctx->re; +	int subid = 0; +	int depth = 0; +	reg_errcode_t err; +	tre_stack_t *stack = ctx->stack; + +	PUSHINT(err, stack, subid++); +	for (;;) { +		if ((!ere && *s == '\\' && s[1] == '(') || +		    (ere && *s == '(')) { +			PUSHPTR(err, stack, nunion); +			PUSHPTR(err, stack, nbranch); +			PUSHINT(err, stack, subid++); +			s++; +			if (!ere) +				s++; +			depth++; +			nbranch = nunion = 0; +			continue;  		} -	      else -		{ -		  result = tre_ast_new_literal(ctx->mem, wc, wc, -					       ctx->position); -		  if (!result) -		    return REG_ESPACE; +		if ((!ere && *s == '\\' && s[1] == ')') || +		    (ere && *s == ')' && depth)) { +			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +			if (!ctx->n) +				return REG_ESPACE; +		} else { +			err = parse_atom(ctx, s); +			if (err != REG_OK) +				return err; +			s = ctx->s;  		} -	      ctx->position++; -	      ctx->re += clen; -	      break; -	    } -	  break; - -	case PARSE_MARK_FOR_SUBMATCH: -	  { -	    int submatch_id = tre_stack_pop_int(stack); -	    if (result->submatch_id >= 0) -	      { -		tre_ast_node_t *n, *tmp_node; -		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); -		if (n == NULL) -		  return REG_ESPACE; -		tmp_node = tre_ast_new_catenation(ctx->mem, n, result); -		if (tmp_node == NULL) -		  return REG_ESPACE; -		tmp_node->num_submatches = result->num_submatches; -		result = tmp_node; -	      } -	    result->submatch_id = submatch_id; -	    result->num_submatches++; -	    break; -	  } - -	case PARSE_RESTORE_CFLAGS: -	  ctx->cflags = tre_stack_pop_int(stack); -	  break; +	parse_iter: +		/* extension: repetitions are accepted after an empty node +		   eg. (+), ^*, a$?, a|{2} */ +		switch (*s) { +		case '+': +		case '?': +			if (!ere) +				break; +			/* fallthrough */ +		case '*':; +			int min=0, max=-1; +			if (*s == '+') +				min = 1; +			if (*s == '?') +				max = 1; +			s++; +			ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); +			if (!ctx->n) +				return REG_ESPACE; +			/* extension: multiple consecutive *+?{,} is unspecified, +			   but (a+)+ has to be supported so accepting a++ makes +			   sense, note however that the RE_DUP_MAX limit can be +			   circumvented: (a{255}){255} uses a lot of memory.. */ +			goto parse_iter; +		case '\\': +			if (ere || s[1] != '{') +				break; +			s++; +			goto parse_brace; +		case '{': +			if (!ere) +				break; +		parse_brace: +			err = parse_dup(ctx, s+1); +			if (err != REG_OK) +				return err; +			s = ctx->s; +			goto parse_iter; +		} -	default: -	  assert(0); -	  break; +		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); +		if ((ere && *s == '|') || +		    (ere && *s == ')' && depth) || +		    (!ere && *s == '\\' && s[1] == ')') || +		    !*s) { +			/* extension: empty branch is unspecified (), (|a), (a|) +			   here they are not rejected but match on empty string */ +			int c = *s; +			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); +			nbranch = 0; +			if (c != '|') { +				if (c == '\\') { +					if (!depth) return REG_EPAREN; +					s+=2; +				} else if (c == ')') +					s++; +				depth--; +				err = marksub(ctx, nunion, tre_stack_pop_int(stack)); +				if (err != REG_OK) +					return err; +				if (!c && depth<0) { +					ctx->submatch_id = subid; +					return REG_OK; +				} +				if (!c || depth<0) +					return REG_EPAREN; +				nbranch = tre_stack_pop_voidptr(stack); +				nunion = tre_stack_pop_voidptr(stack); +				goto parse_iter; +			} +			s++; +		}  	} -    } - -  /* Check for missing closing parentheses. */ -  if (depth > 0) -    return REG_EPAREN; - -  if (status == REG_OK) -    ctx->result = result; - -  return status;  } -  /***********************************************************************   from tre-compile.c  ***********************************************************************/ @@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)    if (errcode != REG_OK)      ERROR_EXIT(errcode);    preg->re_nsub = parse_ctx.submatch_id - 1; -  tree = parse_ctx.result; - -  /* Back references and approximate matching cannot currently be used -     in the same regexp. */ -  if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) -    ERROR_EXIT(REG_BADPAT); +  tree = parse_ctx.n;  #ifdef TRE_DEBUG    tre_ast_print(tree); @@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)    if (tnfa == NULL)      ERROR_EXIT(REG_ESPACE);    tnfa->have_backrefs = parse_ctx.max_backref >= 0; -  tnfa->have_approx = parse_ctx.have_approx; +  tnfa->have_approx = 0;    tnfa->num_submatches = parse_ctx.submatch_id;    /* Set up tags for submatch addressing.  If REG_NOSUB is set and the | 
