diff options
| author | Rich Felker <dalias@aerifal.cx> | 2012-05-07 14:50:49 -0400 | 
|---|---|---|
| committer | Rich Felker <dalias@aerifal.cx> | 2012-05-07 14:50:49 -0400 | 
| commit | d7a90b35b9b5336a3e730e09d56f1c53f4e946ee (patch) | |
| tree | 9352610f23666ee1e0427d90b7ab12aeb8469592 /src/regex | |
| parent | 2d3e2a7fc13296bb682b9a48b81f1c7130a94bde (diff) | |
| download | musl-d7a90b35b9b5336a3e730e09d56f1c53f4e946ee.tar.gz | |
fix ugly bugs in TRE regex parser
1. * in BRE is not special at the beginning of the regex or a
subexpression. this broke ncurses' build scripts.
2. \\( in BRE is a literal \ followed by a literal (, not a literal \
followed by a subexpression opener.
3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the
entire BRE. POSIX allows treating it as an anchor at the beginning of
a subexpression, but TRE's code for checking if it was at the
beginning of a subexpression was wrong, and fixing it for the sake of
supporting a non-portable usage was too much trouble when just
removing this non-portable behavior was much easier.
this patch also moved lots of the ugly logic for empty atom checking
out of the default/literal case and into new cases for the relevant
characters. this should make parsing faster and make the code smaller.
if nothing else it's a lot more readable/logical.
at some point i'd like to revisit and overhaul lots of this code...
Diffstat (limited to 'src/regex')
| -rw-r--r-- | src/regex/regcomp.c | 91 | 
1 files changed, 31 insertions, 60 deletions
| diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index f8ebe409..fa79e2e5 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)    tre_stack_t *stack = ctx->stack;    int bottom = tre_stack_num_objects(stack);    int depth = 0; +  wchar_t wc; +  int clen;    if (!ctx->nofirstsub)      { @@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)  	    {  	    case CHAR_LPAREN:  /* parenthesized subexpression */ -	      if (ctx->cflags & REG_EXTENDED -		  || (ctx->re > ctx->re_start -		      && *(ctx->re - 1) == CHAR_BACKSLASH)) +	      if (ctx->cflags & REG_EXTENDED)  		{ +		lparen:  		  depth++;  		    {  		      ctx->re++; @@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)  		goto parse_literal;  	      break; -	    case CHAR_RPAREN:  /* end of current subexpression */ -	      if ((ctx->cflags & REG_EXTENDED && depth > 0) -		  || (ctx->re > ctx->re_start -		      && *(ctx->re - 1) == CHAR_BACKSLASH)) -		{ -		  /* We were expecting an atom, but instead the current -		     subexpression was closed.	POSIX leaves the meaning of -		     this to be implementation-defined.	 We interpret this as -		     an empty expression (which matches an empty string).  */ -		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); -		  if (result == NULL) -		    return REG_ESPACE; -		  if (!(ctx->cflags & REG_EXTENDED)) -		    ctx->re--; -		} -	      else -		goto parse_literal; -	      break; -  	    case CHAR_LBRACKET: /* bracket expression */  	      ctx->re++;  	      status = tre_parse_bracket(ctx, &result); @@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)  	    case CHAR_BACKSLASH:  	      /* If this is "\(" or "\)" chew off the backslash and  		 try again. */ -	      if (!(ctx->cflags & REG_EXTENDED) -		  && (*(ctx->re + 1) == CHAR_LPAREN -		      || *(ctx->re + 1) == CHAR_RPAREN)) +	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)  		{  		  ctx->re++; -		  STACK_PUSHX(stack, int, PARSE_ATOM); -		  break; +		  goto lparen; +		} +	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) +		{ +		  goto empty_atom;  		}  	      /* If a macro is used, parse the expanded macro recursively. */ @@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)  	      break;  	    case CHAR_CARET:	 /* beginning of line assertion */ -	      /* '^' has a special meaning everywhere in EREs, and in the -		 beginning of the RE and after \( is BREs. */ +	      /* '^' has a special meaning everywhere in EREs, and at +		 beginning of BRE. */  	      if (ctx->cflags & REG_EXTENDED -		  || (ctx->re - 2 >= ctx->re_start -		      && *(ctx->re - 2) == CHAR_BACKSLASH -		      && *(ctx->re - 1) == CHAR_LPAREN)  		  || ctx->re == ctx->re_start)  		{  		  result = tre_ast_new_literal(ctx->mem, ASSERTION, @@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)  	    case CHAR_DOLLAR:	 /* end of line assertion. */  	      /* '$' is special everywhere in EREs, and in the end of the -		 string and before \) is BREs. */ +		 string in BREs. */  	      if (ctx->cflags & REG_EXTENDED -		  || (*(ctx->re + 1) == CHAR_BACKSLASH -		      && *(ctx->re + 2) == CHAR_RPAREN)  		  || !*(ctx->re + 1))  		{  		  result = tre_ast_new_literal(ctx->mem, ASSERTION, @@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)  		goto parse_literal;  	      break; +	    case CHAR_RPAREN: +	      if (!depth) +	        goto parse_literal; +	    case CHAR_STAR: +	    case CHAR_PIPE: +	    case CHAR_LBRACE: +	    case CHAR_PLUS: +	    case CHAR_QUESTIONMARK: +	      if (!(ctx->cflags & REG_EXTENDED)) +	        goto parse_literal; + +	    empty_atom: +	      result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); +	      if (!result) +		return REG_ESPACE; +	      break; +  	    default:  	    parse_literal: -	      /* We are expecting an atom.  If the subexpression (or the whole -		 regexp ends here, we interpret it as an empty expression -		 (which matches an empty string).  */ -	      if ( -		  (!*ctx->re -		   || *ctx->re == CHAR_STAR -		   || (ctx->cflags & REG_EXTENDED -		       && (*ctx->re == CHAR_PIPE -			   || *ctx->re == CHAR_LBRACE -			   || *ctx->re == CHAR_PLUS -			   || *ctx->re == CHAR_QUESTIONMARK)) -		   /* Test for "\)" in BRE mode. */ -		   || (!(ctx->cflags & REG_EXTENDED) -		       && !*(ctx->re + 1) -		       && *ctx->re == CHAR_BACKSLASH -		       && *(ctx->re + 1) == CHAR_LBRACE))) -		{ -		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); -		  if (!result) -		    return REG_ESPACE; -		  break; -		} - -	      wchar_t wc; -	      int clen = mbtowc(&wc, ctx->re, -1); +	      clen = mbtowc(&wc, ctx->re, -1);  	      if (clen<0) clen=1, wc=WEOF;  	      /* Note that we can't use an tre_isalpha() test here, since there | 
