From d7a90b35b9b5336a3e730e09d56f1c53f4e946ee Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 7 May 2012 14:50:49 -0400 Subject: fix ugly bugs in TRE regex parser 1. * in BRE is not special at the beginning of the regex or a subexpression. this broke ncurses' build scripts. 2. \\( in BRE is a literal \ followed by a literal (, not a literal \ followed by a subexpression opener. 3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the entire BRE. POSIX allows treating it as an anchor at the beginning of a subexpression, but TRE's code for checking if it was at the beginning of a subexpression was wrong, and fixing it for the sake of supporting a non-portable usage was too much trouble when just removing this non-portable behavior was much easier. this patch also moved lots of the ugly logic for empty atom checking out of the default/literal case and into new cases for the relevant characters. this should make parsing faster and make the code smaller. if nothing else it's a lot more readable/logical. at some point i'd like to revisit and overhaul lots of this code... --- src/regex/regcomp.c | 91 ++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 60 deletions(-) diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index f8ebe409..fa79e2e5 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx) tre_stack_t *stack = ctx->stack; int bottom = tre_stack_num_objects(stack); int depth = 0; + wchar_t wc; + int clen; if (!ctx->nofirstsub) { @@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx) { case CHAR_LPAREN: /* parenthesized subexpression */ - if (ctx->cflags & REG_EXTENDED - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == CHAR_BACKSLASH)) + if (ctx->cflags & REG_EXTENDED) { + lparen: depth++; { ctx->re++; @@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx) goto parse_literal; break; - case CHAR_RPAREN: /* end of current subexpression */ - if ((ctx->cflags & REG_EXTENDED && depth > 0) - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == CHAR_BACKSLASH)) - { - /* We were expecting an atom, but instead the current - subexpression was closed. POSIX leaves the meaning of - this to be implementation-defined. We interpret this as - an empty expression (which matches an empty string). */ - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (result == NULL) - return REG_ESPACE; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re--; - } - else - goto parse_literal; - break; - case CHAR_LBRACKET: /* bracket expression */ ctx->re++; status = tre_parse_bracket(ctx, &result); @@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx) case CHAR_BACKSLASH: /* If this is "\(" or "\)" chew off the backslash and try again. */ - if (!(ctx->cflags & REG_EXTENDED) - && (*(ctx->re + 1) == CHAR_LPAREN - || *(ctx->re + 1) == CHAR_RPAREN)) + if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) { ctx->re++; - STACK_PUSHX(stack, int, PARSE_ATOM); - break; + goto lparen; + } + if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) + { + goto empty_atom; } /* If a macro is used, parse the expanded macro recursively. */ @@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx) break; case CHAR_CARET: /* beginning of line assertion */ - /* '^' has a special meaning everywhere in EREs, and in the - beginning of the RE and after \( is BREs. */ + /* '^' has a special meaning everywhere in EREs, and at + beginning of BRE. */ if (ctx->cflags & REG_EXTENDED - || (ctx->re - 2 >= ctx->re_start - && *(ctx->re - 2) == CHAR_BACKSLASH - && *(ctx->re - 1) == CHAR_LPAREN) || ctx->re == ctx->re_start) { result = tre_ast_new_literal(ctx->mem, ASSERTION, @@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx) case CHAR_DOLLAR: /* end of line assertion. */ /* '$' is special everywhere in EREs, and in the end of the - string and before \) is BREs. */ + string in BREs. */ if (ctx->cflags & REG_EXTENDED - || (*(ctx->re + 1) == CHAR_BACKSLASH - && *(ctx->re + 2) == CHAR_RPAREN) || !*(ctx->re + 1)) { result = tre_ast_new_literal(ctx->mem, ASSERTION, @@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx) goto parse_literal; break; + case CHAR_RPAREN: + if (!depth) + goto parse_literal; + case CHAR_STAR: + case CHAR_PIPE: + case CHAR_LBRACE: + case CHAR_PLUS: + case CHAR_QUESTIONMARK: + if (!(ctx->cflags & REG_EXTENDED)) + goto parse_literal; + + empty_atom: + result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!result) + return REG_ESPACE; + break; + default: parse_literal: - /* We are expecting an atom. If the subexpression (or the whole - regexp ends here, we interpret it as an empty expression - (which matches an empty string). */ - if ( - (!*ctx->re - || *ctx->re == CHAR_STAR - || (ctx->cflags & REG_EXTENDED - && (*ctx->re == CHAR_PIPE - || *ctx->re == CHAR_LBRACE - || *ctx->re == CHAR_PLUS - || *ctx->re == CHAR_QUESTIONMARK)) - /* Test for "\)" in BRE mode. */ - || (!(ctx->cflags & REG_EXTENDED) - && !*(ctx->re + 1) - && *ctx->re == CHAR_BACKSLASH - && *(ctx->re + 1) == CHAR_LBRACE))) - { - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (!result) - return REG_ESPACE; - break; - } - - wchar_t wc; - int clen = mbtowc(&wc, ctx->re, -1); + clen = mbtowc(&wc, ctx->re, -1); if (clen<0) clen=1, wc=WEOF; /* Note that we can't use an tre_isalpha() test here, since there -- cgit v1.2.1