diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex/fnmatch.c | 404 | 
1 files changed, 273 insertions, 131 deletions
| diff --git a/src/regex/fnmatch.c b/src/regex/fnmatch.c index 81f0c7bd..ffd3ea0d 100644 --- a/src/regex/fnmatch.c +++ b/src/regex/fnmatch.c @@ -1,157 +1,299 @@ -#include <fnmatch.h> -#include <wctype.h> +/* + * An implementation of what I call the "Sea of Stars" algorithm for + * POSIX fnmatch(). The basic idea is that we factor the pattern into + * a head component (which we match first and can reject without ever + * measuring the length of the string), an optional tail component + * (which only exists if the pattern contains at least one star), and + * an optional "sea of stars", a set of star-separated components + * between the head and tail. After the head and tail matches have + * been removed from the input string, the components in the "sea of + * stars" are matched sequentially by searching for their first + * occurrence past the end of the previous match. + * + * - Rich Felker, April 2012 + */ +  #include <string.h> -#include <wchar.h> +#include <fnmatch.h>  #include <stdlib.h> -#include <limits.h> +#include <wchar.h> +#include <wctype.h> -static int next(const char **s) +#define END -1 +#define UNMATCHABLE -2 +#define BRACKET -3 +#define QUESTION -4 +#define STAR -5 + +static int str_next(const char *str, size_t n, size_t *step)  { -	wchar_t c; -	int l = mbtowc(&c, *s, MB_LEN_MAX); -	/* hack to allow literal matches of invalid byte sequences */ -	if (l < 0) return (unsigned char)*(*s)++ - 0x100; -	*s += l; -	return c; +	if (!n) { +		*step = 0; +		return 0; +	} +	if (str[0] >= 128U) { +		wchar_t wc; +		int k = mbtowc(&wc, str, n); +		if (k<0) { +			*step = 1; +			return -1; +		} +		*step = k; +		return wc; +	} +	*step = 1; +	return str[0];  } -#define BRACKET_ERROR  -0x100 -#define BRACKET_NOCHAR -0x101 - -static int bracket_next(const char **s) +static int pat_next(const char *pat, size_t m, size_t *step, int flags)  { -	int c; -	int type; -	if (**s == '[') { -		type = *(*s+1); -		if (type == '.' || type == '=') { -			*s += 2; -			c = next(s); -			if (c <= 0) return BRACKET_ERROR; -			if (**s == type && *(*s+1) == ']') { -				*s += 2; -				return c; +	int esc = 0; +	if (!m || !*pat) { +		*step = 0; +		return END; +	} +	*step = 1; +	if (pat[0]=='\\' && !(flags & FNM_NOESCAPE)) { +		*step = 2; +		pat++; +		esc = 1; +		goto escaped; +	} +	if (pat[0]=='[') { +		size_t k = 1; +		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; +		if (k<m) if (pat[k] == ']') k++; +		for (; k<m && pat[k] && pat[k]!=']'; k++) { +			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) { +				int z = pat[k+1]; +				k+=2; +				if (k<m && pat[k]) k++; +				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++; +				if (k==m || !pat[k]) break;  			} -			for (; **s && (**s != type || *(*s+1) != ']'); (*s)++); -			if (!**s) return BRACKET_ERROR; -			*s += 2; -			return BRACKET_NOCHAR;  		} +		if (k==m || !pat[k]) { +			*step = 1; +			return '['; +		} +		*step = k+1; +		return BRACKET; +	} +	if (pat[0] == '*') +		return STAR; +	if (pat[0] == '?') +		return QUESTION; +escaped: +	if (pat[0] >= 128U) { +		wchar_t wc; +		int k = mbtowc(&wc, pat, m); +		if (k<0) { +			*step = 0; +			return UNMATCHABLE; +		} +		*step = k + esc; +		return wc;  	} -	c = next(s); -	if (c <= 0) return BRACKET_ERROR; -	return c; +	return pat[0];  } -#define __FNM_CONT 0x8000 +static int match_bracket(const char *p, int k) +{ +	wchar_t wc; +	int inv = 0; +	p++; +	if (*p=='^' || *p=='!') { +		inv = 1; +		p++; +	} +	if (*p==']') { +		if (k==']') return !inv; +		p++; +	} else if (*p=='-') { +		if (k=='-') return !inv; +		p++; +	} +	wc = p[-1]; +	for (; *p != ']'; p++) { +		if (p[0]=='-' && p[1]!=']') { +			wchar_t wc2; +			int l = mbtowc(&wc2, p+1, 4); +			if (l < 0) return 0; +			if (wc<=wc2 && (unsigned)k-wc <= wc2-wc) return !inv; +			p += l-1; +			continue; +		} +		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { +			const char *p0 = p+2; +			int z = p[1]; +			p+=3; +			while (p[-1]!=z || p[0]!=']') p++; +			if (z == ':' && p-1-p0 < 16) { +				char buf[16]; +				memcpy(buf, p0, p-1-p0); +				buf[p-1-p0] = 0; +				if (iswctype(k, wctype(buf))) return !inv; +			} +			continue; +		} +		if (*p < 128U) { +			wc = (unsigned char)*p; +		} else { +			int l = mbtowc(&wc, p, 4); +			if (l < 0) return 0; +			p += l-1; +		} +		if (wc==k) return !inv; +	} +	return inv; +} -int fnmatch(const char *p, const char *s, int flags) +static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)  { -	int c, d, k; -	int not; -	int match; -	int first; -	int no_slash = (flags & FNM_PATHNAME) ? '/' : 0; -	int no_period = (flags & FNM_PERIOD) && !(flags & __FNM_CONT) ? '.' : 0x100; -	const char *p1; - -	flags |= __FNM_CONT; - -	while ((c = *p++)) { -		switch (c) { -		case '?': -			k = next(&s); -			if (!k || k == no_period || k == no_slash) -				return FNM_NOMATCH; +	const char *p, *ptail, *endpat; +	const char *s, *stail, *endstr; +	size_t pinc, sinc, tailcnt=0; +	int c, k; + +	if (flags & FNM_PERIOD) { +		if (*str == '.' && *pat != '.') +			return FNM_NOMATCH; +	} +	for (;;) { +		switch ((c = pat_next(pat, m, &pinc, flags))) { +		case UNMATCHABLE: +			return FNM_NOMATCH; +		case STAR: +			pat++; +			m--;  			break; -		case '\\': -			if (!(flags & FNM_NOESCAPE)) { -				c = *p++; -				goto literal; +		default: +			k = str_next(str, n, &sinc); +			if (k <= 0) +				return (c==END) ? 0 : FNM_NOMATCH; +			str += sinc; +			n -= sinc; +			if (c == BRACKET) { +				if (!match_bracket(pat, k)) +					return FNM_NOMATCH; +			} else if (c != QUESTION && k != c) { +				return FNM_NOMATCH;  			} -			if (*s++ != c) return FNM_NOMATCH; +			pat+=pinc; +			m-=pinc; +			continue; +		} +		break; +	} + +	/* Compute real pat length if it was initially unknown/-1 */ +	m = strnlen(pat, m); +	endpat = pat + m; + +	/* Find the last * in pat and count chars needed after it */ +	for (p=ptail=pat; p<endpat; p+=pinc) { +		switch (pat_next(p, endpat-p, &pinc, flags)) { +		case UNMATCHABLE: +			return FNM_NOMATCH; +		case STAR: +			tailcnt=0; +			ptail = p+1;  			break; -		case '*': -			for (; *p == '*'; p++); -			if (*p && !*s) return FNM_NOMATCH; -			if (*s == no_period) +		default: +			tailcnt++; +			break; +		} +	} + +	/* Past this point we need not check for UNMATCHABLE in pat, +	 * because all of pat has already been parsed once. */ + +	/* Compute real str length if it was initially unknown/-1 */ +	n = strnlen(str, n); +	endstr = str + n; +	if (n < tailcnt) return FNM_NOMATCH; + +	/* Find the final tailcnt chars of str, accounting for UTF-8. +	 * On illegal sequences we may get it wrong, but in that case +	 * we necessarily have a matching failure anyway. */ +	for (s=endstr; s>str && tailcnt; tailcnt--) { +		if (s[-1] < 128U) s--; +		else while ((unsigned char)*--s-0x80U<0x40 && s>str); +	} +	if (tailcnt) return FNM_NOMATCH; +	stail = s; + +	/* Check that the pat and str tails match */ +	p = ptail; +	for (;;) { +		c = pat_next(p, endpat-p, &pinc, flags); +		p += pinc; +		if ((k = str_next(s, endstr-s, &sinc)) <= 0) { +			if (c != END) return FNM_NOMATCH; +			break; +		} +		s += sinc; +		if (c == BRACKET) { +			if (!match_bracket(p-pinc, k))  				return FNM_NOMATCH; -			if (!*p && (!no_slash || !strchr(s, no_slash))) -				return 0; -			for (; *s; s++) -				if (!fnmatch(p, s, flags)) -					return 0; -				else if (*s == no_slash) -					break; +		} else if (c != QUESTION && k != c) {  			return FNM_NOMATCH; -		case '[': -			p1 = p-1; -			not = (*p == '!' || *p == '^'); -			if (not) p++; -			k = next(&s); -			if (!k || k == no_slash || k == no_period) -				return FNM_NOMATCH; -			match = 0; -			first = 1; -			for (;;) { -				if (!*p) goto literal_bracket; -				if (*p == ']' && !first) break; -				first = 0; -				if (*p == '[' && *(p+1) == ':') { -					const char *z; -					p += 2; -					for (z=p; *z && (*z != ':' || *(z+1) != ']'); z++); -					if (!*z || z-p > 32) { /* FIXME: symbolic const? */ -						return FNM_NOMATCH; -					} else { -						char class[33]; -						memcpy(class, p, z-p); -						class[z-p] = 0; -						if (iswctype(k, wctype(class))) -							match = 1; -					} -					p = z+2; -					continue; -				} -				c = bracket_next(&p); -				if (c == BRACKET_ERROR) { -literal_bracket: -					match = (k=='['); -					p = p1; -					not = 0; -					break; -				} -				if (c == BRACKET_NOCHAR) -					continue; -				if (*p == '-' && *(p+1) != ']') { -					p++; -					d = bracket_next(&p); -					if (d == BRACKET_ERROR) -						goto literal_bracket; -					if (d == BRACKET_NOCHAR) -						continue; -					if (k >= c && k <= d) -						match = 1; -					continue; -				} -				if (k == c) match = 1; +		} +	} + +	/* We're all done with the tails now, so throw them out */ +	endstr = stail; +	endpat = ptail; + +	/* Match pattern components until there are none left */ +	while (pat<endpat) { +		p = pat; +		s = str; +		for (;;) { +			c = pat_next(p, endpat-p, &pinc, flags); +			p += pinc; +			/* Encountering * completes/commits a component */ +			if (c == STAR) { +				pat = p; +				str = s; +				break;  			} -			p++; -			if (not == match) +			k = str_next(s, endstr-s, &sinc); +			if (!k)  				return FNM_NOMATCH; -			break; -		default: -		literal: -			if (*s++ != c) -				return FNM_NOMATCH; -			if (c == no_slash && (flags & FNM_PERIOD)) { -				no_period = '.'; -				continue; +			if (c == BRACKET) { +				if (!match_bracket(p-pinc, k)) +					break; +			} else if (c != QUESTION && k != c) { +				break;  			} -			break; +			s += sinc;  		} -		no_period = 0x100; +		if (c == STAR) continue; +		/* If we failed, advance str, by 1 char if it's a valid +		 * char, or past all invalid bytes otherwise. */ +		k = str_next(str, endstr-str, &sinc); +		if (k > 0) str += sinc; +		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);  	} -	if (*s) return FNM_NOMATCH; +  	return 0;  } + +int fnmatch(const char *pat, const char *str, int flags) +{ +	const char *s, *p; +	size_t inc; +	int c; +	if (flags & FNM_PATHNAME) for (;;) { +		for (s=str; *s && *s!='/'; s++); +		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc); +		if (*s && *p!=*s) return FNM_NOMATCH; +		if (fnmatch_internal(pat, p-pat, str, s-str, flags)) +			return FNM_NOMATCH; +		if (!*s && c==END) return 0; +		str = s+1; +		pat = p+1; +	} +	return fnmatch_internal(pat, -1, str, -1, flags); +} | 
