PERFORCE change 144086 for review

Wed Jun 25 12:05:28 UTC 2008

http://perforce.freebsd.org/chv.cgi?CH=144086

Change 144086 by gabor at gabor_server on 2008/06/25 12:05:02

	- Remove the fastgrep code. The regex library can auto-detect the fixed
	  strings and operate with the same speed. If -F is specified, also
	  set the REG_NOSPEC explicitly.
	- As the whole fastgrep code is removed, there is no need to fix
	  the wide char handling there, grep should be localized now.
	
	Obtained from:	NetBSD Project

Affected files ...

.. //depot/projects/soc2008/gabor_textproc/grep/grep.c#46 edit
.. //depot/projects/soc2008/gabor_textproc/grep/grep.h#26 edit
.. //depot/projects/soc2008/gabor_textproc/grep/util.c#40 edit

Differences ...

==== //depot/projects/soc2008/gabor_textproc/grep/grep.c#46 (text+ko) ====

@@ -84,7 +84,6 @@
 int		 patterns, pattern_sz;
 char		**pattern;
 regex_t		*r_pattern;
-struct fastgrep	*fg_pattern;
 
 /* For regex errors  */
 char	 re_error[RE_ERROR_BUF + 1];
@@ -616,24 +615,17 @@
 		++argv;
 	}
 
-	if (Eflag)
+	if (Fflag)
+		cflags |= REG_NOSPEC;
+	else if (Eflag)
 		cflags |= REG_EXTENDED;
-	fg_pattern = grep_calloc(patterns, sizeof(*fg_pattern));
 	r_pattern = grep_calloc(patterns, sizeof(*r_pattern));
 	for (i = 0; i < patterns; ++i) {
-		/* Check if cheating is allowed (always is for fgrep). */
-		if (Fflag) {
-			fgrepcomp(&fg_pattern[i], pattern[i]);
-		} else {
-			if (fastcomp(&fg_pattern[i], pattern[i])) {
-				/* Fall back to full regex library */
-				c = regcomp(&r_pattern[i], pattern[i], cflags);
-				if (c != 0) {
-					regerror(c, &r_pattern[i], re_error,
-					    RE_ERROR_BUF);
-					errx(2, "%s", re_error);
-				}
-			}
+		c = regcomp(&r_pattern[i], pattern[i], cflags);
+		if (c != 0) {
+			regerror(c, &r_pattern[i], re_error,
+			    RE_ERROR_BUF);
+			errx(2, "%s", re_error);
 		}
 	}
 

==== //depot/projects/soc2008/gabor_textproc/grep/grep.h#26 (text+ko) ====

@@ -98,7 +98,6 @@
 
 extern int	 first, prev, matchall, patterns, tail, notfound;
 extern char    **pattern;
-extern struct fastgrep *fg_pattern;
 extern regex_t	*r_pattern;
 
 /* For regex errors  */
@@ -112,8 +111,6 @@
 void	*grep_calloc(size_t nmemb, size_t size);
 void	*grep_realloc(void *ptr, size_t size);
 void	 printline(struct str *line, int sep);
-int	 fastcomp(struct fastgrep *, const char *);
-void	 fgrepcomp(struct fastgrep *, const char *);
 
 /* queue.c */
 void	 initqueue(void);

==== //depot/projects/soc2008/gabor_textproc/grep/util.c#40 (text+ko) ====

@@ -56,9 +56,6 @@
 
 static int	 linesqueued;
 static int	 procline(struct str *l, int);
-static int	 grep_search(struct fastgrep *, unsigned char *, size_t, regmatch_t *pmatch);
-static int	 grep_cmp(const unsigned char *, const unsigned char *, size_t);
-static void	 grep_revstr(unsigned char *, int);
 
 int
 grep_tree(char **argv)
@@ -204,13 +201,6 @@
 	return (c);
 }
 
-
-/*
- * Process an individual line in a file. Return non-zero if it matches.
- */
-
-#define isword(x)	(isalnum(x) || (x) == '_')
-
 static int
 procline(struct str *l, int nottext)
 {
@@ -219,14 +209,9 @@
 
 	if (!matchall) {
 		for (c = i = 0; i < patterns; i++) {
-			if (fg_pattern[i].pattern) {
-				r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
-				    l->len, &pmatch);
-			} else {
-				pmatch.rm_so = 0;
-				pmatch.rm_eo = l->len;
-				r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
-			}
+			pmatch.rm_so = 0;
+			pmatch.rm_eo = l->len;
+			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
 			if (r == 0 && xflag) {
 				if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
 					r = REG_NOMATCH;
@@ -306,283 +291,6 @@
 	return (c);
 }
 
-void
-fgrepcomp(struct fastgrep *fg, const char *pattern)
-{
-	int	 i;
-
-	/* Initialize. */
-	fg->patternLen = strlen(pattern);
-	fg->bol = 0;
-	fg->eol = 0;
-	fg->wmatch = wflag;
-	fg->reversedSearch = 0;
-
-	/*
-	 * Make a copy and upper case it for later if in -i mode,
-	 * else just copy the pointer.
-	 */
-	if (iflag) {
-		fg->pattern = grep_malloc(fg->patternLen + 1);
-		for (i = 0; i < fg->patternLen; i++)
-			fg->pattern[i] = toupper(pattern[i]);
-		fg->pattern[fg->patternLen] = '\0';
-	} else
-		fg->pattern = (unsigned char *)pattern;	/* really const */
-
-	/* Preprocess pattern. */
-	for (i = 0; i <= UCHAR_MAX; i++)
-		fg->qsBc[i] = fg->patternLen;
-	for (i = 1; i < fg->patternLen; i++) {
-		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
-		/*
-		 * If case is ignored, make the jump apply to both upper and
-		 * lower cased characters.  As the pattern is stored in upper
-		 * case, apply the same to the lower case equivalents.
-		 */
-		if (iflag)
-			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
-	}
-}
-
-/*
- * Returns: -1 on failure, 0 on success
- */
-int
-fastcomp(struct fastgrep *fg, const char *pattern)
-{
-	int	 i;
-	int	 bol = 0;
-	int	 eol = 0;
-	int	 shiftPatternLen;
-	int	 hasDot = 0;
-	int	 firstHalfDot = -1;
-	int	 firstLastHalfDot = -1;
-	int	 lastHalfDot = 0;
-
-	/* Initialize. */
-	fg->patternLen = strlen(pattern);
-	fg->bol = 0;
-	fg->eol = 0;
-	fg->wmatch = 0;
-	fg->reversedSearch = 0;
-
-	/* Remove end-of-line character ('$'). */
-	if (pattern[fg->patternLen - 1] == '$') {
-		eol++;
-		fg->eol = 1;
-		fg->patternLen--;
-	}
-
-	/* Remove beginning-of-line character ('^'). */
-	if (pattern[0] == '^') {
-		bol++;
-		fg->bol = 1;
-		fg->patternLen--;
-	}
-
-	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
-	if (wflag) {
-		/* basic re's use \( \), extended re's ( ) */
-		int extra = Eflag ? 1 : 2;
-		fg->patternLen -= 14 + 2 * extra;
-		fg->wmatch = 7 + extra;
-	} else if (fg->patternLen >= 14 &&
-	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
-	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
-		fg->patternLen -= 14;
-		fg->wmatch = 7;
-	}
-
-	/*
-	 * Copy pattern minus '^' and '$' characters as well as word
-	 * match character classes at the beginning and ending of the
-	 * string respectively.
-	 */
-	fg->pattern = grep_malloc(fg->patternLen + 1);
-	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
-	fg->pattern[fg->patternLen] = '\0';
-
-	/* Look for ways to cheat...er...avoid the full regex engine. */
-	for (i = 0; i < fg->patternLen; i++)
-	{
-		/* Can still cheat? */
-		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
-		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
-		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
-		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
-			/* As long as it is good, upper case it for later. */
-			if (iflag)
-				fg->pattern[i] = toupper(fg->pattern[i]);
-		} else if (fg->pattern[i] == '.') {
-			hasDot = i;
-			if (i < fg->patternLen / 2) {
-				if (firstHalfDot < 0)
-					/* Closest dot to the beginning */
-					firstHalfDot = i;
-			} else {
-				/* Closest dot to the end of the pattern. */
-				lastHalfDot = i;
-				if (firstLastHalfDot < 0)
-					firstLastHalfDot = i;
-			}
-		} else {
-			/* Free memory and let others know this is empty. */
-			free(fg->pattern);
-			fg->pattern = NULL;
-			return (-1);
-		}
-	}
-
-	/*
-	 * Determine if a reverse search would be faster based on the placement
-	 * of the dots.
-	 */
-	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
-	    ((lastHalfDot) && ((firstHalfDot < 0) ||
-	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
-		fg->reversedSearch = 1;
-		hasDot = fg->patternLen - (firstHalfDot < 0 ?
-		    firstLastHalfDot : firstHalfDot) - 1;
-		grep_revstr(fg->pattern, fg->patternLen);
-	}
-
-	/*
-	 * Normal Quick Search would require a shift based on the position the
-	 * next character after the comparison is within the pattern.  With
-	 * wildcards, the position of the last dot effects the maximum shift
-	 * distance.
-	 * The closer to the end the wild card is the slower the search.  A
-	 * reverse version of this algorithm would be useful for wildcards near
-	 * the end of the string.
-	 *
-	 * Examples:
-	 * Pattern	Max shift
-	 * -------	---------
-	 * this		5
-	 * .his		4
-	 * t.is		3
-	 * th.s		2
-	 * thi.		1
-	 */
-
-	/* Adjust the shift based on location of the last dot ('.'). */
-	shiftPatternLen = fg->patternLen - hasDot;
-
-	/* Preprocess pattern. */
-	for (i = 0; i <= UCHAR_MAX; i++)
-		fg->qsBc[i] = shiftPatternLen;
-	for (i = hasDot + 1; i < fg->patternLen; i++) {
-		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
-		/*
-		 * If case is ignored, make the jump apply to both upper and
-		 * lower cased characters.  As the pattern is stored in upper
-		 * case, apply the same to the lower case equivalents.
-		 */
-		if (iflag)
-			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
-	}
-
-	/*
-	 * Put pattern back to normal after pre-processing to allow for easy
-	 * comparisons later.
-	 */
-	if (fg->reversedSearch)
-		grep_revstr(fg->pattern, fg->patternLen);
-
-	return (0);
-}
-
-/*
- * Word boundaries using regular expressions are defined as the point
- * of transition from a non-word char to a word char, or vice versa.
- * This means that grep -w +a and grep -w a+ never match anything,
- * because they lack a starting or ending transition, but grep -w a+b
- * does match a line containing a+b.
- */
-#define wmatch(d, l, s, e)	\
-	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
-	  e > s && isword(d[s]) && isword(d[e-1]))
-
-static int
-grep_search(struct fastgrep *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
-{
-	int	 j;
-	int	 rtrnVal = REG_NOMATCH;
-
-	pmatch->rm_so = -1;
-	pmatch->rm_eo = -1;
-
-	/* No point in going farther if we do not have enough data. */
-	if (dataLen < fg->patternLen)
-		return (rtrnVal);
-
-	/* Only try once at the beginning or ending of the line. */
-	if (fg->bol || fg->eol) {
-		/* Simple text comparison. */
-		/* Verify data is >= pattern length before searching on it. */
-		if (dataLen >= fg->patternLen) {
-			/* Determine where in data to start search at. */
-			if (fg->eol)
-				j = dataLen - fg->patternLen;
-			else
-				j = 0;
-			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
-				if (grep_cmp(fg->pattern, data + j,
-				    fg->patternLen) == -1) {
-					pmatch->rm_so = j;
-					pmatch->rm_eo = j + fg->patternLen;
-					if (!fg->wmatch || wmatch(data, dataLen,
-					    pmatch->rm_so, pmatch->rm_eo))
-						rtrnVal = 0;
-				}
-		}
-	} else if (fg->reversedSearch) {
-		/* Quick Search algorithm. */
-		j = dataLen;
-		do {
-			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
-			    fg->patternLen) == -1) {
-				pmatch->rm_so = j - fg->patternLen;
-				pmatch->rm_eo = j;
-				if (!fg->wmatch || wmatch(data, dataLen,
-				    pmatch->rm_so, pmatch->rm_eo)) {
-					rtrnVal = 0;
-					break;
-				}
-			}
-			/* Shift if within bounds, otherwise, we are done. */
-			if (j == fg->patternLen)
-				break;
-			j -= fg->qsBc[data[j - fg->patternLen - 1]];
-		} while (j >= fg->patternLen);
-	} else {
-		/* Quick Search algorithm. */
-		j = 0;
-		do {
-			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
-				pmatch->rm_so = j;
-				pmatch->rm_eo = j + fg->patternLen;
-				if (fg->patternLen == 0 || !fg->wmatch ||
-				    wmatch(data, dataLen, pmatch->rm_so,
-				    pmatch->rm_eo)) {
-					rtrnVal = 0;
-					break;
-				}
-			}
-
-			/* Shift if within bounds, otherwise, we are done. */
-			if (j + fg->patternLen == dataLen)
-				break;
-			else
-				j += fg->qsBc[data[j + fg->patternLen]];
-		} while (j <= (dataLen - fg->patternLen));
-	}
-
-	return (rtrnVal);
-}
-
-
 void *
 grep_malloc(size_t size)
 {
@@ -612,38 +320,6 @@
 	return (ptr);
 }
 
-/*
- * Returns:	i >= 0 on failure (position that it failed)
- *		-1 on success
- */
-static int
-grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len)
-{
-	int	 i;
-
-	for (i = 0; i < len; i++) {
-		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
-		    || (iflag && pattern[i] == toupper(data[i])))
-			continue;
-		return (i);
-	}
-
-	return (-1);
-}
-
-static void
-grep_revstr(unsigned char *str, int len)
-{
-	int	 i;
-	char	 c;
-
-	for (i = 0; i < len / 2; i++) {
-		c = str[i];
-		str[i] = str[len - i - 1];
-		str[len - i - 1] = c;
-	}
-}
-
 void
 printline(struct str *line, int sep)
 {