Official request: Please make GNU grep the default

Dimitry Andric dimitry at andric.com
Mon Aug 16 10:42:47 UTC 2010


On 2010-08-15 21:49, Dimitry Andric wrote:
> ...I
> have attached a more complete patch that:
> 
> - Replaces the horrendously inefficient grep_fgetln() with mostly the
>   same implementation as the libc fgetln() function.
> - Uses plain file descriptors instead of struct FILE, since the
>   buffering is done manually anyway, and it makes it easier to support
>   gzip and bzip2.
> - Let the bzip2 reader just read the file as plain data, when the
>   initial magic number doesn't match, mimicking the behaviour of GNU
>   grep.

Here is a new patch, updated against Gabor's changes in r211364.
-------------- next part --------------
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c
index c66f0a7..30a2dc5 100644
--- a/usr.bin/grep/fastgrep.c
+++ b/usr.bin/grep/fastgrep.c
@@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat)
 }
 
 int
-grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch)
+grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
 {
 	unsigned int j;
 	int ret = REG_NOMATCH;
diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c
index 1872d0e..185ab1d 100644
--- a/usr.bin/grep/file.c
+++ b/usr.bin/grep/file.c
@@ -37,7 +37,7 @@ __FBSDID("$FreeBSD$");
 #include <bzlib.h>
 #include <err.h>
 #include <errno.h>
-#include <stdio.h>
+#include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -47,138 +47,160 @@ __FBSDID("$FreeBSD$");
 
 #include "grep.h"
 
-static char	 fname[MAXPATHLEN];	/* file name */
+#define	MAXBUFSIZ	(32 * 1024)
+#define	LNBUFBUMP	80
 
-#define		 MAXBUFSIZ	(16 * 1024)
-#define		 PREREAD_M	0.2
+static gzFile gzbufdesc;
+static BZFILE* bzbufdesc;
 
-/* Some global variables for the buffering and reading. */
-static char	*lnbuf;
-static size_t	 lnbuflen;
-static unsigned char *binbuf;
-static int	 binbufsiz;
-unsigned char	*binbufptr;
-static int	 bzerr;
+static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *bufpos;
+static size_t bufrem;
 
-#define iswbinary(ch)	(!iswspace((ch)) && iswcntrl((ch)) && \
-			    (ch != L'\b') && (ch != L'\0'))
+static unsigned char *lnbuf;
+static size_t lnbuflen;
 
-/*
- * Returns a single character according to the file type.
- * Returns -1 on failure.
- */
-static inline int
-grep_fgetc(struct file *f)
+static int
+grep_refill(struct file *f)
 {
-	unsigned char c;
+	ssize_t nr;
+	int bzerr;
 
-	switch (filebehave) {
-	case FILE_STDIO:
-		return (getc_unlocked(f->f));
-	case FILE_GZIP:
-		return (gzgetc(f->gzf));
-	case FILE_BZIP:
-		BZ2_bzRead(&bzerr, f->bzf, &c, 1);
-		if (bzerr == BZ_STREAM_END)
-			return (-1);
-		else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
-			errx(2, "%s", getstr(2));
-		return (c);
+	bufpos = buffer;
+	bufrem = 0;
+
+	if (filebehave == FILE_GZIP) {
+		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
+	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
+		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
+		switch (bzerr) {
+		case BZ_OK:
+		case BZ_STREAM_END:
+			/* No problem, nr will be okay */
+			break;
+		case BZ_DATA_ERROR_MAGIC:
+			/*
+			 * As opposed to gzread(), which simply returns the
+			 * plain file data, if it is not in the correct
+			 * compressed format, BZ2_bzRead() instead aborts.
+			 *
+			 * So, just restart at the beginning of the file again,
+			 * and use plain reads from now on.
+			 */
+			BZ2_bzReadClose(&bzerr, bzbufdesc);
+			bzbufdesc = NULL;
+			if (lseek(f->fd, 0, SEEK_SET) == -1)
+				return (EOF);
+			nr = read(f->fd, buffer, MAXBUFSIZ);
+			break;
+		default:
+			/* Make sure we exit with an error */
+			nr = -1;
+		}
+	} else {
+		nr = read(f->fd, buffer, MAXBUFSIZ);
 	}
-	return (-1);
+
+	if (nr <= 0)
+		return (EOF);
+
+	bufrem = nr;
+	return (0);
 }
 
-/*
- * Returns true if the file position is a EOF, returns false
- * otherwise.
- */
-static inline int
-grep_feof(struct file *f)
+static int
+grep_lnbufgrow(size_t newlen)
 {
+	unsigned char *p;
 
-	switch (filebehave) {
-	case FILE_STDIO:
-		return (feof_unlocked(f->f));
-	case FILE_GZIP:
-		return (gzeof(f->gzf));
-	case FILE_BZIP:
-		return (bzerr == BZ_STREAM_END);
-	}
-	return (1);
+	if (lnbuflen >= newlen)
+		return (0);
+	if ((p = realloc(lnbuf, newlen)) == NULL)
+		return (EOF);
+	lnbuf = p;
+	lnbuflen = newlen;
+	return (0);
 }
 
-/*
- * At the first call, fills in an internal buffer and checks if the given
- * file is a binary file and sets the binary flag accordingly.  Then returns
- * a single line and sets len to the length of the returned line.
- * At any other call returns a single line either from the internal buffer
- * or from the file if the buffer is exhausted and sets len to the length
- * of the line.
- */
 char *
-grep_fgetln(struct file *f, size_t *len)
+grep_fgetln(struct file *f, size_t *lenp)
 {
-	struct stat st;
-	size_t bufsiz, i = 0;
-	int ch = 0;
-
-	/* Fill in the buffer if it is empty. */
-	if (binbufptr == NULL) {
-
-		/* Only pre-read to the buffer if we need the binary check. */
-		if (binbehave != BINFILE_TEXT) {
-			if (f->stdin)
-				st.st_size = MAXBUFSIZ;
-			else if (stat(fname, &st) != 0)
-				err(2, NULL);
-			/* no need to allocate buffer. */
-			if (st.st_size == 0)
-				return (NULL);
-
-			bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
-			    (st.st_size / 2) : MAXBUFSIZ;
-
-			binbuf = grep_malloc(sizeof(char) * bufsiz);
-
-			while (i < bufsiz) {
-				ch = grep_fgetc(f);
-				if (ch == EOF)
-					break;
-				binbuf[i++] = ch;
-				if ((ch == '\n') && lbflag)
-					break;
-			}
-
-			f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
-			    '\0' : '\200', i - 1) != NULL;
-		}
-		binbufsiz = i;
-		binbufptr = binbuf;
+	unsigned char *p;
+	char *ret;
+	size_t len;
+	size_t diff;
+	size_t off;
+
+	/* Fill the buffer, if necessary */
+	if (bufrem == 0 && grep_refill(f)) {
+error:
+		*lenp = 0;
+		return (NULL);
+	}
+
+	/* Look for a newline in the remaining part of the buffer */
+	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
+		++p; /* advance over newline */
+		ret = bufpos;
+		len = p - bufpos;
+		bufrem -= len;
+		bufpos = p;
+		*lenp = len;
+		return (ret);
 	}
 
-	/* Read a line whether from the buffer or from the file itself. */
-	for (i = 0; !(grep_feof(f) &&
-	    (binbufptr == &binbuf[binbufsiz])); i++) {
-		if (binbufptr == &binbuf[binbufsiz]) {
-			ch = grep_fgetc(f);
-		} else {
-			ch = binbufptr[0];
-			binbufptr++;
-		}
-		if (i >= lnbuflen) {
-			lnbuflen *= 2;
-			lnbuf = grep_realloc(lnbuf, ++lnbuflen);
-		}
-		if ((ch == '\n') || (ch == EOF)) {
-			lnbuf[i] = '\0';
-			break;
-		} else
-			lnbuf[i] = ch;
+	/* We have to copy the current buffered data to the line buffer */
+	for (len = bufrem, off = 0; ; len += bufrem) {
+		/* Make sure there is room for more data */
+		if (grep_lnbufgrow(len + LNBUFBUMP))
+			goto error;
+		memcpy(lnbuf + off, bufpos, len - off);
+		off = len;
+		if (grep_refill(f))
+			break; /* EOF or error: return partial line */
+		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
+			continue;
+		/* got it: finish up the line (like code above) */
+		++p;
+		diff = p - bufpos;
+		len =+ diff;
+		if (grep_lnbufgrow(len))
+		    goto error;
+		memcpy(lnbuf + off, bufpos, diff);
+		bufrem -= diff;
+		bufpos = p;
+		break;
 	}
-	if (grep_feof(f) && (i == 0) && (ch != '\n'))
+	*lenp = len;
+	return lnbuf;
+}
+
+static struct file *
+grep_file_init(struct file *f)
+{
+
+	if (filebehave == FILE_GZIP &&
+	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL) {
+error:
+		if (!f->stdin)
+			close(f->fd);
+		free(f);
 		return (NULL);
-	*len = i;
-	return (lnbuf);
+	}
+
+	if (filebehave == FILE_BZIP &&
+	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
+		goto error;
+
+	/* Fill read buffer, also catches errors early */
+	if (grep_refill(f))
+		goto error;
+
+	/* Check for binary stuff, if necessary */
+	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
+		f->binary = true;
+
+	return (f);
 }
 
 /*
@@ -192,77 +214,46 @@ grep_stdin_open(void)
 	/* Processing stdin implies --line-buffered for tail -f to work. */
 	lbflag = true;
 
-	snprintf(fname, sizeof fname, "%s", getstr(1));
-
 	f = grep_malloc(sizeof *f);
+	memset(f, 0, sizeof *f);
+	f->fd = STDIN_FILENO;
+	f->stdin = true;
 
-	binbuf = NULL;
-	if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
-		flockfile(f->f);
-		f->stdin = true;
-		return (f);
-	}
-
-	free(f);
-	return (NULL);
+	return grep_file_init(f);
 }
 
 /*
- * Opens a normal, a gzipped or a bzip2 compressed file for processing.
+ * Opens a file for processing.
  */
 struct file *
 grep_open(const char *path)
 {
 	struct file *f;
 
-	snprintf(fname, sizeof fname, "%s", path);
-
 	f = grep_malloc(sizeof *f);
-
-	binbuf = NULL;
-	f->stdin = false;
-	switch (filebehave) {
-	case FILE_STDIO:
-		if ((f->f = fopen(path, "r")) != NULL) {
-			flockfile(f->f);
-			return (f);
-		}
-		break;
-	case FILE_GZIP:
-		if ((f->gzf = gzopen(fname, "r")) != NULL)
-			return (f);
-		break;
-	case FILE_BZIP:
-		if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
-			return (f);
-		break;
+	memset(f, 0, sizeof *f);
+	if ((f->fd = open(path, O_RDONLY)) == -1) {
+		free(f);
+		return (NULL);
 	}
 
-	free(f);
-	return (NULL);
+	return grep_file_init(f);
 }
 
 /*
- * Closes a normal, a gzipped or a bzip2 compressed file.
+ * Closes a file.
  */
 void
 grep_close(struct file *f)
 {
 
-	switch (filebehave) {
-	case FILE_STDIO:
-		funlockfile(f->f);
-		fclose(f->f);
-		break;
-	case FILE_GZIP:
-		gzclose(f->gzf);
-		break;
-	case FILE_BZIP:
-		BZ2_bzclose(f->bzf);
-		break;
-	}
+	close(f->fd);
 
-	/* Reset read buffer for the file we are closing */
-	binbufptr = NULL;
-	free(binbuf);
+	/* Reset read buffer and line buffer */
+	bufpos = buffer;
+	bufrem = 0;
+
+	free(lnbuf);
+	lnbuf = NULL;
+	lnbuflen = 0;
 }
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index 2f743e4..967d464 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -77,10 +77,7 @@ extern const char		*errstr[];
 #define MAX_LINE_MATCHES	32
 
 struct file {
-	struct mmfile	*mmf;
-	BZFILE		*bzf;
-	FILE		*f;
-	gzFile		*gzf;
+	int		 fd;
 	bool		 binary;
 	bool		 stdin;
 };
@@ -157,4 +154,4 @@ char		*grep_fgetln(struct file *f, size_t *len);
 /* fastgrep.c */
 int		 fastcomp(fastgrep_t *, const char *);
 void		 fgrepcomp(fastgrep_t *, const char *);
-int		 grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *);
+int		 grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);


More information about the freebsd-current mailing list