Official request: Please make GNU grep the default

Dimitry Andric dimitry at andric.com
Sun Aug 15 19:49:40 UTC 2010


On 2010-08-15 21:07, Dag-Erling Smørgrav wrote:
> I built a profiling version of BSD grep and ran it with a regexp that
> matches only the very last line in (my copy of) INDEX-9.  The results
> are pretty clear:

[I also did precisely that, and I just read your mail when I was
composing the following... :) ]

I had a look at Doug Barton's time trial script for GNU and BSD grep,
and decided to profile both greps a bit.  This gave some interesting
results.

I started with GNU grep, doing approximately the same search as in
Doug's trial, e.g. grep -m1 "|/usr/ports/x11-wm/xfce4-wm|", but I made
the testfile 373491 lines instead of the original 21971, with the
only matching line as the last one.

The GNU grep top called functions are (please ignore the .mcount entry,
which is a gprof side effect):

  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 99.1       0.59     0.59    11497     0.05     0.05  read [5]
  0.6       0.59     0.00    11497     0.00     0.00  kwsexec [8]
  0.1       0.59     0.00        0  100.00%           .mcount (130)
  0.1       0.59     0.00        1     0.62   594.77  grepfile [3]
  0.1       0.60     0.00    11496     0.00     0.00  memmove [9]
  0.0       0.60     0.00        4     0.03     0.03  memchr [10]
  0.0       0.60     0.00    12541     0.00     0.00  memset [16]
  0.0       0.60     0.00    11497     0.00     0.00  EGexecute [7]
  0.0       0.60     0.00    11497     0.00     0.05  fillbuf [4]
  0.0       0.60     0.00    11497     0.00     0.00  grepbuf [6]

The same exercise with BSD grep indeed runs *much* slower, and its top
called functions are:

  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 67.3       8.60     8.60        0  100.00%           .mcount (105)
 22.2      11.44     2.84    22778     0.12     0.12  __sys_read [9]
  3.6      11.91     0.46 373553797     0.00     0.00  grep_feof [11]
  3.5      12.35     0.45 373180306     0.00     0.00  fgetc [6]
  2.1      12.63     0.27 373180306     0.00     0.00  grep_fgetc [5]
  1.2      12.78     0.15   373491     0.00     0.01  grep_fgetln [4]
  0.0      12.78     0.01      464     0.01     0.01  memset [12]
  0.0      12.79     0.00        1     2.83  4184.69  procfile [3]
  0.0      12.79     0.00        5     0.42     0.69  free [14]
  0.0      12.79     0.00    22778     0.00     0.00  __sread [607]

The testfile has 373180306 bytes, so you can see there is a tremendous
overhead in calling grep_fgetc, grep_feof and fgetc, which is done
separately for EACH byte in the file!

So my first quick fix attempt was to replace the home-grown grep_fgetln
with fgetln(3), which is in libc.  This does not support gzip and bzip2
files, but just to prove the point, it is enough.  It gave the following
profiling result:

  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 80.0       0.68     0.68     5695     0.12     0.12  read [5]
 15.5       0.81     0.13   379178     0.00     0.00  memchr [8]
  3.2       0.83     0.03        0  100.00%           .mcount (102)
  0.6       0.84     0.00      451     0.01     0.01  memset [9]
  0.4       0.84     0.00        5     0.59     0.85  free [10]
  0.1       0.84     0.00        1     1.23   814.47  procfile [3]
  0.1       0.84     0.00   373491     0.00     0.00  fgetln [4]
  0.0       0.84     0.00   373491     0.00     0.00  grep_fgetln [29]
  0.0       0.84     0.00    11379     0.00     0.00  memcpy [30]
  0.0       0.84     0.00       50     0.00     0.00  arena_run_alloc [36]

You can see there is much less overhead, and it is the regular read()
function that gets the most calls, though not yet as much as with GNU
grep.  However, this already gave MUCH better runtimes with Doug's trial
script:

  GNU grep
  Elapsed time: 57 seconds

  BSD grep (original)
  Elapsed time: 820 seconds  (~14.4x slower than GNU grep)

  BSD grep (quickfixed)
  Elapsed time: 115 seconds  (~2.0x slower than GNU grep)

It proves that getting rid of the fgetc's is certainly worthwhile, and I
have attached a more complete patch that:

- Replaces the horrendously inefficient grep_fgetln() with mostly the
  same implementation as the libc fgetln() function.
- Uses plain file descriptors instead of struct FILE, since the
  buffering is done manually anyway, and it makes it easier to support
  gzip and bzip2.
- Let the bzip2 reader just read the file as plain data, when the
  initial magic number doesn't match, mimicking the behaviour of GNU
  grep.

There is probably more room for optimization, but let's see if this
survives a bunch of tests first. :)
-------------- next part --------------
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c
index 5239a0e..2405ed8 100644
--- a/usr.bin/grep/fastgrep.c
+++ b/usr.bin/grep/fastgrep.c
@@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat)
 }
 
 int
-grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch)
+grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
 {
 	unsigned int j;
 	int ret = REG_NOMATCH;
diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c
index c4de7db..a7e3bf0 100644
--- a/usr.bin/grep/file.c
+++ b/usr.bin/grep/file.c
@@ -37,7 +37,7 @@ __FBSDID("$FreeBSD$");
 #include <bzlib.h>
 #include <err.h>
 #include <errno.h>
-#include <stdio.h>
+#include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -47,133 +47,159 @@ __FBSDID("$FreeBSD$");
 
 #include "grep.h"
 
-static char	 fname[MAXPATHLEN];	/* file name */
+#define	MAXBUFSIZ	(32 * 1024)
+#define	LNBUFBUMP	80
 
-#define		 MAXBUFSIZ	(16 * 1024)
-#define		 PREREAD_M	0.2
+static gzFile gzbufdesc;
+static BZFILE* bzbufdesc;
 
-/* Some global variables for the buffering and reading. */
-static char	*lnbuf;
-static size_t	 lnbuflen;
-static unsigned char *binbuf;
-static int	 binbufsiz;
-unsigned char	*binbufptr;
-static int	 bzerr;
+static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *bufpos;
+static size_t bufrem;
 
-#define iswbinary(ch)	(!iswspace((ch)) && iswcntrl((ch)) && \
-			    (ch != L'\b') && (ch != L'\0'))
+static unsigned char *lnbuf;
+static size_t lnbuflen;
 
-/*
- * Returns a single character according to the file type.
- * Returns -1 on failure.
- */
-int
-grep_fgetc(struct file *f)
+static int
+grep_refill(struct file *f)
 {
-	unsigned char c;
-
-	switch (filebehave) {
-	case FILE_STDIO:
-		return (fgetc(f->f));
-	case FILE_GZIP:
-		return (gzgetc(f->gzf));
-	case FILE_BZIP:
-		BZ2_bzRead(&bzerr, f->bzf, &c, 1);
-		if (bzerr == BZ_STREAM_END)
-			return (-1);
-		else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
-			errx(2, "%s", getstr(2));
-		return (c);
+	ssize_t nr;
+	int bzerr;
+
+	bufpos = buffer;
+	bufrem = 0;
+
+	if (filebehave == FILE_GZIP) {
+		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
+	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
+		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
+		switch (bzerr) {
+		case BZ_OK:
+		case BZ_STREAM_END:
+			/* No problem, nr will be okay */
+			break;
+		case BZ_DATA_ERROR_MAGIC:
+			/*
+			 * As opposed to gzread(), which simply returns the
+			 * plain file data, if it is not in the correct
+			 * compressed format, BZ2_bzRead() instead aborts.
+			 *
+			 * So, just restart at the beginning of the file again,
+			 * and use plain reads from now on.
+			 */
+			BZ2_bzReadClose(&bzerr, bzbufdesc);
+			bzbufdesc = NULL;
+			if (lseek(f->fd, 0, SEEK_SET) == -1)
+				return (EOF);
+			nr = read(f->fd, buffer, MAXBUFSIZ);
+			break;
+		default:
+			/* Make sure we exit with an error */
+			nr = -1;
+		}
+	} else {
+		nr = read(f->fd, buffer, MAXBUFSIZ);
 	}
-	return (-1);
+
+	if (nr <= 0)
+		return (EOF);
+
+	bufrem = nr;
+	return (0);
 }
 
-/*
- * Returns true if the file position is a EOF, returns false
- * otherwise.
- */
-int
-grep_feof(struct file *f)
+static int
+grep_lnbufgrow(size_t newlen)
 {
-
-	switch (filebehave) {
-	case FILE_STDIO:
-		return (feof(f->f));
-	case FILE_GZIP:
-		return (gzeof(f->gzf));
-	case FILE_BZIP:
-		return (bzerr == BZ_STREAM_END);
-	}
-	return (1);
+	unsigned char *p;
+
+	if (lnbuflen >= newlen)
+		return (0);
+	if ((p = realloc(lnbuf, newlen)) == NULL)
+		return (EOF);
+	lnbuf = p;
+	lnbuflen = newlen;
+	return (0);
 }
 
-/*
- * At the first call, fills in an internal buffer and checks if the given
- * file is a binary file and sets the binary flag accordingly.  Then returns
- * a single line and sets len to the length of the returned line.
- * At any other call returns a single line either from the internal buffer
- * or from the file if the buffer is exhausted and sets len to the length
- * of the line.
- */
 char *
-grep_fgetln(struct file *f, size_t *len)
+grep_fgetln(struct file *f, size_t *lenp)
 {
-	struct stat st;
-	size_t bufsiz, i = 0;
-	int ch = 0;
-
-	/* Fill in the buffer if it is empty. */
-	if (binbufptr == NULL) {
-
-		/* Only pre-read to the buffer if we need the binary check. */
-		if (binbehave != BINFILE_TEXT) {
-			if (f->stdin)
-				st.st_size = MAXBUFSIZ;
-			else if (stat(fname, &st) != 0)
-				err(2, NULL);
-
-			bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
-			    (st.st_size / 2) : MAXBUFSIZ;
-
-			binbuf = grep_malloc(sizeof(char) * bufsiz);
-
-			while (i < bufsiz) {
-				ch = grep_fgetc(f);
-				if (ch == EOF)
-					break;
-				binbuf[i++] = ch;
-			}
-
-			f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
-			    '\0' : '\200', i - 1) != NULL;
-		}
-		binbufsiz = i;
-		binbufptr = binbuf;
+	unsigned char *p;
+	char *ret;
+	size_t len;
+	size_t diff;
+	size_t off;
+
+	/* Fill the buffer, if necessary */
+	if (bufrem == 0 && grep_refill(f)) {
+error:
+		*lenp = 0;
+		return (NULL);
 	}
 
-	/* Read a line whether from the buffer or from the file itself. */
-	for (i = 0; !(grep_feof(f) &&
-	    (binbufptr == &binbuf[binbufsiz])); i++) {
-		if (binbufptr == &binbuf[binbufsiz]) {
-			ch = grep_fgetc(f);
-		} else {
-			ch = binbufptr[0];
-			binbufptr++;
-		}
-		if (i >= lnbuflen) {
-			lnbuflen *= 2;
-			lnbuf = grep_realloc(lnbuf, ++lnbuflen);
-		}
-		if ((ch == '\n') || (ch == EOF)) {
-			lnbuf[i] = '\0';
-			break;
-		} else
-			lnbuf[i] = ch;
+	/* Look for a newline in the remaining part of the buffer */
+	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
+		++p; /* advance over newline */
+		ret = bufpos;
+		len = p - bufpos;
+		bufrem -= len;
+		bufpos = p;
+		*lenp = len;
+		return (ret);
 	}
-	if (grep_feof(f) && (i == 0) && (ch != '\n'))
+
+	/* We have to copy the current buffered data to the line buffer */
+	for (len = bufrem, off = 0; ; len += bufrem) {
+		/* Make sure there is room for more data */
+		if (grep_lnbufgrow(len + LNBUFBUMP))
+			goto error;
+		memcpy(lnbuf + off, bufpos, len - off);
+		off = len;
+		if (grep_refill(f))
+			break; /* EOF or error: return partial line */
+		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
+			continue;
+		/* got it: finish up the line (like code above) */
+		++p;
+		diff = p - bufpos;
+		len =+ diff;
+		if (grep_lnbufgrow(len))
+		    goto error;
+		memcpy(lnbuf + off, bufpos, diff);
+		bufrem -= diff;
+		bufpos = p;
+		break;
+	}
+	*lenp = len;
+	return lnbuf;
+}
+
+static struct file *
+grep_file_init(struct file *f)
+{
+	if (filebehave == FILE_GZIP &&
+	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL) {
+error:
+		if (!f->stdin)
+			close(f->fd);
+		free(f);
 		return (NULL);
-	*len = i;
-	return (lnbuf);
+	}
+
+	if (filebehave == FILE_BZIP &&
+	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
+		goto error;
+
+	/* Fill read buffer, also catches errors early */
+	if (grep_refill(f))
+		goto error;
+
+	/* Check for binary stuff, if necessary */
+	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
+		f->binary = true;
+
+	return (f);
 }
 
 /*
@@ -184,72 +210,45 @@ grep_stdin_open(void)
 {
 	struct file *f;
 
-	snprintf(fname, sizeof fname, "%s", getstr(1));
-
 	f = grep_malloc(sizeof *f);
+	memset(f, 0, sizeof *f);
+	f->fd = fileno(stdin);
+	f->stdin = true;
 
-	if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
-		f->stdin = true;
-		return (f);
-	}
-
-	free(f);
-	return (NULL);
+	return grep_file_init(f);
 }
 
 /*
- * Opens a normal, a gzipped or a bzip2 compressed file for processing.
+ * Opens a file for processing.
  */
 struct file *
 grep_open(const char *path)
 {
 	struct file *f;
 
-	snprintf(fname, sizeof fname, "%s", path);
-
 	f = grep_malloc(sizeof *f);
-
-	f->stdin = false;
-	switch (filebehave) {
-	case FILE_STDIO:
-		if ((f->f = fopen(path, "r")) != NULL)
-			return (f);
-		break;
-	case FILE_GZIP:
-		if ((f->gzf = gzopen(fname, "r")) != NULL)
-			return (f);
-		break;
-	case FILE_BZIP:
-		if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
-			return (f);
-		break;
+	memset(f, 0, sizeof *f);
+	if ((f->fd = open(path, O_RDONLY)) == -1) {
+		free(f);
+		return (NULL);
 	}
 
-	free(f);
-	return (NULL);
+	return grep_file_init(f);
 }
 
 /*
- * Closes a normal, a gzipped or a bzip2 compressed file.
+ * Closes a file.
  */
 void
 grep_close(struct file *f)
 {
+	close(f->fd);
 
-	switch (filebehave) {
-	case FILE_STDIO:
-		fclose(f->f);
-		break;
-	case FILE_GZIP:
-		gzclose(f->gzf);
-		break;
-	case FILE_BZIP:
-		BZ2_bzclose(f->bzf);
-		break;
-	}
-
-	/* Reset read buffer for the file we are closing */
-	binbufptr = NULL;
-	free(binbuf);
+	/* Reset read buffer and line buffer */
+	bufpos = buffer;
+	bufrem = 0;
 
+	free(lnbuf);
+	lnbuf = NULL;
+	lnbuflen = 0;
 }
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index bb5e5f9..3988240 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -77,10 +77,7 @@ extern const char		*errstr[];
 #define MAX_LINE_MATCHES	32
 
 struct file {
-	struct mmfile	*mmf;
-	BZFILE		*bzf;
-	FILE		*f;
-	gzFile		*gzf;
+	int		 fd;
 	bool		 binary;
 	bool		 stdin;
 };
@@ -153,11 +150,9 @@ void	 clearqueue(void);
 void		 grep_close(struct file *f);
 struct file	*grep_stdin_open(void);
 struct file	*grep_open(const char *path);
-int		 grep_feof(struct file *f);
-int		 grep_fgetc(struct file *f);
 char		*grep_fgetln(struct file *f, size_t *len);
 
 /* fastgrep.c */
 int		 fastcomp(fastgrep_t *, const char *);
 void		 fgrepcomp(fastgrep_t *, const char *);
-int		 grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *);
+int		 grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);


More information about the freebsd-current mailing list