Official request: Please make GNU grep the default
Dimitry Andric
dimitry at andric.com
Mon Aug 16 10:42:47 UTC 2010
On 2010-08-15 21:49, Dimitry Andric wrote:
> ...I
> have attached a more complete patch that:
>
> - Replaces the horrendously inefficient grep_fgetln() with mostly the
> same implementation as the libc fgetln() function.
> - Uses plain file descriptors instead of struct FILE, since the
> buffering is done manually anyway, and it makes it easier to support
> gzip and bzip2.
> - Let the bzip2 reader just read the file as plain data, when the
> initial magic number doesn't match, mimicking the behaviour of GNU
> grep.
Here is a new patch, updated against Gabor's changes in r211364.
-------------- next part --------------
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c
index c66f0a7..30a2dc5 100644
--- a/usr.bin/grep/fastgrep.c
+++ b/usr.bin/grep/fastgrep.c
@@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat)
}
int
-grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch)
+grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
{
unsigned int j;
int ret = REG_NOMATCH;
diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c
index 1872d0e..185ab1d 100644
--- a/usr.bin/grep/file.c
+++ b/usr.bin/grep/file.c
@@ -37,7 +37,7 @@ __FBSDID("$FreeBSD$");
#include <bzlib.h>
#include <err.h>
#include <errno.h>
-#include <stdio.h>
+#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
@@ -47,138 +47,160 @@ __FBSDID("$FreeBSD$");
#include "grep.h"
-static char fname[MAXPATHLEN]; /* file name */
+#define MAXBUFSIZ (32 * 1024)
+#define LNBUFBUMP 80
-#define MAXBUFSIZ (16 * 1024)
-#define PREREAD_M 0.2
+static gzFile gzbufdesc;
+static BZFILE* bzbufdesc;
-/* Some global variables for the buffering and reading. */
-static char *lnbuf;
-static size_t lnbuflen;
-static unsigned char *binbuf;
-static int binbufsiz;
-unsigned char *binbufptr;
-static int bzerr;
+static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *bufpos;
+static size_t bufrem;
-#define iswbinary(ch) (!iswspace((ch)) && iswcntrl((ch)) && \
- (ch != L'\b') && (ch != L'\0'))
+static unsigned char *lnbuf;
+static size_t lnbuflen;
-/*
- * Returns a single character according to the file type.
- * Returns -1 on failure.
- */
-static inline int
-grep_fgetc(struct file *f)
+static int
+grep_refill(struct file *f)
{
- unsigned char c;
+ ssize_t nr;
+ int bzerr;
- switch (filebehave) {
- case FILE_STDIO:
- return (getc_unlocked(f->f));
- case FILE_GZIP:
- return (gzgetc(f->gzf));
- case FILE_BZIP:
- BZ2_bzRead(&bzerr, f->bzf, &c, 1);
- if (bzerr == BZ_STREAM_END)
- return (-1);
- else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
- errx(2, "%s", getstr(2));
- return (c);
+ bufpos = buffer;
+ bufrem = 0;
+
+ if (filebehave == FILE_GZIP) {
+ nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
+ } else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
+ nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
+ switch (bzerr) {
+ case BZ_OK:
+ case BZ_STREAM_END:
+ /* No problem, nr will be okay */
+ break;
+ case BZ_DATA_ERROR_MAGIC:
+ /*
+ * As opposed to gzread(), which simply returns the
+ * plain file data, if it is not in the correct
+ * compressed format, BZ2_bzRead() instead aborts.
+ *
+ * So, just restart at the beginning of the file again,
+ * and use plain reads from now on.
+ */
+ BZ2_bzReadClose(&bzerr, bzbufdesc);
+ bzbufdesc = NULL;
+ if (lseek(f->fd, 0, SEEK_SET) == -1)
+ return (EOF);
+ nr = read(f->fd, buffer, MAXBUFSIZ);
+ break;
+ default:
+ /* Make sure we exit with an error */
+ nr = -1;
+ }
+ } else {
+ nr = read(f->fd, buffer, MAXBUFSIZ);
}
- return (-1);
+
+ if (nr <= 0)
+ return (EOF);
+
+ bufrem = nr;
+ return (0);
}
-/*
- * Returns true if the file position is a EOF, returns false
- * otherwise.
- */
-static inline int
-grep_feof(struct file *f)
+static int
+grep_lnbufgrow(size_t newlen)
{
+ unsigned char *p;
- switch (filebehave) {
- case FILE_STDIO:
- return (feof_unlocked(f->f));
- case FILE_GZIP:
- return (gzeof(f->gzf));
- case FILE_BZIP:
- return (bzerr == BZ_STREAM_END);
- }
- return (1);
+ if (lnbuflen >= newlen)
+ return (0);
+ if ((p = realloc(lnbuf, newlen)) == NULL)
+ return (EOF);
+ lnbuf = p;
+ lnbuflen = newlen;
+ return (0);
}
-/*
- * At the first call, fills in an internal buffer and checks if the given
- * file is a binary file and sets the binary flag accordingly. Then returns
- * a single line and sets len to the length of the returned line.
- * At any other call returns a single line either from the internal buffer
- * or from the file if the buffer is exhausted and sets len to the length
- * of the line.
- */
char *
-grep_fgetln(struct file *f, size_t *len)
+grep_fgetln(struct file *f, size_t *lenp)
{
- struct stat st;
- size_t bufsiz, i = 0;
- int ch = 0;
-
- /* Fill in the buffer if it is empty. */
- if (binbufptr == NULL) {
-
- /* Only pre-read to the buffer if we need the binary check. */
- if (binbehave != BINFILE_TEXT) {
- if (f->stdin)
- st.st_size = MAXBUFSIZ;
- else if (stat(fname, &st) != 0)
- err(2, NULL);
- /* no need to allocate buffer. */
- if (st.st_size == 0)
- return (NULL);
-
- bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
- (st.st_size / 2) : MAXBUFSIZ;
-
- binbuf = grep_malloc(sizeof(char) * bufsiz);
-
- while (i < bufsiz) {
- ch = grep_fgetc(f);
- if (ch == EOF)
- break;
- binbuf[i++] = ch;
- if ((ch == '\n') && lbflag)
- break;
- }
-
- f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
- '\0' : '\200', i - 1) != NULL;
- }
- binbufsiz = i;
- binbufptr = binbuf;
+ unsigned char *p;
+ char *ret;
+ size_t len;
+ size_t diff;
+ size_t off;
+
+ /* Fill the buffer, if necessary */
+ if (bufrem == 0 && grep_refill(f)) {
+error:
+ *lenp = 0;
+ return (NULL);
+ }
+
+ /* Look for a newline in the remaining part of the buffer */
+ if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
+ ++p; /* advance over newline */
+ ret = bufpos;
+ len = p - bufpos;
+ bufrem -= len;
+ bufpos = p;
+ *lenp = len;
+ return (ret);
}
- /* Read a line whether from the buffer or from the file itself. */
- for (i = 0; !(grep_feof(f) &&
- (binbufptr == &binbuf[binbufsiz])); i++) {
- if (binbufptr == &binbuf[binbufsiz]) {
- ch = grep_fgetc(f);
- } else {
- ch = binbufptr[0];
- binbufptr++;
- }
- if (i >= lnbuflen) {
- lnbuflen *= 2;
- lnbuf = grep_realloc(lnbuf, ++lnbuflen);
- }
- if ((ch == '\n') || (ch == EOF)) {
- lnbuf[i] = '\0';
- break;
- } else
- lnbuf[i] = ch;
+ /* We have to copy the current buffered data to the line buffer */
+ for (len = bufrem, off = 0; ; len += bufrem) {
+ /* Make sure there is room for more data */
+ if (grep_lnbufgrow(len + LNBUFBUMP))
+ goto error;
+ memcpy(lnbuf + off, bufpos, len - off);
+ off = len;
+ if (grep_refill(f))
+ break; /* EOF or error: return partial line */
+ if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
+ continue;
+ /* got it: finish up the line (like code above) */
+ ++p;
+ diff = p - bufpos;
+ len =+ diff;
+ if (grep_lnbufgrow(len))
+ goto error;
+ memcpy(lnbuf + off, bufpos, diff);
+ bufrem -= diff;
+ bufpos = p;
+ break;
}
- if (grep_feof(f) && (i == 0) && (ch != '\n'))
+ *lenp = len;
+ return lnbuf;
+}
+
+static struct file *
+grep_file_init(struct file *f)
+{
+
+ if (filebehave == FILE_GZIP &&
+ (gzbufdesc = gzdopen(f->fd, "r")) == NULL) {
+error:
+ if (!f->stdin)
+ close(f->fd);
+ free(f);
return (NULL);
- *len = i;
- return (lnbuf);
+ }
+
+ if (filebehave == FILE_BZIP &&
+ (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
+ goto error;
+
+ /* Fill read buffer, also catches errors early */
+ if (grep_refill(f))
+ goto error;
+
+ /* Check for binary stuff, if necessary */
+ if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
+ f->binary = true;
+
+ return (f);
}
/*
@@ -192,77 +214,46 @@ grep_stdin_open(void)
/* Processing stdin implies --line-buffered for tail -f to work. */
lbflag = true;
- snprintf(fname, sizeof fname, "%s", getstr(1));
-
f = grep_malloc(sizeof *f);
+ memset(f, 0, sizeof *f);
+ f->fd = STDIN_FILENO;
+ f->stdin = true;
- binbuf = NULL;
- if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
- flockfile(f->f);
- f->stdin = true;
- return (f);
- }
-
- free(f);
- return (NULL);
+ return grep_file_init(f);
}
/*
- * Opens a normal, a gzipped or a bzip2 compressed file for processing.
+ * Opens a file for processing.
*/
struct file *
grep_open(const char *path)
{
struct file *f;
- snprintf(fname, sizeof fname, "%s", path);
-
f = grep_malloc(sizeof *f);
-
- binbuf = NULL;
- f->stdin = false;
- switch (filebehave) {
- case FILE_STDIO:
- if ((f->f = fopen(path, "r")) != NULL) {
- flockfile(f->f);
- return (f);
- }
- break;
- case FILE_GZIP:
- if ((f->gzf = gzopen(fname, "r")) != NULL)
- return (f);
- break;
- case FILE_BZIP:
- if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
- return (f);
- break;
+ memset(f, 0, sizeof *f);
+ if ((f->fd = open(path, O_RDONLY)) == -1) {
+ free(f);
+ return (NULL);
}
- free(f);
- return (NULL);
+ return grep_file_init(f);
}
/*
- * Closes a normal, a gzipped or a bzip2 compressed file.
+ * Closes a file.
*/
void
grep_close(struct file *f)
{
- switch (filebehave) {
- case FILE_STDIO:
- funlockfile(f->f);
- fclose(f->f);
- break;
- case FILE_GZIP:
- gzclose(f->gzf);
- break;
- case FILE_BZIP:
- BZ2_bzclose(f->bzf);
- break;
- }
+ close(f->fd);
- /* Reset read buffer for the file we are closing */
- binbufptr = NULL;
- free(binbuf);
+ /* Reset read buffer and line buffer */
+ bufpos = buffer;
+ bufrem = 0;
+
+ free(lnbuf);
+ lnbuf = NULL;
+ lnbuflen = 0;
}
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index 2f743e4..967d464 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -77,10 +77,7 @@ extern const char *errstr[];
#define MAX_LINE_MATCHES 32
struct file {
- struct mmfile *mmf;
- BZFILE *bzf;
- FILE *f;
- gzFile *gzf;
+ int fd;
bool binary;
bool stdin;
};
@@ -157,4 +154,4 @@ char *grep_fgetln(struct file *f, size_t *len);
/* fastgrep.c */
int fastcomp(fastgrep_t *, const char *);
void fgrepcomp(fastgrep_t *, const char *);
-int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *);
+int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);
More information about the freebsd-current
mailing list