svn commit: r211463 - head/usr.bin/grep

mdf at FreeBSD.org mdf at FreeBSD.org
Wed Aug 18 20:48:43 UTC 2010


On Wed, Aug 18, 2010 at 10:40 AM, Gabor Kovesdan <gabor at freebsd.org> wrote:
> Author: gabor
> Date: Wed Aug 18 17:40:10 2010
> New Revision: 211463
> URL: http://svn.freebsd.org/changeset/base/211463
>
> Log:
>  - Refactor file reading code to use pure syscalls and an internal buffer
>    instead of stdio.  This gives BSD grep a very big performance boost,
>    its speed is now almost comparable to GNU grep.

I didn't read all of the details in the profiling mails in the thread,
but does this mean that work on stdio would give a performance boost
to many apps?  Or is there something specific about how grep(1) is
using its input that makes it a horse of a different color?

Thanks,
matthew


>
>  Submitted by: Dimitry Andric <dimitry at andric.com>
>  Approved by:  delphij (mentor)
>
> Modified:
>  head/usr.bin/grep/fastgrep.c
>  head/usr.bin/grep/file.c
>  head/usr.bin/grep/grep.h
>  head/usr.bin/grep/util.c
>
> Modified: head/usr.bin/grep/fastgrep.c
> ==============================================================================
> --- head/usr.bin/grep/fastgrep.c        Wed Aug 18 17:39:47 2010        (r211462)
> +++ head/usr.bin/grep/fastgrep.c        Wed Aug 18 17:40:10 2010        (r211463)
> @@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat
>  }
>
>  int
> -grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch)
> +grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
>  {
>        unsigned int j;
>        int ret = REG_NOMATCH;
>
> Modified: head/usr.bin/grep/file.c
> ==============================================================================
> --- head/usr.bin/grep/file.c    Wed Aug 18 17:39:47 2010        (r211462)
> +++ head/usr.bin/grep/file.c    Wed Aug 18 17:40:10 2010        (r211463)
> @@ -2,7 +2,8 @@
>
>  /*-
>  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
> - * Copyright (C) 2008-2009 Gabor Kovesdan <gabor at FreeBSD.org>
> + * Copyright (C) 2008-2010 Gabor Kovesdan <gabor at FreeBSD.org>
> + * Copyright (C) 2010 Dimitry Andric <dimitry at andric.com>
>  * All rights reserved.
>  *
>  * Redistribution and use in source and binary forms, with or without
> @@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$");
>  #include <bzlib.h>
>  #include <err.h>
>  #include <errno.h>
> -#include <stdio.h>
> +#include <fcntl.h>
> +#include <stddef.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> @@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$");
>
>  #include "grep.h"
>
> -static char     fname[MAXPATHLEN];     /* file name */
> +#define        MAXBUFSIZ       (32 * 1024)
> +#define        LNBUFBUMP       80
>
> -#define                 MAXBUFSIZ      (16 * 1024)
> -#define                 PREREAD_M      0.2
> +static gzFile gzbufdesc;
> +static BZFILE* bzbufdesc;
>
> -/* Some global variables for the buffering and reading. */
> -static char    *lnbuf;
> -static size_t   lnbuflen;
> -static unsigned char *binbuf;
> -static int      binbufsiz;
> -unsigned char  *binbufptr;
> -static int      bzerr;
> +static unsigned char buffer[MAXBUFSIZ];
> +static unsigned char *bufpos;
> +static size_t bufrem;
>
> -#define iswbinary(ch)  (!iswspace((ch)) && iswcntrl((ch)) && \
> -                           (ch != L'\b') && (ch != L'\0'))
> +static unsigned char *lnbuf;
> +static size_t lnbuflen;
>
> -/*
> - * Returns a single character according to the file type.
> - * Returns -1 on failure.
> - */
>  static inline int
> -grep_fgetc(struct file *f)
> +grep_refill(struct file *f)
>  {
> -       unsigned char c;
> +       ssize_t nr;
> +       int bzerr;
>
> -       switch (filebehave) {
> -       case FILE_STDIO:
> -               return (getc_unlocked(f->f));
> -       case FILE_GZIP:
> -               return (gzgetc(f->gzf));
> -       case FILE_BZIP:
> -               BZ2_bzRead(&bzerr, f->bzf, &c, 1);
> -               if (bzerr == BZ_STREAM_END)
> -                       return (-1);
> -               else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
> -                       errx(2, "%s", getstr(2));
> -               return (c);
> -       }
> -       return (-1);
> +       bufpos = buffer;
> +       bufrem = 0;
> +
> +       if (filebehave == FILE_GZIP)
> +               nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
> +       else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
> +               nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
> +               switch (bzerr) {
> +               case BZ_OK:
> +               case BZ_STREAM_END:
> +                       /* No problem, nr will be okay */
> +                       break;
> +               case BZ_DATA_ERROR_MAGIC:
> +                       /*
> +                        * As opposed to gzread(), which simply returns the
> +                        * plain file data, if it is not in the correct
> +                        * compressed format, BZ2_bzRead() instead aborts.
> +                        *
> +                        * So, just restart at the beginning of the file again,
> +                        * and use plain reads from now on.
> +                        */
> +                       BZ2_bzReadClose(&bzerr, bzbufdesc);
> +                       bzbufdesc = NULL;
> +                       if (lseek(f->fd, 0, SEEK_SET) == -1)
> +                               return (-1);
> +                       nr = read(f->fd, buffer, MAXBUFSIZ);
> +                       break;
> +               default:
> +                       /* Make sure we exit with an error */
> +                       nr = -1;
> +               }
> +       } else
> +               nr = read(f->fd, buffer, MAXBUFSIZ);
> +
> +       if (nr < 0)
> +               return (-1);
> +
> +       bufrem = nr;
> +       return (0);
>  }
>
> -/*
> - * Returns true if the file position is a EOF, returns false
> - * otherwise.
> - */
>  static inline int
> -grep_feof(struct file *f)
> +grep_lnbufgrow(size_t newlen)
>  {
>
> -       switch (filebehave) {
> -       case FILE_STDIO:
> -               return (feof_unlocked(f->f));
> -       case FILE_GZIP:
> -               return (gzeof(f->gzf));
> -       case FILE_BZIP:
> -               return (bzerr == BZ_STREAM_END);
> +       if (lnbuflen < newlen) {
> +               lnbuf = grep_realloc(lnbuf, newlen);
> +               lnbuflen = newlen;
>        }
> -       return (1);
> +
> +       return (0);
>  }
>
> -/*
> - * At the first call, fills in an internal buffer and checks if the given
> - * file is a binary file and sets the binary flag accordingly.  Then returns
> - * a single line and sets len to the length of the returned line.
> - * At any other call returns a single line either from the internal buffer
> - * or from the file if the buffer is exhausted and sets len to the length
> - * of the line.
> - */
>  char *
> -grep_fgetln(struct file *f, size_t *len)
> +grep_fgetln(struct file *f, size_t *lenp)
>  {
> -       struct stat st;
> -       size_t bufsiz, i = 0;
> -       int ch = 0;
> -
> -       /* Fill in the buffer if it is empty. */
> -       if (binbufptr == NULL) {
> -
> -               /* Only pre-read to the buffer if we need the binary check. */
> -               if (binbehave != BINFILE_TEXT) {
> -                       if (f->stdin)
> -                               st.st_size = MAXBUFSIZ;
> -                       else if (stat(fname, &st) != 0)
> -                               err(2, NULL);
> -                       /* no need to allocate buffer. */
> -                       if (st.st_size == 0)
> -                               return (NULL);
> -
> -                       bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
> -                           (st.st_size / 2) : MAXBUFSIZ;
> -
> -                       binbuf = grep_malloc(sizeof(char) * bufsiz);
> -
> -                       while (i < bufsiz) {
> -                               ch = grep_fgetc(f);
> -                               if (ch == EOF)
> -                                       break;
> -                               binbuf[i++] = ch;
> -                               if ((ch == '\n') && lbflag)
> -                                       break;
> -                       }
> -
> -                       f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
> -                           '\0' : '\200', i - 1) != NULL;
> -               }
> -               binbufsiz = i;
> -               binbufptr = binbuf;
> -       }
> -
> -       /* Read a line whether from the buffer or from the file itself. */
> -       for (i = 0; !(grep_feof(f) &&
> -           (binbufptr == &binbuf[binbufsiz])); i++) {
> -               if (binbufptr == &binbuf[binbufsiz]) {
> -                       ch = grep_fgetc(f);
> -               } else {
> -                       ch = binbufptr[0];
> -                       binbufptr++;
> -               }
> -               if (i >= lnbuflen) {
> -                       lnbuflen *= 2;
> -                       lnbuf = grep_realloc(lnbuf, ++lnbuflen);
> -               }
> -               if ((ch == '\n') || (ch == EOF)) {
> -                       lnbuf[i] = '\0';
> +       unsigned char *p;
> +       char *ret;
> +       size_t len;
> +       size_t off;
> +       ptrdiff_t diff;
> +
> +       /* Fill the buffer, if necessary */
> +       if (bufrem == 0 && grep_refill(f) != 0)
> +               goto error;
> +
> +       if (bufrem == 0) {
> +               /* Return zero length to indicate EOF */
> +               *lenp = 0;
> +               return (bufpos);
> +       }
> +
> +       /* Look for a newline in the remaining part of the buffer */
> +       if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
> +               ++p; /* advance over newline */
> +               ret = bufpos;
> +               len = p - bufpos;
> +               bufrem -= len;
> +               bufpos = p;
> +               *lenp = len;
> +               return (ret);
> +       }
> +
> +       /* We have to copy the current buffered data to the line buffer */
> +       for (len = bufrem, off = 0; ; len += bufrem) {
> +               /* Make sure there is room for more data */
> +               if (grep_lnbufgrow(len + LNBUFBUMP))
> +                       goto error;
> +               memcpy(lnbuf + off, bufpos, len - off);
> +               off = len;
> +               if (grep_refill(f) != 0)
> +                       goto error;
> +               if (bufrem == 0)
> +                       /* EOF: return partial line */
>                        break;
> -               } else
> -                       lnbuf[i] = ch;
> +               if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
> +                       continue;
> +               /* got it: finish up the line (like code above) */
> +               ++p;
> +               diff = p - bufpos;
> +               len += diff;
> +               if (grep_lnbufgrow(len))
> +                   goto error;
> +               memcpy(lnbuf + off, bufpos, diff);
> +               bufrem -= diff;
> +               bufpos = p;
> +               break;
>        }
> -       if (grep_feof(f) && (i == 0) && (ch != '\n'))
> -               return (NULL);
> -       *len = i;
> +       *lenp = len;
>        return (lnbuf);
> +
> +error:
> +       *lenp = 0;
> +       return (NULL);
>  }
>
> -/*
> - * Opens the standard input for processing.
> - */
> -struct file *
> -grep_stdin_open(void)
> +static inline struct file *
> +grep_file_init(struct file *f)
>  {
> -       struct file *f;
>
> -       /* Processing stdin implies --line-buffered for tail -f to work. */
> -       lbflag = true;
> +       if (filebehave == FILE_GZIP &&
> +           (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
> +               goto error;
>
> -       snprintf(fname, sizeof fname, "%s", getstr(1));
> +       if (filebehave == FILE_BZIP &&
> +           (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
> +               goto error;
>
> -       f = grep_malloc(sizeof *f);
> +       /* Fill read buffer, also catches errors early */
> +       if (grep_refill(f) != 0)
> +               goto error;
>
> -       binbuf = NULL;
> -       if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
> -               flockfile(f->f);
> -               f->stdin = true;
> -               return (f);
> -       }
> +       /* Check for binary stuff, if necessary */
> +       if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
> +               f->binary = true;
>
> +       return (f);
> +error:
> +       close(f->fd);
>        free(f);
>        return (NULL);
>  }
>
>  /*
> - * Opens a normal, a gzipped or a bzip2 compressed file for processing.
> + * Opens a file for processing.
>  */
>  struct file *
>  grep_open(const char *path)
>  {
>        struct file *f;
>
> -       snprintf(fname, sizeof fname, "%s", path);
> -
>        f = grep_malloc(sizeof *f);
> -
> -       binbuf = NULL;
> -       f->stdin = false;
> -       switch (filebehave) {
> -       case FILE_STDIO:
> -               if ((f->f = fopen(path, "r")) != NULL) {
> -                       flockfile(f->f);
> -                       return (f);
> -               }
> -               break;
> -       case FILE_GZIP:
> -               if ((f->gzf = gzopen(fname, "r")) != NULL)
> -                       return (f);
> -               break;
> -       case FILE_BZIP:
> -               if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
> -                       return (f);
> -               break;
> +       memset(f, 0, sizeof *f);
> +       if (path == NULL) {
> +               /* Processing stdin implies --line-buffered. */
> +               lbflag = true;
> +               f->fd = STDIN_FILENO;
> +       } else if ((f->fd = open(path, O_RDONLY)) == -1) {
> +               free(f);
> +               return (NULL);
>        }
>
> -       free(f);
> -       return (NULL);
> +       return (grep_file_init(f));
>  }
>
>  /*
> - * Closes a normal, a gzipped or a bzip2 compressed file.
> + * Closes a file.
>  */
>  void
>  grep_close(struct file *f)
>  {
>
> -       switch (filebehave) {
> -       case FILE_STDIO:
> -               funlockfile(f->f);
> -               fclose(f->f);
> -               break;
> -       case FILE_GZIP:
> -               gzclose(f->gzf);
> -               break;
> -       case FILE_BZIP:
> -               BZ2_bzclose(f->bzf);
> -               break;
> -       }
> +       close(f->fd);
>
> -       /* Reset read buffer for the file we are closing */
> -       binbufptr = NULL;
> -       free(binbuf);
> +       /* Reset read buffer and line buffer */
> +       bufpos = buffer;
> +       bufrem = 0;
> +
> +       free(lnbuf);
> +       lnbuf = NULL;
> +       lnbuflen = 0;
>  }
>
> Modified: head/usr.bin/grep/grep.h
> ==============================================================================
> --- head/usr.bin/grep/grep.h    Wed Aug 18 17:39:47 2010        (r211462)
> +++ head/usr.bin/grep/grep.h    Wed Aug 18 17:40:10 2010        (r211463)
> @@ -77,12 +77,8 @@ extern const char            *errstr[];
>  #define MAX_LINE_MATCHES       32
>
>  struct file {
> -       struct mmfile   *mmf;
> -       BZFILE          *bzf;
> -       FILE            *f;
> -       gzFile          *gzf;
> +       int              fd;
>        bool             binary;
> -       bool             stdin;
>  };
>
>  struct str {
> @@ -150,11 +146,10 @@ void       clearqueue(void);
>
>  /* file.c */
>  void            grep_close(struct file *f);
> -struct file    *grep_stdin_open(void);
>  struct file    *grep_open(const char *path);
>  char           *grep_fgetln(struct file *f, size_t *len);
>
>  /* fastgrep.c */
>  int             fastcomp(fastgrep_t *, const char *);
>  void            fgrepcomp(fastgrep_t *, const char *);
> -int             grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *);
> +int             grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);
>
> Modified: head/usr.bin/grep/util.c
> ==============================================================================
> --- head/usr.bin/grep/util.c    Wed Aug 18 17:39:47 2010        (r211462)
> +++ head/usr.bin/grep/util.c    Wed Aug 18 17:40:10 2010        (r211463)
> @@ -184,7 +184,7 @@ procfile(const char *fn)
>
>        if (strcmp(fn, "-") == 0) {
>                fn = label != NULL ? label : getstr(1);
> -               f = grep_stdin_open();
> +               f = grep_open(NULL);
>        } else {
>                if (!stat(fn, &sb)) {
>                        /* Check if we need to process the file */
> @@ -215,7 +215,7 @@ procfile(const char *fn)
>
>        for (c = 0;  c == 0 || !(lflag || qflag); ) {
>                ln.off += ln.len + 1;
> -               if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) {
> +               if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == 0) {
>                        if (ln.line_no == 0 && matchall)
>                                exit(0);
>                        else
>


More information about the svn-src-head mailing list