Author: gabor
Date: Wed Aug 18 17:40:10 2010
New Revision: 211463
URL: http://svn.freebsd.org/changeset/base/211463

Log:
  - Refactor file reading code to use pure syscalls and an internal buffer
    instead of stdio.  This gives BSD grep a very big performance boost,
    its speed is now almost comparable to GNU grep.
  
  Submitted by: Dimitry Andric <dimi...@andric.com>
  Approved by:  delphij (mentor)

Modified:
  head/usr.bin/grep/fastgrep.c
  head/usr.bin/grep/file.c
  head/usr.bin/grep/grep.h
  head/usr.bin/grep/util.c

Modified: head/usr.bin/grep/fastgrep.c
==============================================================================
--- head/usr.bin/grep/fastgrep.c        Wed Aug 18 17:39:47 2010        
(r211462)
+++ head/usr.bin/grep/fastgrep.c        Wed Aug 18 17:40:10 2010        
(r211463)
@@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat
 }
 
 int
-grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t 
*pmatch)
+grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t 
*pmatch)
 {
        unsigned int j;
        int ret = REG_NOMATCH;

Modified: head/usr.bin/grep/file.c
==============================================================================
--- head/usr.bin/grep/file.c    Wed Aug 18 17:39:47 2010        (r211462)
+++ head/usr.bin/grep/file.c    Wed Aug 18 17:40:10 2010        (r211463)
@@ -2,7 +2,8 @@
 
 /*-
  * Copyright (c) 1999 James Howard and Dag-Erling Co�dan Sm�rgrav
- * Copyright (C) 2008-2009 Gabor Kovesdan <ga...@freebsd.org>
+ * Copyright (C) 2008-2010 Gabor Kovesdan <ga...@freebsd.org>
+ * Copyright (C) 2010 Dimitry Andric <dimi...@andric.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$");
 #include <bzlib.h>
 #include <err.h>
 #include <errno.h>
-#include <stdio.h>
+#include <fcntl.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$");
 
 #include "grep.h"
 
-static char     fname[MAXPATHLEN];     /* file name */
+#define        MAXBUFSIZ       (32 * 1024)
+#define        LNBUFBUMP       80
 
-#define                 MAXBUFSIZ      (16 * 1024)
-#define                 PREREAD_M      0.2
+static gzFile gzbufdesc;
+static BZFILE* bzbufdesc;
 
-/* Some global variables for the buffering and reading. */
-static char    *lnbuf;
-static size_t   lnbuflen;
-static unsigned char *binbuf;
-static int      binbufsiz;
-unsigned char  *binbufptr;
-static int      bzerr;
+static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *bufpos;
+static size_t bufrem;
 
-#define iswbinary(ch)  (!iswspace((ch)) && iswcntrl((ch)) && \
-                           (ch != L'\b') && (ch != L'\0'))
+static unsigned char *lnbuf;
+static size_t lnbuflen;
 
-/*
- * Returns a single character according to the file type.
- * Returns -1 on failure.
- */
 static inline int
-grep_fgetc(struct file *f)
+grep_refill(struct file *f)
 {
-       unsigned char c;
+       ssize_t nr;
+       int bzerr;
 
-       switch (filebehave) {
-       case FILE_STDIO:
-               return (getc_unlocked(f->f));
-       case FILE_GZIP:
-               return (gzgetc(f->gzf));
-       case FILE_BZIP:
-               BZ2_bzRead(&bzerr, f->bzf, &c, 1);
-               if (bzerr == BZ_STREAM_END)
-                       return (-1);
-               else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
-                       errx(2, "%s", getstr(2));
-               return (c);
-       }
-       return (-1);
+       bufpos = buffer;
+       bufrem = 0;
+
+       if (filebehave == FILE_GZIP)
+               nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
+       else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
+               nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
+               switch (bzerr) {
+               case BZ_OK:
+               case BZ_STREAM_END:
+                       /* No problem, nr will be okay */
+                       break;
+               case BZ_DATA_ERROR_MAGIC:
+                       /*
+                        * As opposed to gzread(), which simply returns the
+                        * plain file data, if it is not in the correct
+                        * compressed format, BZ2_bzRead() instead aborts.
+                        *
+                        * So, just restart at the beginning of the file again,
+                        * and use plain reads from now on.
+                        */
+                       BZ2_bzReadClose(&bzerr, bzbufdesc);
+                       bzbufdesc = NULL;
+                       if (lseek(f->fd, 0, SEEK_SET) == -1)
+                               return (-1);
+                       nr = read(f->fd, buffer, MAXBUFSIZ);
+                       break;
+               default:
+                       /* Make sure we exit with an error */
+                       nr = -1;
+               }
+       } else
+               nr = read(f->fd, buffer, MAXBUFSIZ);
+
+       if (nr < 0)
+               return (-1);
+
+       bufrem = nr;
+       return (0);
 }
 
-/*
- * Returns true if the file position is a EOF, returns false
- * otherwise.
- */
 static inline int
-grep_feof(struct file *f)
+grep_lnbufgrow(size_t newlen)
 {
 
-       switch (filebehave) {
-       case FILE_STDIO:
-               return (feof_unlocked(f->f));
-       case FILE_GZIP:
-               return (gzeof(f->gzf));
-       case FILE_BZIP:
-               return (bzerr == BZ_STREAM_END);
+       if (lnbuflen < newlen) {
+               lnbuf = grep_realloc(lnbuf, newlen);
+               lnbuflen = newlen;
        }
-       return (1);
+
+       return (0);
 }
 
-/*
- * At the first call, fills in an internal buffer and checks if the given
- * file is a binary file and sets the binary flag accordingly.  Then returns
- * a single line and sets len to the length of the returned line.
- * At any other call returns a single line either from the internal buffer
- * or from the file if the buffer is exhausted and sets len to the length
- * of the line.
- */
 char *
-grep_fgetln(struct file *f, size_t *len)
+grep_fgetln(struct file *f, size_t *lenp)
 {
-       struct stat st;
-       size_t bufsiz, i = 0;
-       int ch = 0;
-
-       /* Fill in the buffer if it is empty. */
-       if (binbufptr == NULL) {
-
-               /* Only pre-read to the buffer if we need the binary check. */
-               if (binbehave != BINFILE_TEXT) {
-                       if (f->stdin)
-                               st.st_size = MAXBUFSIZ;
-                       else if (stat(fname, &st) != 0)
-                               err(2, NULL);
-                       /* no need to allocate buffer. */
-                       if (st.st_size == 0)
-                               return (NULL);
-
-                       bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
-                           (st.st_size / 2) : MAXBUFSIZ;
-
-                       binbuf = grep_malloc(sizeof(char) * bufsiz);
-
-                       while (i < bufsiz) {
-                               ch = grep_fgetc(f);
-                               if (ch == EOF)
-                                       break;
-                               binbuf[i++] = ch;
-                               if ((ch == '\n') && lbflag)
-                                       break;
-                       }
-
-                       f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
-                           '\0' : '\200', i - 1) != NULL;
-               }
-               binbufsiz = i;
-               binbufptr = binbuf;
-       }
-
-       /* Read a line whether from the buffer or from the file itself. */
-       for (i = 0; !(grep_feof(f) &&
-           (binbufptr == &binbuf[binbufsiz])); i++) {
-               if (binbufptr == &binbuf[binbufsiz]) {
-                       ch = grep_fgetc(f);
-               } else {
-                       ch = binbufptr[0];
-                       binbufptr++;
-               }
-               if (i >= lnbuflen) {
-                       lnbuflen *= 2;
-                       lnbuf = grep_realloc(lnbuf, ++lnbuflen);
-               }
-               if ((ch == '\n') || (ch == EOF)) {
-                       lnbuf[i] = '\0';
+       unsigned char *p;
+       char *ret;
+       size_t len;
+       size_t off;
+       ptrdiff_t diff;
+
+       /* Fill the buffer, if necessary */
+       if (bufrem == 0 && grep_refill(f) != 0)
+               goto error;
+
+       if (bufrem == 0) {
+               /* Return zero length to indicate EOF */
+               *lenp = 0;
+               return (bufpos);
+       }
+
+       /* Look for a newline in the remaining part of the buffer */
+       if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
+               ++p; /* advance over newline */
+               ret = bufpos;
+               len = p - bufpos;
+               bufrem -= len;
+               bufpos = p;
+               *lenp = len;
+               return (ret);
+       }
+
+       /* We have to copy the current buffered data to the line buffer */
+       for (len = bufrem, off = 0; ; len += bufrem) {
+               /* Make sure there is room for more data */
+               if (grep_lnbufgrow(len + LNBUFBUMP))
+                       goto error;
+               memcpy(lnbuf + off, bufpos, len - off);
+               off = len;
+               if (grep_refill(f) != 0)
+                       goto error;
+               if (bufrem == 0)
+                       /* EOF: return partial line */
                        break;
-               } else
-                       lnbuf[i] = ch;
+               if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
+                       continue;
+               /* got it: finish up the line (like code above) */
+               ++p;
+               diff = p - bufpos;
+               len += diff;
+               if (grep_lnbufgrow(len))
+                   goto error;
+               memcpy(lnbuf + off, bufpos, diff);
+               bufrem -= diff;
+               bufpos = p;
+               break;
        }
-       if (grep_feof(f) && (i == 0) && (ch != '\n'))
-               return (NULL);
-       *len = i;
+       *lenp = len;
        return (lnbuf);
+
+error:
+       *lenp = 0;
+       return (NULL);
 }
 
-/*
- * Opens the standard input for processing.
- */
-struct file *
-grep_stdin_open(void)
+static inline struct file *
+grep_file_init(struct file *f)
 {
-       struct file *f;
 
-       /* Processing stdin implies --line-buffered for tail -f to work. */
-       lbflag = true;
+       if (filebehave == FILE_GZIP &&
+           (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
+               goto error;
 
-       snprintf(fname, sizeof fname, "%s", getstr(1));
+       if (filebehave == FILE_BZIP &&
+           (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
+               goto error;
 
-       f = grep_malloc(sizeof *f);
+       /* Fill read buffer, also catches errors early */
+       if (grep_refill(f) != 0)
+               goto error;
 
-       binbuf = NULL;
-       if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
-               flockfile(f->f);
-               f->stdin = true;
-               return (f);
-       }
+       /* Check for binary stuff, if necessary */
+       if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
+               f->binary = true;
 
+       return (f);
+error:
+       close(f->fd);
        free(f);
        return (NULL);
 }
 
 /*
- * Opens a normal, a gzipped or a bzip2 compressed file for processing.
+ * Opens a file for processing.
  */
 struct file *
 grep_open(const char *path)
 {
        struct file *f;
 
-       snprintf(fname, sizeof fname, "%s", path);
-
        f = grep_malloc(sizeof *f);
-
-       binbuf = NULL;
-       f->stdin = false;
-       switch (filebehave) {
-       case FILE_STDIO:
-               if ((f->f = fopen(path, "r")) != NULL) {
-                       flockfile(f->f);
-                       return (f);
-               }
-               break;
-       case FILE_GZIP:
-               if ((f->gzf = gzopen(fname, "r")) != NULL)
-                       return (f);
-               break;
-       case FILE_BZIP:
-               if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
-                       return (f);
-               break;
+       memset(f, 0, sizeof *f);
+       if (path == NULL) {
+               /* Processing stdin implies --line-buffered. */
+               lbflag = true;
+               f->fd = STDIN_FILENO;
+       } else if ((f->fd = open(path, O_RDONLY)) == -1) {
+               free(f);
+               return (NULL);
        }
 
-       free(f);
-       return (NULL);
+       return (grep_file_init(f));
 }
 
 /*
- * Closes a normal, a gzipped or a bzip2 compressed file.
+ * Closes a file.
  */
 void
 grep_close(struct file *f)
 {
 
-       switch (filebehave) {
-       case FILE_STDIO:
-               funlockfile(f->f);
-               fclose(f->f);
-               break;
-       case FILE_GZIP:
-               gzclose(f->gzf);
-               break;
-       case FILE_BZIP:
-               BZ2_bzclose(f->bzf);
-               break;
-       }
+       close(f->fd);
 
-       /* Reset read buffer for the file we are closing */
-       binbufptr = NULL;
-       free(binbuf);
+       /* Reset read buffer and line buffer */
+       bufpos = buffer;
+       bufrem = 0;
+
+       free(lnbuf);
+       lnbuf = NULL;
+       lnbuflen = 0;
 }

Modified: head/usr.bin/grep/grep.h
==============================================================================
--- head/usr.bin/grep/grep.h    Wed Aug 18 17:39:47 2010        (r211462)
+++ head/usr.bin/grep/grep.h    Wed Aug 18 17:40:10 2010        (r211463)
@@ -77,12 +77,8 @@ extern const char            *errstr[];
 #define MAX_LINE_MATCHES       32
 
 struct file {
-       struct mmfile   *mmf;
-       BZFILE          *bzf;
-       FILE            *f;
-       gzFile          *gzf;
+       int              fd;
        bool             binary;
-       bool             stdin;
 };
 
 struct str {
@@ -150,11 +146,10 @@ void       clearqueue(void);
 
 /* file.c */
 void            grep_close(struct file *f);
-struct file    *grep_stdin_open(void);
 struct file    *grep_open(const char *path);
 char           *grep_fgetln(struct file *f, size_t *len);
 
 /* fastgrep.c */
 int             fastcomp(fastgrep_t *, const char *);
 void            fgrepcomp(fastgrep_t *, const char *);
-int             grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t 
*);
+int             grep_search(fastgrep_t *, const unsigned char *, size_t, 
regmatch_t *);

Modified: head/usr.bin/grep/util.c
==============================================================================
--- head/usr.bin/grep/util.c    Wed Aug 18 17:39:47 2010        (r211462)
+++ head/usr.bin/grep/util.c    Wed Aug 18 17:40:10 2010        (r211463)
@@ -184,7 +184,7 @@ procfile(const char *fn)
 
        if (strcmp(fn, "-") == 0) {
                fn = label != NULL ? label : getstr(1);
-               f = grep_stdin_open();
+               f = grep_open(NULL);
        } else {
                if (!stat(fn, &sb)) {
                        /* Check if we need to process the file */
@@ -215,7 +215,7 @@ procfile(const char *fn)
 
        for (c = 0;  c == 0 || !(lflag || qflag); ) {
                ln.off += ln.len + 1;
-               if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) {
+               if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == 0) {
                        if (ln.line_no == 0 && matchall)
                                exit(0);
                        else
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to