Here's a proposed patch to greatly speed up 'grep' in some cases when it's dealing with sparse files that happen to start with a block of NUL-free data.
>From cadc29e2b8fed7ae807fc451cac821798d0bc4c8 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Sun, 11 Mar 2012 22:41:01 -0700 Subject: [PATCH] grep: sparse files are now considered binary * NEWS: Document this. * doc/grep.texi (File and Directory Selection): Likewise. * bootstrap.conf (gnulib_modules): Add stat-size. * src/main.c: Include stat-size.h. (file_is_binary): New function, which looks for holes too. (grep): Use it. * tests/Makefile.am (TESTS): Add big-hole. * tests/big-hole: New file. --- NEWS | 4 +++ bootstrap.conf | 1 + doc/grep.texi | 7 +++-- src/main.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++- tests/Makefile.am | 1 + tests/big-hole | 25 +++++++++++++++++++ 6 files changed, 101 insertions(+), 4 deletions(-) create mode 100755 tests/big-hole diff --git a/NEWS b/NEWS index d0a63d5..8544287 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,10 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** New features + + 'grep' without -z now treats a sparse file as binary, if it can + easily determine that the file is sparse. * Noteworthy changes in release 2.11 (2012-03-02) [stable] diff --git a/bootstrap.conf b/bootstrap.conf index 45bb33d..b5634cd 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -67,6 +67,7 @@ realloc-gnu regex same-inode ssize_t +stat-size stddef stdlib stpcpy diff --git a/doc/grep.texi b/doc/grep.texi index c014d8f..8189a94 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option. @item --binary-files=@var{type} @opindex --binary-files @cindex binary files -If the first few bytes of a file indicate that the file contains binary data, +If a file's allocation metadata or its first few bytes +indicate that the file contains binary data, assume that the file is of type @var{type}. By default, @var{type} is @samp{binary}, and @command{grep} normally outputs either @@ -703,8 +704,8 @@ better performance. @cindex binary files, MS-DOS/MS-Windows Treat the file(s) as binary. By default, under MS-DOS and MS-Windows, -@command{grep} guesses the file type -by looking at the contents of the first 32kB read from the file. +@command{grep} guesses whether a file is text or binary +as described for the @option{--binary-files} option. If @command{grep} decides the file is a text file, it strips the @code{CR} characters from the original file contents (to make regular expressions with @code{^} and @code{$} work correctly). diff --git a/src/main.c b/src/main.c index 2f6c761..cc6427d 100644 --- a/src/main.c +++ b/src/main.c @@ -44,6 +44,7 @@ #include "propername.h" #include "quote.h" #include "savedir.h" +#include "stat-size.h" #include "version-etc.h" #include "xalloc.h" #include "xstrtol.h" @@ -426,6 +427,70 @@ clean_up_stdout (void) close_stdout (); } +/* Return 1 if a file is known to be binary for the purpose of 'grep'. + BUF, of size BUFSIZE, is the initial buffer read from the file with + descriptor FD and status ST. */ +static int +file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st) +{ + #ifndef HAVE_STRUCT_STAT_ST_BLOCKS + enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 }; + #endif + #ifndef SEEK_HOLE + enum { SEEK_HOLE = SEEK_END }; + #endif + + /* If -z, test only whether the initial buffer contains '\200'; + knowing about holes won't help. */ + if (! eolbyte) + return memchr (buf, '\200', bufsize) != 0; + + /* If the initial buffer contains a null byte, guess that the file + is binary. */ + if (memchr (buf, '\0', bufsize)) + return 1; + + /* If the file has holes, it must contain a null byte somewhere. */ + if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END) + && S_ISREG (st->st_mode)) + { + off_t cur = bufsize; + if (O_BINARY || fd == STDIN_FILENO) + { + cur = lseek (fd, 0, SEEK_CUR); + if (cur < 0) + return 0; + } + + /* If the file has fewer blocks than would be needed to + represent its data, then it must have at least one hole. */ + if (HAVE_STRUCT_STAT_ST_BLOCKS) + { + off_t nonzeros_needed = st->st_size - cur + bufsize; + off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE; + int partial_block = nonzeros_needed % ST_NBLOCKSIZE != 0; + if (ST_NBLOCKS (*st) < full_blocks + partial_block) + return 1; + } + + /* Look for a hole after the current location. */ + if (SEEK_HOLE != SEEK_END) + { + off_t hole_start = lseek (fd, cur, SEEK_HOLE); + if (0 <= hole_start) + { + if (lseek (fd, cur, SEEK_SET) < 0) + suppressible_error (filename, errno); + if (hole_start < st->st_size) + return 1; + } + } + } + + /* Guess that the file does not contain binary data. */ + return 0; +} + /* Convert STR to a nonnegative integer, storing the result in *OUT. STR must be a valid context length argument; report an error if it isn't. Silently ceiling *OUT at the maximum value, as that is @@ -1127,7 +1192,7 @@ grep (int fd, char const *file, struct stats *stats) not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet) || binary_files == WITHOUT_MATCH_BINARY_FILES) - && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg)); + && file_is_binary (bufbeg, buflim - bufbeg, fd, &stats->stat)); if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES) return 0; done_on_match += not_text; diff --git a/tests/Makefile.am b/tests/Makefile.am index c2cd2f7..0715fda 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -38,6 +38,7 @@ TESTS = \ backref \ backref-multibyte-slow \ backref-word \ + big-hole \ big-match \ bogus-wctob \ bre \ diff --git a/tests/big-hole b/tests/big-hole new file mode 100755 index 0000000..ccc6bf5 --- /dev/null +++ b/tests/big-hole @@ -0,0 +1,25 @@ +#!/bin/sh +# Check that grep --binary-file=without-match quickly skips files with holes. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +expensive_ + +# Create a file that starts with at least a buffer's worth of text, +# but has a big hole later. +ten='1 2 3 4 5 6 7 8 9 10' +x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' +(for i in $ten; do + for j in $ten; do + for k in $ten; do + echo $x + done + done + done + echo x | dd bs=1024k seek=8000000 +) >8T-or-so || skip_ 'cannot create big sparse file' + +grep --binary-file=without-match x 8T-or-so >/dev/null +test $? -eq 1 || fail=1 + +Exit $fail
