[PATCH] grep: sparse files are now considered binary

Paul Eggert Sun, 11 Mar 2012 23:27:39 -0700

Here's a proposed patch to greatly speed up 'grep' in some cases
when it's dealing with sparse files that happen to start
with a block of NUL-free data.


>From cadc29e2b8fed7ae807fc451cac821798d0bc4c8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 11 Mar 2012 22:41:01 -0700
Subject: [PATCH] grep: sparse files are now considered binary

* NEWS: Document this.
* doc/grep.texi (File and Directory Selection): Likewise.
* bootstrap.conf (gnulib_modules): Add stat-size.
* src/main.c: Include stat-size.h.
(file_is_binary): New function, which looks for holes too.
(grep): Use it.
* tests/Makefile.am (TESTS): Add big-hole.
* tests/big-hole: New file.
---
 NEWS              |    4 +++
 bootstrap.conf    |    1 +
 doc/grep.texi     |    7 +++--
 src/main.c        |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 tests/Makefile.am |    1 +
 tests/big-hole    |   25 +++++++++++++++++++
 6 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100755 tests/big-hole

diff --git a/NEWS b/NEWS
index d0a63d5..8544287 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** New features
+
+  'grep' without -z now treats a sparse file as binary, if it can
+  easily determine that the file is sparse.
 
 * Noteworthy changes in release 2.11 (2012-03-02) [stable]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index 45bb33d..b5634cd 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -67,6 +67,7 @@ realloc-gnu
 regex
 same-inode
 ssize_t
+stat-size
 stddef
 stdlib
 stpcpy
diff --git a/doc/grep.texi b/doc/grep.texi
index c014d8f..8189a94 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option.
 @item --binary-files=@var{type}
 @opindex --binary-files
 @cindex binary files
-If the first few bytes of a file indicate that the file contains binary data,
+If a file's allocation metadata or its first few bytes
+indicate that the file contains binary data,
 assume that the file is of type @var{type}.
 By default, @var{type} is @samp{binary},
 and @command{grep} normally outputs either
@@ -703,8 +704,8 @@ better performance.
 @cindex binary files, MS-DOS/MS-Windows
 Treat the file(s) as binary.
 By default, under MS-DOS and MS-Windows,
-@command{grep} guesses the file type
-by looking at the contents of the first 32kB read from the file.
+@command{grep} guesses whether a file is text or binary
+as described for the @option{--binary-files} option.
 If @command{grep} decides the file is a text file,
 it strips the @code{CR} characters from the original file contents
 (to make regular expressions with @code{^} and @code{$} work correctly).
diff --git a/src/main.c b/src/main.c
index 2f6c761..cc6427d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,6 +44,7 @@
 #include "propername.h"
 #include "quote.h"
 #include "savedir.h"
+#include "stat-size.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -426,6 +427,70 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
+/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+   BUF, of size BUFSIZE, is the initial buffer read from the file with
+   descriptor FD and status ST.  */
+static int
+file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+{
+  #ifndef HAVE_STRUCT_STAT_ST_BLOCKS
+  enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 };
+  #endif
+  #ifndef SEEK_HOLE
+  enum { SEEK_HOLE = SEEK_END };
+  #endif
+
+  /* If -z, test only whether the initial buffer contains '\200';
+     knowing about holes won't help.  */
+  if (! eolbyte)
+    return memchr (buf, '\200', bufsize) != 0;
+
+  /* If the initial buffer contains a null byte, guess that the file
+     is binary.  */
+  if (memchr (buf, '\0', bufsize))
+    return 1;
+
+  /* If the file has holes, it must contain a null byte somewhere.  */
+  if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END)
+      && S_ISREG (st->st_mode))
+    {
+      off_t cur = bufsize;
+      if (O_BINARY || fd == STDIN_FILENO)
+        {
+          cur = lseek (fd, 0, SEEK_CUR);
+          if (cur < 0)
+            return 0;
+        }
+
+      /* If the file has fewer blocks than would be needed to
+         represent its data, then it must have at least one hole.  */
+      if (HAVE_STRUCT_STAT_ST_BLOCKS)
+        {
+          off_t nonzeros_needed = st->st_size - cur + bufsize;
+          off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE;
+          int partial_block = nonzeros_needed % ST_NBLOCKSIZE != 0;
+          if (ST_NBLOCKS (*st) < full_blocks + partial_block)
+            return 1;
+        }
+
+      /* Look for a hole after the current location.  */
+      if (SEEK_HOLE != SEEK_END)
+        {
+          off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+          if (0 <= hole_start)
+            {
+              if (lseek (fd, cur, SEEK_SET) < 0)
+                suppressible_error (filename, errno);
+              if (hole_start < st->st_size)
+                return 1;
+            }
+        }
+    }
+
+  /* Guess that the file does not contain binary data.  */
+  return 0;
+}
+
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
    STR must be a valid context length argument; report an error if it
    isn't.  Silently ceiling *OUT at the maximum value, as that is
@@ -1127,7 +1192,7 @@ grep (int fd, char const *file, struct stats *stats)
 
   not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
                || binary_files == WITHOUT_MATCH_BINARY_FILES)
-              && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
+              && file_is_binary (bufbeg, buflim - bufbeg, fd, &stats->stat));
   if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
     return 0;
   done_on_match += not_text;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c2cd2f7..0715fda 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -38,6 +38,7 @@ TESTS =                                               \
   backref                                      \
   backref-multibyte-slow                       \
   backref-word                                 \
+  big-hole                                     \
   big-match                                    \
   bogus-wctob                                  \
   bre                                          \
diff --git a/tests/big-hole b/tests/big-hole
new file mode 100755
index 0000000..ccc6bf5
--- /dev/null
+++ b/tests/big-hole
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Check that grep --binary-file=without-match quickly skips files with holes.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+expensive_
+
+# Create a file that starts with at least a buffer's worth of text,
+# but has a big hole later.
+ten='1 2 3 4 5 6 7 8 9 10'
+x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+(for i in $ten; do
+   for j in $ten; do
+     for k in $ten; do
+       echo $x
+     done
+   done
+ done
+ echo x | dd bs=1024k seek=8000000
+) >8T-or-so || skip_ 'cannot create big sparse file'
+
+grep --binary-file=without-match x 8T-or-so >/dev/null
+test $? -eq 1 || fail=1
+
+Exit $fail

[PATCH] grep: sparse files are now considered binary

Reply via email to