Re: [PATCH 4/5] grep: optionally recurse into submodules

2016-11-04 Thread Jonathan Tan
On Thu, Oct 27, 2016 at 3:38 PM, Brandon Williams  wrote:
> diff --git a/builtin/grep.c b/builtin/grep.c
> index 8887b6add..f34f16df9 100644
> --- a/builtin/grep.c
> +++ b/builtin/grep.c
> @@ -18,12 +18,20 @@
>  #include "quote.h"
>  #include "dir.h"
>  #include "pathspec.h"
> +#include "submodule.h"
>
>  static char const * const grep_usage[] = {
> N_("git grep [] [-e]  [...] [[--] ...]"),
> NULL
>  };
>
> +static const char *super_prefix;

I think that the super_prefix changes could be in its own patch.

> +static int recurse_submodules;
> +static struct argv_array submodule_options = ARGV_ARRAY_INIT;

I guess this has to be static because it is shared by multiple threads.

> +
> +static int grep_submodule_launch(struct grep_opt *opt,
> +const struct grep_source *gs);
> +
>  #define GREP_NUM_THREADS_DEFAULT 8
>  static int num_threads;
>
> @@ -174,7 +182,10 @@ static void *run(void *arg)
> break;
>
> opt->output_priv = w;
> -   hit |= grep_source(opt, &w->source);
> +   if (w->source.type == GREP_SOURCE_SUBMODULE)
> +   hit |= grep_submodule_launch(opt, &w->source);
> +   else
> +   hit |= grep_source(opt, &w->source);

It seems to me that GREP_SOURCE_SUBMODULE is of a different nature
than the other GREP_SOURCE_.* - in struct work_item, could we instead
have another variable that distinguishes between submodules and
"native" sources? This might also assuage Junio's concerns in
 about the nature of the
sources.

That variable could also be the discriminant for a tagged union, such
that we have "struct grep_source" for the "native" sources and a new
struct (holding only submodule-relevant information) for the
submodule.

> +/*
> + * Prep grep structures for a submodule grep
> + * sha1: the sha1 of the submodule or NULL if using the working tree
> + * filename: name of the submodule including tree name of parent
> + * path: location of the submodule
> + */
> +static int grep_submodule(struct grep_opt *opt, const unsigned char *sha1,
> + const char *filename, const char *path)
> +{
> +   if (!(is_submodule_initialized(path) &&
> + is_submodule_checked_out(path))) {
> +   warning("skiping submodule '%s%s' since it is not initialized 
> and checked out",
> +   super_prefix ? super_prefix: "",
> +   path);
> +   return 0;
> +   }
> +
> +#ifndef NO_PTHREADS
> +   if (num_threads) {
> +   add_work(opt, GREP_SOURCE_SUBMODULE, filename, path, sha1);
> +   return 0;
> +   } else
> +#endif
> +   {
> +   struct work_item w;
> +   int hit;
> +
> +   grep_source_init(&w.source, GREP_SOURCE_SUBMODULE,
> +filename, path, sha1);
> +   strbuf_init(&w.out, 0);
> +   opt->output_priv = &w;
> +   hit = grep_submodule_launch(opt, &w.source);
> +
> +   write_or_die(1, w.out.buf, w.out.len);
> +
> +   grep_source_clear(&w.source);
> +   strbuf_release(&w.out);
> +   return hit;
> +   }

This is at least the third invocation of this "if pthreads, add work,
otherwise do it now" pattern - could this be extracted into its own
function (in another patch)? Ideally, there would also be exactly one
function in which the grep_source.* functions are invoked, and both
"run" and the non-pthread code path can use it.

> +}
> +
> +static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec,
> + int cached)

This line isn't modified other than the line break, as far as I can
tell, so I wouldn't break it.

> diff --git a/t/t7814-grep-recurse-submodules.sh 
> b/t/t7814-grep-recurse-submodules.sh
> new file mode 100755
> index 0..b670c70cb
> --- /dev/null
> +++ b/t/t7814-grep-recurse-submodules.sh
> @@ -0,0 +1,99 @@
> +#!/bin/sh
> +
> +test_description='Test grep recurse-submodules feature
> +
> +This test verifies the recurse-submodules feature correctly greps across
> +submodules.
> +'
> +
> +. ./test-lib.sh
> +

Would it be possible to also test it while num_threads is zero? (Or,
if num_threads is already zero, to test it while it is not zero?)


[PATCH 4/5] grep: optionally recurse into submodules

2016-10-27 Thread Brandon Williams
Allow grep to recognize submodules and recursively search for patterns in
each submodule.  This is done by forking off a process to recursively
call grep on each submodule.  The top level --super-prefix option is
used to pass a path to the submodule which can in turn be used to
prepend to output or in pathspec matching logic.

Recursion only occurs for submodules which have been initialized and
checked out by the parent project.  If a submodule hasn't been
initialized and checked out it is simply skipped.

In order to support the existing multi-threading infrastructure in grep,
output from each child process is captured in a strbuf so that it can be
later printed to the console in an ordered fashion.

To limit the number of theads that are created, each child process has
half the number of threads as its parents (minimum of 1), otherwise we
potentailly have a fork-bomb.

Signed-off-by: Brandon Williams 
---
 Documentation/git-grep.txt |   5 +
 builtin/grep.c | 301 ++---
 git.c  |   2 +-
 t/t7814-grep-recurse-submodules.sh |  99 
 4 files changed, 386 insertions(+), 21 deletions(-)
 create mode 100755 t/t7814-grep-recurse-submodules.sh

diff --git a/Documentation/git-grep.txt b/Documentation/git-grep.txt
index 0ecea6e49..17aa1ba70 100644
--- a/Documentation/git-grep.txt
+++ b/Documentation/git-grep.txt
@@ -26,6 +26,7 @@ SYNOPSIS
   [--threads ]
   [-f ] [-e] 
   [--and|--or|--not|(|)|-e ...]
+  [--recurse-submodules]
   [ [--[no-]exclude-standard] [--cached | --no-index | --untracked] | 
...]
   [--] [...]
 
@@ -88,6 +89,10 @@ OPTIONS
mechanism.  Only useful when searching files in the current directory
with `--no-index`.
 
+--recurse-submodules::
+   Recursively search in each submodule that has been initialized and
+   checked out in the repository.
+
 -a::
 --text::
Process binary files as if they were text.
diff --git a/builtin/grep.c b/builtin/grep.c
index 8887b6add..f34f16df9 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -18,12 +18,20 @@
 #include "quote.h"
 #include "dir.h"
 #include "pathspec.h"
+#include "submodule.h"
 
 static char const * const grep_usage[] = {
N_("git grep [] [-e]  [...] [[--] ...]"),
NULL
 };
 
+static const char *super_prefix;
+static int recurse_submodules;
+static struct argv_array submodule_options = ARGV_ARRAY_INIT;
+
+static int grep_submodule_launch(struct grep_opt *opt,
+const struct grep_source *gs);
+
 #define GREP_NUM_THREADS_DEFAULT 8
 static int num_threads;
 
@@ -174,7 +182,10 @@ static void *run(void *arg)
break;
 
opt->output_priv = w;
-   hit |= grep_source(opt, &w->source);
+   if (w->source.type == GREP_SOURCE_SUBMODULE)
+   hit |= grep_submodule_launch(opt, &w->source);
+   else
+   hit |= grep_source(opt, &w->source);
grep_source_clear_data(&w->source);
work_done(w);
}
@@ -300,6 +311,10 @@ static int grep_sha1(struct grep_opt *opt, const unsigned 
char *sha1,
if (opt->relative && opt->prefix_length) {
quote_path_relative(filename + tree_name_len, opt->prefix, 
&pathbuf);
strbuf_insert(&pathbuf, 0, filename, tree_name_len);
+   } else if (super_prefix) {
+   strbuf_add(&pathbuf, filename, tree_name_len);
+   strbuf_addstr(&pathbuf, super_prefix);
+   strbuf_addstr(&pathbuf, filename + tree_name_len);
} else {
strbuf_addstr(&pathbuf, filename);
}
@@ -328,10 +343,13 @@ static int grep_file(struct grep_opt *opt, const char 
*filename)
 {
struct strbuf buf = STRBUF_INIT;
 
-   if (opt->relative && opt->prefix_length)
+   if (opt->relative && opt->prefix_length) {
quote_path_relative(filename, opt->prefix, &buf);
-   else
+   } else {
+   if (super_prefix)
+   strbuf_addstr(&buf, super_prefix);
strbuf_addstr(&buf, filename);
+   }
 
 #ifndef NO_PTHREADS
if (num_threads) {
@@ -378,31 +396,259 @@ static void run_pager(struct grep_opt *opt, const char 
*prefix)
exit(status);
 }
 
-static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec, 
int cached)
+static void compile_submodule_options(const struct grep_opt *opt,
+ const struct pathspec *pathspec,
+ int cached, int untracked,
+ int opt_exclude, int use_index,
+ int pattern_type_arg)
+{
+   struct grep_pat *pattern;
+   int i;
+
+   if (recurse_submodules)
+   argv_array_push(&submodule_options, "--recurse-submodules");
+
+