Re: [PATCH 1/2] serialize collection of changed submodules

2016-09-19 Thread Heiko Voigt
On Fri, Sep 16, 2016 at 10:27:04AM -0700, Junio C Hamano wrote:
> Heiko Voigt  writes:
> 
> > +static struct sha1_array *get_sha1s_from_list(struct string_list 
> > *submodules,
> > +   const char *path)
> > +{
> > +   struct string_list_item *item;
> > +   struct sha1_array *hashes;
> > +
> > +   item = string_list_insert(submodules, path);
> > +   if (item->util)
> > +   return (struct sha1_array *) item->util;
> > +
> > +   hashes = (struct sha1_array *) xmalloc(sizeof(struct sha1_array));
> > +   /* NEEDSWORK: should we add an initializer function for
> > +* sha1_array ? */
> > +   memset(hashes, 0, sizeof(struct sha1_array));
> > +   item->util = hashes;
> 
> 
>   /* NEEDSWORK: should we have SHA1_ARRAY_INIT etc.? */
>   item->util = xcalloc(1, sizeof(struct sha1_array));

Ok will do.


Re: [PATCH 1/2] serialize collection of changed submodules

2016-09-16 Thread Junio C Hamano
Heiko Voigt  writes:

> +static struct sha1_array *get_sha1s_from_list(struct string_list *submodules,
> + const char *path)
> +{
> + struct string_list_item *item;
> + struct sha1_array *hashes;
> +
> + item = string_list_insert(submodules, path);
> + if (item->util)
> + return (struct sha1_array *) item->util;
> +
> + hashes = (struct sha1_array *) xmalloc(sizeof(struct sha1_array));
> + /* NEEDSWORK: should we add an initializer function for
> +  * sha1_array ? */
> + memset(hashes, 0, sizeof(struct sha1_array));
> + item->util = hashes;


/* NEEDSWORK: should we have SHA1_ARRAY_INIT etc.? */
item->util = xcalloc(1, sizeof(struct sha1_array));

>  static void collect_submodules_from_diff(struct diff_queue_struct *q,
>struct diff_options *options,
>void *data)
>  {
>   int i;
> - struct string_list *needs_pushing = data;
> + struct string_list *submodules = data;
>  
>   for (i = 0; i < q->nr; i++) {
>   struct diff_filepair *p = q->queue[i];
> + struct sha1_array *hashes;
>   if (!S_ISGITLINK(p->two->mode))
>   continue;
> - if (submodule_needs_pushing(p->two->path, p->two->oid.hash))
> - string_list_insert(needs_pushing, p->two->path);
> + hashes = get_sha1s_from_list(submodules, p->two->path);
> + sha1_array_append(hashes, p->two->oid.hash);
>   }
>  }

So the idea at this step is still let each commit in the top-level
history inspected for any submodule change, but the result is
collected in a mapping (submodule -> [ list of submodule commits ]).
As we do not expect too many "oops, the old commit was better, so
let's revert and rebind the old one from the submodule" in the
history of the top-level, appending and then running for-each-unique
is an efficient way, instead of first checking if we already have
it and then inserting new ones to maintain the uniqueness.

Makes sense.

> @@ -582,14 +601,41 @@ static void find_unpushed_submodule_commits(struct 
> commit *commit,
>   diff_tree_combined_merge(commit, 1, &rev);
>  }
>  
> +struct collect_submodule_from_sha1s_data {
> + char *submodule_path;
> + struct string_list *needs_pushing;
> +};
> +
> +static void collect_submodules_from_sha1s(const unsigned char sha1[20],
> + void *data)
> +{
> + struct collect_submodule_from_sha1s_data *me =
> + (struct collect_submodule_from_sha1s_data *) data;
> +
> + if (submodule_needs_pushing(me->submodule_path, sha1))
> + string_list_insert(me->needs_pushing, me->submodule_path);
> +}

This is called from sha1_array_for_each_unique() that iterates over
the submodule commit object names for one submodule and then ends up
calling submodule_needs_pushing() number of times, which smells less
efficient than it could be.  You can ask

rev-list  --not --remotes

just once in the submodule repository.  I imagine that is what you'll
do in the next patch.

An obvious but much less efficient way to optimize this part would
be to see if me->needs_pushing already has me->submodule_path and
skip the check for submodule_needs_pushing(), but if you drop the
call by find_unpushed_submodule to sha1_array_for_each_unique() to
walk new submodule commits one by one, that would become irrelevant.

> +static void free_submodules_sha1s(struct string_list *submodules)
> +{
> + int i;
> + for (i = 0; i < submodules->nr; i++) {
> + struct string_list_item *item = &submodules->items[i];
> + struct sha1_array *hashes = (struct sha1_array *) item->util;
> + sha1_array_clear(hashes);
> + }
> + string_list_clear(submodules, 1);
> +}
> +
>  int find_unpushed_submodules(unsigned char new_sha1[20],
>   const char *remotes_name, struct string_list *needs_pushing)
>  {
>   struct rev_info rev;
>   struct commit *commit;
>   const char *argv[] = {NULL, NULL, "--not", "NULL", NULL};
> - int argc = ARRAY_SIZE(argv) - 1;
> + int argc = ARRAY_SIZE(argv) - 1, i;
>   char *sha1_copy;
> + struct string_list submodules = STRING_LIST_INIT_DUP;
>  
>   struct strbuf remotes_arg = STRBUF_INIT;
>  
> @@ -603,12 +649,23 @@ int find_unpushed_submodules(unsigned char new_sha1[20],
>   die("revision walk setup failed");
>  
>   while ((commit = get_revision(&rev)) != NULL)
> - find_unpushed_submodule_commits(commit, needs_pushing);
> + find_unpushed_submodule_commits(commit, &submodules);
>  
>   reset_revision_walk();
>   free(sha1_copy);
>   strbuf_release(&remotes_arg);
>  
> + for (i = 0; i < submodules.nr; i++) {
> + struct string_list_item *item = &submodules.items[i];
> + struct collect_submodule_from_sha1s_data data;
> + data.submodul

Re: [PATCH 1/2] serialize collection of changed submodules

2016-09-14 Thread Junio C Hamano
Heiko Voigt  writes:

> Sorry about the late reply. I was not able to process emails until now.
> Here are two patches that should help to improve the situation and batch
> up some processing. This one is for repositories with submodules, so
> that they do not iterate over the same submodule twice with the same
> hash.
>
> The second one will be the one people without submodules are interested
> in.

Thanks.  Will take a look at later as I'm already deep in today's
integration cycle.  Very much appreciated.


[PATCH 1/2] serialize collection of changed submodules

2016-09-14 Thread Heiko Voigt
To check whether a submodule needs to be pushed we need to collect all
changed submodules. Lets collect them first and then execute the
possibly expensive test whether certain revisions are already pushed
only once per submodule.

There is further potential for optimization since we can assemble one
command and only issued that instead of one call for each remote ref in
the submodule.

Signed-off-by: Heiko Voigt 
---
Sorry about the late reply. I was not able to process emails until now.
Here are two patches that should help to improve the situation and batch
up some processing. This one is for repositories with submodules, so
that they do not iterate over the same submodule twice with the same
hash.

The second one will be the one people without submodules are interested
in.

Cheers Heiko

 submodule.c | 67 -
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/submodule.c b/submodule.c
index 0ef2ff4..b04c066 100644
--- a/submodule.c
+++ b/submodule.c
@@ -554,19 +554,38 @@ static int submodule_needs_pushing(const char *path, 
const unsigned char sha1[20
return 0;
 }
 
+static struct sha1_array *get_sha1s_from_list(struct string_list *submodules,
+   const char *path)
+{
+   struct string_list_item *item;
+   struct sha1_array *hashes;
+
+   item = string_list_insert(submodules, path);
+   if (item->util)
+   return (struct sha1_array *) item->util;
+
+   hashes = (struct sha1_array *) xmalloc(sizeof(struct sha1_array));
+   /* NEEDSWORK: should we add an initializer function for
+* sha1_array ? */
+   memset(hashes, 0, sizeof(struct sha1_array));
+   item->util = hashes;
+   return hashes;
+}
+
 static void collect_submodules_from_diff(struct diff_queue_struct *q,
 struct diff_options *options,
 void *data)
 {
int i;
-   struct string_list *needs_pushing = data;
+   struct string_list *submodules = data;
 
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
+   struct sha1_array *hashes;
if (!S_ISGITLINK(p->two->mode))
continue;
-   if (submodule_needs_pushing(p->two->path, p->two->oid.hash))
-   string_list_insert(needs_pushing, p->two->path);
+   hashes = get_sha1s_from_list(submodules, p->two->path);
+   sha1_array_append(hashes, p->two->oid.hash);
}
 }
 
@@ -582,14 +601,41 @@ static void find_unpushed_submodule_commits(struct commit 
*commit,
diff_tree_combined_merge(commit, 1, &rev);
 }
 
+struct collect_submodule_from_sha1s_data {
+   char *submodule_path;
+   struct string_list *needs_pushing;
+};
+
+static void collect_submodules_from_sha1s(const unsigned char sha1[20],
+   void *data)
+{
+   struct collect_submodule_from_sha1s_data *me =
+   (struct collect_submodule_from_sha1s_data *) data;
+
+   if (submodule_needs_pushing(me->submodule_path, sha1))
+   string_list_insert(me->needs_pushing, me->submodule_path);
+}
+
+static void free_submodules_sha1s(struct string_list *submodules)
+{
+   int i;
+   for (i = 0; i < submodules->nr; i++) {
+   struct string_list_item *item = &submodules->items[i];
+   struct sha1_array *hashes = (struct sha1_array *) item->util;
+   sha1_array_clear(hashes);
+   }
+   string_list_clear(submodules, 1);
+}
+
 int find_unpushed_submodules(unsigned char new_sha1[20],
const char *remotes_name, struct string_list *needs_pushing)
 {
struct rev_info rev;
struct commit *commit;
const char *argv[] = {NULL, NULL, "--not", "NULL", NULL};
-   int argc = ARRAY_SIZE(argv) - 1;
+   int argc = ARRAY_SIZE(argv) - 1, i;
char *sha1_copy;
+   struct string_list submodules = STRING_LIST_INIT_DUP;
 
struct strbuf remotes_arg = STRBUF_INIT;
 
@@ -603,12 +649,23 @@ int find_unpushed_submodules(unsigned char new_sha1[20],
die("revision walk setup failed");
 
while ((commit = get_revision(&rev)) != NULL)
-   find_unpushed_submodule_commits(commit, needs_pushing);
+   find_unpushed_submodule_commits(commit, &submodules);
 
reset_revision_walk();
free(sha1_copy);
strbuf_release(&remotes_arg);
 
+   for (i = 0; i < submodules.nr; i++) {
+   struct string_list_item *item = &submodules.items[i];
+   struct collect_submodule_from_sha1s_data data;
+   data.submodule_path = item->string;
+   data.needs_pushing = needs_pushing;
+   sha1_array_for_each_unique((struct sha1_array *) item->util,
+   collect_submodules_from_sha1s,
+   &data);
+