Re: [PATCH v2] merge-recursive: change current file dir string_lists to hashmap

2017-09-06 Thread Junio C Hamano
Kevin Willford  writes:

> The code was using two string_lists, one for the directories and
> one for the files.  The code never checks the lists independently
> so we should be able to only use one list.  The string_list also
> is a O(log n) for lookup and insertion.  Switching this to use a
> hashmap will give O(1) which will save some time when there are
> millions of paths that will be checked.
>
> Also cleaned up a memory leak and method where the return was not
> being used.
>
> Signed-off-by: Kevin Willford 
> ---
>  merge-recursive.c | 76 
> ---
>  merge-recursive.h |  3 +--
>  2 files changed, 57 insertions(+), 22 deletions(-)
>
> diff --git a/merge-recursive.c b/merge-recursive.c
> index 1494ffdb82..ebfe01017f 100644
> --- a/merge-recursive.c
> +++ b/merge-recursive.c
> @@ -24,6 +24,31 @@
>  #include "dir.h"
>  #include "submodule.h"
>  
> +struct path_hashmap_entry {
> + struct hashmap_entry;

You seem to have lost the squash you privately agreed to that is
needed in order to make it compile?


[PATCH v2] merge-recursive: change current file dir string_lists to hashmap

2017-09-06 Thread Kevin Willford
The code was using two string_lists, one for the directories and
one for the files.  The code never checks the lists independently
so we should be able to only use one list.  The string_list also
is a O(log n) for lookup and insertion.  Switching this to use a
hashmap will give O(1) which will save some time when there are
millions of paths that will be checked.

Also cleaned up a memory leak and method where the return was not
being used.

Signed-off-by: Kevin Willford 
---
 merge-recursive.c | 76 ---
 merge-recursive.h |  3 +--
 2 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/merge-recursive.c b/merge-recursive.c
index 1494ffdb82..ebfe01017f 100644
--- a/merge-recursive.c
+++ b/merge-recursive.c
@@ -24,6 +24,31 @@
 #include "dir.h"
 #include "submodule.h"
 
+struct path_hashmap_entry {
+   struct hashmap_entry;
+   char path[FLEX_ARRAY];
+};
+
+static int path_hashmap_cmp(const void *cmp_data,
+   const void *entry,
+   const void *entry_or_key,
+   const void *keydata)
+{
+   const struct path_hashmap_entry *a = entry;
+   const struct path_hashmap_entry *b = entry_or_key;
+   const char *key = keydata;
+
+   if (ignore_case)
+   return strcasecmp(a->path, key ? key : b->path);
+   else
+   return strcmp(a->path, key ? key : b->path);
+}
+
+static unsigned int path_hash(const char *path)
+{
+   return ignore_case ? strihash(path) : strhash(path);
+}
+
 static void flush_output(struct merge_options *o)
 {
if (o->buffer_output < 2 && o->obuf.len) {
@@ -314,29 +339,25 @@ static int save_files_dirs(const unsigned char *sha1,
struct strbuf *base, const char *path,
unsigned int mode, int stage, void *context)
 {
+   struct path_hashmap_entry *entry;
int baselen = base->len;
struct merge_options *o = context;
 
strbuf_addstr(base, path);
 
-   if (S_ISDIR(mode))
-   string_list_insert(>current_directory_set, base->buf);
-   else
-   string_list_insert(>current_file_set, base->buf);
+   FLEX_ALLOC_MEM(entry, path, base->buf, base->len);
+   hashmap_entry_init(entry, path_hash(entry->path));
+   hashmap_add(>current_file_dir_set, entry);
 
strbuf_setlen(base, baselen);
return (S_ISDIR(mode) ? READ_TREE_RECURSIVE : 0);
 }
 
-static int get_files_dirs(struct merge_options *o, struct tree *tree)
+static void get_files_dirs(struct merge_options *o, struct tree *tree)
 {
-   int n;
struct pathspec match_all;
memset(_all, 0, sizeof(match_all));
-   if (read_tree_recursive(tree, "", 0, 0, _all, save_files_dirs, o))
-   return 0;
-   n = o->current_file_set.nr + o->current_directory_set.nr;
-   return n;
+   read_tree_recursive(tree, "", 0, 0, _all, save_files_dirs, o);
 }
 
 /*
@@ -646,6 +667,7 @@ static void add_flattened_path(struct strbuf *out, const 
char *s)
 
 static char *unique_path(struct merge_options *o, const char *path, const char 
*branch)
 {
+   struct path_hashmap_entry *entry;
struct strbuf newpath = STRBUF_INIT;
int suffix = 0;
size_t base_len;
@@ -654,14 +676,16 @@ static char *unique_path(struct merge_options *o, const 
char *path, const char *
add_flattened_path(, branch);
 
base_len = newpath.len;
-   while (string_list_has_string(>current_file_set, newpath.buf) ||
-  string_list_has_string(>current_directory_set, newpath.buf) ||
+   while (hashmap_get_from_hash(>current_file_dir_set,
+path_hash(newpath.buf), newpath.buf) ||
   (!o->call_depth && file_exists(newpath.buf))) {
strbuf_setlen(, base_len);
strbuf_addf(, "_%d", suffix++);
}
 
-   string_list_insert(>current_file_set, newpath.buf);
+   FLEX_ALLOC_MEM(entry, path, newpath.buf, newpath.len);
+   hashmap_entry_init(entry, path_hash(entry->path));
+   hashmap_add(>current_file_dir_set, entry);
return strbuf_detach(, NULL);
 }
 
@@ -1945,8 +1969,14 @@ int merge_trees(struct merge_options *o,
if (unmerged_cache()) {
struct string_list *entries, *re_head, *re_merge;
int i;
-   string_list_clear(>current_file_set, 1);
-   string_list_clear(>current_directory_set, 1);
+   /*
+* Only need the hashmap while processing entries, so
+* initialize it here and free it when we are done running
+* through the entries. Keeping it in the merge_options as
+* opposed to decaring a local hashmap is for convenience
+* so that we don't have to pass it to around.
+*/
+   hashmap_init(>current_file_dir_set,