Re: [PATCH 3/5] name-hash: precompute hash values during preload-index

2017-02-19 Thread Junio C Hamano
Jeff Hostetler  writes:

> I looked at doing this, but I didn't think the complexity and overhead to
> forward search for peers at the current level didn't warrant the limited 
> gains.

It seems that I wasn't clear what I meant.  I didn't mean anything
complex like what you said.

Just something simple, like this on top of yours, that passes and
compares with only the previous one.  I do not know if that gives
any gain, though ;-).

 cache.h |  2 +-
 name-hash.c | 11 +--
 preload-index.c |  4 +++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/cache.h b/cache.h
index 390aa803df..bd2980f6e3 100644
--- a/cache.h
+++ b/cache.h
@@ -233,7 +233,7 @@ struct cache_entry {
 #error "CE_EXTENDED_FLAGS out of range"
 #endif
 
-void precompute_istate_hashes(struct cache_entry *ce);
+void precompute_istate_hashes(struct cache_entry *ce, struct cache_entry 
*prev);
 
 /* Forward structure decls */
 struct pathspec;
diff --git a/name-hash.c b/name-hash.c
index f95054f44c..5e09b79170 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -300,7 +300,7 @@ void free_name_hash(struct index_state *istate)
  * non-skip-worktree items (since status should not observe skipped items), but
  * because lazy_init_name_hash() hashes everything, we force it here.
  */
-void precompute_istate_hashes(struct cache_entry *ce)
+void precompute_istate_hashes(struct cache_entry *ce, struct cache_entry *prev)
 {
int namelen = ce_namelen(ce);
 
@@ -312,7 +312,14 @@ void precompute_istate_hashes(struct cache_entry *ce)
ce->precomputed_hash.root_entry = 1;
} else {
namelen--;
-   ce->precomputed_hash.dir = memihash(ce->name, namelen);
+
+   if (prev && 
+   prev->precomputed_hash.initialized &&
+   namelen <= ce_namelen(prev) &&
+   !memcmp(ce->name, prev->name, namelen))
+   ce->precomputed_hash.dir = prev->precomputed_hash.dir;
+   else
+   ce->precomputed_hash.dir = memihash(ce->name, namelen);
ce->precomputed_hash.name = memihash_continue(
ce->precomputed_hash.dir, ce->name + namelen,
ce_namelen(ce) - namelen);
diff --git a/preload-index.c b/preload-index.c
index 602737f9d0..784378ffac 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -37,6 +37,7 @@ static void *preload_thread(void *_data)
struct thread_data *p = _data;
struct index_state *index = p->index;
struct cache_entry **cep = index->cache + p->offset;
+   struct cache_entry *previous = NULL;
struct cache_def cache = CACHE_DEF_INIT;
 
nr = p->nr;
@@ -47,7 +48,8 @@ static void *preload_thread(void *_data)
struct cache_entry *ce = *cep++;
struct stat st;
 
-   precompute_istate_hashes(ce);
+   precompute_istate_hashes(ce, previous);
+   previous = ce;
 
if (ce_stage(ce))
continue;




> (I was just looking at the complexity of clear_ce_flags_1() in unpack-trees.c
> and how hard it has to look to find the end of the current directory and the
> effect that that has on the recursion and it felt like too much work for the
> potential gain.)
>
> Whereas remembering the previous one was basically free.  Granted, it only
> helps us for adjacent files in the index, so it's not perfect, but gives us 
> the
> best bang for the buck.
>
> Jeff


RE: [PATCH 3/5] name-hash: precompute hash values during preload-index

2017-02-18 Thread Jeff Hostetler


From: Junio C Hamano [mailto:jch2...@gmail.com] On Behalf Of Junio C Hamano
> 
> The fact that each preload_thread() still walks the index in-order
> makes me wonder if it may allow us to further optimize the "dir"
> part of the hash by passing the previous ce for which we already
> precomputed hash values.  While the loop is iterating over the paths
> in the same directory, .dir component from the previous ce can be
> reused and .name component can "continue", no?
> 
> It's possible that you already tried such an optimization and
> rejected it after finding that the cost of comparison of pathnames
> to tell if ce and previous ce are still in the same directory is
> more than unconditionally memihash() the directory part, and I am in
> no way saying that I found a missed optimization opportunity you
> must pursue.  I am just being curious.

I looked at doing this, but I didn't think the complexity and overhead to
forward search for peers at the current level didn't warrant the limited gains.
(I was just looking at the complexity of clear_ce_flags_1() in unpack-trees.c
and how hard it has to look to find the end of the current directory and the
effect that that has on the recursion and it felt like too much work for the
potential gain.)

Whereas remembering the previous one was basically free.  Granted, it only
helps us for adjacent files in the index, so it's not perfect, but gives us the
best bang for the buck.

Jeff



Re: [PATCH 3/5] name-hash: precompute hash values during preload-index

2017-02-17 Thread Junio C Hamano
Johannes Schindelin  writes:

> +void precompute_istate_hashes(struct cache_entry *ce)
> +{
> + int namelen = ce_namelen(ce);
> +
> + while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
> + namelen--;
> +
> + if (namelen <= 0) {
> + ce->precomputed_hash.name = memihash(ce->name, ce_namelen(ce));
> + ce->precomputed_hash.root_entry = 1;
> + } else {
> + namelen--;
> + ce->precomputed_hash.dir = memihash(ce->name, namelen);
> + ce->precomputed_hash.name = memihash_continue(
> + ce->precomputed_hash.dir, ce->name + namelen,
> + ce_namelen(ce) - namelen);
> + ce->precomputed_hash.root_entry = 0;
> + }
> + ce->precomputed_hash.initialized = 1;
> +}
> diff --git a/preload-index.c b/preload-index.c
> index c1fe3a3ef9c..602737f9d0f 100644
> --- a/preload-index.c
> +++ b/preload-index.c
> @@ -47,6 +47,8 @@ static void *preload_thread(void *_data)
>   struct cache_entry *ce = *cep++;
>   struct stat st;
>  
> + precompute_istate_hashes(ce);
> +

The fact that each preload_thread() still walks the index in-order
makes me wonder if it may allow us to further optimize the "dir"
part of the hash by passing the previous ce for which we already
precomputed hash values.  While the loop is iterating over the paths
in the same directory, .dir component from the previous ce can be
reused and .name component can "continue", no?

It's possible that you already tried such an optimization and
rejected it after finding that the cost of comparison of pathnames
to tell if ce and previous ce are still in the same directory is
more than unconditionally memihash() the directory part, and I am in
no way saying that I found a missed optimization opportunity you
must pursue.  I am just being curious.



[PATCH 3/5] name-hash: precompute hash values during preload-index

2017-02-14 Thread Johannes Schindelin
From: Jeff Hostetler 

Precompute the istate.name_hash and istate.dir_hash values
for each cache-entry during the preload-index phase.

Move the expensive memihash() calculations from lazy_init_name_hash()
to the multi-threaded preload-index phase.

Signed-off-by: Jeff Hostetler 
Signed-off-by: Johannes Schindelin 
---
 cache.h |  6 ++
 name-hash.c | 57 +++--
 preload-index.c |  2 ++
 read-cache.c|  3 +++
 4 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 61fc86e6d71..56f7c97cdbe 100644
--- a/cache.h
+++ b/cache.h
@@ -173,6 +173,10 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned int index; /* for link extension */
+   struct {
+   unsigned initialized:1, root_entry:1;
+   unsigned int name, dir;
+   } precomputed_hash;
struct object_id oid;
char name[FLEX_ARRAY]; /* more */
 };
@@ -229,6 +233,8 @@ struct cache_entry {
 #error "CE_EXTENDED_FLAGS out of range"
 #endif
 
+void precompute_istate_hashes(struct cache_entry *ce);
+
 /* Forward structure decls */
 struct pathspec;
 struct child_process;
diff --git a/name-hash.c b/name-hash.c
index ad0bc0cef73..49eb84846df 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -50,6 +50,10 @@ static struct dir_entry *hash_dir_entry(struct index_state 
*istate,
 */
struct dir_entry *dir;
unsigned int hash;
+   int orig_namelen = namelen;
+
+   if (ce->precomputed_hash.initialized && ce->precomputed_hash.root_entry)
+   return NULL; /* item does not have a parent directory */
 
/* get length of parent directory */
while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
@@ -59,7 +63,10 @@ static struct dir_entry *hash_dir_entry(struct index_state 
*istate,
namelen--;
 
/* lookup existing entry for that directory */
-   hash = memihash(ce->name, namelen);
+   if (ce->precomputed_hash.initialized && orig_namelen == ce_namelen(ce))
+   hash = ce->precomputed_hash.dir;
+   else
+   hash = memihash(ce->name, namelen);
dir = find_dir_entry_1(istate, ce->name, namelen, hash);
if (!dir) {
/* not found, create it and add to hash table */
@@ -99,10 +106,18 @@ static void remove_dir_entry(struct index_state *istate, 
struct cache_entry *ce)
 
 static void hash_index_entry(struct index_state *istate, struct cache_entry 
*ce)
 {
+   unsigned int h;
+
if (ce->ce_flags & CE_HASHED)
return;
ce->ce_flags |= CE_HASHED;
-   hashmap_entry_init(ce, memihash(ce->name, ce_namelen(ce)));
+
+   if (ce->precomputed_hash.initialized)
+   h = ce->precomputed_hash.name;
+   else
+   h = memihash(ce->name, ce_namelen(ce));
+
+   hashmap_entry_init(ce, h);
hashmap_add(&istate->name_hash, ce);
 
if (ignore_case)
@@ -244,3 +259,41 @@ void free_name_hash(struct index_state *istate)
hashmap_free(&istate->name_hash, 0);
hashmap_free(&istate->dir_hash, 1);
 }
+
+/*
+ * Precompute the hash values for this cache_entry
+ * for use in the istate.name_hash and istate.dir_hash.
+ *
+ * If the item is in the root directory, just compute the hash value (for
+ * istate.name_hash) on the full path.
+ *
+ * If the item is in a subdirectory, first compute the hash value for the
+ * immediate parent directory (for istate.dir_hash) and then the hash value for
+ * the full path by continuing the computation.
+ *
+ * Note that these hashes will be used by wt_status_collect_untracked() as it
+ * scans the worktree and maps observed paths back to the index (optionally
+ * ignoring case). Technically, we only *NEED* to precompute this for
+ * non-skip-worktree items (since status should not observe skipped items), but
+ * because lazy_init_name_hash() hashes everything, we force it here.
+ */
+void precompute_istate_hashes(struct cache_entry *ce)
+{
+   int namelen = ce_namelen(ce);
+
+   while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
+   namelen--;
+
+   if (namelen <= 0) {
+   ce->precomputed_hash.name = memihash(ce->name, ce_namelen(ce));
+   ce->precomputed_hash.root_entry = 1;
+   } else {
+   namelen--;
+   ce->precomputed_hash.dir = memihash(ce->name, namelen);
+   ce->precomputed_hash.name = memihash_continue(
+   ce->precomputed_hash.dir, ce->name + namelen,
+   ce_namelen(ce) - namelen);
+   ce->precomputed_hash.root_entry = 0;
+   }
+   ce->precomputed_hash.initialized = 1;
+}
diff --git a/preload-index.c b/preload-index.c
index c1fe3a3ef9c..602737f9d0f 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -47,6 +47,8 @@ static void *preload_thread(void *_data)
struct cache_entry *ce =