Karsten Blees <karsten.bl...@gmail.com> writes:

> With core.ignorecase=true, name-hash.c builds a case insensitive index of
> all tracked directories. Currently, the existing cache entry structures are
> added multiple times to the same hashtable (with different name lengths and
> hash codes). However, there's only one dir_next pointer, which gets
> completely messed up in case of hash collisions. In the worst case, this
> causes an endless loop if ce == ce->dir_next:
>
> ---8<---
> # "V/", "V/XQANY/" and "WURZAUP/" all have the same hash_name
> mkdir V
> mkdir V/XQANY
> mkdir WURZAUP
> touch V/XQANY/test
> git init
> git config core.ignorecase true
> git add .
> git status
> ---8<---

Instead of using the scissors mark to confuse "am -c", indenting
this block would have been easier to later readers.

Also it is somewhat a shame that we do not use the above sample
collisions in a new test case.

> +static struct dir_entry *hash_dir_entry(struct index_state *istate,
> +             struct cache_entry *ce, int namelen, int add)
>  {
>       /*
>        * Throw each directory component in the hash for quick lookup
>        * during a git status. Directory components are stored with their
> -      * closing slash.  Despite submodules being a directory, they never
> -      * reach this point, because they are stored without a closing slash
> -      * in the cache.

Is the description of submodule no longer relevant?

> -      * Note that the cache_entry stored with the directory does not
> -      * represent the directory itself.  It is a pointer to an existing
> -      * filename, and its only purpose is to represent existence of the
> -      * directory in the cache.  It is very possible multiple directory
> -      * hash entries may point to the same cache_entry.

Is this paragraph no longer relevant?  It seems to me that it still
holds true, given the way how dir->ce points at the given ce.

> +      * closing slash.
>        */
> +     struct dir_entry *dir, *p;
> +
> +     /* get length of parent directory */
> +     while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
> +             namelen--;
> +     if (namelen <= 0)
> +             return NULL;
> +
> +     /* lookup existing entry for that directory */
> +     dir = find_dir_entry(istate, ce->name, namelen);
> +     if (add && !dir) {
> ...
>       }
> +
> +     /* add or release reference to this entry (and parents if 0) */
> +     p = dir;
> +     if (add) {
> +             while (p && !(p->nr++))
> +                     p = p->parent;
> +     } else {
> +             while (p && p->nr && !(--p->nr))
> +                     p = p->parent;
> +     }

Can we free the entry when its refcnt goes down to zero?  If yes, is
it worth doing so?

> +
> +     return dir;
>  }
>  
>  static void hash_index_entry(struct index_state *istate, struct cache_entry 
> *ce)
> @@ -74,7 +111,7 @@ static void hash_index_entry(struct index_state *istate, 
> struct cache_entry *ce)
>       if (ce->ce_flags & CE_HASHED)
>               return;
>       ce->ce_flags |= CE_HASHED;
> -     ce->next = ce->dir_next = NULL;
> +     ce->next = NULL;
>       hash = hash_name(ce->name, ce_namelen(ce));
>       pos = insert_hash(hash, ce, &istate->name_hash);
>       if (pos) {
> @@ -82,8 +119,8 @@ static void hash_index_entry(struct index_state *istate, 
> struct cache_entry *ce)
>               *pos = ce;
>       }
>  
> -     if (ignore_case)
> -             hash_index_entry_directories(istate, ce);
> +     if (ignore_case && !(ce->ce_flags & CE_UNHASHED))
> +             hash_dir_entry(istate, ce, ce_namelen(ce), 1);
>  }
>  
>  static void lazy_init_name_hash(struct index_state *istate)
> @@ -99,11 +136,33 @@ static void lazy_init_name_hash(struct index_state 
> *istate)
>  
>  void add_name_hash(struct index_state *istate, struct cache_entry *ce)
>  {
> +     /* if already hashed, add reference to directory entries */
> +     if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_STATE_MASK)
> +             hash_dir_entry(istate, ce, ce_namelen(ce), 1);

Instead of a single function with "are we adding or removing?"
parameter, it would be a lot easier to read the callers if they are
wrapped in two helpers, add_dir_entry() and del_dir_entry() or
something, especially when the add=[0|1] parameter is constant for
each and every callsite (i.e. the codeflow determines it, not the
data).

>       ce->ce_flags &= ~CE_UNHASHED;
>       if (istate->name_hash_initialized)
>               hash_index_entry(istate, ce);
>  }
>  
> +/*
> + * We don't actually *remove* it, we can just mark it invalid so that
> + * we won't find it in lookups.
> + *
> + * Not only would we have to search the lists (simple enough), but
> + * we'd also have to rehash other hash buckets in case this makes the
> + * hash bucket empty (common). So it's much better to just mark
> + * it.
> + */
> +void remove_name_hash(struct index_state *istate, struct cache_entry *ce)
> +{
> +     /* if already hashed, release reference to directory entries */
> +     if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_HASHED)
> +             hash_dir_entry(istate, ce, ce_namelen(ce), 0);

And here as well.

> +
> +     ce->ce_flags |= CE_UNHASHED;
> +}
> +
>  static int slow_same_name(const char *name1, int len1, const char *name2, 
> int len2)
>  {
>       if (len1 != len2)
> @@ -137,18 +196,7 @@ static int same_name(const struct cache_entry *ce, const 
> char *name, int namelen
>       if (!icase)
>               return 0;
>  
> -     /*
> -      * If the entry we're comparing is a filename (no trailing slash), then 
> compare
> -      * the lengths exactly.
> -      */
> -     if (name[namelen - 1] != '/')
> -             return slow_same_name(name, namelen, ce->name, len);
> -
> -     /*
> -      * For a directory, we point to an arbitrary cache_entry filename.  Just
> -      * make sure the directory portion matches.
> -      */
> -     return slow_same_name(name, namelen, ce->name, namelen < len ? namelen 
> : len);
> +     return slow_same_name(name, namelen, ce->name, len);

Hmph, what is this change about?  Nobody calls same_name() with a
directory name anymore or something?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to