Re: [PATCH 3/3] abbrev: auto size the default abbreviation

Junio C Hamano Mon, 03 Oct 2016 18:37:44 -0700

Jeff King <p...@peff.net> writes:

>> OK, as Linus's "count at the point of use" is already in 'next',
>> could you make it incremental with a log message?
>
> Sure. I wasn't sure if you actually liked my direction or not, so I was
> mostly just showing off what the completed one would look like.


To be quite honest, I am not just unsure if I liked your direction;
rather I am not sure if I actually understood what you perceived as
a difference that matters between the two approaches.  I wanted to
hear you explain the difference in terms of "Linus's does this, but
it is bad in X and Y way, so let's avoid it and do it like Z
instead".  One effective way to extract that out of you was to force
you to justify the "incremental" update.

And it seems that I succeeded ;-).

I am still not sure if I 100% agree with your first paragraph, but
at least now I think I see where you are coming from.

You probably will hear from Ramsay about extern-ness of msb().

> -- >8 --
> Subject: [PATCH] find_unique_abbrev: move logic out of get_short_sha1()
>
> The get_short_sha1() is only about reading short sha1s; we
> do call it in a loop to check "is this long enough" for each
> object, but otherwise it should not need to know about
> things like our default_abbrev setting.
>
> So instead of asking it to set default_automatic_abbrev as a
> side-effect, let's just have find_unique_abbrev() pick the
> right place to start its loop.  This requires a separate
> approximate_object_count() function, but that naturally
> belongs with the rest of sha1_file.c.
>
> Signed-off-by: Jeff King <p...@peff.net>
> ---
>  cache.h     |  7 ++++++-
>  sha1_file.c | 27 +++++++++++++++++++++++++++
>  sha1_name.c | 60 +++++++++++++++++++++++++++++++++++-------------------------
>  3 files changed, 68 insertions(+), 26 deletions(-)
>
> diff --git a/cache.h b/cache.h
> index 0e2a059..f22ace5 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1204,7 +1204,6 @@ struct object_context {
>  #define GET_SHA1_TREEISH          020
>  #define GET_SHA1_BLOB             040
>  #define GET_SHA1_FOLLOW_SYMLINKS 0100
> -#define GET_SHA1_AUTOMATIC    0200
>  #define GET_SHA1_ONLY_TO_DIE    04000
>  
>  #define GET_SHA1_DISAMBIGUATORS \
> @@ -1456,6 +1455,12 @@ extern void prepare_packed_git(void);
>  extern void reprepare_packed_git(void);
>  extern void install_packed_git(struct packed_git *pack);
>  
> +/*
> + * Give a rough count of objects in the repository. This sacrifices accuracy
> + * for speed.
> + */
> +unsigned long approximate_object_count(void);
> +
>  extern struct packed_git *find_sha1_pack(const unsigned char *sha1,
>                                        struct packed_git *packs);
>  
> diff --git a/sha1_file.c b/sha1_file.c
> index b9c1fa3..4882440 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -1381,6 +1381,32 @@ static void prepare_packed_git_one(char *objdir, int 
> local)
>       strbuf_release(&path);
>  }
>  
> +static int approximate_object_count_valid;
> +
> +/*
> + * Give a fast, rough count of the number of objects in the repository. This
> + * ignores loose objects completely. If you have a lot of them, then either
> + * you should repack because your performance will be awful, or they are
> + * all unreachable objects about to be pruned, in which case they're not 
> really
> + * interesting as a measure of repo size in the first place.
> + */
> +unsigned long approximate_object_count(void)
> +{
> +     static unsigned long count;
> +     if (!approximate_object_count_valid) {
> +             struct packed_git *p;
> +
> +             prepare_packed_git();
> +             count = 0;
> +             for (p = packed_git; p; p = p->next) {
> +                     if (open_pack_index(p))
> +                             continue;
> +                     count += p->num_objects;
> +             }
> +     }
> +     return count;
> +}
> +
>  static void *get_next_packed_git(const void *p)
>  {
>       return ((const struct packed_git *)p)->next;
> @@ -1455,6 +1481,7 @@ void prepare_packed_git(void)
>  
>  void reprepare_packed_git(void)
>  {
> +     approximate_object_count_valid = 0;
>       prepare_packed_git_run_once = 0;
>       prepare_packed_git();
>  }
> diff --git a/sha1_name.c b/sha1_name.c
> index beb7ab5..76e6885 100644
> --- a/sha1_name.c
> +++ b/sha1_name.c
> @@ -15,7 +15,6 @@ typedef int (*disambiguate_hint_fn)(const unsigned char *, 
> void *);
>  
>  struct disambiguate_state {
>       int len; /* length of prefix in hex chars */
> -     unsigned int nrobjects;
>       char hex_pfx[GIT_SHA1_HEXSZ + 1];
>       unsigned char bin_pfx[GIT_SHA1_RAWSZ];
>  
> @@ -119,14 +118,6 @@ static void find_short_object_filename(struct 
> disambiguate_state *ds)
>  
>                       if (strlen(de->d_name) != 38)
>                               continue;
> -
> -                     /*
> -                      * We only look at the one subdirectory, and we assume
> -                      * each subdirectory is roughly similar, so each
> -                      * object we find probably has 255 other objects in
> -                      * the other fan-out directories.
> -                      */
> -                     ds->nrobjects += 256;
>                       if (memcmp(de->d_name, ds->hex_pfx + 2, ds->len - 2))
>                               continue;
>                       memcpy(hex + 2, de->d_name, 38);
> @@ -160,7 +151,6 @@ static void unique_in_pack(struct packed_git *p,
>  
>       open_pack_index(p);
>       num = p->num_objects;
> -     ds->nrobjects += num;
>       last = num;
>       while (first < last) {
>               uint32_t mid = (first + last) / 2;
> @@ -390,9 +380,6 @@ static int show_ambiguous_object(const unsigned char 
> *sha1, void *data)
>       return 0;
>  }
>  
> -/* start from our historical default before the automatic abbreviation */
> -static int default_automatic_abbrev = FALLBACK_DEFAULT_ABBREV;
> -
>  static int get_short_sha1(const char *name, int len, unsigned char *sha1,
>                         unsigned flags)
>  {
> @@ -439,14 +426,6 @@ static int get_short_sha1(const char *name, int len, 
> unsigned char *sha1,
>               for_each_abbrev(ds.hex_pfx, show_ambiguous_object, &ds);
>       }
>  
> -     if (len < 16 && !status && (flags & GET_SHA1_AUTOMATIC)) {
> -             unsigned int expect_collision = 1 << (len * 2);
> -             if (ds.nrobjects > expect_collision) {
> -                     default_automatic_abbrev = len+1;
> -                     return SHORT_NAME_AMBIGUOUS;
> -             }
> -     }
> -
>       return status;
>  }
>  
> @@ -476,22 +455,53 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn 
> fn, void *cb_data)
>       return ret;
>  }
>  
> +/*
> + * Return the slot of the most-significant bit set in "val". There are 
> various
> + * ways to do this quickly with fls() or __builtin_clzl(), but speed is
> + * probably not a big deal here.
> + */
> +unsigned msb(unsigned long val)
> +{
> +     unsigned r = 0;
> +     while (val >>= 1)
> +             r++;
> +     return r;
> +}
> +
>  int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len)
>  {
>       int status, exists;
> -     int flags = GET_SHA1_QUIETLY;
>  
>       if (len < 0) {
> -             flags |= GET_SHA1_AUTOMATIC;
> -             len = default_automatic_abbrev;
> +             unsigned long count = approximate_object_count();
> +             /*
> +              * Add one because the MSB only tells us the highest bit set,
> +              * not including the value of all the _other_ bits (so "15"
> +              * is only one off of 2^4, but the MSB is the 3rd bit.
> +              */
> +             len = msb(count) + 1;
> +             /*
> +              * We now know we have on the order of 2^len objects, which
> +              * expects a collision at 2^(len/2). But we also care about hex
> +              * chars, not bits, and there are 4 bits per hex. So all
> +              * together we need to divide by 2; but we also want to round
> +              * odd numbers up, hence adding one before dividing.
> +              */
> +             len = (len + 1) / 2;
> +             /*
> +              * For very small repos, we stick with our regular fallback.
> +              */
> +             if (len < FALLBACK_DEFAULT_ABBREV)
> +                     len = FALLBACK_DEFAULT_ABBREV;
>       }
> +
>       sha1_to_hex_r(hex, sha1);
>       if (len == 40 || !len)
>               return 40;
>       exists = has_sha1_file(sha1);
>       while (len < 40) {
>               unsigned char sha1_ret[20];
> -             status = get_short_sha1(hex, len, sha1_ret, flags);
> +             status = get_short_sha1(hex, len, sha1_ret, GET_SHA1_QUIETLY);
>               if (exists
>                   ? !status
>                   : status == SHORT_NAME_NOT_FOUND) {

Re: [PATCH 3/3] abbrev: auto size the default abbreviation

Reply via email to