[PATCH 16/16] write_sha1_file: freshen existing objects

2014-10-03 Thread Jeff King
When we try to write a loose object file, we first check
whether that object already exists. If so, we skip the
write as an optimization. However, this can interfere with
prune's strategy of using mtimes to mark files in progress.

For example, if a branch contains a particular tree object
and is deleted, that tree object may become unreachable, and
have an old mtime. If a new operation then tries to write
the same tree, this ends up as a noop; we notice we
already have the object and do nothing. A prune running
simultaneously with this operation will see the object as
old, and may delete it.

We can solve this by "freshening" objects that we avoid
writing by updating their mtime. The algorithm for doing so
is essentially the same as that of has_sha1_file. Therefore
we provide a new (static) interface "check_and_freshen",
which finds and optionally freshens the object. It's trivial
to implement freshening and simple checking by tweaking a
single parameter.

Signed-off-by: Jeff King 
---
 sha1_file.c| 51 +++---
 t/t6501-freshen-objects.sh | 27 
 2 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index d017289..d263b38 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -442,27 +442,53 @@ void prepare_alt_odb(void)
read_info_alternates(get_object_directory(), 0);
 }
 
-static int has_loose_object_local(const unsigned char *sha1)
+static int freshen_file(const char *fn)
 {
-   return !access(sha1_file_name(sha1), F_OK);
+   struct utimbuf t;
+   t.actime = t.modtime = time(NULL);
+   return utime(fn, &t);
 }
 
-int has_loose_object_nonlocal(const unsigned char *sha1)
+static int check_and_freshen_file(const char *fn, int freshen)
+{
+   if (access(fn, F_OK))
+   return 0;
+   if (freshen && freshen_file(fn))
+   return 0;
+   return 1;
+}
+
+static int check_and_freshen_local(const unsigned char *sha1, int freshen)
+{
+   return check_and_freshen_file(sha1_file_name(sha1), freshen);
+}
+
+static int check_and_freshen_nonlocal(const unsigned char *sha1, int freshen)
 {
struct alternate_object_database *alt;
prepare_alt_odb();
for (alt = alt_odb_list; alt; alt = alt->next) {
fill_sha1_path(alt->name, sha1);
-   if (!access(alt->base, F_OK))
+   if (check_and_freshen_file(alt->base, freshen))
return 1;
}
return 0;
 }
 
+static int check_and_freshen(const unsigned char *sha1, int freshen)
+{
+   return check_and_freshen_local(sha1, freshen) ||
+  check_and_freshen_nonlocal(sha1, freshen);
+}
+
+int has_loose_object_nonlocal(const unsigned char *sha1)
+{
+   return check_and_freshen_nonlocal(sha1, 0);
+}
+
 static int has_loose_object(const unsigned char *sha1)
 {
-   return has_loose_object_local(sha1) ||
-  has_loose_object_nonlocal(sha1);
+   return check_and_freshen(sha1, 0);
 }
 
 static unsigned int pack_used_ctr;
@@ -2949,6 +2975,17 @@ static int write_loose_object(const unsigned char *sha1, 
char *hdr, int hdrlen,
return move_temp_to_file(tmp_file, filename);
 }
 
+static int freshen_loose_object(const unsigned char *sha1)
+{
+   return check_and_freshen(sha1, 1);
+}
+
+static int freshen_packed_object(const unsigned char *sha1)
+{
+   struct pack_entry e;
+   return find_pack_entry(sha1, &e) && freshen_file(e.p->pack_name);
+}
+
 int write_sha1_file(const void *buf, unsigned long len, const char *type, 
unsigned char *returnsha1)
 {
unsigned char sha1[20];
@@ -2961,7 +2998,7 @@ int write_sha1_file(const void *buf, unsigned long len, 
const char *type, unsign
write_sha1_file_prepare(buf, len, type, sha1, hdr, &hdrlen);
if (returnsha1)
hashcpy(returnsha1, sha1);
-   if (has_sha1_file(sha1))
+   if (freshen_loose_object(sha1) || freshen_packed_object(sha1))
return 0;
return write_loose_object(sha1, hdr, hdrlen, buf, len, 0);
 }
diff --git a/t/t6501-freshen-objects.sh b/t/t6501-freshen-objects.sh
index e25c47d..157f3f9 100755
--- a/t/t6501-freshen-objects.sh
+++ b/t/t6501-freshen-objects.sh
@@ -100,6 +100,33 @@ for repack in '' true; do
test_expect_success "repository passes fsck ($title)" '
git fsck
'
+
+   test_expect_success "abandon objects again ($title)" '
+   git reset --hard HEAD^ &&
+   find .git/objects -type f |
+   xargs test-chmtime -v -86400
+   '
+
+   test_expect_success "start writing new commit with same tree ($title)" '
+   tree=$(
+   GIT_INDEX_FILE=index.tmp &&
+   export GIT_INDEX_FILE &&
+   git read-tree HEAD &&
+   add abandon &&
+   add unrelated &&
+   git write-tree
+ 

Re: [PATCH 16/16] write_sha1_file: freshen existing objects

2014-10-03 Thread Junio C Hamano
Jeff King  writes:

> When we try to write a loose object file, we first check
> whether that object already exists. If so, we skip the
> write as an optimization. However, this can interfere with
> prune's strategy of using mtimes to mark files in progress.
>
> For example, if a branch contains a particular tree object
> and is deleted, that tree object may become unreachable, and
> have an old mtime. If a new operation then tries to write
> the same tree, this ends up as a noop; we notice we
> already have the object and do nothing. A prune running
> simultaneously with this operation will see the object as
> old, and may delete it.
>
> We can solve this by "freshening" objects that we avoid
> writing by updating their mtime. The algorithm for doing so
> is essentially the same as that of has_sha1_file. Therefore
> we provide a new (static) interface "check_and_freshen",
> which finds and optionally freshens the object. It's trivial
> to implement freshening and simple checking by tweaking a
> single parameter.

An old referent by a recent unreachable may be in pack.  Is it
expected that the same pack will have many similar old objects (in
other words, is it worth trying to optimize check-and-freshen by
bypassing access() and utime(), perhaps by keeping a "freshened in
this process already" flag in struct packed_git)?

Could check-and-freshen-nonlocal() ever be called with freshen set
to true?  Should it be?  In other words, should we be mucking with
objects in other people's repositories with utime()?

Other than these two minor questions, I am happy with this patch.

Thanks.

> Signed-off-by: Jeff King 
> ---
>  sha1_file.c| 51 
> +++---
>  t/t6501-freshen-objects.sh | 27 
>  2 files changed, 71 insertions(+), 7 deletions(-)
>
> diff --git a/sha1_file.c b/sha1_file.c
> index d017289..d263b38 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -442,27 +442,53 @@ void prepare_alt_odb(void)
>   read_info_alternates(get_object_directory(), 0);
>  }
>  
> -static int has_loose_object_local(const unsigned char *sha1)
> +static int freshen_file(const char *fn)
>  {
> - return !access(sha1_file_name(sha1), F_OK);
> + struct utimbuf t;
> + t.actime = t.modtime = time(NULL);
> + return utime(fn, &t);
>  }
>  
> -int has_loose_object_nonlocal(const unsigned char *sha1)
> +static int check_and_freshen_file(const char *fn, int freshen)
> +{
> + if (access(fn, F_OK))
> + return 0;
> + if (freshen && freshen_file(fn))
> + return 0;
> + return 1;
> +}
> +
> +static int check_and_freshen_local(const unsigned char *sha1, int freshen)
> +{
> + return check_and_freshen_file(sha1_file_name(sha1), freshen);
> +}
> +
> +static int check_and_freshen_nonlocal(const unsigned char *sha1, int freshen)
>  {
>   struct alternate_object_database *alt;
>   prepare_alt_odb();
>   for (alt = alt_odb_list; alt; alt = alt->next) {
>   fill_sha1_path(alt->name, sha1);
> - if (!access(alt->base, F_OK))
> + if (check_and_freshen_file(alt->base, freshen))
>   return 1;
>   }
>   return 0;
>  }
>  
> +static int check_and_freshen(const unsigned char *sha1, int freshen)
> +{
> + return check_and_freshen_local(sha1, freshen) ||
> +check_and_freshen_nonlocal(sha1, freshen);
> +}
> +
> +int has_loose_object_nonlocal(const unsigned char *sha1)
> +{
> + return check_and_freshen_nonlocal(sha1, 0);
> +}
> +
>  static int has_loose_object(const unsigned char *sha1)
>  {
> - return has_loose_object_local(sha1) ||
> -has_loose_object_nonlocal(sha1);
> + return check_and_freshen(sha1, 0);
>  }
>  
>  static unsigned int pack_used_ctr;
> @@ -2949,6 +2975,17 @@ static int write_loose_object(const unsigned char 
> *sha1, char *hdr, int hdrlen,
>   return move_temp_to_file(tmp_file, filename);
>  }
>  
> +static int freshen_loose_object(const unsigned char *sha1)
> +{
> + return check_and_freshen(sha1, 1);
> +}
> +
> +static int freshen_packed_object(const unsigned char *sha1)
> +{
> + struct pack_entry e;
> + return find_pack_entry(sha1, &e) && freshen_file(e.p->pack_name);
> +}
> +
>  int write_sha1_file(const void *buf, unsigned long len, const char *type, 
> unsigned char *returnsha1)
>  {
>   unsigned char sha1[20];
> @@ -2961,7 +2998,7 @@ int write_sha1_file(const void *buf, unsigned long len, 
> const char *type, unsign
>   write_sha1_file_prepare(buf, len, type, sha1, hdr, &hdrlen);
>   if (returnsha1)
>   hashcpy(returnsha1, sha1);
> - if (has_sha1_file(sha1))
> + if (freshen_loose_object(sha1) || freshen_packed_object(sha1))
>   return 0;
>   return write_loose_object(sha1, hdr, hdrlen, buf, len, 0);
>  }
> diff --git a/t/t6501-freshen-objects.sh b/t/t6501-freshen-objects.sh
> index e25c47d..157f3f9 100755
> --- a/t/t6501-freshen-obje

Re: [PATCH 16/16] write_sha1_file: freshen existing objects

2014-10-03 Thread Jeff King
On Fri, Oct 03, 2014 at 02:29:58PM -0700, Junio C Hamano wrote:

> > We can solve this by "freshening" objects that we avoid
> > writing by updating their mtime. The algorithm for doing so
> > is essentially the same as that of has_sha1_file. Therefore
> > we provide a new (static) interface "check_and_freshen",
> > which finds and optionally freshens the object. It's trivial
> > to implement freshening and simple checking by tweaking a
> > single parameter.
> 
> An old referent by a recent unreachable may be in pack.  Is it
> expected that the same pack will have many similar old objects (in
> other words, is it worth trying to optimize check-and-freshen by
> bypassing access() and utime(), perhaps by keeping a "freshened in
> this process already" flag in struct packed_git)?

Thanks for reminding me. I considered something like that early on and
then completely forgot to revisit it. I do not have numbers either way
on whether it is an optimization worth doing. On the one hand, it is
very easy to do.  On the other, it probably does not make a big
difference; we are literally skipping the write of an entire object, and
have just run a complete sha1 over the contents. A single utime() call
probably is not a big deal.

> Could check-and-freshen-nonlocal() ever be called with freshen set
> to true?  Should it be?  In other words, should we be mucking with
> objects in other people's repositories with utime()?

Yes, it can, and I think the answer to "should" is "yes" for safety,
though I agree it feels a little hacky. I did explicitly write it so
that we fail-safe when freshening doesn't work. That is, if we try to
freshen an object that is in an alternate and we cannot (e.g., because
we don't have write access), we'll fallback to writing out a new loose
object locally.

That's very much the safest thing to do, but obviously it performs less
well. Again, this is the code path where we _would have_ written out the
object anyway, so it might not be that bad. But I don't know to what
degree the current code relies on that optimization for reasonable
performance. E.g., if you clone from a read-only alternate and then try
to `git write-tree` immediately on the index, will we literally make a
full copy of each tree object?

Hmm, that should be easy to test...

  $ su - nobody
  $ git clone -s ~peff/compile/linux /tmp/foo
  $ cd /tmp/foo

  $ git count-objects
  0 objects, 0 kilobytes
  $ git write-tree
  $ git count-objects
  0 objects, 0 kilobytes

So far so good. Let's blow away the cache-tree to make sure...

  $ rm .git/index
  $ git read-tree HEAD
  $ git write-tree
  $ git count-objects
  0 objects, 0 kilobytes

So that's promising. But it's far from a proof that there isn't some
other code path that will be negatively impacted.

-Peff
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 16/16] write_sha1_file: freshen existing objects

2014-10-05 Thread René Scharfe

Am 03.10.2014 um 22:41 schrieb Jeff King:

When we try to write a loose object file, we first check
whether that object already exists. If so, we skip the
write as an optimization. However, this can interfere with
prune's strategy of using mtimes to mark files in progress.

For example, if a branch contains a particular tree object
and is deleted, that tree object may become unreachable, and
have an old mtime. If a new operation then tries to write
the same tree, this ends up as a noop; we notice we
already have the object and do nothing. A prune running
simultaneously with this operation will see the object as
old, and may delete it.

We can solve this by "freshening" objects that we avoid
writing by updating their mtime. The algorithm for doing so
is essentially the same as that of has_sha1_file. Therefore
we provide a new (static) interface "check_and_freshen",
which finds and optionally freshens the object. It's trivial
to implement freshening and simple checking by tweaking a
single parameter.

Signed-off-by: Jeff King 
---
  sha1_file.c| 51 +++---
  t/t6501-freshen-objects.sh | 27 
  2 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index d017289..d263b38 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -442,27 +442,53 @@ void prepare_alt_odb(void)
read_info_alternates(get_object_directory(), 0);
  }

-static int has_loose_object_local(const unsigned char *sha1)
+static int freshen_file(const char *fn)
  {
-   return !access(sha1_file_name(sha1), F_OK);
+   struct utimbuf t;
+   t.actime = t.modtime = time(NULL);
+   return utime(fn, &t);
  }


Mental note: freshen_file() returns 0 on success.



-int has_loose_object_nonlocal(const unsigned char *sha1)
+static int check_and_freshen_file(const char *fn, int freshen)
+{
+   if (access(fn, F_OK))
+   return 0;
+   if (freshen && freshen_file(fn))
+   return 0;
+   return 1;
+}


Returns 1 on success.


+
+static int check_and_freshen_local(const unsigned char *sha1, int freshen)
+{
+   return check_and_freshen_file(sha1_file_name(sha1), freshen);
+}


Returns 1 on success.


+
+static int check_and_freshen_nonlocal(const unsigned char *sha1, int freshen)
  {
struct alternate_object_database *alt;
prepare_alt_odb();
for (alt = alt_odb_list; alt; alt = alt->next) {
fill_sha1_path(alt->name, sha1);
-   if (!access(alt->base, F_OK))
+   if (check_and_freshen_file(alt->base, freshen))
return 1;
}
return 0;
  }


Returns 1 on success.



+static int check_and_freshen(const unsigned char *sha1, int freshen)
+{
+   return check_and_freshen_local(sha1, freshen) ||
+  check_and_freshen_nonlocal(sha1, freshen);
+}


Returns 1 on success.


+
+int has_loose_object_nonlocal(const unsigned char *sha1)
+{
+   return check_and_freshen_nonlocal(sha1, 0);
+}
+
  static int has_loose_object(const unsigned char *sha1)
  {
-   return has_loose_object_local(sha1) ||
-  has_loose_object_nonlocal(sha1);
+   return check_and_freshen(sha1, 0);
  }

  static unsigned int pack_used_ctr;
@@ -2949,6 +2975,17 @@ static int write_loose_object(const unsigned char *sha1, 
char *hdr, int hdrlen,
return move_temp_to_file(tmp_file, filename);
  }

+static int freshen_loose_object(const unsigned char *sha1)
+{
+   return check_and_freshen(sha1, 1);
+}


Returns 1 on success.


+
+static int freshen_packed_object(const unsigned char *sha1)
+{
+   struct pack_entry e;
+   return find_pack_entry(sha1, &e) && freshen_file(e.p->pack_name);
+}


Returns 1 if a pack entry is found and freshen_file() fails, and 0 if no 
entry is found or freshen_file() succeeds.


It should be "&& !freshen(...)" instead, no?

Or better, let freshen_file() return 1 on success as the other functions 
here.



+
  int write_sha1_file(const void *buf, unsigned long len, const char *type, 
unsigned char *returnsha1)
  {
unsigned char sha1[20];
@@ -2961,7 +2998,7 @@ int write_sha1_file(const void *buf, unsigned long len, 
const char *type, unsign
write_sha1_file_prepare(buf, len, type, sha1, hdr, &hdrlen);
if (returnsha1)
hashcpy(returnsha1, sha1);
-   if (has_sha1_file(sha1))
+   if (freshen_loose_object(sha1) || freshen_packed_object(sha1))
return 0;
return write_loose_object(sha1, hdr, hdrlen, buf, len, 0);
  }
diff --git a/t/t6501-freshen-objects.sh b/t/t6501-freshen-objects.sh
index e25c47d..157f3f9 100755
--- a/t/t6501-freshen-objects.sh
+++ b/t/t6501-freshen-objects.sh
@@ -100,6 +100,33 @@ for repack in '' true; do
test_expect_success "repository passes fsck ($title)" '
git fsck
'
+
+   test_expect_success "abandon objects again ($title)" '
+   git reset -