Re: [PATCH 07/18] sha1_file: support lazily fetching missing objects

2017-10-12 Thread Christian Couder
On Thu, Oct 12, 2017 at 4:42 PM, Christian Couder
 wrote:
>
> Instead of adding labels and gotos, I would suggest adding a new
> function like the following does on top of your changes:

Sorry for the usual Gmail related problems. You can find the patch and
a further refactoring that adds an object_info_from_pack_entry()
function there:

https://github.com/chriscool/git/commits/pu-partial-clone-lazy-fetch-improved


Re: [PATCH 07/18] sha1_file: support lazily fetching missing objects

2017-10-12 Thread Christian Couder
On Fri, Sep 29, 2017 at 10:11 PM, Jonathan Tan  wrote:

> diff --git a/sha1_file.c b/sha1_file.c
> index b4a67bb83..77aa0ffdf 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -29,6 +29,7 @@
>  #include "mergesort.h"
>  #include "quote.h"
>  #include "packfile.h"
> +#include "fetch-object.h"
>
>  const unsigned char null_sha1[GIT_MAX_RAWSZ];
>  const struct object_id null_oid;
> @@ -1149,6 +1150,8 @@ static int sha1_loose_object_info(const unsigned char 
> *sha1,
> return (status < 0) ? status : 0;
>  }
>
> +int fetch_if_missing = 1;
> +
>  int sha1_object_info_extended(const unsigned char *sha1, struct object_info 
> *oi, unsigned flags)
>  {
> static struct object_info blank_oi = OBJECT_INFO_INIT;
> @@ -1157,6 +1160,7 @@ int sha1_object_info_extended(const unsigned char 
> *sha1, struct object_info *oi,
> const unsigned char *real = (flags & OBJECT_INFO_LOOKUP_REPLACE) ?
> lookup_replace_object(sha1) :
> sha1;
> +   int already_retried = 0;
>
> if (!oi)
> oi = _oi;
> @@ -1181,28 +1185,36 @@ int sha1_object_info_extended(const unsigned char 
> *sha1, struct object_info *oi,
> }
> }
>
> -   if (!find_pack_entry(real, )) {
> -   /* Most likely it's a loose object. */
> -   if (!sha1_loose_object_info(real, oi, flags))
> -   return 0;
> +retry:
> +   if (find_pack_entry(real, ))
> +   goto found_packed;
>
> -   /* Not a loose object; someone else may have just packed it. 
> */
> -   if (flags & OBJECT_INFO_QUICK) {
> -   return -1;
> -   } else {
> -   reprepare_packed_git();
> -   if (!find_pack_entry(real, ))
> -   return -1;
> -   }
> +   /* Most likely it's a loose object. */
> +   if (!sha1_loose_object_info(real, oi, flags))
> +   return 0;
> +
> +   /* Not a loose object; someone else may have just packed it. */
> +   reprepare_packed_git();
> +   if (find_pack_entry(real, ))
> +   goto found_packed;
> +
> +   /* Check if it is a missing object */
> +   if (fetch_if_missing && repository_format_partial_clone &&
> +   !already_retried) {
> +   fetch_object(repository_format_partial_clone, real);
> +   already_retried = 1;
> +   goto retry;
> }
>
> +   return -1;
> +
> +found_packed:
> if (oi == _oi)
> /*
>  * We know that the caller doesn't actually need the
>  * information below, so return early.
>  */
> return 0;
> -
> rtype = packed_object_info(e.p, e.offset, oi);
> if (rtype < 0) {
> mark_bad_packed_object(e.p, real);

Instead of adding labels and gotos, I would suggest adding a new
function like the following does on top of your changes:

diff --git a/sha1_file.c b/sha1_file.c
index cc1aa0bd7f..02a6ed1e9b 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1171,18 +1171,40 @@ static int sha1_loose_object_info(const
unsigned char *sha1,
return (status < 0) ? status : 0;
 }

+int try_find_packed_entry_or_loose_object(const unsigned char *real,
struct object_info *oi,
+ unsigned flags, struct
pack_entry *e, int retry)
+{
+   if (find_pack_entry(real, e))
+   return 1;
+
+   /* Most likely it's a loose object. */
+   if (!sha1_loose_object_info(real, oi, flags))
+   return 0;
+
+   /* Not a loose object; someone else may have just packed it. */
+   reprepare_packed_git();
+   if (find_pack_entry(real, e))
+   return 1;
+
+   /* Check if it is a missing object */
+   if (fetch_if_missing && repository_format_partial_clone && retry) {
+   fetch_object(repository_format_partial_clone, real);
+   return try_find_packed_entry_or_loose_object(real, oi,
flags, e, 0);
+   }
+
+   return -1;
+}
+
 int fetch_if_missing = 1;

 int sha1_object_info_extended(const unsigned char *sha1, struct
object_info *oi, unsigned flags)
 {
static struct object_info blank_oi = OBJECT_INFO_INIT;
struct pack_entry e;
-   int rtype;
+   int rtype, res;
const unsigned char *real = (flags & OBJECT_INFO_LOOKUP_REPLACE) ?
lookup_replace_object(sha1) :
sha1;
-   int already_retried = 0;
-
if (!oi)
oi = _oi;

@@ -1206,30 +1228,10 @@ int sha1_object_info_extended(const unsigned
char *sha1, struct object_info *oi,
}
}

-retry:
-   if (find_pack_entry(real, ))
-   goto found_packed;
-
-   /* Most likely it's a loose object. */
- 

[PATCH 07/18] sha1_file: support lazily fetching missing objects

2017-09-29 Thread Jonathan Tan
Teach sha1_file to fetch objects from the remote configured in
extensions.partialClone whenever an object is requested but missing.

The fetching of objects can be suppressed through a global variable.
This is used by fsck and index-pack.

However, by default, such fetching is not suppressed. This is meant as a
temporary measure to ensure that all Git commands work in such a
situation. Future patches will update some commands to either tolerate
missing objects (without fetching them) or be more efficient in fetching
them.

In order to determine the code changes in sha1_file.c necessary, I
investigated the following:
 (1) functions in sha1_file.c that take in a hash, without the user
 regarding how the object is stored (loose or packed)
 (2) functions in packfile.c (because I need to check callers that know
 about the loose/packed distinction and operate on both differently,
 and ensure that they can handle the concept of objects that are
 neither loose nor packed)

(1) is handled by the modification to sha1_object_info_extended().

For (2), I looked at for_each_packed_object and others.  For
for_each_packed_object, the callers either already work or are fixed in
this patch:
 - reachable - only to find recent objects
 - builtin/fsck - already knows about missing objects
 - builtin/cat-file - warning message added in this commit

Callers of the other functions do not need to be changed:
 - parse_pack_index
   - http - indirectly from http_get_info_packs
   - find_pack_entry_one
 - this searches a single pack that is provided as an argument; the
   caller already knows (through other means) that the sought object
   is in a specific pack
 - find_sha1_pack
   - fast-import - appears to be an optimization to not store a file if
 it is already in a pack
   - http-walker - to search through a struct alt_base
   - http-push - to search through remote packs
 - has_sha1_pack
   - builtin/fsck - already knows about promisor objects
   - builtin/count-objects - informational purposes only (check if loose
 object is also packed)
   - builtin/prune-packed - check if object to be pruned is packed (if
 not, don't prune it)
   - revision - used to exclude packed objects if requested by user
   - diff - just for optimization

Signed-off-by: Jonathan Tan 
---
 builtin/cat-file.c   |  2 ++
 builtin/fetch-pack.c |  2 ++
 builtin/fsck.c   |  3 +++
 builtin/index-pack.c |  6 ++
 cache.h  |  8 
 fetch-object.c   |  3 +++
 sha1_file.c  | 38 
 t/t0410-partial-clone.sh | 51 
 8 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 4ccbfaac3..7aa1159ce 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -474,6 +474,8 @@ static int batch_objects(struct batch_options *opt)
 
for_each_loose_object(batch_loose_object, , 0);
for_each_packed_object(batch_packed_object, , 0);
+   if (repository_format_partial_clone)
+   warning("This repository has extensions.partialClone 
set. Some objects may not be loaded.");
 
cb.opt = opt;
cb.expand = 
diff --git a/builtin/fetch-pack.c b/builtin/fetch-pack.c
index 9f303cf98..9a7ebf6e9 100644
--- a/builtin/fetch-pack.c
+++ b/builtin/fetch-pack.c
@@ -53,6 +53,8 @@ int cmd_fetch_pack(int argc, const char **argv, const char 
*prefix)
struct oid_array shallow = OID_ARRAY_INIT;
struct string_list deepen_not = STRING_LIST_INIT_DUP;
 
+   fetch_if_missing = 0;
+
packet_trace_identity("fetch-pack");
 
memset(, 0, sizeof(args));
diff --git a/builtin/fsck.c b/builtin/fsck.c
index f6cb4d755..155ca663b 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -683,6 +683,9 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
int i;
struct alternate_object_database *alt;
 
+   /* fsck knows how to handle missing promisor objects */
+   fetch_if_missing = 0;
+
errors_found = 0;
check_replace_refs = 0;
 
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 14ebb2f17..440558046 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1657,6 +1657,12 @@ int cmd_index_pack(int argc, const char **argv, const 
char *prefix)
unsigned foreign_nr = 1;/* zero is a "good" value, assume bad */
int report_end_of_input = 0;
 
+   /*
+* index-pack never needs to fetch missing objects, since it only
+* accesses the repo to do hash collision checks
+*/
+   fetch_if_missing = 0;
+
if (argc == 2 && !strcmp(argv[1], "-h"))
usage(index_pack_usage);
 
diff --git a/cache.h b/cache.h
index 437206d06..58d3f8986 100644
--- a/cache.h
+++ b/cache.h
@@ -1721,6 +1721,14 @@ struct object_info {