[PATCH v2 09/12] fsck: verify commit-graph

2018-05-11 Thread Derrick Stolee
If core.commitGraph is true, verify the contents of the commit-graph
during 'git fsck' using the 'git commit-graph verify' subcommand. Run
this check on all alternates, as well.

We use a new process for two reasons:

1. The subcommand decouples the details of loading and verifying a
   commit-graph file from the other fsck details.

2. The commit-graph verification requires the commits to be loaded
   in a specific order to guarantee we parse from the commit-graph
   file for some objects and from the object database for others.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/git-fsck.txt |  3 +++
 builtin/fsck.c | 21 +
 t/t5318-commit-graph.sh| 21 ++---
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt
index b9f060e3b2..ab9a93fb9b 100644
--- a/Documentation/git-fsck.txt
+++ b/Documentation/git-fsck.txt
@@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or 
other archives
 (i.e., you can just remove them and do an 'rsync' with some other site in
 the hopes that somebody else has the object you have corrupted).
 
+If core.commitGraph is true, the commit-graph file will also be inspected
+using 'git commit-graph verify'. See linkgit:git-commit-graph[1].
+
 Extracted Diagnostics
 -
 
diff --git a/builtin/fsck.c b/builtin/fsck.c
index ef78c6c00c..a6d5045b77 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -16,6 +16,7 @@
 #include "streaming.h"
 #include "decorate.h"
 #include "packfile.h"
+#include "run-command.h"
 
 #define REACHABLE 0x0001
 #define SEEN  0x0002
@@ -45,6 +46,7 @@ static int name_objects;
 #define ERROR_REACHABLE 02
 #define ERROR_PACK 04
 #define ERROR_REFS 010
+#define ERROR_COMMIT_GRAPH 020
 
 static const char *describe_object(struct object *obj)
 {
@@ -815,5 +817,24 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
}
 
check_connectivity();
+
+   if (core_commit_graph) {
+   struct child_process commit_graph_verify = CHILD_PROCESS_INIT;
+   const char *verify_argv[] = { "commit-graph", "verify", NULL, 
NULL, NULL, NULL };
+   commit_graph_verify.argv = verify_argv;
+   commit_graph_verify.git_cmd = 1;
+
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+
+   prepare_alt_odb();
+   for (alt = alt_odb_list; alt; alt = alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }
+
return errors_found;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 5ab268a024..91c8406d97 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -205,6 +205,16 @@ test_expect_success 'build graph from commits with append' 
'
 graph_git_behavior 'append graph, commit 8 vs merge 1' full commits/8 merge/1
 graph_git_behavior 'append graph, commit 8 vs merge 2' full commits/8 merge/2
 
+test_expect_success 'build graph using --reachable' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git commit-graph write --reachable &&
+   test_path_is_file $objdir/info/commit-graph &&
+   graph_read_expect "11" "large_edges"
+'
+
+graph_git_behavior 'append graph, commit 8 vs merge 1' full commits/8 merge/1
+graph_git_behavior 'append graph, commit 8 vs merge 2' full commits/8 merge/2
+
 test_expect_success 'setup bare repo' '
cd "$TRASH_DIRECTORY" &&
git clone --bare --no-local full bare &&
@@ -335,7 +345,7 @@ test_expect_success 'detect OID not in object database' '
cd "$TRASH_DIRECTORY/full" &&
cp $objdir/info/commit-graph commit-graph-backup &&
test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
-   corrupt_data $objdir/info/commit-graph 1134 "\01" &&
+   corrupt_data $objdir/info/commit-graph 1134 "\00" &&
test_must_fail git commit-graph verify 2>err &&
grep -v "^\+" err > verify-errors &&
test_line_count = 3 verify-errors &&
@@ -348,7 +358,7 @@ test_expect_success 'detect incorrect tree OID' '
cd "$TRASH_DIRECTORY/full" &&
cp $objdir/info/commit-graph commit-graph-backup &&
test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
-   corrupt_data $objdir/info/commit-graph 1312 "\01" &&
+   corrupt_data $objdir/info/commit-graph 1312 "\00"

[PATCH v2 12/12] commit-graph: update design document

2018-05-11 Thread Derrick Stolee
The commit-graph feature is now integrated with 'fsck' and 'gc',
so remove those items from the "Future Work" section of the
commit-graph design document.

Also remove the section on lazy-loading trees, as that was completed
in an earlier patch series.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/technical/commit-graph.txt | 22 --
 1 file changed, 22 deletions(-)

diff --git a/Documentation/technical/commit-graph.txt 
b/Documentation/technical/commit-graph.txt
index e1a883eb46..c664acbd76 100644
--- a/Documentation/technical/commit-graph.txt
+++ b/Documentation/technical/commit-graph.txt
@@ -118,9 +118,6 @@ Future Work
 - The commit graph feature currently does not honor commit grafts. This can
   be remedied by duplicating or refactoring the current graft logic.
 
-- The 'commit-graph' subcommand does not have a "verify" mode that is
-  necessary for integration with fsck.
-
 - After computing and storing generation numbers, we must make graph
   walks aware of generation numbers to gain the performance benefits they
   enable. This will mostly be accomplished by swapping a commit-date-ordered
@@ -130,25 +127,6 @@ Future Work
 - 'log --topo-order'
 - 'tag --merged'
 
-- Currently, parse_commit_gently() requires filling in the root tree
-  object for a commit. This passes through lookup_tree() and consequently
-  lookup_object(). Also, it calls lookup_commit() when loading the parents.
-  These method calls check the ODB for object existence, even if the
-  consumer does not need the content. For example, we do not need the
-  tree contents when computing merge bases. Now that commit parsing is
-  removed from the computation time, these lookup operations are the
-  slowest operations keeping graph walks from being fast. Consider
-  loading these objects without verifying their existence in the ODB and
-  only loading them fully when consumers need them. Consider a method
-  such as "ensure_tree_loaded(commit)" that fully loads a tree before
-  using commit->tree.
-
-- The current design uses the 'commit-graph' subcommand to generate the graph.
-  When this feature stabilizes enough to recommend to most users, we should
-  add automatic graph writes to common operations that create many commits.
-  For example, one could compute a graph on 'clone', 'fetch', or 'repack'
-  commands.
-
 - A server could provide a commit graph file as part of the network protocol
   to avoid extra calculations by clients. This feature is only of benefit if
   the user is willing to trust the file, because verifying the file is correct
-- 
2.16.2.329.gfb62395de6



[PATCH v2 11/12] gc: automatically write commit-graph files

2018-05-11 Thread Derrick Stolee
The commit-graph file is a very helpful feature for speeding up git
operations. In order to make it more useful, write the commit-graph file
by default during standard garbage collection operations.

Add a 'gc.commitGraph' config setting that triggers writing a
commit-graph file after any non-trivial 'git gc' command. Defaults to
false while the commit-graph feature matures. We specifically do not
want to turn this on by default until the commit-graph feature is fully
integrated with history-modifying features like shallow clones.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/config.txt | 6 ++
 Documentation/git-gc.txt | 4 
 builtin/gc.c | 8 
 3 files changed, 18 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 11f027194e..9a3abd87e7 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1553,6 +1553,12 @@ gc.autoDetach::
Make `git gc --auto` return immediately and run in background
if the system supports it. Default is true.
 
+gc.commitGraph::
+   If true, then gc will rewrite the commit-graph file after any
+   change to the object database. If '--auto' is used, then the
+   commit-graph will not be updated unless the threshold is met.
+   See linkgit:git-commit-graph[1] for details.
+
 gc.logExpiry::
If the file gc.log exists, then `git gc --auto` won't run
unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..17dd654a59 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -119,6 +119,10 @@ The optional configuration variable `gc.packRefs` 
determines if
 it within all non-bare repos or it can be set to a boolean value.
 This defaults to true.
 
+The optional configuration variable 'gc.commitGraph' determines if
+'git gc' runs 'git commit-graph write'. This can be set to a boolean
+value. This defaults to false.
+
 The optional configuration variable `gc.aggressiveWindow` controls how
 much time is spent optimizing the delta compression of the objects in
 the repository when the --aggressive option is specified.  The larger
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..8403445738 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -34,6 +34,7 @@ static int aggressive_depth = 50;
 static int aggressive_window = 250;
 static int gc_auto_threshold = 6700;
 static int gc_auto_pack_limit = 50;
+static int gc_commit_graph = 0;
 static int detach_auto = 1;
 static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
@@ -46,6 +47,7 @@ static struct argv_array repack = ARGV_ARRAY_INIT;
 static struct argv_array prune = ARGV_ARRAY_INIT;
 static struct argv_array prune_worktrees = ARGV_ARRAY_INIT;
 static struct argv_array rerere = ARGV_ARRAY_INIT;
+static struct argv_array commit_graph = ARGV_ARRAY_INIT;
 
 static struct tempfile *pidfile;
 static struct lock_file log_lock;
@@ -121,6 +123,7 @@ static void gc_config(void)
git_config_get_int("gc.aggressivedepth", _depth);
git_config_get_int("gc.auto", _auto_threshold);
git_config_get_int("gc.autopacklimit", _auto_pack_limit);
+   git_config_get_bool("gc.commitgraph", _commit_graph);
git_config_get_bool("gc.autodetach", _auto);
git_config_get_expiry("gc.pruneexpire", _expire);
git_config_get_expiry("gc.worktreepruneexpire", 
_worktrees_expire);
@@ -374,6 +377,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
argv_array_pushl(, "prune", "--expire", NULL);
argv_array_pushl(_worktrees, "worktree", "prune", "--expire", 
NULL);
argv_array_pushl(, "rerere", "gc", NULL);
+   argv_array_pushl(_graph, "commit-graph", "write", "--reachable", 
NULL);
 
/* default expiry time, overwritten in gc_config */
gc_config();
@@ -480,6 +484,10 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
if (pack_garbage.nr > 0)
clean_pack_garbage();
 
+   if (gc_commit_graph)
+   if (run_command_v_opt(commit_graph.argv, RUN_GIT_CMD))
+   return error(FAILED_RUN, commit_graph.argv[0]);
+
if (auto_gc && too_many_loose_objects())
warning(_("There are too many unreachable loose objects; "
"run 'git prune' to remove them."));
-- 
2.16.2.329.gfb62395de6



[PATCH v2 10/12] commit-graph: add '--reachable' option

2018-05-11 Thread Derrick Stolee
When writing commit-graph files, it can be convenient to ask for all
reachable commits (starting at the ref set) in the resulting file. This
is particularly helpful when writing to stdin is complicated, such as a
future integration with 'git gc' which will call
'git commit-graph write --reachable' after performing cleanup of the
object database.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/git-commit-graph.txt |  8 ++--
 builtin/commit-graph.c | 41 ++
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index a222cfab08..cc1715a823 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -38,12 +38,16 @@ Write a commit graph file based on the commits found in 
packfiles.
 +
 With the `--stdin-packs` option, generate the new commit graph by
 walking objects only in the specified pack-indexes. (Cannot be combined
-with --stdin-commits.)
+with --stdin-commits or --reachable.)
 +
 With the `--stdin-commits` option, generate the new commit graph by
 walking commits starting at the commits specified in stdin as a list
 of OIDs in hex, one OID per line. (Cannot be combined with
---stdin-packs.)
+--stdin-packs or --reachable.)
++
+With the `--reachable` option, generate the new commit graph by walking
+commits starting at all refs. (Cannot be combined with --stdin-commits
+or --stind-packs.)
 +
 With the `--append` option, include all commits that are present in the
 existing commit-graph file.
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index af3101291f..7cb94a4813 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -3,13 +3,14 @@
 #include "dir.h"
 #include "lockfile.h"
 #include "parse-options.h"
+#include "refs.h"
 #include "commit-graph.h"
 
 static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
N_("git commit-graph verify [--object-dir ]"),
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
@@ -24,12 +25,13 @@ static const char * const builtin_commit_graph_read_usage[] 
= {
 };
 
 static const char * const builtin_commit_graph_write_usage[] = {
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
 static struct opts_commit_graph {
const char *obj_dir;
+   int reachable;
int stdin_packs;
int stdin_commits;
int append;
@@ -113,6 +115,25 @@ static int graph_read(int argc, const char **argv)
return 0;
 }
 
+struct hex_list {
+   char **hex_strs;
+   int hex_nr;
+   int hex_alloc;
+};
+
+static int add_ref_to_list(const char *refname,
+  const struct object_id *oid,
+  int flags, void *cb_data)
+{
+   struct hex_list *list = (struct hex_list*)cb_data;
+
+   ALLOC_GROW(list->hex_strs, list->hex_nr + 1, list->hex_alloc);
+   list->hex_strs[list->hex_nr] = xcalloc(GIT_MAX_HEXSZ + 1, 1);
+   strcpy(list->hex_strs[list->hex_nr], oid_to_hex(oid));
+   list->hex_nr++;
+   return 0;
+}
+
 static int graph_write(int argc, const char **argv)
 {
const char **pack_indexes = NULL;
@@ -127,6 +148,8 @@ static int graph_write(int argc, const char **argv)
OPT_STRING(0, "object-dir", _dir,
N_("dir"),
N_("The object directory to store the graph")),
+   OPT_BOOL(0, "reachable", ,
+   N_("start walk at all refs")),
OPT_BOOL(0, "stdin-packs", _packs,
N_("scan pack-indexes listed by stdin for commits")),
OPT_BOOL(0, "stdin-commits", _commits,
@@ -140,8 +163,8 @@ static int graph_write(int argc, const char **argv)
 builtin_commit_graph_write_options,
 builtin_commit_graph_write_usage, 0);
 
-   if (opts.stdin_packs && opts.stdin_commits)
-   die(_("cannot use both --stdin-commits and --stdin-packs"));
+   if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1)
+   die(_("use at most one of --reachable, --stdin-commits, or 
--stdin-packs"));
if (!opts.obj_dir)
opts.obj_dir = get_object_directory

[PATCH v2 08/12] commit-graph: verify commit contents against odb

2018-05-11 Thread Derrick Stolee
When running 'git commit-graph verify', compare the contents of the
commits that are loaded from the commit-graph file with commits that are
loaded directly from the object database. This includes checking the
root tree object ID, commit date, and parents.

Parse the commit from the graph during the initial loop through the
object IDs to guarantee we parse from the commit-graph file.

In addition, verify the generation number calculation is correct for all
commits in the commit-graph file.

While testing, we discovered that mutating the integer value for a
parent to be outside the accepted range causes a segmentation fault. Add
a new check in insert_parent_or_die() that prevents this fault. Check
for that error during the test, both in the typical parents and in the
list of parents for octopus merges.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 100 
 t/t5318-commit-graph.sh |  64 +++
 2 files changed, 164 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 5bb93e533c..a15ad9710d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -237,6 +237,10 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+
+   if (pos >= g->num_commits)
+   die("invalide parent position %"PRIu64, pos);
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -875,6 +879,8 @@ int verify_commit_graph(struct commit_graph *g)
return 1;
 
for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
if (i && oidcmp(_oid, _oid) >= 0)
@@ -892,6 +898,10 @@ int verify_commit_graph(struct commit_graph *g)
 
cur_fanout_pos++;
}
+
+   graph_commit = lookup_commit(_oid);
+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph", 
oid_to_hex(_oid));
}
 
while (cur_fanout_pos < 256) {
@@ -904,5 +914,95 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
+   if (verify_commit_graph_error)
+   return 1;
+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit, *odb_commit;
+   struct commit_list *graph_parents, *odb_parents;
+   int num_parents = 0;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   graph_commit = lookup_commit(_oid);
+   odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
+   if (parse_commit_internal(odb_commit, 0, 0)) {
+   graph_report("failed to parse %s from object database", 
oid_to_hex(_oid));
+   continue;
+   }
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree object ID for commit %s in 
commit-graph is %s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));
+
+   if (graph_commit->date != odb_commit->date)
+   graph_report("commit date for commit %s in commit-graph 
is %"PRItime" != %"PRItime"",
+oid_to_hex(_oid),
+graph_commit->date,
+odb_commit->date);
+
+
+   graph_parents = graph_commit->parents;
+   odb_parents = odb_commit->parents;
+
+   while (graph_parents) {
+   num_parents++;
+
+   if (odb_parents == NULL)
+   graph_report("commit-graph parent list for 
commit %s is too long (%d)",
+oid_to_hex(_oid),
+num_parents);
+
+   if (oidcmp(_parents->item->object.oid, 
_parents->item->object.oid))
+   graph_report("commit-graph parent for %s is %s 
!= %s",
+oid_to_hex(_oid),
+
oid_to_hex(_parents->item->object.oid),
+
oid_to_hex(_parents->item->ob

[PATCH v2 07/12] commit-graph: load a root tree from specific graph

2018-05-11 Thread Derrick Stolee
When lazy-loading a tree for a commit, it will be important to select
the tree from a specific struct commit_graph. Create a new method that
specifies the commit-graph file and use that in
get_commit_tree_in_graph().

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index b0fd1d5320..5bb93e533c 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -357,14 +357,20 @@ static struct tree *load_tree_for_commit(struct 
commit_graph *g, struct commit *
return c->maybe_tree;
 }
 
-struct tree *get_commit_tree_in_graph(const struct commit *c)
+static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g,
+const struct commit *c)
 {
if (c->maybe_tree)
return c->maybe_tree;
if (c->graph_pos == COMMIT_NOT_FROM_GRAPH)
BUG("get_commit_tree_in_graph called from non-commit-graph 
commit");
 
-   return load_tree_for_commit(commit_graph, (struct commit *)c);
+   return load_tree_for_commit(g, (struct commit *)c);
+}
+
+struct tree *get_commit_tree_in_graph(const struct commit *c)
+{
+   return get_commit_tree_in_graph_one(commit_graph, c);
 }
 
 static void write_graph_chunk_fanout(struct hashfile *f,
-- 
2.16.2.329.gfb62395de6



Re: What's cooking in git.git (May 2018, #02; Thu, 17)

2018-05-17 Thread Derrick Stolee

On 5/17/2018 2:01 AM, Junio C Hamano wrote:

* ds/generation-numbers (2018-05-02) 11 commits
  - commit-graph.txt: update design document
  - merge: check config before loading commits
  - commit: use generation number in remove_redundant()
  - commit: add short-circuit to paint_down_to_common()
  - commit: use generation numbers for in_merge_bases()
  - ref-filter: use generation number for --contains
  - commit-graph: always load commit-graph information
  - commit: use generations in paint_down_to_common()
  - commit-graph: compute generation numbers
  - commit: add generation number to struct commmit
  - ref-filter: fix outdated comment on in_commit_list
  (this branch is used by ds/commit-graph-lockfile-fix; uses 
ds/lazy-load-trees.)

  A recently added "commit-graph" datafile has learned to store
  pre-computed generation numbers to speed up the decisions to stop
  history traversal.

  Is this ready for 'next' with ds/commit-graph-lockfile-fix?
  A commit with triple 'm' needs its title amended, though.


With the lockfile fix, it should be ready. I've been giving this 
significant testing on my machine and a few other developers here. The 
next version of GVFS is shipping with this code and with GVFS 
controlling the maintenance of the commit-graph file. That code has been 
cooking with our CI builds for a while, with full functional tests 
against the Windows repository. The only bugs we've found are the fix in 
"merge: check config before loading commits" and in 
ds/commit-graph-lockfile-fix.


Sorry for the triple-m.

Thanks,
-Stolee


Re: worktrees vs. alternates

2018-05-16 Thread Derrick Stolee

On 5/16/2018 6:33 AM, Ævar Arnfjörð Bjarmason wrote:
[big snip]


And here's where this isn't at all like "worktree", each of those 100
will have their own "master" branch, and they can all create 100
different branches called "topic" that can be different.


This is the biggest difference. You cannot have the same ref checked out 
in multiple worktrees, as they both may edit that ref. The alternates 
allow you to share data in a "read only" fashion. If you have one repo 
that is the "base" repo that manages that objects dir, then that is 
probably a good way to reduce the duplication. I'm not familiar with 
what happens when a "child" repo does 'git gc' or 'git repack', will it 
delete the local objects that is sees exist in the alternate?


GVFS uses alternates in this same way: we create a drive-wide "shared 
object cache" that GVFS manages. We put our prefetch packs filled with 
commits and trees in there, and any loose objects that are downloaded 
via the object virtualization are placed as loose objects in the 
alternate. We also store the multi-pack-index and commit-graph in that 
alternate. This means that the only objects in each src dir are those 
created by the developer doing their normal work.


Thanks,
-Stolee



Re: Git log range reverse bug

2018-05-16 Thread Derrick Stolee

Hi Mendi,

On 5/16/2018 2:19 PM, Mehdi Zeinali wrote:

Git Version: Version: 2.14.2

When reversing a range in git log, it does not start from the expected commit:

$ git show 8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b
commit 8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b
Author: Some Name 
Date:   Mon Nov 3 19:01:53 2014 +
.
.
.

$ git show
Author: Some Other Name 
Date:   Wed May 16 16:49:10 2018 +
.
.
.

$ git log --reverse 8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b..HEAD


This command is asking for the commits reachable from HEAD but NOT 
reachable from 8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b. To see 
8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b in the results, you would need 
to add "--boundary" to the command. That may still not show 
8e11b4a41ec21e47fb0bf8b76e1edba739f57a9b as the first commit, as there 
may be multiple, earlier boundary commits.


Thanks,
-Stolee


commit-graph: change in "best" merge-base when ambiguous

2018-05-21 Thread Derrick Stolee

Hello all,

While working on the commit-graph feature, I made a test commit that 
sets core.commitGraph and gc.commitGraph to true by default AND runs 
'git commit-graph write --reachable' after each 'git commit' command. 
This helped me find instances in the test suite where the commit-graph 
feature changes existing functionality. Most of these were in regards to 
grafts, replace-objects, and shallow-clones (as expected) or when trying 
to find a corrupt or hidden commit (the commit-graph hides this 
corrupt/missing data). However, there was one interesting case that I'd 
like to mention on-list.


In t6024-recursive-merge.sh, we have the following commit structure:

    # 1 - A - D - F
    #   \   X   /
    # B   X
    #   X   \
    # 2 - C - E - G

When merging F to G, there are two "best" merge-bases, A and C. With 
core.commitGraph=false, 'git merge-base F G' returns A, while it returns 
C when core.commitGraph=true. This is due to the new walk order when 
using generation numbers, although I have not dug deep into the code to 
point out exactly where the choice between A and C is made. Likely it's 
just whatever order they are inserted into a list.


In the Discussion section of the `git merge-base` docs [1], we have the 
following:


    When the history involves criss-cross merges, there can be more 
than one best common ancestor for two commits. For example, with this 
topology:


    ---1---o---A
        \ /
     X
        / \
    ---2---o---o---B

    both 1 and 2 are merge-bases of A and B. Neither one is better than 
the other (both are best merge bases). When the --all option is not 
given,     it is unspecified which best one is output.


This means our official documentation mentions that we do not have a 
concrete way to differentiate between these choices. This makes me think 
that this change in behavior is not a bug, but it _is_ a change in 
behavior. It's worth mentioning, but I don't think there is any value in 
making sure `git merge-base` returns the same output.


Does anyone disagree? Is this something we should solidify so we always 
have a "definitive" merge-base?


The biggest reason I think we should avoid sticking to the existing 
behavior is that the current behavior depends on the walk order. That 
means we would not be able to concretely define a tie-breaker without 
changing the existing behavior anyway.


Thanks,
-Stolee

[1] https://git-scm.com/docs/git-merge-base#_discussion



Re: commit-graph: change in "best" merge-base when ambiguous

2018-05-22 Thread Derrick Stolee

On 5/22/2018 1:39 AM, Michael Haggerty wrote:

On 05/21/2018 08:10 PM, Derrick Stolee wrote:

[...]
In the Discussion section of the `git merge-base` docs [1], we have the
following:

     When the history involves criss-cross merges, there can be more than
one best common ancestor for two commits. For example, with this topology:

     ---1---o---A
         \ /
      X
         / \
     ---2---o---o---B

     both 1 and 2 are merge-bases of A and B. Neither one is better than
the other (both are best merge bases). When the --all option is not
given,     it is unspecified which best one is output.

This means our official documentation mentions that we do not have a
concrete way to differentiate between these choices. This makes me think
that this change in behavior is not a bug, but it _is_ a change in
behavior. It's worth mentioning, but I don't think there is any value in
making sure `git merge-base` returns the same output.

Does anyone disagree? Is this something we should solidify so we always
have a "definitive" merge-base?
[...]

This may be beyond the scope of what you are working on, but there are
significant advantages to selecting a "best" merge base from among the
candidates. Long ago [1] I proposed that the "best" merge base is the
merge base candidate that minimizes the number of non-merge commits that
are in

 git rev-list $candidate..$branch

that are already in master:

 git rev-list $master

(assuming merging branch into master), which is equivalent to choosing
the merge base that minimizes

 git rev-list --count $candidate..$branch

In fact, this criterion is symmetric if you exchange branch ↔ master,
which is a nice property, and indeed generalizes pretty simply to
computing the merge base of more than two commits.

In that email I also included some data showing that the "best" merge
base almost always results in either the same or a shorter diff than the
more or less arbitrary algorithm that we currently use. Sometimes the
difference in diff length is dramatic.

To me it feels like the best *deterministic* merge base would be based
on the above criterion, maybe with first-parent reachability, commit
times, and SHA-1s used (in that order) to break ties.


Thanks, everyone, for your perspective on this. I'm walking away with 
these conclusions:


1. While this is a change in behavior, it is not a regression. We do not 
need to act immediately to preserve old behavior in these ambiguous cases.


2. We should (eventually) define tie-breaking conditions. I like 
Michael's suggestion above.


Thanks,
-Stolee


[PATCH v3 00/20] Integrate commit-graph into 'fsck' and 'gc'

2018-05-24 Thread Derrick Stolee
Thanks for all the feedback on v2. I've tried to make this round's
review a bit easier by splitting up the commits into smaller pieces.
Also, the test script now has less boilerplate and uses variables and
clear arithmetic to explain which bytes are being modified.

One other change worth mentioning: in "commit-graph: add '--reachable'
option" I put the ref-iteration into a new external
'write_commit_graph_reachable()' method inside commit-graph.c. This
makes the 'gc: automatically write commit-graph files' a simpler change.

Thanks,
-Stolee

Derrick Stolee (20):
  commit-graph: UNLEAK before die()
  commit-graph: fix GRAPH_MIN_SIZE
  commit-graph: parse commit from chosen graph
  commit: force commit to parse from object database
  commit-graph: load a root tree from specific graph
  commit-graph: add 'verify' subcommand
  commit-graph: verify catches corrupt signature
  commit-graph: verify required chunks are present
  commit-graph: verify corrupt OID fanout and lookup
  commit-graph: verify objects exist
  commit-graph: verify root tree OIDs
  commit-graph: verify parent list
  commit-graph: verify generation number
  commit-graph: verify commit date
  commit-graph: test for corrupted octopus edge
  commit-graph: verify contents match checksum
  fsck: verify commit-graph
  commit-graph: add '--reachable' option
  gc: automatically write commit-graph files
  commit-graph: update design document

 Documentation/config.txt |   6 +
 Documentation/git-commit-graph.txt   |  14 +-
 Documentation/git-fsck.txt   |   3 +
 Documentation/git-gc.txt |   4 +
 Documentation/technical/commit-graph.txt |  22 ---
 builtin/commit-graph.c   |  59 +++-
 builtin/fsck.c   |  21 +++
 builtin/gc.c |   6 +
 commit-graph.c   | 234 +--
 commit-graph.h   |   3 +
 commit.c |   9 +-
 commit.h |   1 +
 t/t5318-commit-graph.sh  | 196 ++
 13 files changed, 539 insertions(+), 39 deletions(-)


base-commit: 34fdd433396ee0e3ef4de02eb2189f8226eafe4e
-- 
2.16.2.329.gfb62395de6



[PATCH v3 06/20] commit-graph: add 'verify' subcommand

2018-05-24 Thread Derrick Stolee
If the commit-graph file becomes corrupt, we need a way to verify
that its contents match the object database. In the manner of
'git fsck' we will implement a 'git commit-graph verify' subcommand
to report all issues with the file.

Add the 'verify' subcommand to the 'commit-graph' builtin and its
documentation. The subcommand is currently a no-op except for
loading the commit-graph into memory, which may trigger run-time
errors that would be caught by normal use. Add a simple test that
ensures the command returns a zero error code.

If no commit-graph file exists, this is an acceptable state. Do
not report any errors.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/git-commit-graph.txt |  6 ++
 builtin/commit-graph.c | 38 ++
 commit-graph.c | 26 ++
 commit-graph.h |  2 ++
 t/t5318-commit-graph.sh| 10 ++
 5 files changed, 82 insertions(+)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index 4c97b555cc..a222cfab08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -10,6 +10,7 @@ SYNOPSIS
 
 [verse]
 'git commit-graph read' [--object-dir ]
+'git commit-graph verify' [--object-dir ]
 'git commit-graph write'  [--object-dir ]
 
 
@@ -52,6 +53,11 @@ existing commit-graph file.
 Read a graph file given by the commit-graph file and output basic
 details about the graph file. Used for debugging purposes.
 
+'verify'::
+
+Read the commit-graph file and verify its contents against the object
+database. Used to check for corrupted data.
+
 
 EXAMPLES
 
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index f0875b8bf3..0433dd6e20 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -8,10 +8,16 @@
 static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
+   N_("git commit-graph verify [--object-dir ]"),
N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
NULL
 };
 
+static const char * const builtin_commit_graph_verify_usage[] = {
+   N_("git commit-graph verify [--object-dir ]"),
+   NULL
+};
+
 static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir ]"),
NULL
@@ -29,6 +35,36 @@ static struct opts_commit_graph {
int append;
 } opts;
 
+
+static int graph_verify(int argc, const char **argv)
+{
+   struct commit_graph *graph = 0;
+   char *graph_name;
+
+   static struct option builtin_commit_graph_verify_options[] = {
+   OPT_STRING(0, "object-dir", _dir,
+  N_("dir"),
+  N_("The object directory to store the graph")),
+   OPT_END(),
+   };
+
+   argc = parse_options(argc, argv, NULL,
+builtin_commit_graph_verify_options,
+builtin_commit_graph_verify_usage, 0);
+
+   if (!opts.obj_dir)
+   opts.obj_dir = get_object_directory();
+
+   graph_name = get_commit_graph_filename(opts.obj_dir);
+   graph = load_commit_graph_one(graph_name);
+   FREE_AND_NULL(graph_name);
+
+   if (!graph)
+   return 0;
+
+   return verify_commit_graph(graph);
+}
+
 static int graph_read(int argc, const char **argv)
 {
struct commit_graph *graph = NULL;
@@ -163,6 +199,8 @@ int cmd_commit_graph(int argc, const char **argv, const 
char *prefix)
 PARSE_OPT_STOP_AT_NON_OPTION);
 
if (argc > 0) {
+   if (!strcmp(argv[0], "verify"))
+   return graph_verify(argc, argv);
if (!strcmp(argv[0], "read"))
return graph_read(argc, argv);
if (!strcmp(argv[0], "write"))
diff --git a/commit-graph.c b/commit-graph.c
index 25893ec096..55b41664ee 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -836,3 +836,29 @@ void write_commit_graph(const char *obj_dir,
oids.alloc = 0;
oids.nr = 0;
 }
+
+static int verify_commit_graph_error;
+
+static void graph_report(const char *fmt, ...)
+{
+   va_list ap;
+   struct strbuf sb = STRBUF_INIT;
+   verify_commit_graph_error = 1;
+
+   va_start(ap, fmt);
+   strbuf_vaddf(, fmt, ap);
+
+   fprintf(stderr, "%s\n", sb.buf);
+   strbuf_release();
+   va_end(ap);
+}
+
+int verify_commit_graph(struct commit_graph *g)
+{
+   if (!g) {
+   graph_report("no commit-graph file loaded");
+   return 1;
+   }
+
+   return verify_commit_graph_error;
+}
diff --git a/co

[PATCH v3 11/20] commit-graph: verify root tree OIDs

2018-05-24 Thread Derrick Stolee
The 'verify' subcommand must compare the commit content parsed from the
commit-graph and compare it against the content in the object database.
Use lookup_commit() and parse_commit_in_graph_one() to parse the commits
from the graph and compare against a commit that is loaded separately
and parsed directly from the object database.

Add checks for the root tree OID.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 17 -
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 0420ebcd87..19ea369fc6 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -880,6 +880,8 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
if (i && oidcmp(_oid, _oid) >= 0)
@@ -897,6 +899,11 @@ int verify_commit_graph(struct commit_graph *g)
 
cur_fanout_pos++;
}
+
+   graph_commit = lookup_commit(_oid);
+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph",
+oid_to_hex(_oid));
}
 
while (cur_fanout_pos < 256) {
@@ -913,16 +920,24 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
-   struct commit *odb_commit;
+   struct commit *graph_commit, *odb_commit;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
+   graph_commit = lookup_commit(_oid);
odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
if (parse_commit_internal(odb_commit, 0, 0)) {
graph_report("failed to parse %s from object database",
 oid_to_hex(_oid));
continue;
}
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree OID for commit %s in 
commit-graph is %s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 996a016239..21cc8e82f3 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255`
 GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256`
 GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8`
 GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 
+ 10`
+GRAPH_COMMIT_DATA_OFFSET=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS`
+GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -341,4 +343,9 @@ test_expect_success 'detect OID not in object database' '
"from object database"
 '
 
+test_expect_success 'detect incorrect tree OID' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_TREE "\01" \
+   "root tree OID for commit"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 10/20] commit-graph: verify objects exist

2018-05-24 Thread Derrick Stolee
In the 'verify' subcommand, load commits directly from the object
database to ensure they exist. Parse by skipping the commit-graph.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 20 
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 27 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index cbd1aae514..0420ebcd87 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -238,6 +238,10 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+
+   if (pos >= g->num_commits)
+   die("invalid parent position %"PRIu64, pos);
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -905,5 +909,21 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *odb_commit;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
+   if (parse_commit_internal(odb_commit, 0, 0)) {
+   graph_report("failed to parse %s from object database",
+oid_to_hex(_oid));
+   continue;
+   }
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c050ef980b..996a016239 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+NUM_COMMITS=9
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=`expr $GRAPH_FANOUT_OFFSET + 4 \* 4`
 GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255`
 GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256`
 GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8`
+GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 
+ 10`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' '
"incorrect OID order"
 '
 
+test_expect_success 'detect OID not in object database' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \
+   "from object database"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 19/20] gc: automatically write commit-graph files

2018-05-24 Thread Derrick Stolee
The commit-graph file is a very helpful feature for speeding up git
operations. In order to make it more useful, write the commit-graph file
by default during standard garbage collection operations.

Add a 'gc.commitGraph' config setting that triggers writing a
commit-graph file after any non-trivial 'git gc' command. Defaults to
false while the commit-graph feature matures. We specifically do not
want to turn this on by default until the commit-graph feature is fully
integrated with history-modifying features like shallow clones.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/config.txt |  6 ++
 Documentation/git-gc.txt |  4 
 builtin/gc.c |  6 ++
 t/t5318-commit-graph.sh  | 14 ++
 4 files changed, 30 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 11f027194e..9a3abd87e7 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1553,6 +1553,12 @@ gc.autoDetach::
Make `git gc --auto` return immediately and run in background
if the system supports it. Default is true.
 
+gc.commitGraph::
+   If true, then gc will rewrite the commit-graph file after any
+   change to the object database. If '--auto' is used, then the
+   commit-graph will not be updated unless the threshold is met.
+   See linkgit:git-commit-graph[1] for details.
+
 gc.logExpiry::
If the file gc.log exists, then `git gc --auto` won't run
unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..17dd654a59 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -119,6 +119,10 @@ The optional configuration variable `gc.packRefs` 
determines if
 it within all non-bare repos or it can be set to a boolean value.
 This defaults to true.
 
+The optional configuration variable 'gc.commitGraph' determines if
+'git gc' runs 'git commit-graph write'. This can be set to a boolean
+value. This defaults to false.
+
 The optional configuration variable `gc.aggressiveWindow` controls how
 much time is spent optimizing the delta compression of the objects in
 the repository when the --aggressive option is specified.  The larger
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..efd214a59f 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,7 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "commit-graph.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -34,6 +35,7 @@ static int aggressive_depth = 50;
 static int aggressive_window = 250;
 static int gc_auto_threshold = 6700;
 static int gc_auto_pack_limit = 50;
+static int gc_commit_graph = 0;
 static int detach_auto = 1;
 static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
@@ -121,6 +123,7 @@ static void gc_config(void)
git_config_get_int("gc.aggressivedepth", _depth);
git_config_get_int("gc.auto", _auto_threshold);
git_config_get_int("gc.autopacklimit", _auto_pack_limit);
+   git_config_get_bool("gc.commitgraph", _commit_graph);
git_config_get_bool("gc.autodetach", _auto);
git_config_get_expiry("gc.pruneexpire", _expire);
git_config_get_expiry("gc.worktreepruneexpire", 
_worktrees_expire);
@@ -480,6 +483,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
if (pack_garbage.nr > 0)
clean_pack_garbage();
 
+   if (gc_commit_graph)
+   write_commit_graph_reachable(get_object_directory(), 0);
+
if (auto_gc && too_many_loose_objects())
warning(_("There are too many unreachable loose objects; "
"run 'git prune' to remove them."));
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index a659620332..d20b17586f 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -245,6 +245,20 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+test_expect_success 'check that gc clears commit-graph' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git commit --allow-empty -m "blank" &&
+   git commit-graph write --reachable &&
+   cp $objdir/info/commit-graph commit-graph-before-gc &&
+   git reset --hard HEAD~1 &&
+   git config gc.commitGraph true &&
+   git gc &&
+   cp $objdir/info/commit-graph commit-graph-after-gc &&
+   ! test_cmp commit-graph-before-gc commit-graph-after-gc &&
+   git commit-graph write --reachable &&
+   test_cmp commit-graph-after-gc $objdir/info/commit-graph
+'
+
 # the verify tests below expect the commit-graph to contain
 # exactly the commits reachable from the commits/8 branch.
 # If the file changes the set of commits in the list, then the
-- 
2.16.2.329.gfb62395de6



[PATCH v3 16/20] commit-graph: verify contents match checksum

2018-05-24 Thread Derrick Stolee
The commit-graph file ends with a SHA1 hash of the previous contents. If
a commit-graph file has errors but the checksum hash is correct, then we
know that the problem is a bug in Git and not simply file corruption
after-the-fact.

Compute the checksum right away so it is the first error that appears,
and make the message translatable since this error can be "corrected" by
a user by simply deleting the file and recomputing. The rest of the
errors are useful only to developers.

Be sure to continue checking the rest of the file data if the checksum
is wrong. This is important for our tests, as we break the checksum as
we modify bytes of the commit-graph file.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 16 ++--
 t/t5318-commit-graph.sh |  6 ++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index d2b291aca2..a33600c584 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -841,6 +841,7 @@ void write_commit_graph(const char *obj_dir,
oids.nr = 0;
 }
 
+#define VERIFY_COMMIT_GRAPH_ERROR_HASH 2
 static int verify_commit_graph_error;
 
 static void graph_report(const char *fmt, ...)
@@ -860,7 +861,9 @@ static void graph_report(const char *fmt, ...)
 int verify_commit_graph(struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
-   struct object_id prev_oid, cur_oid;
+   struct object_id prev_oid, cur_oid, checksum;
+   struct hashfile *f;
+   int devnull;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -879,6 +882,15 @@ int verify_commit_graph(struct commit_graph *g)
if (verify_commit_graph_error)
return verify_commit_graph_error;
 
+   devnull = open("/dev/null", O_WRONLY);
+   f = hashfd(devnull, NULL);
+   hashwrite(f, g->data, g->data_len - g->hash_len);
+   finalize_hashfile(f, checksum.hash, CSUM_CLOSE);
+   if (hashcmp(checksum.hash, g->data + g->data_len - g->hash_len)) {
+   graph_report(_("the commit-graph file has incorrect checksum 
and is likely corrupt"));
+   verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH;
+   }
+
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit;
 
@@ -916,7 +928,7 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
-   if (verify_commit_graph_error)
+   if (verify_commit_graph_error & ~VERIFY_COMMIT_GRAPH_ERROR_HASH)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 240aef6add..2680a2ebff 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -279,6 +279,7 @@ GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16`
 GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \
$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS`
 GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4`
+GRAPH_BYTE_FOOTER=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -388,4 +389,9 @@ test_expect_success 'detect incorrect parent for octopus 
merge' '
"invalid parent"
 '
 
+test_expect_success 'detect invalid checksum hash' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 02/20] commit-graph: fix GRAPH_MIN_SIZE

2018-05-24 Thread Derrick Stolee
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable
commit-graph file. However, the minimum number of chunks was wrong.
It is possible to write a commit-graph file with zero commits, and
that violates this macro's value.

Rewrite the macro, and use extra macros to better explain the magic
constants.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index a8c337dd77..82295f0975 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -33,10 +33,11 @@
 
 #define GRAPH_LAST_EDGE 0x8000
 
+#define GRAPH_HEADER_SIZE 8
 #define GRAPH_FANOUT_SIZE (4 * 256)
 #define GRAPH_CHUNKLOOKUP_WIDTH 12
-#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \
-   GRAPH_OID_LEN + 8)
+#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \
+   + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN)
 
 char *get_commit_graph_filename(const char *obj_dir)
 {
-- 
2.16.2.329.gfb62395de6



[PATCH v3 05/20] commit-graph: load a root tree from specific graph

2018-05-24 Thread Derrick Stolee
When lazy-loading a tree for a commit, it will be important to select
the tree from a specific struct commit_graph. Create a new method that
specifies the commit-graph file and use that in
get_commit_tree_in_graph().

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 78ba0edc80..25893ec096 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -358,14 +358,20 @@ static struct tree *load_tree_for_commit(struct 
commit_graph *g, struct commit *
return c->maybe_tree;
 }
 
-struct tree *get_commit_tree_in_graph(const struct commit *c)
+static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g,
+const struct commit *c)
 {
if (c->maybe_tree)
return c->maybe_tree;
if (c->graph_pos == COMMIT_NOT_FROM_GRAPH)
-   BUG("get_commit_tree_in_graph called from non-commit-graph 
commit");
+   BUG("get_commit_tree_in_graph_one called from non-commit-graph 
commit");
+
+   return load_tree_for_commit(g, (struct commit *)c);
+}
 
-   return load_tree_for_commit(commit_graph, (struct commit *)c);
+struct tree *get_commit_tree_in_graph(const struct commit *c)
+{
+   return get_commit_tree_in_graph_one(commit_graph, c);
 }
 
 static void write_graph_chunk_fanout(struct hashfile *f,
-- 
2.16.2.329.gfb62395de6



[PATCH v3 12/20] commit-graph: verify parent list

2018-05-24 Thread Derrick Stolee
The commit-graph file stores parents in a two-column portion of the
commit data chunk. If there is only one parent, then the second column
stores 0x to indicate no second parent.

The 'verify' subcommand checks the parent list for the commit loaded
from the commit-graph and the one parsed from the object database. Test
these checks for corrupt parents, too many parents, and wrong parents.

The octopus merge will be tested in a later commit.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 25 +
 t/t5318-commit-graph.sh | 18 ++
 2 files changed, 43 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 19ea369fc6..fff22dc0c3 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -921,6 +921,7 @@ int verify_commit_graph(struct commit_graph *g)
 
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
+   struct commit_list *graph_parents, *odb_parents;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -938,6 +939,30 @@ int verify_commit_graph(struct commit_graph *g)
 oid_to_hex(_oid),
 
oid_to_hex(get_commit_tree_oid(graph_commit)),
 
oid_to_hex(get_commit_tree_oid(odb_commit)));
+
+   graph_parents = graph_commit->parents;
+   odb_parents = odb_commit->parents;
+
+   while (graph_parents) {
+   if (odb_parents == NULL) {
+   graph_report("commit-graph parent list for 
commit %s is too long",
+oid_to_hex(_oid));
+   break;
+   }
+
+   if (oidcmp(_parents->item->object.oid, 
_parents->item->object.oid))
+   graph_report("commit-graph parent for %s is %s 
!= %s",
+oid_to_hex(_oid),
+
oid_to_hex(_parents->item->object.oid),
+
oid_to_hex(_parents->item->object.oid));
+
+   graph_parents = graph_parents->next;
+   odb_parents = odb_parents->next;
+   }
+
+   if (odb_parents != NULL)
+   graph_report("commit-graph parent list for commit %s 
terminates early",
+oid_to_hex(_oid));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 21cc8e82f3..12f0d7f54d 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -269,6 +269,9 @@ GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET 
+ $HASH_LEN \* 8`
 GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 
+ 10`
 GRAPH_COMMIT_DATA_OFFSET=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS`
 GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
+GRAPH_BYTE_COMMIT_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN`
+GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4`
+GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -348,4 +351,19 @@ test_expect_success 'detect incorrect tree OID' '
"root tree OID for commit"
 '
 
+test_expect_success 'detect incorrect parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_PARENT "\01" \
+   "invalid parent"
+'
+
+test_expect_success 'detect extra parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_EXTRA_PARENT "\00" \
+   "is too long"
+'
+
+test_expect_success 'detect incorrect tree OID' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_WRONG_PARENT "\01" \
+   "commit-graph parent for"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 01/20] commit-graph: UNLEAK before die()

2018-05-24 Thread Derrick Stolee
Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 builtin/commit-graph.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 37420ae0fd..f0875b8bf3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv)
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
 
-   if (!graph)
+   if (!graph) {
+   UNLEAK(graph_name);
die("graph file %s does not exist", graph_name);
+   }
+
FREE_AND_NULL(graph_name);
 
printf("header: %08x %d %d %d %d\n",
-- 
2.16.2.329.gfb62395de6



[PATCH v3 04/20] commit: force commit to parse from object database

2018-05-24 Thread Derrick Stolee
In anticipation of verifying commit-graph file contents against the
object database, create parse_commit_internal() to allow side-stepping
the commit-graph file and parse directly from the object database.

Due to the use of generation numbers, this method should not be called
unless the intention is explicit in avoiding commits from the
commit-graph file.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit.c | 9 +++--
 commit.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index 1d28677dfb..6eaed0174c 100644
--- a/commit.c
+++ b/commit.c
@@ -392,7 +392,7 @@ int parse_commit_buffer(struct commit *item, const void 
*buffer, unsigned long s
return 0;
 }
 
-int parse_commit_gently(struct commit *item, int quiet_on_missing)
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph)
 {
enum object_type type;
void *buffer;
@@ -403,7 +403,7 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return -1;
if (item->object.parsed)
return 0;
-   if (parse_commit_in_graph(item))
+   if (use_commit_graph && parse_commit_in_graph(item))
return 0;
buffer = read_sha1_file(item->object.oid.hash, , );
if (!buffer)
@@ -424,6 +424,11 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return ret;
 }
 
+int parse_commit_gently(struct commit *item, int quiet_on_missing)
+{
+   return parse_commit_internal(item, quiet_on_missing, 1);
+}
+
 void parse_commit_or_die(struct commit *item)
 {
if (parse_commit(item))
diff --git a/commit.h b/commit.h
index b5afde1ae9..5fde74fcd7 100644
--- a/commit.h
+++ b/commit.h
@@ -73,6 +73,7 @@ struct commit *lookup_commit_reference_by_name(const char 
*name);
 struct commit *lookup_commit_or_die(const struct object_id *oid, const char 
*ref_name);
 
 int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long 
size, int check_graph);
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph);
 int parse_commit_gently(struct commit *item, int quiet_on_missing);
 static inline int parse_commit(struct commit *item)
 {
-- 
2.16.2.329.gfb62395de6



[PATCH v3 14/20] commit-graph: verify commit date

2018-05-24 Thread Derrick Stolee
Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 6 ++
 t/t5318-commit-graph.sh | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index ead92460c1..d2b291aca2 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -981,6 +981,12 @@ int verify_commit_graph(struct commit_graph *g)
 oid_to_hex(_oid),
 graph_commit->generation,
 max_generation + 1);
+
+   if (graph_commit->date != odb_commit->date)
+   graph_report("commit date for commit %s in commit-graph 
is %"PRItime" != %"PRItime,
+oid_to_hex(_oid),
+graph_commit->date,
+odb_commit->date);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 673b0d37d5..58adb8246d 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -273,6 +273,7 @@ GRAPH_BYTE_COMMIT_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + 
$HASH_LEN`
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4`
 GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3`
 GRAPH_BYTE_COMMIT_GENERATION=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 8`
+GRAPH_BYTE_COMMIT_DATE=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -372,4 +373,9 @@ test_expect_success 'detect incorrect generation number' '
"generation"
 '
 
+test_expect_success 'detect incorrect commit date' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATE "\01" \
+   "commit date"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 09/20] commit-graph: verify corrupt OID fanout and lookup

2018-05-24 Thread Derrick Stolee
In the commit-graph file, the OID fanout chunk provides an index into
the OID lookup. The 'verify' subcommand should find incorrect values
in the fanout.

Similarly, the 'verify' subcommand should find out-of-order values in
the OID lookup.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 36 
 t/t5318-commit-graph.sh | 22 ++
 2 files changed, 58 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 06e3e4f9ba..cbd1aae514 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -855,6 +855,9 @@ static void graph_report(const char *fmt, ...)
 
 int verify_commit_graph(struct commit_graph *g)
 {
+   uint32_t i, cur_fanout_pos = 0;
+   struct object_id prev_oid, cur_oid;
+
if (!g) {
graph_report("no commit-graph file loaded");
return 1;
@@ -869,5 +872,38 @@ int verify_commit_graph(struct commit_graph *g)
if (!g->chunk_commit_data)
graph_report("commit-graph is missing the Commit Data chunk");
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   if (i && oidcmp(_oid, _oid) >= 0)
+   graph_report("commit-graph has incorrect OID order: %s 
then %s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));
+
+   oidcpy(_oid, _oid);
+
+   while (cur_oid.hash[0] > cur_fanout_pos) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+   if (i != fanout_value)
+   graph_report("commit-graph has incorrect fanout 
value: fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+   }
+
+   while (cur_fanout_pos < 256) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+
+   if (g->num_commits != fanout_value)
+   graph_report("commit-graph has incorrect fanout value: 
fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 4ef3fe3dc2..c050ef980b 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
 GRAPH_BYTE_CHUNK_COUNT=6
@@ -258,6 +259,12 @@ GRAPH_BYTE_OID_LOOKUP_ID=`expr $GRAPH_CHUNK_LOOKUP_OFFSET 
+ \
  1 \* $GRAPH_CHUNK_LOOKUP_WIDTH`
 GRAPH_BYTE_COMMIT_DATA_ID=`expr $GRAPH_CHUNK_LOOKUP_OFFSET + \
2 \* $GRAPH_CHUNK_LOOKUP_WIDTH`
+GRAPH_FANOUT_OFFSET=`expr $GRAPH_CHUNK_LOOKUP_OFFSET + \
+ $GRAPH_CHUNK_LOOKUP_WIDTH \* $GRAPH_CHUNK_LOOKUP_ROWS`
+GRAPH_BYTE_FANOUT1=`expr $GRAPH_FANOUT_OFFSET + 4 \* 4`
+GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255`
+GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256`
+GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -312,4 +319,19 @@ test_expect_success 'detect missing commit data chunk' '
"missing the Commit Data chunk"
 '
 
+test_expect_success 'detect incorrect fanout' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT1 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect fanout' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect OID order' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ORDER "\01" \
+   "incorrect OID order"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 13/20] commit-graph: verify generation number

2018-05-24 Thread Derrick Stolee
While iterating through the commit parents, perform the generation
number calculation and compare against the value stored in the
commit-graph.

The tests demonstrate that having a different set of parents affects
the generation number calculation, and this value propagates to
descendants. Hence, we drop the single-line condition on the output.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  | 18 ++
 t/t5318-commit-graph.sh |  6 ++
 2 files changed, 24 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index fff22dc0c3..ead92460c1 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -922,6 +922,7 @@ int verify_commit_graph(struct commit_graph *g)
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
struct commit_list *graph_parents, *odb_parents;
+   uint32_t max_generation = 0;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -956,6 +957,9 @@ int verify_commit_graph(struct commit_graph *g)
 
oid_to_hex(_parents->item->object.oid),
 
oid_to_hex(_parents->item->object.oid));
 
+   if (graph_parents->item->generation > max_generation)
+   max_generation = 
graph_parents->item->generation;
+
graph_parents = graph_parents->next;
odb_parents = odb_parents->next;
}
@@ -963,6 +967,20 @@ int verify_commit_graph(struct commit_graph *g)
if (odb_parents != NULL)
graph_report("commit-graph parent list for commit %s 
terminates early",
 oid_to_hex(_oid));
+
+   /*
+* If one of our parents has generation GENERATION_NUMBER_MAX, 
then
+* our generation is also GENERATION_NUMBER_MAX. Decrement to 
avoid
+* extra logic in the following condition.
+*/
+   if (max_generation == GENERATION_NUMBER_MAX)
+   max_generation--;
+
+   if (graph_commit->generation != max_generation + 1)
+   graph_report("commit-graph generation for commit %s is 
%u != %u",
+oid_to_hex(_oid),
+graph_commit->generation,
+max_generation + 1);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 12f0d7f54d..673b0d37d5 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 GRAPH_BYTE_COMMIT_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN`
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4`
 GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3`
+GRAPH_BYTE_COMMIT_GENERATION=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 8`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -366,4 +367,9 @@ test_expect_success 'detect incorrect tree OID' '
"commit-graph parent for"
 '
 
+test_expect_success 'detect incorrect generation number' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_GENERATION "\01" \
+   "generation"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



Re: [PATCH v2 03/12] commit-graph: test that 'verify' finds corruption

2018-05-24 Thread Derrick Stolee

On 5/21/2018 2:53 PM, Jakub Narebski wrote:

+corrupt_data() {
+   file=$1
+   pos=$2
+   data="${3:-\0}"
+   printf "$data" | dd of="$file" bs=1 seek="$pos" conv=notrunc
+}

First, if we do this that way (and not by adding a test helper), the use
of this function should be, I think, protected using appropriate test
prerequisite.  Not everyone has 'dd' tool installed, for example on
MS Windows.


Windows does not, but it is also missing many things this test suite 
needs. 'dd' is included in the Git for Windows SDK. I rebased this 
series onto Git for Windows and the tests passed when run in an SDK shell.


Thanks,
-Stolee


[PATCH v3 15/20] commit-graph: test for corrupted octopus edge

2018-05-24 Thread Derrick Stolee
The commit-graph file has an extra chunk to store the parent int-ids for
parents beyond the first parent for octopus merges. Our test repo has a
single octopus merge that we can manipulate to demonstrate the 'verify'
subcommand detects incorrect values in that chunk.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 t/t5318-commit-graph.sh | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 58adb8246d..240aef6add 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' '
 '
 
 NUM_COMMITS=9
+NUM_OCTOPUS_EDGES=2
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -274,6 +275,10 @@ GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr 
$GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4`
 GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3`
 GRAPH_BYTE_COMMIT_GENERATION=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 8`
 GRAPH_BYTE_COMMIT_DATE=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12`
+GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16`
+GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \
+   $GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS`
+GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -378,4 +383,9 @@ test_expect_success 'detect incorrect commit date' '
"commit date"
 '
 
+test_expect_success 'detect incorrect parent for octopus merge' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \
+   "invalid parent"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 17/20] fsck: verify commit-graph

2018-05-24 Thread Derrick Stolee
If core.commitGraph is true, verify the contents of the commit-graph
during 'git fsck' using the 'git commit-graph verify' subcommand. Run
this check on all alternates, as well.

We use a new process for two reasons:

1. The subcommand decouples the details of loading and verifying a
   commit-graph file from the other fsck details.

2. The commit-graph verification requires the commits to be loaded
   in a specific order to guarantee we parse from the commit-graph
   file for some objects and from the object database for others.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/git-fsck.txt |  3 +++
 builtin/fsck.c | 21 +
 t/t5318-commit-graph.sh|  8 
 3 files changed, 32 insertions(+)

diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt
index b9f060e3b2..ab9a93fb9b 100644
--- a/Documentation/git-fsck.txt
+++ b/Documentation/git-fsck.txt
@@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or 
other archives
 (i.e., you can just remove them and do an 'rsync' with some other site in
 the hopes that somebody else has the object you have corrupted).
 
+If core.commitGraph is true, the commit-graph file will also be inspected
+using 'git commit-graph verify'. See linkgit:git-commit-graph[1].
+
 Extracted Diagnostics
 -
 
diff --git a/builtin/fsck.c b/builtin/fsck.c
index ef78c6c00c..a6d5045b77 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -16,6 +16,7 @@
 #include "streaming.h"
 #include "decorate.h"
 #include "packfile.h"
+#include "run-command.h"
 
 #define REACHABLE 0x0001
 #define SEEN  0x0002
@@ -45,6 +46,7 @@ static int name_objects;
 #define ERROR_REACHABLE 02
 #define ERROR_PACK 04
 #define ERROR_REFS 010
+#define ERROR_COMMIT_GRAPH 020
 
 static const char *describe_object(struct object *obj)
 {
@@ -815,5 +817,24 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
}
 
check_connectivity();
+
+   if (core_commit_graph) {
+   struct child_process commit_graph_verify = CHILD_PROCESS_INIT;
+   const char *verify_argv[] = { "commit-graph", "verify", NULL, 
NULL, NULL, NULL };
+   commit_graph_verify.argv = verify_argv;
+   commit_graph_verify.git_cmd = 1;
+
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+
+   prepare_alt_odb();
+   for (alt = alt_odb_list; alt; alt = alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }
+
return errors_found;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 2680a2ebff..4941937163 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -394,4 +394,12 @@ test_expect_success 'detect invalid checksum hash' '
"incorrect checksum"
 '
 
+test_expect_success 'git fsck (checks commit-graph)' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git fsck &&
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum" &&
+   test_must_fail git fsck
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 08/20] commit-graph: verify required chunks are present

2018-05-24 Thread Derrick Stolee
The commit-graph file requires the following three chunks:

* OID Fanout
* OID Lookup
* Commit Data

If any of these are missing, then the 'verify' subcommand should
report a failure. This includes the chunk IDs malformed or the
chunk count is truncated.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c  |  9 +
 t/t5318-commit-graph.sh | 29 +
 2 files changed, 38 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 55b41664ee..06e3e4f9ba 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -860,5 +860,14 @@ int verify_commit_graph(struct commit_graph *g)
return 1;
}
 
+   verify_commit_graph_error = 0;
+
+   if (!g->chunk_oid_fanout)
+   graph_report("commit-graph is missing the OID Fanout chunk");
+   if (!g->chunk_oid_lookup)
+   graph_report("commit-graph is missing the OID Lookup chunk");
+   if (!g->chunk_commit_data)
+   graph_report("commit-graph is missing the Commit Data chunk");
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index bd64481c7a..4ef3fe3dc2 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -249,6 +249,15 @@ test_expect_success 'git commit-graph verify' '
 
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
+GRAPH_BYTE_CHUNK_COUNT=6
+GRAPH_CHUNK_LOOKUP_OFFSET=8
+GRAPH_CHUNK_LOOKUP_WIDTH=12
+GRAPH_CHUNK_LOOKUP_ROWS=5
+GRAPH_BYTE_OID_FANOUT_ID=$GRAPH_CHUNK_LOOKUP_OFFSET
+GRAPH_BYTE_OID_LOOKUP_ID=`expr $GRAPH_CHUNK_LOOKUP_OFFSET + \
+ 1 \* $GRAPH_CHUNK_LOOKUP_WIDTH`
+GRAPH_BYTE_COMMIT_DATA_ID=`expr $GRAPH_CHUNK_LOOKUP_OFFSET + \
+   2 \* $GRAPH_CHUNK_LOOKUP_WIDTH`
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -283,4 +292,24 @@ test_expect_success 'detect bad hash version' '
"hash version"
 '
 
+test_expect_success 'detect bad chunk count' '
+   corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+   "missing the Commit Data chunk"
+'
+
+test_expect_success 'detect missing OID fanout chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_FANOUT_ID "\0" \
+   "missing the OID Fanout chunk"
+'
+
+test_expect_success 'detect missing OID lookup chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ID "\0" \
+   "missing the OID Lookup chunk"
+'
+
+test_expect_success 'detect missing commit data chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATA_ID "\0" \
+   "missing the Commit Data chunk"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 20/20] commit-graph: update design document

2018-05-24 Thread Derrick Stolee
The commit-graph feature is now integrated with 'fsck' and 'gc',
so remove those items from the "Future Work" section of the
commit-graph design document.

Also remove the section on lazy-loading trees, as that was completed
in an earlier patch series.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/technical/commit-graph.txt | 22 --
 1 file changed, 22 deletions(-)

diff --git a/Documentation/technical/commit-graph.txt 
b/Documentation/technical/commit-graph.txt
index e1a883eb46..c664acbd76 100644
--- a/Documentation/technical/commit-graph.txt
+++ b/Documentation/technical/commit-graph.txt
@@ -118,9 +118,6 @@ Future Work
 - The commit graph feature currently does not honor commit grafts. This can
   be remedied by duplicating or refactoring the current graft logic.
 
-- The 'commit-graph' subcommand does not have a "verify" mode that is
-  necessary for integration with fsck.
-
 - After computing and storing generation numbers, we must make graph
   walks aware of generation numbers to gain the performance benefits they
   enable. This will mostly be accomplished by swapping a commit-date-ordered
@@ -130,25 +127,6 @@ Future Work
 - 'log --topo-order'
 - 'tag --merged'
 
-- Currently, parse_commit_gently() requires filling in the root tree
-  object for a commit. This passes through lookup_tree() and consequently
-  lookup_object(). Also, it calls lookup_commit() when loading the parents.
-  These method calls check the ODB for object existence, even if the
-  consumer does not need the content. For example, we do not need the
-  tree contents when computing merge bases. Now that commit parsing is
-  removed from the computation time, these lookup operations are the
-  slowest operations keeping graph walks from being fast. Consider
-  loading these objects without verifying their existence in the ODB and
-  only loading them fully when consumers need them. Consider a method
-  such as "ensure_tree_loaded(commit)" that fully loads a tree before
-  using commit->tree.
-
-- The current design uses the 'commit-graph' subcommand to generate the graph.
-  When this feature stabilizes enough to recommend to most users, we should
-  add automatic graph writes to common operations that create many commits.
-  For example, one could compute a graph on 'clone', 'fetch', or 'repack'
-  commands.
-
 - A server could provide a commit graph file as part of the network protocol
   to avoid extra calculations by clients. This feature is only of benefit if
   the user is willing to trust the file, because verifying the file is correct
-- 
2.16.2.329.gfb62395de6



[PATCH v3 18/20] commit-graph: add '--reachable' option

2018-05-24 Thread Derrick Stolee
When writing commit-graph files, it can be convenient to ask for all
reachable commits (starting at the ref set) in the resulting file. This
is particularly helpful when writing to stdin is complicated, such as a
future integration with 'git gc' which will call
write_commit_graph_reachable() after performing cleanup of the object
database.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 Documentation/git-commit-graph.txt |  8 ++--
 builtin/commit-graph.c | 16 
 commit-graph.c | 32 
 commit-graph.h |  1 +
 t/t5318-commit-graph.sh| 10 ++
 5 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index a222cfab08..dececb79d7 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -38,12 +38,16 @@ Write a commit graph file based on the commits found in 
packfiles.
 +
 With the `--stdin-packs` option, generate the new commit graph by
 walking objects only in the specified pack-indexes. (Cannot be combined
-with --stdin-commits.)
+with `--stdin-commits` or `--reachable`.)
 +
 With the `--stdin-commits` option, generate the new commit graph by
 walking commits starting at the commits specified in stdin as a list
 of OIDs in hex, one OID per line. (Cannot be combined with
---stdin-packs.)
+`--stdin-packs` or `--reachable`.)
++
+With the `--reachable` option, generate the new commit graph by walking
+commits starting at all refs. (Cannot be combined with `--stdin-commits`
+or `--stdin-packs`.)
 +
 With the `--append` option, include all commits that are present in the
 existing commit-graph file.
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 0433dd6e20..20ce6437ae 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -9,7 +9,7 @@ static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
N_("git commit-graph verify [--object-dir ]"),
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
@@ -24,12 +24,13 @@ static const char * const builtin_commit_graph_read_usage[] 
= {
 };
 
 static const char * const builtin_commit_graph_write_usage[] = {
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
 static struct opts_commit_graph {
const char *obj_dir;
+   int reachable;
int stdin_packs;
int stdin_commits;
int append;
@@ -130,6 +131,8 @@ static int graph_write(int argc, const char **argv)
OPT_STRING(0, "object-dir", _dir,
N_("dir"),
N_("The object directory to store the graph")),
+   OPT_BOOL(0, "reachable", ,
+   N_("start walk at all refs")),
OPT_BOOL(0, "stdin-packs", _packs,
N_("scan pack-indexes listed by stdin for commits")),
OPT_BOOL(0, "stdin-commits", _commits,
@@ -143,11 +146,16 @@ static int graph_write(int argc, const char **argv)
 builtin_commit_graph_write_options,
 builtin_commit_graph_write_usage, 0);
 
-   if (opts.stdin_packs && opts.stdin_commits)
-   die(_("cannot use both --stdin-commits and --stdin-packs"));
+   if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1)
+   die(_("use at most one of --reachable, --stdin-commits, or 
--stdin-packs"));
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
 
+   if (opts.reachable) {
+   write_commit_graph_reachable(opts.obj_dir, opts.append);
+   return 0;
+   }
+
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
lines_nr = 0;
diff --git a/commit-graph.c b/commit-graph.c
index a33600c584..057d734926 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -6,6 +6,7 @@
 #include "packfile.h"
 #include "commit.h"
 #include "object.h"
+#include "refs.h"
 #include "revision.h"
 #include "sha1-lookup.h"
 #include "commit-graph.h"
@@ -651,6 +652,37 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
}
 }
 
+struct hex_list {
+   char **he

[PATCH v3 07/20] commit-graph: verify catches corrupt signature

2018-05-24 Thread Derrick Stolee
This is the first of several commits that add a test to check that
'git commit-graph verify' catches corruption in the commit-graph
file. The first test checks that the command catches an error in
the file signature. This is a check that exists in the existing
commit-graph reading code.

Add a helper method 'corrupt_graph_and_verify' to the test script
t5318-commit-graph.sh. This helper corrupts the commit-graph file
at a certain location, runs 'git commit-graph verify', and reports
the output to the 'err' file. This data is filtered to remove the
lines added by 'test_must_fail' when the test is run verbosely.
Then, the output is checked to contain a specific error message.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 t/t5318-commit-graph.sh | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 6ca451dfd2..bd64481c7a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+# the verify tests below expect the commit-graph to contain
+# exactly the commits reachable from the commits/8 branch.
+# If the file changes the set of commits in the list, then the
+# offsets into the binary file will result in different edits
+# and the tests will likely break.
+
 test_expect_success 'git commit-graph verify' '
cd "$TRASH_DIRECTORY/full" &&
+   git rev-parse commits/8 | git commit-graph write --stdin-commits &&
git commit-graph verify >output
 '
 
+GRAPH_BYTE_VERSION=4
+GRAPH_BYTE_HASH=5
+
+# usage: corrupt_graph_and_verify   
+# Manipulates the commit-graph file at the position
+# by inserting the data, then runs 'git commit-graph verify'
+# and places the output in the file 'err'. Test 'err' for
+# the given string.
+corrupt_graph_and_verify() {
+   pos=$1
+   data="${2:-\0}"
+   grepstr=$3
+   cd "$TRASH_DIRECTORY/full" &&
+   test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
+   cp $objdir/info/commit-graph commit-graph-backup &&
+   printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" 
conv=notrunc &&
+   test_must_fail git commit-graph verify 2>test_err &&
+   grep -v "^+" test_err >err
+   grep "$grepstr" err
+}
+
+test_expect_success 'detect bad signature' '
+   corrupt_graph_and_verify 0 "\0" \
+   "graph signature"
+'
+
+test_expect_success 'detect bad version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \
+   "graph version"
+'
+
+test_expect_success 'detect bad hash version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \
+   "hash version"
+'
+
 test_done
-- 
2.16.2.329.gfb62395de6



[PATCH v3 03/20] commit-graph: parse commit from chosen graph

2018-05-24 Thread Derrick Stolee
Before verifying a commit-graph file against the object database, we
need to parse all commits from the given commit-graph file. Create
parse_commit_in_graph_one() to target a given struct commit_graph.

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
 commit-graph.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 82295f0975..78ba0edc80 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -310,7 +310,7 @@ static int find_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
}
 }
 
-int parse_commit_in_graph(struct commit *item)
+static int parse_commit_in_graph_one(struct commit_graph *g, struct commit 
*item)
 {
uint32_t pos;
 
@@ -318,9 +318,21 @@ int parse_commit_in_graph(struct commit *item)
return 0;
if (item->object.parsed)
return 1;
+
+   if (find_commit_in_graph(item, g, ))
+   return fill_commit_in_graph(item, g, pos);
+
+   return 0;
+}
+
+int parse_commit_in_graph(struct commit *item)
+{
+   if (!core_commit_graph)
+   return 0;
+
prepare_commit_graph();
-   if (commit_graph && find_commit_in_graph(item, commit_graph, ))
-   return fill_commit_in_graph(item, commit_graph, pos);
+   if (commit_graph)
+   return parse_commit_in_graph_one(commit_graph, item);
return 0;
 }
 
-- 
2.16.2.329.gfb62395de6



Re: [PATCH v3 01/20] commit-graph: UNLEAK before die()

2018-05-24 Thread Derrick Stolee

On 5/24/2018 6:47 PM, Stefan Beller wrote:

On Thu, May 24, 2018 at 9:25 AM, Derrick Stolee <dsto...@microsoft.com> wrote:

Signed-off-by: Derrick Stolee <dsto...@microsoft.com>
---
  builtin/commit-graph.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 37420ae0fd..f0875b8bf3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv)
 graph_name = get_commit_graph_filename(opts.obj_dir);
 graph = load_commit_graph_one(graph_name);

-   if (!graph)
+   if (!graph) {
+   UNLEAK(graph_name);
 die("graph file %s does not exist", graph_name);

Unrelated to this patch: Is the command that ends up die()ing here
a plumbing or porcelain, or: Do we want to translate the message here?

In a lot of commands that show paths we single quote them '%s',
(speaking from experience with a lot of submodule path code)


This is for the 'git commit-graph read' command, which is plumbing (and 
'read' is really only for testing). I don't think this message requires 
translation.


I'll keep the quotes in mind for the future.

Thanks,

-Stolee



Re: [RFC PATCH 00/18] Multi-pack index (MIDX)

2018-06-06 Thread Derrick Stolee

On 6/6/2018 4:13 AM, Ævar Arnfjörð Bjarmason wrote:

On Mon, Jan 08 2018, Derrick Stolee wrote:


On 1/7/2018 5:42 PM, Ævar Arnfjörð Bjarmason wrote:

On Sun, Jan 07 2018, Derrick Stolee jotted:


  git log --oneline --raw --parents

Num Packs | Before MIDX | After MIDX |  Rel % | 1 pack %
--+-+++--
  1 | 35.64 s |35.28 s |  -1.0% |   -1.0%
 24 | 90.81 s |40.06 s | -55.9% |  +12.4%
127 |257.97 s |42.25 s | -83.6% |  +18.6%

The last column is the relative difference between the MIDX-enabled repo
and the single-pack repo. The goal of the MIDX feature is to present the
ODB as if it was fully repacked, so there is still room for improvement.

Changing the command to

  git log --oneline --raw --parents --abbrev=40

has no observable difference (sub 1% change in all cases). This is likely
due to the repack I used putting commits and trees in a small number of
packfiles so the MRU cache workes very well. On more naturally-created
lists of packfiles, there can be up to 20% improvement on this command.

We are using a version of this patch with an upcoming release of GVFS.
This feature is particularly important in that space since GVFS performs
a "prefetch" step that downloads a pack of commits and trees on a daily
basis. These packfiles are placed in an alternate that is shared by all
enlistments. Some users have 150+ packfiles and the MRU misses and
abbreviation computations are significant. Now, GVFS manages the MIDX file
after adding new prefetch packfiles using the following command:

  git midx --write --update-head --delete-expired --pack-dir=

(Not a critique of this, just a (stupid) question)

What's the practical use-case for this feature? Since it doesn't help
with --abbrev=40 the speedup is all in the part that ensures we don't
show an ambiguous SHA-1.

The point of including the --abbrev=40 is to point out that object
lookups do not get slower with the MIDX feature. Using these "git log"
options is a good way to balance object lookups and abbreviations with
object parsing and diff machinery.[...]

[snip]


[...]And while the public data shape I shared did not show a
difference, our private testing of the Windows repository did show a
valuable improvement when isolating to object lookups and ignoring
abbreviation calculations.

Replying to this old thread since I see you're prepearing the MIDX for
submission again and this seemed like the best venue.


You're really good at tracking new commits in my GitHub page ;)



Your WIP branch (github.com/git/derrickstolee/midx/upstream) still only
references the speedups in abbreviation calculations, but here you
allude to other improvements. It would be very nice to have some summary
of that in docs / commit messages when you submit this.


The new version is essentially a complete rewrite of the feature, since 
we learned a lot about how to add a new data store from the commit-graph 
series. The design document [1] refers to some of the immediate benefits 
and future benefits. Some of these future benefits were discussed at the 
contributor's summit [2].


[1] 
https://github.com/derrickstolee/git/blob/midx/upstream/Documentation/technical/midx.txt


[2] 
https://public-inbox.org/git/alpine.DEB.2.20.1803091557510.23109@alexmv-linux/

    Git Merge 2018 Contributor's summit notes (includes discussion of MIDX)



I've been meaning to get around to submitting something like I mentioned
in https://public-inbox.org/git/87efn0bkls@evledraar.gmail.com/
i.e. a way to expand the abbrev mode to not check disambiguations, which
would look something like:

 core.abbrev = 20
 core.validateAbbrev = false

Or:

 core.abbrev = +2
 core.validateAbbrev = false

So, using the example from the above referenced E-Mail +2 would make
linux.git emit hashes of 14 characters, without any abbreviation
checking (just trusting in statistics to work in your favor).

As noted by JS in this thread that wouldn't be acceptable for your
use-case, but there's plenty of people (including me) who'd appreciate
the speedup without being a 100% sure we're emitting unambiguous hashes,
since that trade-off is better than time spent generating another index
on-disk. So I see it as a complimentary & orthogonal feature.

But with that implemented I wouldn't get any benefit from things that
use the MIDX that aren't abbreviations, so what are those?


The MIDX is built for handling many packfiles. As opposed to the 
commit-graph feature, your repo needs to be _really_big_ to need the 
MIDX. Most just repack into one packfile on a regular basis.


One case for vanilla Git: we've heard from lots of customers disabling 
gc.auto in their build machines because they didn't want to wait for a 
repack/gc after a fetch and before a build. Then, they end up in a 
many-pack situation because they never scheduled time for that repack/gc.


For GVFS, we virtualize 

[PATCH v5 12/21] commit-graph: verify parent list

2018-06-06 Thread Derrick Stolee
The commit-graph file stores parents in a two-column portion of the
commit data chunk. If there is only one parent, then the second column
stores 0x to indicate no second parent.

The 'verify' subcommand checks the parent list for the commit loaded
from the commit-graph and the one parsed from the object database. Test
these checks for corrupt parents, too many parents, and wrong parents.

Add a boundary check to insert_parent_or_die() for when the parent
position value is out of range.

The octopus merge will be tested in a later commit.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 28 
 t/t5318-commit-graph.sh | 18 ++
 2 files changed, 46 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index d7e408a99d..fcebd0925c 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -243,6 +243,9 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
struct commit *c;
struct object_id oid;
 
+   if (pos >= g->num_commits)
+   die("invalid parent position %"PRIu64, pos);
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -906,6 +909,7 @@ int verify_commit_graph(struct commit_graph *g)
 
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
+   struct commit_list *graph_parents, *odb_parents;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -923,6 +927,30 @@ int verify_commit_graph(struct commit_graph *g)
 oid_to_hex(_oid),
 
oid_to_hex(get_commit_tree_oid(graph_commit)),
 
oid_to_hex(get_commit_tree_oid(odb_commit)));
+
+   graph_parents = graph_commit->parents;
+   odb_parents = odb_commit->parents;
+
+   while (graph_parents) {
+   if (odb_parents == NULL) {
+   graph_report("commit-graph parent list for 
commit %s is too long",
+oid_to_hex(_oid));
+   break;
+   }
+
+   if (oidcmp(_parents->item->object.oid, 
_parents->item->object.oid))
+   graph_report("commit-graph parent for %s is %s 
!= %s",
+oid_to_hex(_oid),
+
oid_to_hex(_parents->item->object.oid),
+
oid_to_hex(_parents->item->object.oid));
+
+   graph_parents = graph_parents->next;
+   odb_parents = odb_parents->next;
+   }
+
+   if (odb_parents != NULL)
+   graph_report("commit-graph parent list for commit %s 
terminates early",
+oid_to_hex(_oid));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c0c1248eda..ec0964112a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -269,6 +269,9 @@ GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + 
$HASH_LEN \* 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
 GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS))
 GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
+GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
+GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
+GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -348,4 +351,19 @@ test_expect_success 'detect incorrect tree OID' '
"root tree OID for commit"
 '
 
+test_expect_success 'detect incorrect parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_PARENT "\01" \
+   "invalid parent"
+'
+
+test_expect_success 'detect extra parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_EXTRA_PARENT "\00" \
+   "is too long"
+'
+
+test_expect_success 'detect wrong parent' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_WRONG_PARENT "\01" \
+   "commit-graph parent for"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 11/21] commit-graph: verify root tree OIDs

2018-06-06 Thread Derrick Stolee
The 'verify' subcommand must compare the commit content parsed from the
commit-graph against the content in the object database. Use
lookup_commit() and parse_commit_in_graph_one() to parse the commits
from the graph and compare against a commit that is loaded separately
and parsed directly from the object database.

Add checks for the root tree OID.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 17 -
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 893cc2f346..d7e408a99d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -865,6 +865,8 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
if (i && oidcmp(_oid, _oid) >= 0)
@@ -882,6 +884,11 @@ int verify_commit_graph(struct commit_graph *g)
 
cur_fanout_pos++;
}
+
+   graph_commit = lookup_commit(_oid);
+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph",
+oid_to_hex(_oid));
}
 
while (cur_fanout_pos < 256) {
@@ -898,16 +905,24 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
-   struct commit *odb_commit;
+   struct commit *graph_commit, *odb_commit;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
+   graph_commit = lookup_commit(_oid);
odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
if (parse_commit_internal(odb_commit, 0, 0)) {
graph_report("failed to parse %s from object database",
 oid_to_hex(_oid));
continue;
}
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree OID for commit %s in 
commit-graph is %s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index cf60e48496..c0c1248eda 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
+GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS))
+GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -341,4 +343,9 @@ test_expect_success 'detect OID not in object database' '
"from object database"
 '
 
+test_expect_success 'detect incorrect tree OID' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_TREE "\01" \
+   "root tree OID for commit"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 02/21] commit-graph: fix GRAPH_MIN_SIZE

2018-06-06 Thread Derrick Stolee
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable
commit-graph file. However, the minimum number of chunks was wrong.
It is possible to write a commit-graph file with zero commits, and
that violates this macro's value.

Rewrite the macro, and use extra macros to better explain the magic
constants.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index b63a1fc85e..f83f6d2373 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -35,10 +35,11 @@
 
 #define GRAPH_LAST_EDGE 0x8000
 
+#define GRAPH_HEADER_SIZE 8
 #define GRAPH_FANOUT_SIZE (4 * 256)
 #define GRAPH_CHUNKLOOKUP_WIDTH 12
-#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \
-   GRAPH_OID_LEN + 8)
+#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \
+   + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN)
 
 char *get_commit_graph_filename(const char *obj_dir)
 {
-- 
2.18.0.rc1



[PATCH v5 08/21] commit-graph: verify required chunks are present

2018-06-06 Thread Derrick Stolee
The commit-graph file requires the following three chunks:

* OID Fanout
* OID Lookup
* Commit Data

If any of these are missing, then the 'verify' subcommand should
report a failure. This includes the chunk IDs malformed or the
chunk count is truncated.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  |  9 +
 t/t5318-commit-graph.sh | 29 +
 2 files changed, 38 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 432920ad2a..f41d5a0504 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -848,5 +848,14 @@ int verify_commit_graph(struct commit_graph *g)
return 1;
}
 
+   verify_commit_graph_error = 0;
+
+   if (!g->chunk_oid_fanout)
+   graph_report("commit-graph is missing the OID Fanout chunk");
+   if (!g->chunk_oid_lookup)
+   graph_report("commit-graph is missing the OID Lookup chunk");
+   if (!g->chunk_commit_data)
+   graph_report("commit-graph is missing the Commit Data chunk");
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c0c1ff09b9..846396665e 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -249,6 +249,15 @@ test_expect_success 'git commit-graph verify' '
 
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
+GRAPH_BYTE_CHUNK_COUNT=6
+GRAPH_CHUNK_LOOKUP_OFFSET=8
+GRAPH_CHUNK_LOOKUP_WIDTH=12
+GRAPH_CHUNK_LOOKUP_ROWS=5
+GRAPH_BYTE_OID_FANOUT_ID=$GRAPH_CHUNK_LOOKUP_OFFSET
+GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+   1 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+2 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -283,4 +292,24 @@ test_expect_success 'detect bad hash version' '
"hash version"
 '
 
+test_expect_success 'detect low chunk count' '
+   corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+   "missing the .* chunk"
+'
+
+test_expect_success 'detect missing OID fanout chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_FANOUT_ID "\0" \
+   "missing the OID Fanout chunk"
+'
+
+test_expect_success 'detect missing OID lookup chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ID "\0" \
+   "missing the OID Lookup chunk"
+'
+
+test_expect_success 'detect missing commit data chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATA_ID "\0" \
+   "missing the Commit Data chunk"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 01/21] commit-graph: UNLEAK before die()

2018-06-06 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 37420ae0fd..f0875b8bf3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv)
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
 
-   if (!graph)
+   if (!graph) {
+   UNLEAK(graph_name);
die("graph file %s does not exist", graph_name);
+   }
+
FREE_AND_NULL(graph_name);
 
printf("header: %08x %d %d %d %d\n",
-- 
2.18.0.rc1



[PATCH v5 06/21] commit-graph: add 'verify' subcommand

2018-06-06 Thread Derrick Stolee
If the commit-graph file becomes corrupt, we need a way to verify
that its contents match the object database. In the manner of
'git fsck' we will implement a 'git commit-graph verify' subcommand
to report all issues with the file.

Add the 'verify' subcommand to the 'commit-graph' builtin and its
documentation. The subcommand is currently a no-op except for
loading the commit-graph into memory, which may trigger run-time
errors that would be caught by normal use. Add a simple test that
ensures the command returns a zero error code.

If no commit-graph file exists, this is an acceptable state. Do
not report any errors.

Helped-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 Documentation/git-commit-graph.txt |  6 +
 builtin/commit-graph.c | 38 ++
 commit-graph.c | 23 ++
 commit-graph.h |  2 ++
 t/t5318-commit-graph.sh| 10 
 5 files changed, 79 insertions(+)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index 4c97b555cc..a222cfab08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -10,6 +10,7 @@ SYNOPSIS
 
 [verse]
 'git commit-graph read' [--object-dir ]
+'git commit-graph verify' [--object-dir ]
 'git commit-graph write'  [--object-dir ]
 
 
@@ -52,6 +53,11 @@ existing commit-graph file.
 Read a graph file given by the commit-graph file and output basic
 details about the graph file. Used for debugging purposes.
 
+'verify'::
+
+Read the commit-graph file and verify its contents against the object
+database. Used to check for corrupted data.
+
 
 EXAMPLES
 
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index f0875b8bf3..3079cde6f9 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -8,10 +8,16 @@
 static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
+   N_("git commit-graph verify [--object-dir ]"),
N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
NULL
 };
 
+static const char * const builtin_commit_graph_verify_usage[] = {
+   N_("git commit-graph verify [--object-dir ]"),
+   NULL
+};
+
 static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir ]"),
NULL
@@ -29,6 +35,36 @@ static struct opts_commit_graph {
int append;
 } opts;
 
+
+static int graph_verify(int argc, const char **argv)
+{
+   struct commit_graph *graph = NULL;
+   char *graph_name;
+
+   static struct option builtin_commit_graph_verify_options[] = {
+   OPT_STRING(0, "object-dir", _dir,
+  N_("dir"),
+  N_("The object directory to store the graph")),
+   OPT_END(),
+   };
+
+   argc = parse_options(argc, argv, NULL,
+builtin_commit_graph_verify_options,
+builtin_commit_graph_verify_usage, 0);
+
+   if (!opts.obj_dir)
+   opts.obj_dir = get_object_directory();
+
+   graph_name = get_commit_graph_filename(opts.obj_dir);
+   graph = load_commit_graph_one(graph_name);
+   FREE_AND_NULL(graph_name);
+
+   if (!graph)
+   return 0;
+
+   return verify_commit_graph(graph);
+}
+
 static int graph_read(int argc, const char **argv)
 {
struct commit_graph *graph = NULL;
@@ -165,6 +201,8 @@ int cmd_commit_graph(int argc, const char **argv, const 
char *prefix)
if (argc > 0) {
if (!strcmp(argv[0], "read"))
return graph_read(argc, argv);
+   if (!strcmp(argv[0], "verify"))
+   return graph_verify(argc, argv);
if (!strcmp(argv[0], "write"))
return graph_write(argc, argv);
}
diff --git a/commit-graph.c b/commit-graph.c
index 9e228d3bb5..432920ad2a 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -827,3 +827,26 @@ void write_commit_graph(const char *obj_dir,
oids.alloc = 0;
oids.nr = 0;
 }
+
+static int verify_commit_graph_error;
+
+static void graph_report(const char *fmt, ...)
+{
+   va_list ap;
+   verify_commit_graph_error = 1;
+
+   va_start(ap, fmt);
+   vfprintf(stderr, fmt, ap);
+   fprintf(stderr, "\n");
+   va_end(ap);
+}
+
+int verify_commit_graph(struct commit_graph *g)
+{
+   if (!g) {
+   graph_report("no commit-graph file loaded");
+   return 1;
+   }
+
+   return verify_commit_graph_error;
+}
diff --git a/commit-graph.h b/commit-graph.h
index 96cccb10f3..71a39c5a57 100644
--- a/commit-graph.h
+++ b/co

[PATCH v5 04/21] commit: force commit to parse from object database

2018-06-06 Thread Derrick Stolee
In anticipation of verifying commit-graph file contents against the
object database, create parse_commit_internal() to allow side-stepping
the commit-graph file and parse directly from the object database.

Due to the use of generation numbers, this method should not be called
unless the intention is explicit in avoiding commits from the
commit-graph file.

Signed-off-by: Derrick Stolee 
---
 commit.c | 9 +++--
 commit.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index 298ad747c6..922bb68741 100644
--- a/commit.c
+++ b/commit.c
@@ -405,7 +405,7 @@ int parse_commit_buffer(struct commit *item, const void 
*buffer, unsigned long s
return 0;
 }
 
-int parse_commit_gently(struct commit *item, int quiet_on_missing)
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph)
 {
enum object_type type;
void *buffer;
@@ -416,7 +416,7 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return -1;
if (item->object.parsed)
return 0;
-   if (parse_commit_in_graph(item))
+   if (use_commit_graph && parse_commit_in_graph(item))
return 0;
buffer = read_object_file(>object.oid, , );
if (!buffer)
@@ -437,6 +437,11 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return ret;
 }
 
+int parse_commit_gently(struct commit *item, int quiet_on_missing)
+{
+   return parse_commit_internal(item, quiet_on_missing, 1);
+}
+
 void parse_commit_or_die(struct commit *item)
 {
if (parse_commit(item))
diff --git a/commit.h b/commit.h
index cb943013d0..4065fd12ac 100644
--- a/commit.h
+++ b/commit.h
@@ -77,6 +77,7 @@ struct commit *lookup_commit_reference_by_name(const char 
*name);
 struct commit *lookup_commit_or_die(const struct object_id *oid, const char 
*ref_name);
 
 int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long 
size, int check_graph);
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph);
 int parse_commit_gently(struct commit *item, int quiet_on_missing);
 static inline int parse_commit(struct commit *item)
 {
-- 
2.18.0.rc1



Re: [PATCH v5 18/21] commit-graph: use string-list API for input

2018-06-06 Thread Derrick Stolee

On 6/6/2018 8:11 AM, Ævar Arnfjörð Bjarmason wrote:

On Wed, Jun 06 2018, Derrick Stolee wrote:


Signed-off-by: Derrick Stolee 
---
  builtin/commit-graph.c | 39 +--
  commit-graph.c | 15 +++
  commit-graph.h |  7 +++
  3 files changed, 23 insertions(+), 38 deletions(-)
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 3079cde6f9..d8eb8278b3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -118,13 +118,9 @@ static int graph_read(int argc, const char **argv)

  static int graph_write(int argc, const char **argv)
  {
-   const char **pack_indexes = NULL;
-   int packs_nr = 0;
-   const char **commit_hex = NULL;
-   int commits_nr = 0;
-   const char **lines = NULL;
-   int lines_nr = 0;
-   int lines_alloc = 0;
+   struct string_list *pack_indexes = NULL;
+   struct string_list *commit_hex = NULL;
+   struct string_list lines;

static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", _dir,
@@ -150,32 +146,23 @@ static int graph_write(int argc, const char **argv)

if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
-   lines_nr = 0;
-   lines_alloc = 128;
-   ALLOC_ARRAY(lines, lines_alloc);
-
-   while (strbuf_getline(, stdin) != EOF) {
-   ALLOC_GROW(lines, lines_nr + 1, lines_alloc);
-   lines[lines_nr++] = strbuf_detach(, NULL);
-   }
-
-   if (opts.stdin_packs) {
-   pack_indexes = lines;
-   packs_nr = lines_nr;
-   }
-   if (opts.stdin_commits) {
-   commit_hex = lines;
-   commits_nr = lines_nr;
-   }
+   string_list_init(, 0);
+
+   while (strbuf_getline(, stdin) != EOF)
+   string_list_append(, strbuf_detach(, NULL));
+
+   if (opts.stdin_packs)
+   pack_indexes = 
+   if (opts.stdin_commits)
+   commit_hex = 
}

write_commit_graph(opts.obj_dir,
   pack_indexes,
-  packs_nr,
   commit_hex,
-  commits_nr,
   opts.append);

+   string_list_clear(, 0);
return 0;
  }

This results in an invalid free() & segfault because you're freeing
 which may not have been allocated by string_list_init().


Good point. Did my tests not catch this? (seems it requires calling `git 
commit-graph write` with no `--stdin-packs` or `--stdin-commits`).




Monkeypatch on top which I used to fix it:

 diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
 index 76423b3fa5..c7eb68aa3a 100644
 --- a/builtin/commit-graph.c
 +++ b/builtin/commit-graph.c
 @@ -122,6 +122,7 @@ static int graph_write(int argc, const char **argv)
 struct string_list *pack_indexes = NULL;
 struct string_list *commit_hex = NULL;
 struct string_list lines;
 +   int free_lines = 0;

 static struct option builtin_commit_graph_write_options[] = {
 OPT_STRING(0, "object-dir", _dir,
 @@ -155,6 +156,7 @@ static int graph_write(int argc, const char **argv)
 if (opts.stdin_packs || opts.stdin_commits) {
 struct strbuf buf = STRBUF_INIT;
 string_list_init(, 0);
 +   free_lines = 1;

 while (strbuf_getline(, stdin) != EOF)
 string_list_append(, strbuf_detach(, 
NULL));
 @@ -170,7 +172,8 @@ static int graph_write(int argc, const char **argv)
commit_hex,
opts.append);

 -   string_list_clear(, 0);
 +   if (free_lines)
 +   string_list_clear(, 0);
 return 0;
  }

But probably having a pointer to the struct which is NULL etc. is
better.


Wouldn't the easiest fix be to call `string_list_init(, 0)` 
outside of any conditional?


Thanks,
-Stolee


Re: [PATCH v4 17/21] fsck: verify commit-graph

2018-06-06 Thread Derrick Stolee

On 6/6/2018 7:08 AM, Ævar Arnfjörð Bjarmason wrote:

On Mon, Jun 04 2018, Derrick Stolee wrote:


+   prepare_alt_odb();
+   for (alt = alt_odb_list; alt; alt = alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }
+

This doesn't compile under clang on master. It needs to account for
0b20903405 ("sha1_file: add repository argument to prepare_alt_odb",
2018-03-23).

 builtin/fsck.c:837:19: error: too few arguments to function call, single 
argument 'r' was not specified
 prepare_alt_odb();
 ~~~ ^

Ditto this error due to a missing resolution with 031dc927f4
("object-store: move alt_odb_list and alt_odb_tail to object store",
2018-03-23):

 builtin/fsck.c:838:14: error: use of undeclared identifier 'alt_odb_list'
 for (alt = alt_odb_list; alt; alt = alt->next) {


Thanks, Ævar. I forgot to rebase onto 'next'. Doing so now and will send 
v5 shortly.


-Stolee


Re: [PATCH v5 18/21] commit-graph: use string-list API for input

2018-06-06 Thread Derrick Stolee

On 6/6/2018 8:26 AM, Ævar Arnfjörð Bjarmason wrote:

On Wed, Jun 06 2018, Derrick Stolee wrote:


On 6/6/2018 8:11 AM, Ævar Arnfjörð Bjarmason wrote:

On Wed, Jun 06 2018, Derrick Stolee wrote:


Signed-off-by: Derrick Stolee 
---
   builtin/commit-graph.c | 39 +--
   commit-graph.c | 15 +++
   commit-graph.h |  7 +++
   3 files changed, 23 insertions(+), 38 deletions(-)
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 3079cde6f9..d8eb8278b3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -118,13 +118,9 @@ static int graph_read(int argc, const char **argv)

   static int graph_write(int argc, const char **argv)
   {
-   const char **pack_indexes = NULL;
-   int packs_nr = 0;
-   const char **commit_hex = NULL;
-   int commits_nr = 0;
-   const char **lines = NULL;
-   int lines_nr = 0;
-   int lines_alloc = 0;
+   struct string_list *pack_indexes = NULL;
+   struct string_list *commit_hex = NULL;
+   struct string_list lines;

static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", _dir,
@@ -150,32 +146,23 @@ static int graph_write(int argc, const char **argv)

if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
-   lines_nr = 0;
-   lines_alloc = 128;
-   ALLOC_ARRAY(lines, lines_alloc);
-
-   while (strbuf_getline(, stdin) != EOF) {
-   ALLOC_GROW(lines, lines_nr + 1, lines_alloc);
-   lines[lines_nr++] = strbuf_detach(, NULL);
-   }
-
-   if (opts.stdin_packs) {
-   pack_indexes = lines;
-   packs_nr = lines_nr;
-   }
-   if (opts.stdin_commits) {
-   commit_hex = lines;
-   commits_nr = lines_nr;
-   }
+   string_list_init(, 0);
+
+   while (strbuf_getline(, stdin) != EOF)
+   string_list_append(, strbuf_detach(, NULL));
+
+   if (opts.stdin_packs)
+   pack_indexes = 
+   if (opts.stdin_commits)
+   commit_hex = 
}

write_commit_graph(opts.obj_dir,
   pack_indexes,
-  packs_nr,
   commit_hex,
-  commits_nr,
   opts.append);

+   string_list_clear(, 0);
return 0;
   }

This results in an invalid free() & segfault because you're freeing
 which may not have been allocated by string_list_init().

Good point. Did my tests not catch this? (seems it requires calling
`git commit-graph write` with no `--stdin-packs` or
`--stdin-commits`).

Most of your tests (t5318-commit-graph.sh) segfaulted, but presumably
you're on a more forgiving compiler/platform/options. I compiled with
-O0 -g on clang 4.0.1-8 + Debian testing.


I appreciate the extra platform testing. I'm using GCC on Ubuntu (gcc 
(Ubuntu 7.3.0-16ubuntu3) 7.3.0).





Monkeypatch on top which I used to fix it:

  diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
  index 76423b3fa5..c7eb68aa3a 100644
  --- a/builtin/commit-graph.c
  +++ b/builtin/commit-graph.c
  @@ -122,6 +122,7 @@ static int graph_write(int argc, const char **argv)
  struct string_list *pack_indexes = NULL;
  struct string_list *commit_hex = NULL;
  struct string_list lines;
  +   int free_lines = 0;

  static struct option builtin_commit_graph_write_options[] = {
  OPT_STRING(0, "object-dir", _dir,
  @@ -155,6 +156,7 @@ static int graph_write(int argc, const char **argv)
  if (opts.stdin_packs || opts.stdin_commits) {
  struct strbuf buf = STRBUF_INIT;
  string_list_init(, 0);
  +   free_lines = 1;

  while (strbuf_getline(, stdin) != EOF)
  string_list_append(, strbuf_detach(, 
NULL));
  @@ -170,7 +172,8 @@ static int graph_write(int argc, const char **argv)
 commit_hex,
 opts.append);

  -   string_list_clear(, 0);
  +   if (free_lines)
  +   string_list_clear(, 0);
  return 0;
   }

But probably having a pointer to the struct which is NULL etc. is
better.

Wouldn't the easiest fix be to call `string_list_init(, 0)`
outside of any conditional?

Sure that works too. We'd be doing the init when we don't need it, but
it's not like this part is performance critical or anything...




[PATCH v5 20/21] gc: automatically write commit-graph files

2018-06-06 Thread Derrick Stolee
The commit-graph file is a very helpful feature for speeding up git
operations. In order to make it more useful, make it possible to
write the commit-graph file during standard garbage collection
operations.

Add a 'gc.commitGraph' config setting that triggers writing a
commit-graph file after any non-trivial 'git gc' command. Defaults to
false while the commit-graph feature matures. We specifically do not
want to have this on by default until the commit-graph feature is fully
integrated with history-modifying features like shallow clones.

Helped-by: Ævar Arnfjörð Bjarmason 
Signed-off-by: Derrick Stolee 
---
 Documentation/config.txt | 10 +-
 Documentation/git-gc.txt |  4 
 builtin/gc.c |  6 ++
 t/t5318-commit-graph.sh  | 14 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ab641bf5a9..f2b5ed17c8 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -906,7 +906,8 @@ the `GIT_NOTES_REF` environment variable.  See 
linkgit:git-notes[1].
 
 core.commitGraph::
Enable git commit graph feature. Allows reading from the
-   commit-graph file.
+   commit-graph file. See `gc.commitGraph` for automatically
+   maintaining the file.
 
 core.sparseCheckout::
Enable "sparse checkout" feature. See section "Sparse checkout" in
@@ -1647,6 +1648,13 @@ this configuration variable is ignored, all packs except 
the base pack
 will be repacked. After this the number of packs should go below
 gc.autoPackLimit and gc.bigPackThreshold should be respected again.
 
+gc.commitGraph::
+   If true, then gc will rewrite the commit-graph file when
+   linkgit:git-gc[1] is run. When using linkgit:git-gc[1]
+   '--auto' the commit-graph will be updated if housekeeping is
+   required. Default is false. See linkgit:git-commit-graph[1]
+   for details.
+
 gc.logExpiry::
If the file gc.log exists, then `git gc --auto` won't run
unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 24b2dd44fe..f5bc98ccb3 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -136,6 +136,10 @@ The optional configuration variable `gc.packRefs` 
determines if
 it within all non-bare repos or it can be set to a boolean value.
 This defaults to true.
 
+The optional configuration variable `gc.commitGraph` determines if
+'git gc' should run 'git commit-graph write'. This can be set to a
+boolean value. This defaults to false.
+
 The optional configuration variable `gc.aggressiveWindow` controls how
 much time is spent optimizing the delta compression of the objects in
 the repository when the --aggressive option is specified.  The larger
diff --git a/builtin/gc.c b/builtin/gc.c
index ccfb1ceaeb..4e06e8372d 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,7 @@
 #include "sigchain.h"
 #include "argv-array.h"
 #include "commit.h"
+#include "commit-graph.h"
 #include "packfile.h"
 #include "object-store.h"
 #include "pack.h"
@@ -40,6 +41,7 @@ static int aggressive_depth = 50;
 static int aggressive_window = 250;
 static int gc_auto_threshold = 6700;
 static int gc_auto_pack_limit = 50;
+static int gc_commit_graph = 0;
 static int detach_auto = 1;
 static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
@@ -129,6 +131,7 @@ static void gc_config(void)
git_config_get_int("gc.aggressivedepth", _depth);
git_config_get_int("gc.auto", _auto_threshold);
git_config_get_int("gc.autopacklimit", _auto_pack_limit);
+   git_config_get_bool("gc.commitgraph", _commit_graph);
git_config_get_bool("gc.autodetach", _auto);
git_config_get_expiry("gc.pruneexpire", _expire);
git_config_get_expiry("gc.worktreepruneexpire", 
_worktrees_expire);
@@ -641,6 +644,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
if (pack_garbage.nr > 0)
clean_pack_garbage();
 
+   if (gc_commit_graph)
+   write_commit_graph_reachable(get_object_directory(), 0);
+
if (auto_gc && too_many_loose_objects())
warning(_("There are too many unreachable loose objects; "
"run 'git prune' to remove them."));
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index ffb2ed7c95..b24e8b6689 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -245,6 +245,20 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+test_expect_success 'check that gc computes commit-graph' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git commit --allow-empty -m "

[PATCH v5 13/21] commit-graph: verify generation number

2018-06-06 Thread Derrick Stolee
While iterating through the commit parents, perform the generation
number calculation and compare against the value stored in the
commit-graph.

The tests demonstrate that having a different set of parents affects
the generation number calculation, and this value propagates to
descendants. Hence, we drop the single-line condition on the output.

Since Git will ship with the commit-graph feature without generation
numbers, we need to accept commit-graphs with all generation numbers
equal to zero. In this case, ignore the generation number calculation.

However, verify that we should never have a mix of zero and non-zero
generation numbers. Create a test that sets one commit to generation
zero and all following commits report a failure as they have non-zero
generation in a file that contains generation number zero.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 34 ++
 t/t5318-commit-graph.sh | 11 +++
 2 files changed, 45 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index fcebd0925c..b97fa05ec9 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -845,10 +845,14 @@ static void graph_report(const char *fmt, ...)
va_end(ap);
 }
 
+#define GENERATION_ZERO_EXISTS 1
+#define GENERATION_NUMBER_EXISTS 2
+
 int verify_commit_graph(struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
struct object_id prev_oid, cur_oid;
+   int generation_zero = 0;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -910,6 +914,7 @@ int verify_commit_graph(struct commit_graph *g)
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
struct commit_list *graph_parents, *odb_parents;
+   uint32_t max_generation = 0;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -944,6 +949,9 @@ int verify_commit_graph(struct commit_graph *g)
 
oid_to_hex(_parents->item->object.oid),
 
oid_to_hex(_parents->item->object.oid));
 
+   if (graph_parents->item->generation > max_generation)
+   max_generation = 
graph_parents->item->generation;
+
graph_parents = graph_parents->next;
odb_parents = odb_parents->next;
}
@@ -951,6 +959,32 @@ int verify_commit_graph(struct commit_graph *g)
if (odb_parents != NULL)
graph_report("commit-graph parent list for commit %s 
terminates early",
 oid_to_hex(_oid));
+
+   if (!graph_commit->generation) {
+   if (generation_zero == GENERATION_NUMBER_EXISTS)
+   graph_report("commit-graph has generation 
number zero for commit %s, but non-zero elsewhere",
+oid_to_hex(_oid));
+   generation_zero = GENERATION_ZERO_EXISTS;
+   } else if (generation_zero == GENERATION_ZERO_EXISTS)
+   graph_report("commit-graph has non-zero generation 
number for commit %s, but zero elsewhere",
+oid_to_hex(_oid));
+
+   if (generation_zero == GENERATION_ZERO_EXISTS)
+   continue;
+
+   /*
+* If one of our parents has generation GENERATION_NUMBER_MAX, 
then
+* our generation is also GENERATION_NUMBER_MAX. Decrement to 
avoid
+* extra logic in the following condition.
+*/
+   if (max_generation == GENERATION_NUMBER_MAX)
+   max_generation--;
+
+   if (graph_commit->generation != max_generation + 1)
+   graph_report("commit-graph generation for commit %s is 
%u != %u",
+oid_to_hex(_oid),
+graph_commit->generation,
+max_generation + 1);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index ec0964112a..a6ea1341dc 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
+GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -366,4 +367,14 @@ test_expect_success 'detect wrong parent' '

[PATCH v5 03/21] commit-graph: parse commit from chosen graph

2018-06-06 Thread Derrick Stolee
Before verifying a commit-graph file against the object database, we
need to parse all commits from the given commit-graph file. Create
parse_commit_in_graph_one() to target a given struct commit_graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index f83f6d2373..e77b19971d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -314,7 +314,7 @@ static int find_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
}
 }
 
-int parse_commit_in_graph(struct commit *item)
+static int parse_commit_in_graph_one(struct commit_graph *g, struct commit 
*item)
 {
uint32_t pos;
 
@@ -322,9 +322,21 @@ int parse_commit_in_graph(struct commit *item)
return 0;
if (item->object.parsed)
return 1;
+
+   if (find_commit_in_graph(item, g, ))
+   return fill_commit_in_graph(item, g, pos);
+
+   return 0;
+}
+
+int parse_commit_in_graph(struct commit *item)
+{
+   if (!core_commit_graph)
+   return 0;
+
prepare_commit_graph();
-   if (commit_graph && find_commit_in_graph(item, commit_graph, ))
-   return fill_commit_in_graph(item, commit_graph, pos);
+   if (commit_graph)
+   return parse_commit_in_graph_one(commit_graph, item);
return 0;
 }
 
-- 
2.18.0.rc1



[PATCH v5 10/21] commit-graph: verify objects exist

2018-06-06 Thread Derrick Stolee
In the 'verify' subcommand, load commits directly from the object
database to ensure they exist. Parse by skipping the commit-graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 17 +
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 24 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index d7a5b50a6c..893cc2f346 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -242,6 +242,7 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -893,5 +894,21 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *odb_commit;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
+   if (parse_commit_internal(odb_commit, 0, 0)) {
+   graph_report("failed to parse %s from object database",
+oid_to_hex(_oid));
+   continue;
+   }
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c29eae47c9..cf60e48496 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+NUM_COMMITS=9
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4))
 GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
+GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' '
"incorrect OID order"
 '
 
+test_expect_success 'detect OID not in object database' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \
+   "from object database"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 19/21] commit-graph: add '--reachable' option

2018-06-06 Thread Derrick Stolee
When writing commit-graph files, it can be convenient to ask for all
reachable commits (starting at the ref set) in the resulting file. This
is particularly helpful when writing to stdin is complicated, such as a
future integration with 'git gc'.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-commit-graph.txt |  8 ++--
 builtin/commit-graph.c | 16 
 commit-graph.c | 18 ++
 commit-graph.h |  1 +
 t/t5318-commit-graph.sh| 10 ++
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index a222cfab08..dececb79d7 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -38,12 +38,16 @@ Write a commit graph file based on the commits found in 
packfiles.
 +
 With the `--stdin-packs` option, generate the new commit graph by
 walking objects only in the specified pack-indexes. (Cannot be combined
-with --stdin-commits.)
+with `--stdin-commits` or `--reachable`.)
 +
 With the `--stdin-commits` option, generate the new commit graph by
 walking commits starting at the commits specified in stdin as a list
 of OIDs in hex, one OID per line. (Cannot be combined with
---stdin-packs.)
+`--stdin-packs` or `--reachable`.)
++
+With the `--reachable` option, generate the new commit graph by walking
+commits starting at all refs. (Cannot be combined with `--stdin-commits`
+or `--stdin-packs`.)
 +
 With the `--append` option, include all commits that are present in the
 existing commit-graph file.
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index d8eb8278b3..76423b3fa5 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -9,7 +9,7 @@ static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
N_("git commit-graph verify [--object-dir ]"),
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
@@ -24,12 +24,13 @@ static const char * const builtin_commit_graph_read_usage[] 
= {
 };
 
 static const char * const builtin_commit_graph_write_usage[] = {
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
 static struct opts_commit_graph {
const char *obj_dir;
+   int reachable;
int stdin_packs;
int stdin_commits;
int append;
@@ -126,6 +127,8 @@ static int graph_write(int argc, const char **argv)
OPT_STRING(0, "object-dir", _dir,
N_("dir"),
N_("The object directory to store the graph")),
+   OPT_BOOL(0, "reachable", ,
+   N_("start walk at all refs")),
OPT_BOOL(0, "stdin-packs", _packs,
N_("scan pack-indexes listed by stdin for commits")),
OPT_BOOL(0, "stdin-commits", _commits,
@@ -139,11 +142,16 @@ static int graph_write(int argc, const char **argv)
 builtin_commit_graph_write_options,
 builtin_commit_graph_write_usage, 0);
 
-   if (opts.stdin_packs && opts.stdin_commits)
-   die(_("cannot use both --stdin-commits and --stdin-packs"));
+   if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1)
+   die(_("use at most one of --reachable, --stdin-commits, or 
--stdin-packs"));
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
 
+   if (opts.reachable) {
+   write_commit_graph_reachable(opts.obj_dir, opts.append);
+   return 0;
+   }
+
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
string_list_init(, 0);
diff --git a/commit-graph.c b/commit-graph.c
index f23bf4cf50..0d5adc8035 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -7,6 +7,7 @@
 #include "packfile.h"
 #include "commit.h"
 #include "object.h"
+#include "refs.h"
 #include "revision.h"
 #include "sha1-lookup.h"
 #include "commit-graph.h"
@@ -655,6 +656,23 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
}
 }
 
+static int add_ref_to_list(const char *refname,
+  const struct object_id *oid,
+  int flags, void *cb

[PATCH v5 18/21] commit-graph: use string-list API for input

2018-06-06 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 39 +--
 commit-graph.c | 15 +++
 commit-graph.h |  7 +++
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 3079cde6f9..d8eb8278b3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -118,13 +118,9 @@ static int graph_read(int argc, const char **argv)
 
 static int graph_write(int argc, const char **argv)
 {
-   const char **pack_indexes = NULL;
-   int packs_nr = 0;
-   const char **commit_hex = NULL;
-   int commits_nr = 0;
-   const char **lines = NULL;
-   int lines_nr = 0;
-   int lines_alloc = 0;
+   struct string_list *pack_indexes = NULL;
+   struct string_list *commit_hex = NULL;
+   struct string_list lines;
 
static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", _dir,
@@ -150,32 +146,23 @@ static int graph_write(int argc, const char **argv)
 
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
-   lines_nr = 0;
-   lines_alloc = 128;
-   ALLOC_ARRAY(lines, lines_alloc);
-
-   while (strbuf_getline(, stdin) != EOF) {
-   ALLOC_GROW(lines, lines_nr + 1, lines_alloc);
-   lines[lines_nr++] = strbuf_detach(, NULL);
-   }
-
-   if (opts.stdin_packs) {
-   pack_indexes = lines;
-   packs_nr = lines_nr;
-   }
-   if (opts.stdin_commits) {
-   commit_hex = lines;
-   commits_nr = lines_nr;
-   }
+   string_list_init(, 0);
+
+   while (strbuf_getline(, stdin) != EOF)
+   string_list_append(, strbuf_detach(, NULL));
+
+   if (opts.stdin_packs)
+   pack_indexes = 
+   if (opts.stdin_commits)
+   commit_hex = 
}
 
write_commit_graph(opts.obj_dir,
   pack_indexes,
-  packs_nr,
   commit_hex,
-  commits_nr,
   opts.append);
 
+   string_list_clear(, 0);
return 0;
 }
 
diff --git a/commit-graph.c b/commit-graph.c
index 0f93d5d864..f23bf4cf50 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -656,10 +656,8 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
 }
 
 void write_commit_graph(const char *obj_dir,
-   const char **pack_indexes,
-   int nr_packs,
-   const char **commit_hex,
-   int nr_commits,
+   struct string_list *pack_indexes,
+   struct string_list *commit_hex,
int append)
 {
struct packed_oid_list oids;
@@ -700,10 +698,10 @@ void write_commit_graph(const char *obj_dir,
int dirlen;
strbuf_addf(, "%s/pack/", obj_dir);
dirlen = packname.len;
-   for (i = 0; i < nr_packs; i++) {
+   for (i = 0; i < pack_indexes->nr; i++) {
struct packed_git *p;
strbuf_setlen(, dirlen);
-   strbuf_addstr(, pack_indexes[i]);
+   strbuf_addstr(, pack_indexes->items[i].string);
p = add_packed_git(packname.buf, packname.len, 1);
if (!p)
die("error adding pack %s", packname.buf);
@@ -716,12 +714,13 @@ void write_commit_graph(const char *obj_dir,
}
 
if (commit_hex) {
-   for (i = 0; i < nr_commits; i++) {
+   for (i = 0; i < commit_hex->nr; i++) {
const char *end;
struct object_id oid;
struct commit *result;
 
-   if (commit_hex[i] && parse_oid_hex(commit_hex[i], , 
))
+   if (commit_hex->items[i].string &&
+   parse_oid_hex(commit_hex->items[i].string, , 
))
continue;
 
result = lookup_commit_reference_gently(, 1);
diff --git a/commit-graph.h b/commit-graph.h
index 71a39c5a57..1e1fc5 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -2,6 +2,7 @@
 #define COMMIT_GRAPH_H
 
 #include "git-compat-util.h"
+#include "string-list.h"
 
 char *get_commit_graph_filename(const char *obj_dir);
 
@@ -47,10 +48,8 @@ struct commit_graph {
 struct commit_graph *load_commit_graph_one(const char *graph_file);
 
 void write_commit_graph(const char *obj

[PATCH v5 14/21] commit-graph: verify commit date

2018-06-06 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 6 ++
 t/t5318-commit-graph.sh | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index b97fa05ec9..d83f0ce5d5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -985,6 +985,12 @@ int verify_commit_graph(struct commit_graph *g)
 oid_to_hex(_oid),
 graph_commit->generation,
 max_generation + 1);
+
+   if (graph_commit->date != odb_commit->date)
+   graph_report("commit date for commit %s in commit-graph 
is %"PRItime" != %"PRItime,
+oid_to_hex(_oid),
+graph_commit->date,
+odb_commit->date);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index a6ea1341dc..6a873bfda8 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -273,6 +273,7 @@ GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + 
$HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
+GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -377,4 +378,9 @@ test_expect_success 'detect incorrect generation number' '
"non-zero generation number"
 '
 
+test_expect_success 'detect incorrect commit date' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATE "\01" \
+   "commit date"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 16/21] commit-graph: verify contents match checksum

2018-06-06 Thread Derrick Stolee
The commit-graph file ends with a SHA1 hash of the previous contents. If
a commit-graph file has errors but the checksum hash is correct, then we
know that the problem is a bug in Git and not simply file corruption
after-the-fact.

Compute the checksum right away so it is the first error that appears,
and make the message translatable since this error can be "corrected" by
a user by simply deleting the file and recomputing. The rest of the
errors are useful only to developers.

Be sure to continue checking the rest of the file data if the checksum
is wrong. This is important for our tests, as we break the checksum as
we modify bytes of the commit-graph file.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 16 ++--
 t/t5318-commit-graph.sh |  6 ++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index d83f0ce5d5..0f93d5d864 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -832,6 +832,7 @@ void write_commit_graph(const char *obj_dir,
oids.nr = 0;
 }
 
+#define VERIFY_COMMIT_GRAPH_ERROR_HASH 2
 static int verify_commit_graph_error;
 
 static void graph_report(const char *fmt, ...)
@@ -851,8 +852,10 @@ static void graph_report(const char *fmt, ...)
 int verify_commit_graph(struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
-   struct object_id prev_oid, cur_oid;
+   struct object_id prev_oid, cur_oid, checksum;
int generation_zero = 0;
+   struct hashfile *f;
+   int devnull;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -871,6 +874,15 @@ int verify_commit_graph(struct commit_graph *g)
if (verify_commit_graph_error)
return verify_commit_graph_error;
 
+   devnull = open("/dev/null", O_WRONLY);
+   f = hashfd(devnull, NULL);
+   hashwrite(f, g->data, g->data_len - g->hash_len);
+   finalize_hashfile(f, checksum.hash, CSUM_CLOSE);
+   if (hashcmp(checksum.hash, g->data + g->data_len - g->hash_len)) {
+   graph_report(_("the commit-graph file has incorrect checksum 
and is likely corrupt"));
+   verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH;
+   }
+
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit;
 
@@ -908,7 +920,7 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
-   if (verify_commit_graph_error)
+   if (verify_commit_graph_error & ~VERIFY_COMMIT_GRAPH_ERROR_HASH)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index cf67fb391a..2297a44e7d 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -279,6 +279,7 @@ GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
 GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
 $GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS))
 GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
+GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -393,4 +394,9 @@ test_expect_success 'detect incorrect parent for octopus 
merge' '
"invalid parent"
 '
 
+test_expect_success 'detect invalid checksum hash' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 07/21] commit-graph: verify catches corrupt signature

2018-06-06 Thread Derrick Stolee
This is the first of several commits that add a test to check that
'git commit-graph verify' catches corruption in the commit-graph
file. The first test checks that the command catches an error in
the file signature. This is a check that exists in the existing
commit-graph reading code.

Add a helper method 'corrupt_graph_and_verify' to the test script
t5318-commit-graph.sh. This helper corrupts the commit-graph file
at a certain location, runs 'git commit-graph verify', and reports
the output to the 'err' file. This data is filtered to remove the
lines added by 'test_must_fail' when the test is run verbosely.
Then, the output is checked to contain a specific error message.

Most messages from 'git commit-graph verify' will not be marked
for translation. There will be one exception: the message that
reports an invalid checksum will be marked for translation, as that
is the only message that is intended for a typical user.

Helped-by: Szeder Gábor 
Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 43 +
 1 file changed, 43 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 0830ef9fdd..c0c1ff09b9 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+# the verify tests below expect the commit-graph to contain
+# exactly the commits reachable from the commits/8 branch.
+# If the file changes the set of commits in the list, then the
+# offsets into the binary file will result in different edits
+# and the tests will likely break.
+
 test_expect_success 'git commit-graph verify' '
cd "$TRASH_DIRECTORY/full" &&
+   git rev-parse commits/8 | git commit-graph write --stdin-commits &&
git commit-graph verify >output
 '
 
+GRAPH_BYTE_VERSION=4
+GRAPH_BYTE_HASH=5
+
+# usage: corrupt_graph_and_verify   
+# Manipulates the commit-graph file at the position
+# by inserting the data, then runs 'git commit-graph verify'
+# and places the output in the file 'err'. Test 'err' for
+# the given string.
+corrupt_graph_and_verify() {
+   pos=$1
+   data="${2:-\0}"
+   grepstr=$3
+   cd "$TRASH_DIRECTORY/full" &&
+   test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
+   cp $objdir/info/commit-graph commit-graph-backup &&
+   printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" 
conv=notrunc &&
+   test_must_fail git commit-graph verify 2>test_err &&
+   grep -v "^+" test_err >err
+   test_i18ngrep "$grepstr" err
+}
+
+test_expect_success 'detect bad signature' '
+   corrupt_graph_and_verify 0 "\0" \
+   "graph signature"
+'
+
+test_expect_success 'detect bad version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \
+   "graph version"
+'
+
+test_expect_success 'detect bad hash version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \
+   "hash version"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 09/21] commit-graph: verify corrupt OID fanout and lookup

2018-06-06 Thread Derrick Stolee
In the commit-graph file, the OID fanout chunk provides an index into
the OID lookup. The 'verify' subcommand should find incorrect values
in the fanout.

Similarly, the 'verify' subcommand should find out-of-order values in
the OID lookup.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 36 
 t/t5318-commit-graph.sh | 22 ++
 2 files changed, 58 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index f41d5a0504..d7a5b50a6c 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -843,6 +843,9 @@ static void graph_report(const char *fmt, ...)
 
 int verify_commit_graph(struct commit_graph *g)
 {
+   uint32_t i, cur_fanout_pos = 0;
+   struct object_id prev_oid, cur_oid;
+
if (!g) {
graph_report("no commit-graph file loaded");
return 1;
@@ -857,5 +860,38 @@ int verify_commit_graph(struct commit_graph *g)
if (!g->chunk_commit_data)
graph_report("commit-graph is missing the Commit Data chunk");
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   if (i && oidcmp(_oid, _oid) >= 0)
+   graph_report("commit-graph has incorrect OID order: %s 
then %s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));
+
+   oidcpy(_oid, _oid);
+
+   while (cur_oid.hash[0] > cur_fanout_pos) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+   if (i != fanout_value)
+   graph_report("commit-graph has incorrect fanout 
value: fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+   }
+
+   while (cur_fanout_pos < 256) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+
+   if (g->num_commits != fanout_value)
+   graph_report("commit-graph has incorrect fanout value: 
fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 846396665e..c29eae47c9 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
 GRAPH_BYTE_CHUNK_COUNT=6
@@ -258,6 +259,12 @@ GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
1 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
 GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
 2 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_FANOUT_OFFSET=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+  $GRAPH_CHUNK_LOOKUP_WIDTH \* $GRAPH_CHUNK_LOOKUP_ROWS))
+GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4))
+GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
+GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
+GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -312,4 +319,19 @@ test_expect_success 'detect missing commit data chunk' '
"missing the Commit Data chunk"
 '
 
+test_expect_success 'detect incorrect fanout' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT1 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect fanout final value' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect OID order' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ORDER "\01" \
+   "incorrect OID order"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 00/21] Integrate commit-graph into 'fsck' and 'gc'

2018-06-06 Thread Derrick Stolee
Thanks, Ævar, for pointing out that I forgot to rebase onto 'next'.

There were a few collisions with the object-store refactoring. Junio
even pointed them out, so I'm sorry to forget that. I also did a test
merge with 'pu' and it seems the only conflicts are with the earlier
version of this patch.

Thanks,
-Stolee

Derrick Stolee (21):
  commit-graph: UNLEAK before die()
  commit-graph: fix GRAPH_MIN_SIZE
  commit-graph: parse commit from chosen graph
  commit: force commit to parse from object database
  commit-graph: load a root tree from specific graph
  commit-graph: add 'verify' subcommand
  commit-graph: verify catches corrupt signature
  commit-graph: verify required chunks are present
  commit-graph: verify corrupt OID fanout and lookup
  commit-graph: verify objects exist
  commit-graph: verify root tree OIDs
  commit-graph: verify parent list
  commit-graph: verify generation number
  commit-graph: verify commit date
  commit-graph: test for corrupted octopus edge
  commit-graph: verify contents match checksum
  fsck: verify commit-graph
  commit-graph: use string-list API for input
  commit-graph: add '--reachable' option
  gc: automatically write commit-graph files
  commit-graph: update design document

 Documentation/config.txt |  10 +-
 Documentation/git-commit-graph.txt   |  14 +-
 Documentation/git-fsck.txt   |   3 +
 Documentation/git-gc.txt |   4 +
 Documentation/technical/commit-graph.txt |  22 --
 builtin/commit-graph.c   |  98 ++---
 builtin/fsck.c   |  21 ++
 builtin/gc.c |   6 +
 commit-graph.c   | 248 +--
 commit-graph.h   |  10 +-
 commit.c |   9 +-
 commit.h |   1 +
 t/t5318-commit-graph.sh  | 201 ++
 13 files changed, 569 insertions(+), 78 deletions(-)

-- 
2.18.0.rc1



[PATCH v5 05/21] commit-graph: load a root tree from specific graph

2018-06-06 Thread Derrick Stolee
When lazy-loading a tree for a commit, it will be important to select
the tree from a specific struct commit_graph. Create a new method that
specifies the commit-graph file and use that in
get_commit_tree_in_graph().

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index e77b19971d..9e228d3bb5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -362,14 +362,20 @@ static struct tree *load_tree_for_commit(struct 
commit_graph *g, struct commit *
return c->maybe_tree;
 }
 
-struct tree *get_commit_tree_in_graph(const struct commit *c)
+static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g,
+const struct commit *c)
 {
if (c->maybe_tree)
return c->maybe_tree;
if (c->graph_pos == COMMIT_NOT_FROM_GRAPH)
-   BUG("get_commit_tree_in_graph called from non-commit-graph 
commit");
+   BUG("get_commit_tree_in_graph_one called from non-commit-graph 
commit");
+
+   return load_tree_for_commit(g, (struct commit *)c);
+}
 
-   return load_tree_for_commit(commit_graph, (struct commit *)c);
+struct tree *get_commit_tree_in_graph(const struct commit *c)
+{
+   return get_commit_tree_in_graph_one(commit_graph, c);
 }
 
 static void write_graph_chunk_fanout(struct hashfile *f,
-- 
2.18.0.rc1



[PATCH v5 15/21] commit-graph: test for corrupted octopus edge

2018-06-06 Thread Derrick Stolee
The commit-graph file has an extra chunk to store the parent int-ids for
parents beyond the first parent for octopus merges. Our test repo has a
single octopus merge that we can manipulate to demonstrate the 'verify'
subcommand detects incorrect values in that chunk.

Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 6a873bfda8..cf67fb391a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' '
 '
 
 NUM_COMMITS=9
+NUM_OCTOPUS_EDGES=2
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -274,6 +275,10 @@ 
GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
+GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
+GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
+$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS))
+GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -383,4 +388,9 @@ test_expect_success 'detect incorrect commit date' '
"commit date"
 '
 
+test_expect_success 'detect incorrect parent for octopus merge' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \
+   "invalid parent"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v5 21/21] commit-graph: update design document

2018-06-06 Thread Derrick Stolee
The commit-graph feature is now integrated with 'fsck' and 'gc',
so remove those items from the "Future Work" section of the
commit-graph design document.

Also remove the section on lazy-loading trees, as that was completed
in an earlier patch series.

Signed-off-by: Derrick Stolee 
---
 Documentation/technical/commit-graph.txt | 22 --
 1 file changed, 22 deletions(-)

diff --git a/Documentation/technical/commit-graph.txt 
b/Documentation/technical/commit-graph.txt
index e1a883eb46..c664acbd76 100644
--- a/Documentation/technical/commit-graph.txt
+++ b/Documentation/technical/commit-graph.txt
@@ -118,9 +118,6 @@ Future Work
 - The commit graph feature currently does not honor commit grafts. This can
   be remedied by duplicating or refactoring the current graft logic.
 
-- The 'commit-graph' subcommand does not have a "verify" mode that is
-  necessary for integration with fsck.
-
 - After computing and storing generation numbers, we must make graph
   walks aware of generation numbers to gain the performance benefits they
   enable. This will mostly be accomplished by swapping a commit-date-ordered
@@ -130,25 +127,6 @@ Future Work
 - 'log --topo-order'
 - 'tag --merged'
 
-- Currently, parse_commit_gently() requires filling in the root tree
-  object for a commit. This passes through lookup_tree() and consequently
-  lookup_object(). Also, it calls lookup_commit() when loading the parents.
-  These method calls check the ODB for object existence, even if the
-  consumer does not need the content. For example, we do not need the
-  tree contents when computing merge bases. Now that commit parsing is
-  removed from the computation time, these lookup operations are the
-  slowest operations keeping graph walks from being fast. Consider
-  loading these objects without verifying their existence in the ODB and
-  only loading them fully when consumers need them. Consider a method
-  such as "ensure_tree_loaded(commit)" that fully loads a tree before
-  using commit->tree.
-
-- The current design uses the 'commit-graph' subcommand to generate the graph.
-  When this feature stabilizes enough to recommend to most users, we should
-  add automatic graph writes to common operations that create many commits.
-  For example, one could compute a graph on 'clone', 'fetch', or 'repack'
-  commands.
-
 - A server could provide a commit graph file as part of the network protocol
   to avoid extra calculations by clients. This feature is only of benefit if
   the user is willing to trust the file, because verifying the file is correct
-- 
2.18.0.rc1



[PATCH v5 17/21] fsck: verify commit-graph

2018-06-06 Thread Derrick Stolee
If core.commitGraph is true, verify the contents of the commit-graph
during 'git fsck' using the 'git commit-graph verify' subcommand. Run
this check on all alternates, as well.

We use a new process for two reasons:

1. The subcommand decouples the details of loading and verifying a
   commit-graph file from the other fsck details.

2. The commit-graph verification requires the commits to be loaded
   in a specific order to guarantee we parse from the commit-graph
   file for some objects and from the object database for others.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-fsck.txt |  3 +++
 builtin/fsck.c | 21 +
 t/t5318-commit-graph.sh|  8 
 3 files changed, 32 insertions(+)

diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt
index b9f060e3b2..ab9a93fb9b 100644
--- a/Documentation/git-fsck.txt
+++ b/Documentation/git-fsck.txt
@@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or 
other archives
 (i.e., you can just remove them and do an 'rsync' with some other site in
 the hopes that somebody else has the object you have corrupted).
 
+If core.commitGraph is true, the commit-graph file will also be inspected
+using 'git commit-graph verify'. See linkgit:git-commit-graph[1].
+
 Extracted Diagnostics
 -
 
diff --git a/builtin/fsck.c b/builtin/fsck.c
index 3ad4f160f9..9fb2edc69f 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -18,6 +18,7 @@
 #include "decorate.h"
 #include "packfile.h"
 #include "object-store.h"
+#include "run-command.h"
 
 #define REACHABLE 0x0001
 #define SEEN  0x0002
@@ -47,6 +48,7 @@ static int name_objects;
 #define ERROR_REACHABLE 02
 #define ERROR_PACK 04
 #define ERROR_REFS 010
+#define ERROR_COMMIT_GRAPH 020
 
 static const char *describe_object(struct object *obj)
 {
@@ -822,5 +824,24 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
}
 
check_connectivity();
+
+   if (core_commit_graph) {
+   struct child_process commit_graph_verify = CHILD_PROCESS_INIT;
+   const char *verify_argv[] = { "commit-graph", "verify", NULL, 
NULL, NULL };
+   commit_graph_verify.argv = verify_argv;
+   commit_graph_verify.git_cmd = 1;
+
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+
+   prepare_alt_odb(the_repository);
+   for (alt =  the_repository->objects->alt_odb_list; alt; alt = 
alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }
+
return errors_found;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 2297a44e7d..44d4c71f0b 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -399,4 +399,12 @@ test_expect_success 'detect invalid checksum hash' '
"incorrect checksum"
 '
 
+test_expect_success 'git fsck (checks commit-graph)' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git fsck &&
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum" &&
+   test_must_fail git fsck
+'
+
 test_done
-- 
2.18.0.rc1



Re: [PATCH v6 00/21] Integrate commit-graph into 'fsck' and 'gc'

2018-06-08 Thread Derrick Stolee

On 6/8/2018 11:05 AM, Jakub Narębski wrote:

On Fri, 8 Jun 2018 at 15:56, Derrick Stolee  wrote:


[..], the following
diff occurs from the previous patch:

[...]

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index b24e8b6689..9a0661983c 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -33,8 +33,8 @@ test_expect_success 'create commits and repack' '
  '

  graph_git_two_modes() {
-   git -c core.commitGraph=true $1 >output
-   git -c core.commitGraph=false $1 >expect
+   git -c core.graph=true $1 >output
+   git -c core.graph=false $1 >expect
 test_cmp output expect
  }

It seems that you have accidentally removed the fix from previous version.
It needs to be core.commitGraph, not core.graph.




I didn't rebase the fix that I sent as a separate patch [1] (we want 
that change applied to 'master' while this one targets topics in 'next' 
and 'pu'). So this specific diff is unfortunate noise.


Thanks!
-Stolee

[1] 
https://public-inbox.org/git/20180604123906.136417-1-dsto...@microsoft.com/

    [PATCH] t5318-commit-graph.sh: use core.commitGraph


Re: is there a canonical doc about how to deal with whitespace issues?

2018-06-08 Thread Derrick Stolee

On 6/8/2018 9:18 AM, Robert P. J. Day wrote:

   for one of my courses, i wanted to write a section about the various
techniques for dealing with whitespace issues in git, so i started
making a list, things like:

   - running "git diff --check"
   - "git commit --cleanup=" possibilities
   - config options like core.{eol,safecrlf,autocrlf}
   - i'm sure there are client-side hooks that can be mentioned

etc, etc.

   has anyone ever written a doc that collects these things in one
place? if not, i guess i have to write one.

rday



I don't know of a doc for whitespace issues, but the contributing guide 
on GitForWindows [1] recommends `git rebase --whitespace=fix`.


Thanks,
-Stolee

[1] 
https://github.com/git-for-windows/git/blob/master/CONTRIBUTING.md#polish-your-commits


[PATCH 11/23] midx: sort and deduplicate objects from packfiles

2018-06-07 Thread Derrick Stolee
Before writing a list of objects and their offsets to a multi-pack-index
(MIDX), we need to collect the list of objects contained in the
packfiles. There may be multiple copies of some objects, so this list
must be deduplicated.

It is possible to artificially get into a state where there are many
duplicate copies of objects. That can create high memory pressure if we
are to create a list of all objects before de-duplication. To reduce
this memory pressure without a significant performance drop,
automatically group objects by the first byte of their object id. Use
the IDX fanout tables to group the data, copy to a local array, then
sort.

Copy only the de-duplicated entries. Select the duplicate based on the
most-recent modified time of a packfile containing the object.

Signed-off-by: Derrick Stolee 
---
 midx.c | 138 +
 1 file changed, 138 insertions(+)

diff --git a/midx.c b/midx.c
index 923acda72e..b20d52713c 100644
--- a/midx.c
+++ b/midx.c
@@ -4,6 +4,7 @@
 #include "csum-file.h"
 #include "lockfile.h"
 #include "object-store.h"
+#include "packfile.h"
 #include "midx.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
@@ -190,6 +191,140 @@ static void sort_packs_by_name(char **pack_names, 
uint32_t nr_packs, uint32_t *p
}
 }
 
+static uint32_t get_pack_fanout(struct packed_git *p, uint32_t value)
+{
+   const uint32_t *level1_ofs = p->index_data;
+
+   if (!level1_ofs) {
+   if (open_pack_index(p))
+   return 0;
+   level1_ofs = p->index_data;
+   }
+
+   if (p->index_version > 1) {
+   level1_ofs += 2;
+   }
+
+   return ntohl(level1_ofs[value]);
+}
+
+struct pack_midx_entry {
+   struct object_id oid;
+   uint32_t pack_int_id;
+   time_t pack_mtime;
+   uint64_t offset;
+};
+
+static int midx_oid_compare(const void *_a, const void *_b)
+{
+   struct pack_midx_entry *a = (struct pack_midx_entry *)_a;
+   struct pack_midx_entry *b = (struct pack_midx_entry *)_b;
+   int cmp = oidcmp(>oid, >oid);
+
+   if (cmp)
+   return cmp;
+
+   if (a->pack_mtime > b->pack_mtime)
+   return -1;
+   else if (a->pack_mtime < b->pack_mtime)
+   return 1;
+
+   return a->pack_int_id - b->pack_int_id;
+}
+
+static void fill_pack_entry(uint32_t pack_int_id,
+   struct packed_git *p,
+   uint32_t cur_object,
+   struct pack_midx_entry *entry)
+{
+   if (!nth_packed_object_oid(>oid, p, cur_object))
+   die("failed to located object %d in packfile", cur_object);
+
+   entry->pack_int_id = pack_int_id;
+   entry->pack_mtime = p->mtime;
+
+   entry->offset = nth_packed_object_offset(p, cur_object);
+}
+
+/*
+ * It is possible to artificially get into a state where there are many
+ * duplicate copies of objects. That can create high memory pressure if
+ * we are to create a list of all objects before de-duplication. To reduce
+ * this memory pressure without a significant performance drop, automatically
+ * group objects by the first byte of their object id. Use the IDX fanout
+ * tables to group the data, copy to a local array, then sort.
+ *
+ * Copy only the de-duplicated entries (selected by most-recent modified time
+ * of a packfile containing the object).
+ */
+static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
+ uint32_t *perm,
+ uint32_t nr_packs,
+ uint32_t *nr_objects)
+{
+   uint32_t cur_fanout, cur_pack, cur_object;
+   uint32_t nr_fanout, alloc_fanout, alloc_objects, total_objects = 0;
+   struct pack_midx_entry *entries_by_fanout = NULL;
+   struct pack_midx_entry *deduplicated_entries = NULL;
+
+   for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+   if (open_pack_index(p[cur_pack]))
+   continue;
+
+   total_objects += p[cur_pack]->num_objects;
+   }
+
+   /*
+* As we de-duplicate by fanout value, we expect the fanout
+* slices to be evenly distributed, with some noise. Hence,
+* allocate slightly more than one 256th.
+*/
+   alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 
200 : 16;
+
+   ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
+   ALLOC_ARRAY(deduplicated_entries, alloc_objects);
+   *nr_objects = 0;
+
+   for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
+   nr_fanout = 0;
+
+   for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+   uint32_t start = 0, end;
+
+   

[PATCH 18/23] midx: use midx in abbreviation calculations

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 midx.c  | 11 
 midx.h  |  3 +++
 packfile.c  |  6 +
 packfile.h  |  1 +
 sha1-name.c | 70 +
 t/t5319-midx.sh |  3 ++-
 6 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/midx.c b/midx.c
index 6eca8f1b12..25d8142c2a 100644
--- a/midx.c
+++ b/midx.c
@@ -203,6 +203,17 @@ int bsearch_midx(const struct object_id *oid, struct 
midxed_git *m, uint32_t *re
MIDX_HASH_LEN, result);
 }
 
+struct object_id *nth_midxed_object_oid(struct object_id *oid,
+   struct midxed_git *m,
+   uint32_t n)
+{
+   if (n >= m->num_objects)
+   return NULL;
+
+   hashcpy(oid->hash, m->chunk_oid_lookup + m->hash_len * n);
+   return oid;
+}
+
 static off_t nth_midxed_offset(struct midxed_git *m, uint32_t pos)
 {
const unsigned char *offset_data;
diff --git a/midx.h b/midx.h
index 0c66812229..497bdcc77c 100644
--- a/midx.h
+++ b/midx.h
@@ -9,6 +9,9 @@
 
 struct midxed_git *load_midxed_git(const char *object_dir);
 int bsearch_midx(const struct object_id *oid, struct midxed_git *m, uint32_t 
*result);
+struct object_id *nth_midxed_object_oid(struct object_id *oid,
+   struct midxed_git *m,
+   uint32_t n);
 int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct 
midxed_git *m);
 int prepare_midxed_git_one(struct repository *r, const char *object_dir);
 
diff --git a/packfile.c b/packfile.c
index 73f8cc28ee..638e113972 100644
--- a/packfile.c
+++ b/packfile.c
@@ -919,6 +919,12 @@ struct packed_git *get_packed_git(struct repository *r)
return r->objects->packed_git;
 }
 
+struct midxed_git *get_midxed_git(struct repository *r)
+{
+   prepare_packed_git(r);
+   return r->objects->midxed_git;
+}
+
 struct list_head *get_packed_git_mru(struct repository *r)
 {
prepare_packed_git(r);
diff --git a/packfile.h b/packfile.h
index e0a38aba93..01e14b93fd 100644
--- a/packfile.h
+++ b/packfile.h
@@ -39,6 +39,7 @@ extern void install_packed_git(struct repository *r, struct 
packed_git *pack);
 
 struct packed_git *get_packed_git(struct repository *r);
 struct list_head *get_packed_git_mru(struct repository *r);
+struct midxed_git *get_midxed_git(struct repository *r);
 
 /*
  * Give a rough count of objects in the repository. This sacrifices accuracy
diff --git a/sha1-name.c b/sha1-name.c
index 60d9ef3c7e..d975a186c9 100644
--- a/sha1-name.c
+++ b/sha1-name.c
@@ -12,6 +12,7 @@
 #include "packfile.h"
 #include "object-store.h"
 #include "repository.h"
+#include "midx.h"
 
 static int get_oid_oneline(const char *, struct object_id *, struct 
commit_list *);
 
@@ -149,6 +150,32 @@ static int match_sha(unsigned len, const unsigned char *a, 
const unsigned char *
return 1;
 }
 
+static void unique_in_midx(struct midxed_git *m,
+  struct disambiguate_state *ds)
+{
+   uint32_t num, i, first = 0;
+   const struct object_id *current = NULL;
+   num = m->num_objects;
+
+   if (!num)
+   return;
+
+   bsearch_midx(>bin_pfx, m, );
+
+   /*
+* At this point, "first" is the location of the lowest object
+* with an object name that could match "bin_pfx".  See if we have
+* 0, 1 or more objects that actually match(es).
+*/
+   for (i = first; i < num && !ds->ambiguous; i++) {
+   struct object_id oid;
+   current = nth_midxed_object_oid(, m, i);
+   if (!match_sha(ds->len, ds->bin_pfx.hash, current->hash))
+   break;
+   update_candidates(ds, current);
+   }
+}
+
 static void unique_in_pack(struct packed_git *p,
   struct disambiguate_state *ds)
 {
@@ -177,8 +204,12 @@ static void unique_in_pack(struct packed_git *p,
 
 static void find_short_packed_object(struct disambiguate_state *ds)
 {
+   struct midxed_git *m;
struct packed_git *p;
 
+   for (m = get_midxed_git(the_repository); m && !ds->ambiguous;
+m = m->next)
+   unique_in_midx(m, ds);
for (p = get_packed_git(the_repository); p && !ds->ambiguous;
 p = p->next)
unique_in_pack(p, ds);
@@ -527,6 +558,42 @@ static int extend_abbrev_len(const struct object_id *oid, 
void *cb_data)
return 0;
 }
 
+static void find_abbrev_len_for_midx(struct midxed_git *m,
+struct min_abbrev_data *mad)
+{
+   int match = 0;
+   uint32_t num, first = 0;
+   struct object_id oid;
+   const struct object_id *mad_oid;
+
+   if (!m->num_objects)
+   return;
+
+   

[PATCH 22/23] midx: use midx to find ref-deltas

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 midx.c |  2 +-
 midx.h |  1 +
 packfile.c | 15 +++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/midx.c b/midx.c
index 3242646fe0..e46f392fa4 100644
--- a/midx.c
+++ b/midx.c
@@ -214,7 +214,7 @@ struct object_id *nth_midxed_object_oid(struct object_id 
*oid,
return oid;
 }
 
-static off_t nth_midxed_offset(struct midxed_git *m, uint32_t pos)
+off_t nth_midxed_offset(struct midxed_git *m, uint32_t pos)
 {
const unsigned char *offset_data;
uint32_t offset32;
diff --git a/midx.h b/midx.h
index c1db58d8c4..6996b5ff6b 100644
--- a/midx.h
+++ b/midx.h
@@ -9,6 +9,7 @@
 
 struct midxed_git *load_midxed_git(const char *object_dir);
 int bsearch_midx(const struct object_id *oid, struct midxed_git *m, uint32_t 
*result);
+off_t nth_midxed_offset(struct midxed_git *m, uint32_t n);
 struct object_id *nth_midxed_object_oid(struct object_id *oid,
struct midxed_git *m,
uint32_t n);
diff --git a/packfile.c b/packfile.c
index 479cb69b9f..9b814c89c7 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1794,6 +1794,21 @@ off_t find_pack_entry_one(const unsigned char *sha1,
uint32_t result;
 
if (!index) {
+   /*
+* If we have a MIDX, then we want to
+* check the MIDX for the offset instead.
+*/
+   struct midxed_git *m;
+
+   for (m = get_midxed_git(the_repository); m; m = m->next) {
+   if (midx_contains_pack(m, p->pack_name)) {
+   if (bsearch_midx(, m, ))
+   return nth_midxed_offset(m, result);
+
+   break;
+   }
+   }
+
if (open_pack_index(p))
return 0;
}
-- 
2.18.0.rc1



[PATCH 21/23] midx: prevent duplicate packfile loads

2018-06-07 Thread Derrick Stolee
If the multi-pack-index contains a packfile, then we do not need to add
that packfile to the packed_git linked list or the MRU list.

Signed-off-by: Derrick Stolee 
---
 midx.c | 23 +++
 midx.h |  1 +
 packfile.c |  7 +++
 3 files changed, 31 insertions(+)

diff --git a/midx.c b/midx.c
index 388d79b7d9..3242646fe0 100644
--- a/midx.c
+++ b/midx.c
@@ -278,6 +278,29 @@ int fill_midx_entry(const struct object_id *oid, struct 
pack_entry *e, struct mi
return nth_midxed_pack_entry(m, e, pos);
 }
 
+int midx_contains_pack(struct midxed_git *m, const char *idx_name)
+{
+   uint32_t first = 0, last = m->num_packs;
+
+   while (first < last) {
+   uint32_t mid = first + (last - first) / 2;
+   const char *current;
+   int cmp;
+
+   current = m->pack_names[mid];
+   cmp = strcmp(idx_name, current);
+   if (!cmp)
+   return 1;
+   if (cmp > 0) {
+   first = mid + 1;
+   continue;
+   }
+   last = mid;
+   }
+
+   return 0;
+}
+
 int prepare_midxed_git_one(struct repository *r, const char *object_dir)
 {
struct midxed_git *m = r->objects->midxed_git;
diff --git a/midx.h b/midx.h
index 497bdcc77c..c1db58d8c4 100644
--- a/midx.h
+++ b/midx.h
@@ -13,6 +13,7 @@ struct object_id *nth_midxed_object_oid(struct object_id *oid,
struct midxed_git *m,
uint32_t n);
 int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct 
midxed_git *m);
+int midx_contains_pack(struct midxed_git *m, const char *idx_name);
 int prepare_midxed_git_one(struct repository *r, const char *object_dir);
 
 int write_midx_file(const char *object_dir);
diff --git a/packfile.c b/packfile.c
index 059b2aa097..479cb69b9f 100644
--- a/packfile.c
+++ b/packfile.c
@@ -746,6 +746,11 @@ static void prepare_packed_git_one(struct repository *r, 
char *objdir, int local
DIR *dir;
struct dirent *de;
struct string_list garbage = STRING_LIST_INIT_DUP;
+   struct midxed_git *m = r->objects->midxed_git;
+
+   /* look for the multi-pack-index for this object directory */
+   while (m && strcmp(m->object_dir, objdir))
+   m = m->next;
 
strbuf_addstr(, objdir);
strbuf_addstr(, "/pack");
@@ -772,6 +777,8 @@ static void prepare_packed_git_one(struct repository *r, 
char *objdir, int local
base_len = path.len;
if (strip_suffix_mem(path.buf, _len, ".idx")) {
/* Don't reopen a pack we already have. */
+   if (m && midx_contains_pack(m, de->d_name))
+   continue;
for (p = r->objects->packed_git; p;
 p = p->next) {
size_t len;
-- 
2.18.0.rc1



[PATCH 15/23] midx: create core.midx config setting

2018-06-07 Thread Derrick Stolee
The core.midx config setting controls the multi-pack-index (MIDX)
feature. If false, the setting will disable all reads from the
multi-pack-index file.

Add comparison commands in t5319-midx.sh to check typical Git behavior
remains the same as the config setting is turned on and off. This
currently includes 'git rev-list' and 'git log' commands to trigger
several object database reads.

Signed-off-by: Derrick Stolee 
---
 Documentation/config.txt |  4 +++
 cache.h  |  1 +
 config.c |  5 
 environment.c|  1 +
 t/t5319-midx.sh  | 57 
 5 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ab641bf5a9..e78150e452 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -908,6 +908,10 @@ core.commitGraph::
Enable git commit graph feature. Allows reading from the
commit-graph file.
 
+core.midx::
+   Enable multi-pack-index feature. Allows reading from the multi-
+   pack-index file.
+
 core.sparseCheckout::
Enable "sparse checkout" feature. See section "Sparse checkout" in
linkgit:git-read-tree[1] for more information.
diff --git a/cache.h b/cache.h
index 89a107a7f7..c7967f7643 100644
--- a/cache.h
+++ b/cache.h
@@ -814,6 +814,7 @@ extern char *git_replace_ref_base;
 extern int fsync_object_files;
 extern int core_preload_index;
 extern int core_commit_graph;
+extern int core_midx;
 extern int core_apply_sparse_checkout;
 extern int precomposed_unicode;
 extern int protect_hfs;
diff --git a/config.c b/config.c
index fbbf0f8e9f..0df3dbdf74 100644
--- a/config.c
+++ b/config.c
@@ -1313,6 +1313,11 @@ static int git_default_core_config(const char *var, 
const char *value)
return 0;
}
 
+   if (!strcmp(var, "core.midx")) {
+   core_midx = git_config_bool(var, value);
+   return 0;
+   }
+
if (!strcmp(var, "core.sparsecheckout")) {
core_apply_sparse_checkout = git_config_bool(var, value);
return 0;
diff --git a/environment.c b/environment.c
index 2a6de2330b..dcb4417604 100644
--- a/environment.c
+++ b/environment.c
@@ -67,6 +67,7 @@ enum object_creation_mode object_creation_mode = 
OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_commit_graph;
+int core_midx;
 int core_apply_sparse_checkout;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
diff --git a/t/t5319-midx.sh b/t/t5319-midx.sh
index 709652c635..1a50987778 100755
--- a/t/t5319-midx.sh
+++ b/t/t5319-midx.sh
@@ -3,6 +3,8 @@
 test_description='multi-pack-indexes'
 . ./test-lib.sh
 
+objdir=.git/objects
+
 midx_read_expect() {
NUM_PACKS=$1
NUM_OBJECTS=$2
@@ -62,13 +64,42 @@ test_expect_success 'write midx with one v1 pack' '
midx_read_expect 1 17 5 .
 '
 
+midx_git_two_modes() {
+   git -c core.midx=false $1 >expect &&
+   git -c core.midx=true $1 >actual &&
+   test_cmp expect actual
+}
+
+compare_results_with_midx() {
+   MSG=$1
+   test_expect_success "check normal git operations: $MSG" '
+   midx_git_two_modes "rev-list --objects --all" &&
+   midx_git_two_modes "log --raw"
+   '
+}
+
 test_expect_success 'write midx with one v2 pack' '
-   pack=$(git pack-objects --index-version=2,0x40 pack/test expect &&
+   git -c core.midx=true $1 >actual &&
+   test_cmp expect actual
+}
+
+compare_results_with_midx() {
+   MSG=$1
+   test_expect_success "check normal git operations: $MSG" '
+   midx_git_two_modes "rev-list --objects --all" &&
+   midx_git_two_modes "log --raw"
+   '
+}
+
+compare_results_with_midx "one v2 pack"
+
 test_expect_success 'Add more objects' '
for i in `test_seq 6 10`
do
@@ -94,12 +125,13 @@ test_expect_success 'Add more objects' '
 '
 
 test_expect_success 'write midx with two packs' '
-   pack1=$(git pack-objects --index-version=1 pack/test-1 obj-list &&
git update-ref HEAD $commit &&
-   git pack-objects --index-version=2 pack/test-pack   []
 corrupt_data() {
-- 
2.18.0.rc1



[PATCH 19/23] midx: use existing midx when writing new one

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 midx.c | 68 +-
 1 file changed, 63 insertions(+), 5 deletions(-)

diff --git a/midx.c b/midx.c
index 25d8142c2a..388d79b7d9 100644
--- a/midx.c
+++ b/midx.c
@@ -389,6 +389,23 @@ static int midx_oid_compare(const void *_a, const void *_b)
return a->pack_int_id - b->pack_int_id;
 }
 
+static int nth_midxed_pack_midx_entry(struct midxed_git *m,
+ uint32_t *pack_perm,
+ struct pack_midx_entry *e,
+ uint32_t pos)
+{
+   if (pos >= m->num_objects)
+   return 1;
+
+   nth_midxed_object_oid(>oid, m, pos);
+   e->pack_int_id = pack_perm[nth_midxed_pack_int_id(m, pos)];
+   e->offset = nth_midxed_offset(m, pos);
+
+   /* consider objects in midx to be from "old" packs */
+   e->pack_mtime = 0;
+   return 0;
+}
+
 static void fill_pack_entry(uint32_t pack_int_id,
struct packed_git *p,
uint32_t cur_object,
@@ -414,7 +431,8 @@ static void fill_pack_entry(uint32_t pack_int_id,
  * Copy only the de-duplicated entries (selected by most-recent modified time
  * of a packfile containing the object).
  */
-static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
+static struct pack_midx_entry *get_sorted_entries(struct midxed_git *m,
+ struct packed_git **p,
  uint32_t *perm,
  uint32_t nr_packs,
  uint32_t *nr_objects)
@@ -423,8 +441,9 @@ static struct pack_midx_entry *get_sorted_entries(struct 
packed_git **p,
uint32_t nr_fanout, alloc_fanout, alloc_objects, total_objects = 0;
struct pack_midx_entry *entries_by_fanout = NULL;
struct pack_midx_entry *deduplicated_entries = NULL;
+   uint32_t start_pack = m ? m->num_packs : 0;
 
-   for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+   for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) {
if (open_pack_index(p[cur_pack]))
continue;
 
@@ -445,7 +464,23 @@ static struct pack_midx_entry *get_sorted_entries(struct 
packed_git **p,
for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
nr_fanout = 0;
 
-   for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+   if (m) {
+   uint32_t start = 0, end;
+
+   if (cur_fanout)
+   start = ntohl(m->chunk_oid_fanout[cur_fanout - 
1]);
+   end = ntohl(m->chunk_oid_fanout[cur_fanout]);
+
+   for (cur_object = start; cur_object < end; 
cur_object++) {
+   ALLOC_GROW(entries_by_fanout, nr_fanout + 1, 
alloc_fanout);
+   nth_midxed_pack_midx_entry(m, perm,
+  
_by_fanout[nr_fanout],
+  cur_object);
+   nr_fanout++;
+   }
+   }
+
+   for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) {
uint32_t start = 0, end;
 
if (cur_fanout)
@@ -654,6 +689,7 @@ int write_midx_file(const char *object_dir)
struct pack_midx_entry *entries;
uint32_t nr_entries, num_large_offsets = 0;
int large_offsets_needed = 0;
+   struct midxed_git *m = NULL;
 
midx_name = get_midx_filename(object_dir);
if (safe_create_leading_directories(midx_name)) {
@@ -662,6 +698,8 @@ int write_midx_file(const char *object_dir)
  midx_name);
}
 
+   m = load_midxed_git(object_dir);
+
strbuf_addf(_dir, "%s/pack", object_dir);
dir = opendir(pack_dir.buf);
 
@@ -676,11 +714,27 @@ int write_midx_file(const char *object_dir)
pack_dir_len = pack_dir.len;
ALLOC_ARRAY(packs, alloc_packs);
ALLOC_ARRAY(pack_names, alloc_pack_names);
+
+   if (m) {
+   for (i = 0; i < m->num_packs; i++) {
+   ALLOC_GROW(packs, nr_packs + 1, alloc_packs);
+   ALLOC_GROW(pack_names, nr_packs + 1, alloc_pack_names);
+
+   packs[nr_packs] = NULL;
+   pack_names[nr_packs] = xstrdup(m->pack_names[i]);
+   pack_name_concat_len += strlen(pack_names[nr_packs]) + 
1;
+   nr_packs++;
+   }
+   }
+
while ((de = readdir(dir)) != NULL) {
if (is_dot_or_dotdot(de->d_name))
continue;
 

[PATCH 14/23] midx: write object offsets

2018-06-07 Thread Derrick Stolee
The final pair of chunks for the multi-pack-index (MIDX) file stores the
object offsets. We default to using 32-bit offsets as in the pack-index
version 1 format, but if there exists an offset larger than 32-bits, we
use a trick similar to the pack-index version 2 format by storing all
offsets at least 2^31 in a 64-bit table; we use the 32-bit table to
point into that 64-bit table as necessary.

We only store these 64-bit offsets if necessary, so create a test that
manipulates a version 2 pack-index to fake a large offset. This allows
us to test that the large offset table is created, but the data does not
match the actual packfile offsets. The MIDX offset does match the
(corrupted) pack-index offset, so a later commit will compare these
offsets during a 'verify' step.

Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  15 +++-
 builtin/midx.c  |   4 +
 midx.c  | 100 +++-
 object-store.h  |   2 +
 t/t5319-midx.sh |  45 ---
 5 files changed, 151 insertions(+), 15 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 77e88f85e4..0256cfb5e0 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -316,7 +316,20 @@ CHUNK DATA:
The OIDs for all objects in the MIDX are stored in lexicographic
order in this chunk.
 
-   (This section intentionally left incomplete.)
+   Object Offsets (ID: {'O', 'O', 'F', 'F'}) (N * 8 bytes)
+   Stores two 4-byte values for every object.
+   1: The pack-int-id for the pack storing this object.
+   2: The offset within the pack.
+   If all offsets are less than 2^31, then the large offset chunk
+   will not exist and offsets are stored as in IDX v1.
+   If there is at least one offset value larger than 2^32-1, then
+   the large offset chunk must exist. If the large offset chunk
+   exists and the 31st bit is on, then removing that bit reveals
+   the row in the large offsets containing the 8-byte offset of
+   this object.
+
+   [Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'})
+   8-byte offsets into large packfiles.
 
 TRAILER:
 
diff --git a/builtin/midx.c b/builtin/midx.c
index e1fd0e0de4..607d2b3544 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -39,6 +39,10 @@ static int read_midx_file(const char *object_dir)
printf(" oid_fanout");
if (m->chunk_oid_lookup)
printf(" oid_lookup");
+   if (m->chunk_object_offsets)
+   printf(" object_offsets");
+   if (m->chunk_large_offsets)
+   printf(" large_offsets");
 
printf("\nnum_objects: %d\n", m->num_objects);
 
diff --git a/midx.c b/midx.c
index 9458ced208..a49300bf75 100644
--- a/midx.c
+++ b/midx.c
@@ -14,14 +14,19 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 4
+#define MIDX_MAX_CHUNKS 6
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
 #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
+#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
+#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
+#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
+#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
+#define MIDX_LARGE_OFFSET_NEEDED 0x8000
 
 static char *get_midx_filename(const char *object_dir)
 {
@@ -106,6 +111,14 @@ struct midxed_git *load_midxed_git(const char *object_dir)
m->chunk_oid_lookup = m->data + chunk_offset;
break;
 
+   case MIDX_CHUNKID_OBJECTOFFSETS:
+   m->chunk_object_offsets = m->data + 
chunk_offset;
+   break;
+
+   case MIDX_CHUNKID_LARGEOFFSETS:
+   m->chunk_large_offsets = m->data + chunk_offset;
+   break;
+
case 0:
die("terminating MIDX chunk id appears earlier 
than expected");
break;
@@ -127,6 +140,8 @@ struct midxed_git *load_midxed_git(const char *object_dir)
die("MIDX missing required OID fanout chunk");
if (!m->chunk_oid_lookup)

[PATCH 20/23] midx: use midx in approximate_object_count

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 packfile.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/packfile.c b/packfile.c
index 638e113972..059b2aa097 100644
--- a/packfile.c
+++ b/packfile.c
@@ -819,11 +819,14 @@ unsigned long approximate_object_count(void)
 {
if (!the_repository->objects->approximate_object_count_valid) {
unsigned long count;
+   struct midxed_git *m;
struct packed_git *p;
 
prepare_packed_git(the_repository);
count = 0;
-   for (p = the_repository->objects->packed_git; p; p = p->next) {
+   for (m = get_midxed_git(the_repository); m; m = m->next)
+   count += m->num_objects;
+   for (p = get_packed_git(the_repository); p; p = p->next) {
if (open_pack_index(p))
continue;
count += p->num_objects;
-- 
2.18.0.rc1



[PATCH 23/23] midx: clear midx on repack

2018-06-07 Thread Derrick Stolee
If a 'git repack' command replaces existing packfiles, then we must
clear the existing multi-pack-index before moving the packfiles it
references.

Signed-off-by: Derrick Stolee 
---
 builtin/repack.c | 8 
 midx.c   | 8 
 midx.h   | 1 +
 3 files changed, 17 insertions(+)

diff --git a/builtin/repack.c b/builtin/repack.c
index 6c636e159e..66a7d8e8ea 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -8,6 +8,7 @@
 #include "strbuf.h"
 #include "string-list.h"
 #include "argv-array.h"
+#include "midx.h"
 
 static int delta_base_offset = 1;
 static int pack_kept_objects = -1;
@@ -174,6 +175,7 @@ int cmd_repack(int argc, const char **argv, const char 
*prefix)
int no_update_server_info = 0;
int quiet = 0;
int local = 0;
+   int midx_cleared = 0;
 
struct option builtin_repack_options[] = {
OPT_BIT('a', NULL, _everything,
@@ -340,6 +342,12 @@ int cmd_repack(int argc, const char **argv, const char 
*prefix)
continue;
}
 
+   if (!midx_cleared) {
+   /* if we move a packfile, it will invalidated 
the midx */
+   clear_midx_file(get_object_directory());
+   midx_cleared = 1;
+   }
+
fname_old = mkpathdup("%s/old-%s%s", packdir,
item->string, exts[ext].name);
if (file_exists(fname_old))
diff --git a/midx.c b/midx.c
index e46f392fa4..1043c01fa7 100644
--- a/midx.c
+++ b/midx.c
@@ -913,3 +913,11 @@ int write_midx_file(const char *object_dir)
FREE_AND_NULL(pack_names);
return 0;
 }
+
+void clear_midx_file(const char *object_dir)
+{
+   char *midx = get_midx_filename(object_dir);
+
+   if (remove_path(midx))
+   die(_("failed to clear multi-pack-index at %s"), midx);
+}
diff --git a/midx.h b/midx.h
index 6996b5ff6b..46f9f44c94 100644
--- a/midx.h
+++ b/midx.h
@@ -18,5 +18,6 @@ int midx_contains_pack(struct midxed_git *m, const char 
*idx_name);
 int prepare_midxed_git_one(struct repository *r, const char *object_dir);
 
 int write_midx_file(const char *object_dir);
+void clear_midx_file(const char *object_dir);
 
 #endif
-- 
2.18.0.rc1



[PATCH 13/23] midx: write object id fanout chunk

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  5 +++
 builtin/midx.c  |  4 +-
 midx.c  | 53 +++--
 object-store.h  |  1 +
 t/t5319-midx.sh | 18 +
 5 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index de9ac778b6..77e88f85e4 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -307,6 +307,11 @@ CHUNK DATA:
name. This is the only chunk not guaranteed to be a multiple of four
bytes in length, so should be the last chunk for alignment reasons.
 
+   OID Fanout (ID: {'O', 'I', 'D', 'F'}) (256 * 4 bytes)
+   The ith entry, F[i], stores the number of OIDs with first
+   byte at most i. Thus F[255] stores the total
+   number of objects (N).
+
OID Lookup (ID: {'O', 'I', 'D', 'L'}) (N * H bytes)
The OIDs for all objects in the MIDX are stored in lexicographic
order in this chunk.
diff --git a/builtin/midx.c b/builtin/midx.c
index 86edd30174..e1fd0e0de4 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -35,10 +35,12 @@ static int read_midx_file(const char *object_dir)
printf(" pack_lookup");
if (m->chunk_pack_names)
printf(" pack_names");
+   if (m->chunk_oid_fanout)
+   printf(" oid_fanout");
if (m->chunk_oid_lookup)
printf(" oid_lookup");
 
-   printf("\n");
+   printf("\nnum_objects: %d\n", m->num_objects);
 
printf("packs:\n");
for (i = 0; i < m->num_packs; i++)
diff --git a/midx.c b/midx.c
index d06bc6876a..9458ced208 100644
--- a/midx.c
+++ b/midx.c
@@ -14,12 +14,14 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 3
+#define MIDX_MAX_CHUNKS 4
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
 #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
+#define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
 
 static char *get_midx_filename(const char *object_dir)
 {
@@ -96,6 +98,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
m->chunk_pack_names = m->data + chunk_offset;
break;
 
+   case MIDX_CHUNKID_OIDFANOUT:
+   m->chunk_oid_fanout = (uint32_t *)(m->data + 
chunk_offset);
+   break;
+
case MIDX_CHUNKID_OIDLOOKUP:
m->chunk_oid_lookup = m->data + chunk_offset;
break;
@@ -117,9 +123,13 @@ struct midxed_git *load_midxed_git(const char *object_dir)
die("MIDX missing required pack lookup chunk");
if (!m->chunk_pack_names)
die("MIDX missing required pack-name chunk");
+   if (!m->chunk_oid_fanout)
+   die("MIDX missing required OID fanout chunk");
if (!m->chunk_oid_lookup)
die("MIDX missing required OID lookup chunk");
 
+   m->num_objects = ntohl(m->chunk_oid_fanout[255]);
+
m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
for (i = 0; i < m->num_packs; i++) {
if (i) {
@@ -377,6 +387,35 @@ static size_t write_midx_pack_names(struct hashfile *f,
return written;
 }
 
+static size_t write_midx_oid_fanout(struct hashfile *f,
+   struct pack_midx_entry *objects,
+   uint32_t nr_objects)
+{
+   struct pack_midx_entry *list = objects;
+   struct pack_midx_entry *last = objects + nr_objects;
+   uint32_t count = 0;
+   uint32_t i;
+
+   /*
+   * Write the first-level table (the list is sorted,
+   * but we use a 256-entry lookup to be able to avoid
+   * having to do eight extra binary search iterations).
+   */
+   for (i = 0; i < 256; i++) {
+   struct pack_midx_entry *next = list;
+
+   while (next < last && next->oid.hash[0] == i) {
+   count++;
+   next++;
+   }
+
+   hashwrite_be32(f, count);
+   list = next;
+   }
+
+   return MIDX_CHUNK_FANOUT_SIZE;
+}
+
 static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char h

[PATCH 05/23] midx: write header information to lockfile

2018-06-07 Thread Derrick Stolee
As we begin writing the multi-pack-index format to disk, start with
the basics: the 12-byte header and the 20-byte checksum footer. Start
with these basics so we can add the rest of the format in small
increments.

As we implement the format, we will use a technique to check that our
computed offsets within the multi-pack-index file match what we are
actually writing. Each method that writes to the hashfile will return
the number of bytes written, and we will track that those values match
our expectations.

Currently, write_midx_header() returns 12, but is not checked. We will
check the return value in a later commit.

Signed-off-by: Derrick Stolee 
---
 midx.c  | 53 +
 t/t5319-midx.sh |  5 +++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/midx.c b/midx.c
index 616af66b13..3e55422a21 100644
--- a/midx.c
+++ b/midx.c
@@ -1,9 +1,62 @@
 #include "git-compat-util.h"
 #include "cache.h"
 #include "dir.h"
+#include "csum-file.h"
+#include "lockfile.h"
 #include "midx.h"
 
+#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
+#define MIDX_VERSION 1
+#define MIDX_HASH_VERSION 1 /* SHA-1 */
+#define MIDX_HEADER_SIZE 12
+
+static char *get_midx_filename(const char *object_dir)
+{
+   struct strbuf midx_name = STRBUF_INIT;
+   strbuf_addstr(_name, object_dir);
+   strbuf_addstr(_name, "/pack/multi-pack-index");
+   return strbuf_detach(_name, NULL);
+}
+
+static size_t write_midx_header(struct hashfile *f,
+   unsigned char num_chunks,
+   uint32_t num_packs)
+{
+   char byte_values[4];
+   hashwrite_be32(f, MIDX_SIGNATURE);
+   byte_values[0] = MIDX_VERSION;
+   byte_values[1] = MIDX_HASH_VERSION;
+   byte_values[2] = num_chunks;
+   byte_values[3] = 0; /* unused */
+   hashwrite(f, byte_values, sizeof(byte_values));
+   hashwrite_be32(f, num_packs);
+
+   return MIDX_HEADER_SIZE;
+}
+
 int write_midx_file(const char *object_dir)
 {
+   unsigned char num_chunks = 0;
+   uint32_t num_packs = 0;
+   char *midx_name;
+   struct hashfile *f;
+   struct lock_file lk;
+
+   midx_name = get_midx_filename(object_dir);
+   if (safe_create_leading_directories(midx_name)) {
+   UNLEAK(midx_name);
+   die_errno(_("unable to create leading directories of %s"),
+ midx_name);
+   }
+
+   hold_lock_file_for_update(, midx_name, LOCK_DIE_ON_ERROR);
+   f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
+   FREE_AND_NULL(midx_name);
+
+   write_midx_header(f, num_chunks, num_packs);
+
+   finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
+   commit_lock_file();
+
return 0;
 }
diff --git a/t/t5319-midx.sh b/t/t5319-midx.sh
index a590137af7..80f9389837 100755
--- a/t/t5319-midx.sh
+++ b/t/t5319-midx.sh
@@ -3,8 +3,9 @@
 test_description='multi-pack-indexes'
 . ./test-lib.sh
 
-test_expect_success 'write midx with no pakcs' '
-   git midx --object-dir=. write
+test_expect_success 'write midx with no packs' '
+   git midx --object-dir=. write &&
+   test_path_is_file pack/multi-pack-index
 '
 
 test_done
-- 
2.18.0.rc1



[PATCH 07/23] midx: expand test data

2018-06-07 Thread Derrick Stolee
As we build the multi-pack-index file format, we want to test the format
on real repoasitories. Add tests to t5319-midx.sh that create repository
data including multiple packfiles with both version 1 and version 2
formats.

The current 'git midx write' command will always write the same file
with no "real" data. This will be expanded in future commits, along with
the test expectations.

Signed-off-by: Derrick Stolee 
---
 t/t5319-midx.sh | 101 
 1 file changed, 101 insertions(+)

diff --git a/t/t5319-midx.sh b/t/t5319-midx.sh
index e78514d8e9..2c25a69744 100755
--- a/t/t5319-midx.sh
+++ b/t/t5319-midx.sh
@@ -14,8 +14,109 @@ midx_read_expect() {
 
 test_expect_success 'write midx with no packs' '
git midx --object-dir=. write &&
+   test_when_finished rm pack/multi-pack-index &&
test_path_is_file pack/multi-pack-index &&
midx_read_expect
 '
 
+test_expect_success 'create objects' '
+   for i in `test_seq 1 5`
+   do
+   iii=$(printf '%03i' $i)
+   test-tool genrandom "bar" 200 > wide_delta_$iii &&
+   test-tool genrandom "baz $iii" 50 >> wide_delta_$iii &&
+   test-tool genrandom "foo"$i 100 > deep_delta_$iii &&
+   test-tool genrandom "foo"$(expr $i + 1) 100 >> deep_delta_$iii 
&&
+   test-tool genrandom "foo"$(expr $i + 2) 100 >> deep_delta_$iii 
&&
+   echo $iii >file_$iii &&
+   test-tool genrandom "$iii" 8192 >>file_$iii &&
+   git update-index --add file_$iii deep_delta_$iii 
wide_delta_$iii &&
+   i=$(expr $i + 1) || return 1
+   done &&
+   { echo 101 && test-tool genrandom 100 8192; } >file_101 &&
+   git update-index --add file_101 &&
+   tree=$(git write-tree) &&
+   commit=$(git commit-tree $tree obj-list &&
+   git update-ref HEAD $commit
+'
+
+test_expect_success 'write midx with one v1 pack' '
+   pack=$(git pack-objects --index-version=1 pack/test  wide_delta_$iii &&
+   test-tool genrandom "baz $iii" 50 >> wide_delta_$iii &&
+   test-tool genrandom "foo"$i 100 > deep_delta_$iii &&
+   test-tool genrandom "foo"$(expr $i + 1) 100 >> deep_delta_$iii 
&&
+   test-tool genrandom "foo"$(expr $i + 2) 100 >> deep_delta_$iii 
&&
+   echo $iii >file_$iii &&
+   test-tool genrandom "$iii" 8192 >>file_$iii &&
+   git update-index --add file_$iii deep_delta_$iii 
wide_delta_$iii &&
+   i=$(expr $i + 1) || return 1
+   done &&
+   { echo 101 && test-tool genrandom 100 8192; } >file_101 &&
+   git update-index --add file_101 &&
+   tree=$(git write-tree) &&
+   commit=$(git commit-tree $tree -p HEADobj-list2 &&
+   git update-ref HEAD $commit
+'
+
+test_expect_success 'write midx with two packs' '
+   pack1=$(git pack-objects --index-version=1 pack/test-1  wide_delta_$iii &&
+   test-tool genrandom "baz $iii" 50 >> wide_delta_$iii &&
+   test-tool genrandom "foo"$i 100 > deep_delta_$iii &&
+   test-tool genrandom "foo"$(expr $i + 1) 100 >> deep_delta_$iii 
&&
+   test-tool genrandom "foo"$(expr $i + 2) 100 >> deep_delta_$iii 
&&
+   echo $iii >file_$iii &&
+   test-tool genrandom "$iii" 8192 >>file_$iii &&
+   git update-index --add file_$iii deep_delta_$iii 
wide_delta_$iii &&
+   { echo 101 && test-tool genrandom 100 8192; } >file_101 &&
+   git update-index --add file_101 &&
+   tree=$(git write-tree) &&
+   commit=$(git commit-tree $tree -p HEADobj-list &&
+   git update-ref HEAD $commit &&
+   git pack-objects --index-version=2 test-pack 

[PATCH 06/23] midx: struct midxed_git and 'read' subcommand

2018-06-07 Thread Derrick Stolee
As we build the multi-pack-index feature by adding chunks at a time,
we want to test that the data is being written correctly.

Create struct midxed_git to store an in-memory representation of a
multi-pack-index and a memory-map of the binary file. Initialize this
struct in load_midxed_git(object_dir).

Create the 'git midx read' subcommand to output basic information about
the multi-pack-index file. This will be expanded as more information is
written to the file.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-midx.txt | 11 +++
 builtin/midx.c | 23 +-
 midx.c | 65 ++
 midx.h |  9 ++
 object-store.h | 19 +++
 t/t5319-midx.sh| 12 ++-
 6 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt
index dcaeb1a91b..919283fdd8 100644
--- a/Documentation/git-midx.txt
+++ b/Documentation/git-midx.txt
@@ -23,6 +23,11 @@ OPTIONS
/packs/multi-pack-index for the current MIDX file, and
/packs for the pack-files to index.
 
+read::
+   When given as the verb, read the current MIDX file and output
+   basic information about its contents. Used for debugging
+   purposes only.
+
 write::
When given as the verb, write a new MIDX file to
/packs/multi-pack-index.
@@ -43,6 +48,12 @@ $ git midx write
 $ git midx --object-dir  write
 ---
 
+* Read the MIDX file in the .git/objects folder.
++
+---
+$ git midx read
+---
+
 
 GIT
 ---
diff --git a/builtin/midx.c b/builtin/midx.c
index dc0a5acd3f..c7002f664a 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -6,7 +6,7 @@
 #include "midx.h"
 
 static char const * const builtin_midx_usage[] ={
-   N_("git midx [--object-dir ] [write]"),
+   N_("git midx [--object-dir ] [read|write]"),
NULL
 };
 
@@ -14,6 +14,25 @@ static struct opts_midx {
const char *object_dir;
 } opts;
 
+static int read_midx_file(const char *object_dir)
+{
+   struct midxed_git *m = load_midxed_git(object_dir);
+
+   if (!m)
+   return 0;
+
+   printf("header: %08x %d %d %d %d\n",
+  m->signature,
+  m->version,
+  m->hash_version,
+  m->num_chunks,
+  m->num_packs);
+
+   printf("object_dir: %s\n", m->object_dir);
+
+   return 0;
+}
+
 int cmd_midx(int argc, const char **argv, const char *prefix)
 {
static struct option builtin_midx_options[] = {
@@ -38,6 +57,8 @@ int cmd_midx(int argc, const char **argv, const char *prefix)
if (argc == 0)
return 0;
 
+   if (!strcmp(argv[0], "read"))
+   return read_midx_file(opts.object_dir);
if (!strcmp(argv[0], "write"))
return write_midx_file(opts.object_dir);
 
diff --git a/midx.c b/midx.c
index 3e55422a21..fa18770f1d 100644
--- a/midx.c
+++ b/midx.c
@@ -3,12 +3,15 @@
 #include "dir.h"
 #include "csum-file.h"
 #include "lockfile.h"
+#include "object-store.h"
 #include "midx.h"
 
 #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
 #define MIDX_VERSION 1
 #define MIDX_HASH_VERSION 1 /* SHA-1 */
 #define MIDX_HEADER_SIZE 12
+#define MIDX_HASH_LEN 20
+#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
 static char *get_midx_filename(const char *object_dir)
 {
@@ -18,6 +21,68 @@ static char *get_midx_filename(const char *object_dir)
return strbuf_detach(_name, NULL);
 }
 
+struct midxed_git *load_midxed_git(const char *object_dir)
+{
+   struct midxed_git *m;
+   int fd;
+   struct stat st;
+   size_t midx_size;
+   void *midx_map;
+   const char *midx_name = get_midx_filename(object_dir);
+
+   fd = git_open(midx_name);
+   if (fd < 0)
+   return NULL;
+   if (fstat(fd, )) {
+   close(fd);
+   return NULL;
+   }
+   midx_size = xsize_t(st.st_size);
+
+   if (midx_size < MIDX_MIN_SIZE) {
+   close(fd);
+   die("multi-pack-index file %s is too small", midx_name);
+   }
+
+   midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+
+   m = xcalloc(1, sizeof(*m) + strlen(object_dir) + 1);
+   strcpy(m->object_dir, object_dir);
+   m->data = midx_map;
+
+   m->signature = get_be32(m->data);
+   if (m->signature != MIDX_SIGNATURE) {
+   error("multi-pack-index signature %X does not match signature 
%X",
+ m->signature, MIDX_SIGNATURE);
+   goto cleanup_fail;
+   }
+
+   m->version = *(m->data + 4);
+   if (m->versi

[PATCH 08/23] midx: read packfiles from pack directory

2018-06-07 Thread Derrick Stolee
When constructing a multi-pack-index file for a given object directory,
read the files within the enclosed pack directory and find matches that
end with ".idx" and find the correct paired packfile using
add_packed_git().

Signed-off-by: Derrick Stolee 
---
 midx.c  | 51 +++--
 t/t5319-midx.sh | 15 ---
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/midx.c b/midx.c
index fa18770f1d..9fb89c80a2 100644
--- a/midx.c
+++ b/midx.c
@@ -102,10 +102,15 @@ static size_t write_midx_header(struct hashfile *f,
 int write_midx_file(const char *object_dir)
 {
unsigned char num_chunks = 0;
-   uint32_t num_packs = 0;
char *midx_name;
struct hashfile *f;
struct lock_file lk;
+   struct packed_git **packs = NULL;
+   uint32_t i, nr_packs = 0, alloc_packs = 0;
+   DIR *dir;
+   struct dirent *de;
+   struct strbuf pack_dir = STRBUF_INIT;
+   size_t pack_dir_len;
 
midx_name = get_midx_filename(object_dir);
if (safe_create_leading_directories(midx_name)) {
@@ -114,14 +119,56 @@ int write_midx_file(const char *object_dir)
  midx_name);
}
 
+   strbuf_addf(_dir, "%s/pack", object_dir);
+   dir = opendir(pack_dir.buf);
+
+   if (!dir) {
+   error_errno("unable to open pack directory: %s",
+   pack_dir.buf);
+   strbuf_release(_dir);
+   return 1;
+   }
+
+   strbuf_addch(_dir, '/');
+   pack_dir_len = pack_dir.len;
+   ALLOC_ARRAY(packs, alloc_packs);
+   while ((de = readdir(dir)) != NULL) {
+   if (is_dot_or_dotdot(de->d_name))
+   continue;
+
+   if (ends_with(de->d_name, ".idx")) {
+   ALLOC_GROW(packs, nr_packs + 1, alloc_packs);
+
+   strbuf_setlen(_dir, pack_dir_len);
+   strbuf_addstr(_dir, de->d_name);
+
+   packs[nr_packs] = add_packed_git(pack_dir.buf,
+pack_dir.len,
+0);
+   if (!packs[nr_packs])
+   warning("failed to add packfile '%s'",
+   pack_dir.buf);
+   else
+   nr_packs++;
+   }
+   }
+   closedir(dir);
+   strbuf_release(_dir);
+
hold_lock_file_for_update(, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
FREE_AND_NULL(midx_name);
 
-   write_midx_header(f, num_chunks, num_packs);
+   write_midx_header(f, num_chunks, nr_packs);
 
finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
commit_lock_file();
 
+   for (i = 0; i < nr_packs; i++) {
+   close_pack(packs[i]);
+   FREE_AND_NULL(packs[i]);
+   }
+
+   FREE_AND_NULL(packs);
return 0;
 }
diff --git a/t/t5319-midx.sh b/t/t5319-midx.sh
index 2c25a69744..abe545c7c4 100755
--- a/t/t5319-midx.sh
+++ b/t/t5319-midx.sh
@@ -4,8 +4,9 @@ test_description='multi-pack-indexes'
 . ./test-lib.sh
 
 midx_read_expect() {
+   NUM_PACKS=$1
cat >expect <<- EOF
-   header: 4d494458 1 1 0 0
+   header: 4d494458 1 1 0 $NUM_PACKS
object_dir: .
EOF
git midx read --object-dir=. >actual &&
@@ -16,7 +17,7 @@ test_expect_success 'write midx with no packs' '
git midx --object-dir=. write &&
test_when_finished rm pack/multi-pack-index &&
test_path_is_file pack/multi-pack-index &&
-   midx_read_expect
+   midx_read_expect 0
 '
 
 test_expect_success 'create objects' '
@@ -47,14 +48,14 @@ test_expect_success 'write midx with one v1 pack' '
pack=$(git pack-objects --index-version=1 pack/test obj-list &&
git update-ref HEAD $commit &&
-   git pack-objects --index-version=2 test-pack 

[PATCH 17/23] midx: read objects from multi-pack-index

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 midx.c | 96 --
 midx.h |  2 ++
 object-store.h |  1 +
 packfile.c |  8 -
 4 files changed, 104 insertions(+), 3 deletions(-)

diff --git a/midx.c b/midx.c
index 5e9290ca8f..6eca8f1b12 100644
--- a/midx.c
+++ b/midx.c
@@ -3,6 +3,7 @@
 #include "dir.h"
 #include "csum-file.h"
 #include "lockfile.h"
+#include "sha1-lookup.h"
 #include "object-store.h"
 #include "packfile.h"
 #include "midx.h"
@@ -64,7 +65,7 @@ struct midxed_git *load_midxed_git(const char *object_dir)
 
m = xcalloc(1, sizeof(*m) + strlen(object_dir) + 1);
strcpy(m->object_dir, object_dir);
-   m->data = midx_map;
+   m->data = (const unsigned char*)midx_map;
 
m->signature = get_be32(m->data);
if (m->signature != MIDX_SIGNATURE) {
@@ -145,7 +146,9 @@ struct midxed_git *load_midxed_git(const char *object_dir)
 
m->num_objects = ntohl(m->chunk_oid_fanout[255]);
 
-   m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
+   m->packs = xcalloc(m->num_packs, sizeof(*m->packs));
+
+   ALLOC_ARRAY(m->pack_names, m->num_packs);
for (i = 0; i < m->num_packs; i++) {
if (i) {
if (ntohl(m->chunk_pack_lookup[i]) <= 
ntohl(m->chunk_pack_lookup[i - 1])) {
@@ -175,6 +178,95 @@ struct midxed_git *load_midxed_git(const char *object_dir)
exit(1);
 }
 
+static int prepare_midx_pack(struct midxed_git *m, uint32_t pack_int_id)
+{
+   struct strbuf pack_name = STRBUF_INIT;
+
+   if (pack_int_id >= m->num_packs)
+   BUG("bad pack-int-id");
+
+   if (m->packs[pack_int_id])
+   return 0;
+
+   strbuf_addstr(_name, m->object_dir);
+   strbuf_addstr(_name, "/pack/");
+   strbuf_addstr(_name, m->pack_names[pack_int_id]);
+
+   m->packs[pack_int_id] = add_packed_git(pack_name.buf, pack_name.len, 1);
+   strbuf_release(_name);
+   return !m->packs[pack_int_id];
+}
+
+int bsearch_midx(const struct object_id *oid, struct midxed_git *m, uint32_t 
*result)
+{
+   return bsearch_hash(oid->hash, m->chunk_oid_fanout, m->chunk_oid_lookup,
+   MIDX_HASH_LEN, result);
+}
+
+static off_t nth_midxed_offset(struct midxed_git *m, uint32_t pos)
+{
+   const unsigned char *offset_data;
+   uint32_t offset32;
+
+   offset_data = m->chunk_object_offsets + pos * MIDX_CHUNK_OFFSET_WIDTH;
+   offset32 = get_be32(offset_data + sizeof(uint32_t));
+
+   if (m->chunk_large_offsets && offset32 & MIDX_LARGE_OFFSET_NEEDED) {
+   if (sizeof(offset32) < sizeof(uint64_t))
+   die(_("multi-pack-index stores a 64-bit offset, but 
off_t is too small"));
+
+   offset32 ^= MIDX_LARGE_OFFSET_NEEDED;
+   return get_be64(m->chunk_large_offsets + sizeof(uint64_t) * 
offset32);
+   }
+
+   return offset32;
+}
+
+static uint32_t nth_midxed_pack_int_id(struct midxed_git *m, uint32_t pos)
+{
+   return get_be32(m->chunk_object_offsets + pos * 
MIDX_CHUNK_OFFSET_WIDTH);
+}
+
+static int nth_midxed_pack_entry(struct midxed_git *m, struct pack_entry *e, 
uint32_t pos)
+{
+   uint32_t pack_int_id;
+   struct packed_git *p;
+
+   if (pos >= m->num_objects)
+   return 0;
+
+   pack_int_id = nth_midxed_pack_int_id(m, pos);
+
+   if (prepare_midx_pack(m, pack_int_id))
+   die(_("error preparing packfile from multi-pack-index"));
+   p = m->packs[pack_int_id];
+
+   /*
+   * We are about to tell the caller where they can locate the
+   * requested object.  We better make sure the packfile is
+   * still here and can be accessed before supplying that
+   * answer, as it may have been deleted since the MIDX was
+   * loaded!
+   */
+   if (!is_pack_valid(p))
+   return 0;
+
+   e->offset = nth_midxed_offset(m, pos);
+   e->p = p;
+
+   return 1;
+}
+
+int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct 
midxed_git *m)
+{
+   uint32_t pos;
+
+   if (!bsearch_midx(oid, m, ))
+   return 0;
+
+   return nth_midxed_pack_entry(m, e, pos);
+}
+
 int prepare_midxed_git_one(struct repository *r, const char *object_dir)
 {
struct midxed_git *m = r->objects->midxed_git;
diff --git a/midx.h b/midx.h
index 793203fc4a..0c66812229 100644
--- a/midx.h
+++ b/midx.h
@@ -8,6 +8,8 @@
 #include "repository.h"
 
 struct midxed_git *load_midxed_git(const char *object_dir);
+int bsearch_midx(const struct object_id *oid, struct midxed_git *m, uint32_t 
*result);
+int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, str

[PATCH 02/23] midx: add midx format details to pack-format.txt

2018-06-07 Thread Derrick Stolee
The multi-pack-index (MIDX) feature generalizes the existing pack-
index (IDX) feature by indexing objects across multiple pack-files.

Describe the basic file format, using a 12-byte header followed by
a lookup table for a list of "chunks" which will be described later.
The file ends with a footer containing a checksum using the hash
algorithm.

The header allows later versions to create breaking changes by
advancing the version number. We can also change the hash algorithm
using a different version value.

We will add the individual chunk format information as we introduce
the code that writes that information.

Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt | 49 +
 1 file changed, 49 insertions(+)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 70a99fd142..17666b4bfc 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -252,3 +252,52 @@ Pack file entry: <+
 corresponding packfile.
 
 20-byte SHA-1-checksum of all of the above.
+
+== midx-*.midx files have the following format:
+
+The meta-index files refer to multiple pack-files and loose objects.
+
+In order to allow extensions that add extra data to the MIDX, we organize
+the body into "chunks" and provide a lookup table at the beginning of the
+body. The header includes certain length values, such as the number of packs,
+the number of base MIDX files, hash lengths and types.
+
+All 4-byte numbers are in network order.
+
+HEADER:
+
+   4-byte signature:
+   The signature is: {'M', 'I', 'D', 'X'}
+
+   1-byte version number:
+   Git only writes or recognizes version 1
+
+   1-byte Object Id Version
+   Git only writes or recognizes verion 1 (SHA-1)
+
+   1-byte number (C) of "chunks"
+
+   1-byte number (I) of base multi-pack-index files:
+   This value is currently always zero.
+
+   4-byte number (P) of pack files
+
+CHUNK LOOKUP:
+
+   (C + 1) * 12 bytes providing the chunk offsets:
+   First 4 bytes describe chunk id. Value 0 is a terminating label.
+   Other 8 bytes provide offset in current file for chunk to start.
+   (Chunks are provided in file-order, so you can infer the length
+   using the next chunk position if necessary.)
+
+   The remaining data in the body is described one chunk at a time, and
+   these chunks may be given in any order. Chunks are required unless
+   otherwise specified.
+
+CHUNK DATA:
+
+   (This section intentionally left incomplete.)
+
+TRAILER:
+
+   H-byte HASH-checksum of all of the above.
-- 
2.18.0.rc1



[PATCH 10/23] midx: write a lookup into the pack names chunk

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  5 +++
 builtin/midx.c  |  7 
 midx.c  | 56 +++--
 object-store.h  |  2 +
 t/t5319-midx.sh | 11 +++--
 5 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 2b37be7b33..29bf87283a 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -296,6 +296,11 @@ CHUNK LOOKUP:
 
 CHUNK DATA:
 
+   Packfile Name Lookup (ID: {'P', 'L', 'O', 'O'}) (P * 4 bytes)
+   P * 4 bytes storing the offset in the packfile name chunk for
+   the null-terminated string containing the filename for the
+   ith packfile.
+
Packfile Names (ID: {'P', 'N', 'A', 'M'})
Stores the packfile names as concatenated, null-terminated strings.
Packfiles must be listed in lexicographic order for fast lookups by
diff --git a/builtin/midx.c b/builtin/midx.c
index fe56560853..3a261e9bbf 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -16,6 +16,7 @@ static struct opts_midx {
 
 static int read_midx_file(const char *object_dir)
 {
+   uint32_t i;
struct midxed_git *m = load_midxed_git(object_dir);
 
if (!m)
@@ -30,11 +31,17 @@ static int read_midx_file(const char *object_dir)
 
printf("chunks:");
 
+   if (m->chunk_pack_lookup)
+   printf(" pack_lookup");
if (m->chunk_pack_names)
printf(" pack_names");
 
printf("\n");
 
+   printf("packs:\n");
+   for (i = 0; i < m->num_packs; i++)
+   printf("%s\n", m->pack_names[i]);
+
printf("object_dir: %s\n", m->object_dir);
 
return 0;
diff --git a/midx.c b/midx.c
index d4f4a01a51..923acda72e 100644
--- a/midx.c
+++ b/midx.c
@@ -13,8 +13,9 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 1
+#define MIDX_MAX_CHUNKS 2
 #define MIDX_CHUNK_ALIGNMENT 4
+#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 
@@ -85,6 +86,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
uint64_t chunk_offset = get_be64(m->data + 16 + 
MIDX_CHUNKLOOKUP_WIDTH * i);
 
switch (chunk_id) {
+   case MIDX_CHUNKID_PACKLOOKUP:
+   m->chunk_pack_lookup = (uint32_t *)(m->data + 
chunk_offset);
+   break;
+
case MIDX_CHUNKID_PACKNAMES:
m->chunk_pack_names = m->data + chunk_offset;
break;
@@ -102,9 +107,32 @@ struct midxed_git *load_midxed_git(const char *object_dir)
}
}
 
+   if (!m->chunk_pack_lookup)
+   die("MIDX missing required pack lookup chunk");
if (!m->chunk_pack_names)
die("MIDX missing required pack-name chunk");
 
+   m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
+   for (i = 0; i < m->num_packs; i++) {
+   if (i) {
+   if (ntohl(m->chunk_pack_lookup[i]) <= 
ntohl(m->chunk_pack_lookup[i - 1])) {
+   error("MIDX pack lookup value %d before %d",
+ ntohl(m->chunk_pack_lookup[i - 1]),
+ ntohl(m->chunk_pack_lookup[i]));
+   goto cleanup_fail;
+   }
+   }
+
+   m->pack_names[i] = (const char *)(m->chunk_pack_names + 
ntohl(m->chunk_pack_lookup[i]));
+
+   if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) {
+   error("MIDX pack names out of order: '%s' before '%s'",
+ m->pack_names[i - 1],
+ m->pack_names[i]);
+   goto cleanup_fail;
+   }
+   }
+
return m;
 
 cleanup_fail:
@@ -162,6 +190,20 @@ static void sort_packs_by_name(char **pack_names, uint32_t 
nr_packs, uint32_t *p
}
 }
 
+static size_t write_midx_pack_lookup(struct hashfile *f,
+char **pack_names,
+uint32_t nr_packs)
+{
+   uint32_t i, cur_len = 0;
+
+   for (i = 0; i < nr_packs; i++) {
+   hashwrite_be32(f, cur_len);
+   cur_len += strlen(pack_names[i]) + 1;
+   }
+
+   return sizeof(uint32_t) * (

[PATCH 04/23] midx: add 'write' subcommand and basic wiring

2018-06-07 Thread Derrick Stolee
In anticipation of writing multi-pack-indexes (MIDX files), add a
'git midx write' subcommand and send the options to a write_midx_file()
method. Also create a basic test file that tests the 'write' subcommand.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-midx.txt | 22 +-
 Makefile   |  1 +
 builtin/midx.c |  9 -
 midx.c |  9 +
 midx.h |  4 
 t/t5319-midx.sh| 10 ++
 6 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 midx.c
 create mode 100644 midx.h
 create mode 100755 t/t5319-midx.sh

diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt
index 2bd886f1a2..dcaeb1a91b 100644
--- a/Documentation/git-midx.txt
+++ b/Documentation/git-midx.txt
@@ -9,7 +9,7 @@ git-midx - Write and verify multi-pack-indexes (MIDX files).
 SYNOPSIS
 
 [verse]
-'git midx' [--object-dir ]
+'git midx' [--object-dir ] 
 
 DESCRIPTION
 ---
@@ -23,6 +23,26 @@ OPTIONS
/packs/multi-pack-index for the current MIDX file, and
/packs for the pack-files to index.
 
+write::
+   When given as the verb, write a new MIDX file to
+   /packs/multi-pack-index.
+
+
+EXAMPLES
+
+
+* Write a MIDX file for the packfiles in the current .git folder.
++
+---
+$ git midx write
+---
+
+* Write a MIDX file for the packfiles in an alternate.
++
+---
+$ git midx --object-dir  write
+---
+
 
 GIT
 ---
diff --git a/Makefile b/Makefile
index 88958c7b42..aa86fcd8ec 100644
--- a/Makefile
+++ b/Makefile
@@ -890,6 +890,7 @@ LIB_OBJS += merge.o
 LIB_OBJS += merge-blobs.o
 LIB_OBJS += merge-recursive.o
 LIB_OBJS += mergesort.o
+LIB_OBJS += midx.o
 LIB_OBJS += name-hash.o
 LIB_OBJS += notes.o
 LIB_OBJS += notes-cache.o
diff --git a/builtin/midx.c b/builtin/midx.c
index 59ea92178f..dc0a5acd3f 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -3,9 +3,10 @@
 #include "config.h"
 #include "git-compat-util.h"
 #include "parse-options.h"
+#include "midx.h"
 
 static char const * const builtin_midx_usage[] ={
-   N_("git midx [--object-dir ]"),
+   N_("git midx [--object-dir ] [write]"),
NULL
 };
 
@@ -34,5 +35,11 @@ int cmd_midx(int argc, const char **argv, const char *prefix)
if (!opts.object_dir)
opts.object_dir = get_object_directory();
 
+   if (argc == 0)
+   return 0;
+
+   if (!strcmp(argv[0], "write"))
+   return write_midx_file(opts.object_dir);
+
return 0;
 }
diff --git a/midx.c b/midx.c
new file mode 100644
index 00..616af66b13
--- /dev/null
+++ b/midx.c
@@ -0,0 +1,9 @@
+#include "git-compat-util.h"
+#include "cache.h"
+#include "dir.h"
+#include "midx.h"
+
+int write_midx_file(const char *object_dir)
+{
+   return 0;
+}
diff --git a/midx.h b/midx.h
new file mode 100644
index 00..3a63673952
--- /dev/null
+++ b/midx.h
@@ -0,0 +1,4 @@
+#include "cache.h"
+#include "packfile.h"
+
+int write_midx_file(const char *object_dir);
diff --git a/t/t5319-midx.sh b/t/t5319-midx.sh
new file mode 100755
index 00..a590137af7
--- /dev/null
+++ b/t/t5319-midx.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+test_description='multi-pack-indexes'
+. ./test-lib.sh
+
+test_expect_success 'write midx with no pakcs' '
+   git midx --object-dir=. write
+'
+
+test_done
-- 
2.18.0.rc1



[PATCH 12/23] midx: write object ids in a chunk

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  4 ++
 builtin/midx.c  |  2 +
 midx.c  | 50 +++--
 object-store.h  |  1 +
 t/t5319-midx.sh |  4 +-
 5 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 29bf87283a..de9ac778b6 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -307,6 +307,10 @@ CHUNK DATA:
name. This is the only chunk not guaranteed to be a multiple of four
bytes in length, so should be the last chunk for alignment reasons.
 
+   OID Lookup (ID: {'O', 'I', 'D', 'L'}) (N * H bytes)
+   The OIDs for all objects in the MIDX are stored in lexicographic
+   order in this chunk.
+
(This section intentionally left incomplete.)
 
 TRAILER:
diff --git a/builtin/midx.c b/builtin/midx.c
index 3a261e9bbf..86edd30174 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -35,6 +35,8 @@ static int read_midx_file(const char *object_dir)
printf(" pack_lookup");
if (m->chunk_pack_names)
printf(" pack_names");
+   if (m->chunk_oid_lookup)
+   printf(" oid_lookup");
 
printf("\n");
 
diff --git a/midx.c b/midx.c
index b20d52713c..d06bc6876a 100644
--- a/midx.c
+++ b/midx.c
@@ -14,10 +14,11 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 2
+#define MIDX_MAX_CHUNKS 3
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 
 static char *get_midx_filename(const char *object_dir)
@@ -95,6 +96,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
m->chunk_pack_names = m->data + chunk_offset;
break;
 
+   case MIDX_CHUNKID_OIDLOOKUP:
+   m->chunk_oid_lookup = m->data + chunk_offset;
+   break;
+
case 0:
die("terminating MIDX chunk id appears earlier 
than expected");
break;
@@ -112,6 +117,8 @@ struct midxed_git *load_midxed_git(const char *object_dir)
die("MIDX missing required pack lookup chunk");
if (!m->chunk_pack_names)
die("MIDX missing required pack-name chunk");
+   if (!m->chunk_oid_lookup)
+   die("MIDX missing required OID lookup chunk");
 
m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
for (i = 0; i < m->num_packs; i++) {
@@ -370,6 +377,32 @@ static size_t write_midx_pack_names(struct hashfile *f,
return written;
 }
 
+static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
+   struct pack_midx_entry *objects,
+   uint32_t nr_objects)
+{
+   struct pack_midx_entry *list = objects;
+   uint32_t i;
+   size_t written = 0;
+
+   for (i = 0; i < nr_objects; i++) {
+   struct pack_midx_entry *obj = list++;
+
+   if (i < nr_objects - 1) {
+   struct pack_midx_entry *next = list;
+   if (oidcmp(>oid, >oid) >= 0)
+   BUG("OIDs not in order: %s >= %s",
+   oid_to_hex(>oid),
+   oid_to_hex(>oid));
+   }
+
+   hashwrite(f, obj->oid.hash, (int)hash_len);
+   written += hash_len;
+   }
+
+   return written;
+}
+
 int write_midx_file(const char *object_dir)
 {
unsigned char cur_chunk, num_chunks = 0;
@@ -389,6 +422,7 @@ int write_midx_file(const char *object_dir)
uint64_t written = 0;
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
+   struct pack_midx_entry *entries;
uint32_t nr_entries;
 
midx_name = get_midx_filename(object_dir);
@@ -448,14 +482,14 @@ int write_midx_file(const char *object_dir)
ALLOC_ARRAY(pack_perm, nr_packs);
sort_packs_by_name(pack_names, nr_packs, pack_perm);
 
-   get_sorted_entries(packs, pack_perm, nr_packs, _entries);
+   entries = get_sorted_entries(packs, pack_perm, nr_packs, _entries);
 
hold_lock_file_for_update(, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(lk.tempfile->fd, 

[PATCH 00/23] Multi-pack-index (MIDX)

2018-06-07 Thread Derrick Stolee
226427-1-dsto...@microsoft.com/
    A patch series on abbreviation speedups


Derrick Stolee (23):
  midx: add design document
  midx: add midx format details to pack-format.txt
  midx: add midx builtin
  midx: add 'write' subcommand and basic wiring
  midx: write header information to lockfile
  midx: struct midxed_git and 'read' subcommand
  midx: expand test data
  midx: read packfiles from pack directory
  midx: write pack names in chunk
  midx: write a lookup into the pack names chunk
  midx: sort and deduplicate objects from packfiles
  midx: write object ids in a chunk
  midx: write object id fanout chunk
  midx: write object offsets
  midx: create core.midx config setting
  midx: prepare midxed_git struct
  midx: read objects from multi-pack-index
  midx: use midx in abbreviation calculations
  midx: use existing midx when writing new one
  midx: use midx in approximate_object_count
  midx: prevent duplicate packfile loads
  midx: use midx to find ref-deltas
  midx: clear midx on repack

 .gitignore  |   1 +
 Documentation/config.txt|   4 +
 Documentation/git-midx.txt  |  60 ++
 Documentation/technical/midx.txt| 109 +++
 Documentation/technical/pack-format.txt |  82 +++
 Makefile|   2 +
 builtin.h   |   1 +
 builtin/midx.c  |  88 +++
 builtin/repack.c|   8 +
 cache.h |   1 +
 command-list.txt|   1 +
 config.c|   5 +
 environment.c   |   1 +
 git.c   |   1 +
 midx.c  | 923 
 midx.h  |  23 +
 object-store.h  |  35 +
 packfile.c  |  47 +-
 packfile.h  |   1 +
 sha1-name.c |  70 ++
 t/t5319-midx.sh | 192 +
 21 files changed, 1652 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/git-midx.txt
 create mode 100644 Documentation/technical/midx.txt
 create mode 100644 builtin/midx.c
 create mode 100644 midx.c
 create mode 100644 midx.h
 create mode 100755 t/t5319-midx.sh

-- 
2.18.0.rc1



[PATCH 03/23] midx: add midx builtin

2018-06-07 Thread Derrick Stolee
This new 'git midx' builtin will be the plumbing access for writing,
reading, and checking multi-pack-index (MIDX) files. The initial
implementation is a no-op.

Signed-off-by: Derrick Stolee 
---
 .gitignore |  1 +
 Documentation/git-midx.txt | 29 +
 Makefile   |  1 +
 builtin.h  |  1 +
 builtin/midx.c | 38 ++
 command-list.txt   |  1 +
 git.c  |  1 +
 7 files changed, 72 insertions(+)
 create mode 100644 Documentation/git-midx.txt
 create mode 100644 builtin/midx.c

diff --git a/.gitignore b/.gitignore
index 388cc4beee..e309644d6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,6 +97,7 @@
 /git-merge-subtree
 /git-mergetool
 /git-mergetool--lib
+/git-midx
 /git-mktag
 /git-mktree
 /git-name-rev
diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt
new file mode 100644
index 00..2bd886f1a2
--- /dev/null
+++ b/Documentation/git-midx.txt
@@ -0,0 +1,29 @@
+git-midx(1)
+
+
+NAME
+
+git-midx - Write and verify multi-pack-indexes (MIDX files).
+
+
+SYNOPSIS
+
+[verse]
+'git midx' [--object-dir ]
+
+DESCRIPTION
+---
+Write or verify a MIDX file.
+
+OPTIONS
+---
+
+--object-dir ::
+   Use given directory for the location of Git objects. We check
+   /packs/multi-pack-index for the current MIDX file, and
+   /packs for the pack-files to index.
+
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index 1d27f36365..88958c7b42 100644
--- a/Makefile
+++ b/Makefile
@@ -1045,6 +1045,7 @@ BUILTIN_OBJS += builtin/merge-index.o
 BUILTIN_OBJS += builtin/merge-ours.o
 BUILTIN_OBJS += builtin/merge-recursive.o
 BUILTIN_OBJS += builtin/merge-tree.o
+BUILTIN_OBJS += builtin/midx.o
 BUILTIN_OBJS += builtin/mktag.o
 BUILTIN_OBJS += builtin/mktree.o
 BUILTIN_OBJS += builtin/mv.o
diff --git a/builtin.h b/builtin.h
index 4e0f64723e..7b5bd46c7d 100644
--- a/builtin.h
+++ b/builtin.h
@@ -189,6 +189,7 @@ extern int cmd_merge_ours(int argc, const char **argv, 
const char *prefix);
 extern int cmd_merge_file(int argc, const char **argv, const char *prefix);
 extern int cmd_merge_recursive(int argc, const char **argv, const char 
*prefix);
 extern int cmd_merge_tree(int argc, const char **argv, const char *prefix);
+extern int cmd_midx(int argc, const char **argv, const char *prefix);
 extern int cmd_mktag(int argc, const char **argv, const char *prefix);
 extern int cmd_mktree(int argc, const char **argv, const char *prefix);
 extern int cmd_mv(int argc, const char **argv, const char *prefix);
diff --git a/builtin/midx.c b/builtin/midx.c
new file mode 100644
index 00..59ea92178f
--- /dev/null
+++ b/builtin/midx.c
@@ -0,0 +1,38 @@
+#include "builtin.h"
+#include "cache.h"
+#include "config.h"
+#include "git-compat-util.h"
+#include "parse-options.h"
+
+static char const * const builtin_midx_usage[] ={
+   N_("git midx [--object-dir ]"),
+   NULL
+};
+
+static struct opts_midx {
+   const char *object_dir;
+} opts;
+
+int cmd_midx(int argc, const char **argv, const char *prefix)
+{
+   static struct option builtin_midx_options[] = {
+   { OPTION_STRING, 0, "object-dir", _dir,
+ N_("dir"),
+ N_("The object directory containing set of packfile and 
pack-index pairs.") },
+   OPT_END(),
+   };
+
+   if (argc == 2 && !strcmp(argv[1], "-h"))
+   usage_with_options(builtin_midx_usage, builtin_midx_options);
+
+   git_config(git_default_config, NULL);
+
+   argc = parse_options(argc, argv, prefix,
+builtin_midx_options,
+builtin_midx_usage, 0);
+
+   if (!opts.object_dir)
+   opts.object_dir = get_object_directory();
+
+   return 0;
+}
diff --git a/command-list.txt b/command-list.txt
index e1c26c1bb7..a21bd7470e 100644
--- a/command-list.txt
+++ b/command-list.txt
@@ -123,6 +123,7 @@ git-merge-index plumbingmanipulators
 git-merge-one-file  purehelpers
 git-mergetool   ancillarymanipulators   
complete
 git-merge-tree  ancillaryinterrogators
+git-midxplumbingmanipulators
 git-mktag   plumbingmanipulators
 git-mktree  plumbingmanipulators
 git-mv  mainporcelain   worktree
diff --git a/git.c b/git.c
index c2f48d53dd..400fadd677 100644
--- a/git.c
+++ b/git.c
@@ -503,6 +503,7 @@ static struct cmd_struct commands[] = {
{ "merge-recursive-theirs", cmd_merge_recursive, RUN_SETUP | 
NEED_WORK_TREE | NO_PARSEOPT },
{ "merge-subtree", cmd_merge_recursive,

[PATCH 09/23] midx: write pack names in chunk

2018-06-07 Thread Derrick Stolee
The multi-pack-index (MIDX) needs to track which pack-files are covered
by the MIDX file. Store these in our first required chunk. Since
filenames are not well structured, add padding to keep good alignment in
later chunks.

Modify the 'git midx read' subcommand to output the existence of the
pack-file name chunk. Modify t5319-midx.sh to reflect this new output
and the new expected number of chunks.

Defense in depth: A pattern we are using in the multi-pack-index feature
is to verify the data as we write it. We want to ensure we never write
invalid data to the multi-pack-index. There are many checks during the
write of a MIDX file that double-check that the values we are writing
fit the format definitions. If any value is incorrect, then we notice
before writing invalid data. This mainly helps developers while working
on the feature, but it can also identify issues that only appear when
dealing with very large data sets. These large sets are hard to encode
into test cases.

Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |   6 +
 builtin/midx.c  |   7 +
 midx.c  | 176 +++-
 object-store.h  |   2 +
 t/t5319-midx.sh |   3 +-
 5 files changed, 188 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 17666b4bfc..2b37be7b33 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -296,6 +296,12 @@ CHUNK LOOKUP:
 
 CHUNK DATA:
 
+   Packfile Names (ID: {'P', 'N', 'A', 'M'})
+   Stores the packfile names as concatenated, null-terminated strings.
+   Packfiles must be listed in lexicographic order for fast lookups by
+   name. This is the only chunk not guaranteed to be a multiple of four
+   bytes in length, so should be the last chunk for alignment reasons.
+
(This section intentionally left incomplete.)
 
 TRAILER:
diff --git a/builtin/midx.c b/builtin/midx.c
index c7002f664a..fe56560853 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -28,6 +28,13 @@ static int read_midx_file(const char *object_dir)
   m->num_chunks,
   m->num_packs);
 
+   printf("chunks:");
+
+   if (m->chunk_pack_names)
+   printf(" pack_names");
+
+   printf("\n");
+
printf("object_dir: %s\n", m->object_dir);
 
return 0;
diff --git a/midx.c b/midx.c
index 9fb89c80a2..d4f4a01a51 100644
--- a/midx.c
+++ b/midx.c
@@ -13,6 +13,11 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
+#define MIDX_MAX_CHUNKS 1
+#define MIDX_CHUNK_ALIGNMENT 4
+#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
+
 static char *get_midx_filename(const char *object_dir)
 {
struct strbuf midx_name = STRBUF_INIT;
@@ -29,6 +34,7 @@ struct midxed_git *load_midxed_git(const char *object_dir)
size_t midx_size;
void *midx_map;
const char *midx_name = get_midx_filename(object_dir);
+   uint32_t i;
 
fd = git_open(midx_name);
if (fd < 0)
@@ -74,6 +80,31 @@ struct midxed_git *load_midxed_git(const char *object_dir)
m->num_chunks = *(m->data + 6);
m->num_packs = get_be32(m->data + 8);
 
+   for (i = 0; i < m->num_chunks; i++) {
+   uint32_t chunk_id = get_be32(m->data + 12 + 
MIDX_CHUNKLOOKUP_WIDTH * i);
+   uint64_t chunk_offset = get_be64(m->data + 16 + 
MIDX_CHUNKLOOKUP_WIDTH * i);
+
+   switch (chunk_id) {
+   case MIDX_CHUNKID_PACKNAMES:
+   m->chunk_pack_names = m->data + chunk_offset;
+   break;
+
+   case 0:
+   die("terminating MIDX chunk id appears earlier 
than expected");
+   break;
+
+   default:
+   /*
+* Do nothing on unrecognized chunks, allowing 
future
+* extensions to add optional chunks.
+*/
+   break;
+   }
+   }
+
+   if (!m->chunk_pack_names)
+   die("MIDX missing required pack-name chunk");
+
return m;
 
 cleanup_fail:
@@ -99,18 +130,88 @@ static size_t write_midx_header(struct hashfile *f,
return MIDX_HEADER_SIZE;
 }
 
+struct pack_pair {
+   uint32_t pack_int_id;
+   char *pack_name;
+};
+
+static int pack_pair_compare(const void *_a, const void *_b)
+{
+   struct pack_pair *a = (struct pack_pair *)_a;
+   struct pack_pair *b = (struct pack_pair *)_b;
+ 

[PATCH 16/23] midx: prepare midxed_git struct

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 midx.c | 22 ++
 midx.h |  2 ++
 object-store.h |  7 +++
 packfile.c |  6 +-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/midx.c b/midx.c
index a49300bf75..5e9290ca8f 100644
--- a/midx.c
+++ b/midx.c
@@ -175,6 +175,28 @@ struct midxed_git *load_midxed_git(const char *object_dir)
exit(1);
 }
 
+int prepare_midxed_git_one(struct repository *r, const char *object_dir)
+{
+   struct midxed_git *m = r->objects->midxed_git;
+   struct midxed_git *m_search;
+
+   if (!core_midx)
+   return 0;
+
+   for (m_search = m; m_search; m_search = m_search->next)
+   if (!strcmp(object_dir, m_search->object_dir))
+   return 1;
+
+   r->objects->midxed_git = load_midxed_git(object_dir);
+
+   if (r->objects->midxed_git) {
+   r->objects->midxed_git->next = m;
+   return 1;
+   }
+
+   return 0;
+}
+
 static size_t write_midx_header(struct hashfile *f,
unsigned char num_chunks,
uint32_t num_packs)
diff --git a/midx.h b/midx.h
index a1d18ed991..793203fc4a 100644
--- a/midx.h
+++ b/midx.h
@@ -5,8 +5,10 @@
 #include "cache.h"
 #include "object-store.h"
 #include "packfile.h"
+#include "repository.h"
 
 struct midxed_git *load_midxed_git(const char *object_dir);
+int prepare_midxed_git_one(struct repository *r, const char *object_dir);
 
 int write_midx_file(const char *object_dir);
 
diff --git a/object-store.h b/object-store.h
index 9b671f1b0a..7908d46e34 100644
--- a/object-store.h
+++ b/object-store.h
@@ -130,6 +130,13 @@ struct raw_object_store {
 */
struct oidmap *replace_map;
 
+   /*
+* private data
+*
+* should only be accessed directly by packfile.c and midx.c
+*/
+   struct midxed_git *midxed_git;
+
/*
 * private data
 *
diff --git a/packfile.c b/packfile.c
index 1a714fbde9..b91ca9b9f5 100644
--- a/packfile.c
+++ b/packfile.c
@@ -15,6 +15,7 @@
 #include "tree-walk.h"
 #include "tree.h"
 #include "object-store.h"
+#include "midx.h"
 
 char *odb_pack_name(struct strbuf *buf,
const unsigned char *sha1,
@@ -893,10 +894,13 @@ static void prepare_packed_git(struct repository *r)
 
if (r->objects->packed_git_initialized)
return;
+   prepare_midxed_git_one(r, r->objects->objectdir);
prepare_packed_git_one(r, r->objects->objectdir, 1);
prepare_alt_odb(r);
-   for (alt = r->objects->alt_odb_list; alt; alt = alt->next)
+   for (alt = r->objects->alt_odb_list; alt; alt = alt->next) {
+   prepare_midxed_git_one(r, alt->path);
prepare_packed_git_one(r, alt->path, 0);
+   }
rearrange_packed_git(r);
prepare_packed_git_mru(r);
r->objects->packed_git_initialized = 1;
-- 
2.18.0.rc1



[PATCH 01/23] midx: add design document

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/midx.txt | 109 +++
 1 file changed, 109 insertions(+)
 create mode 100644 Documentation/technical/midx.txt

diff --git a/Documentation/technical/midx.txt b/Documentation/technical/midx.txt
new file mode 100644
index 00..789f410d71
--- /dev/null
+++ b/Documentation/technical/midx.txt
@@ -0,0 +1,109 @@
+Multi-Pack-Index (MIDX) Design Notes
+
+
+The Git object directory contains a 'pack' directory containing
+packfiles (with suffix ".pack") and pack-indexes (with suffix
+".idx"). The pack-indexes provide a way to lookup objects and
+navigate to their offset within the pack, but these must come
+in pairs with the packfiles. This pairing depends on the file
+names, as the pack-index differs only in suffix with its pack-
+file. While the pack-indexes provide fast lookup per packfile,
+this performance degrades as the number of packfiles increases,
+because abbreviations need to inspect every packfile and we are
+more likely to have a miss on our most-recently-used packfile.
+For some large repositories, repacking into a single packfile
+is not feasible due to storage space or excessive repack times.
+
+The multi-pack-index (MIDX for short) stores a list of objects
+and their offsets into multiple packfiles. It contains:
+
+- A list of packfile names.
+- A sorted list of object IDs.
+- A list of metadata for the ith object ID including:
+  - A value j referring to the jth packfile.
+  - An offset within the jth packfile for the object.
+- If large offsets are required, we use another list of large
+  offsets similar to version 2 pack-indexes.
+
+Thus, we can provide O(log N) lookup time for any number
+of packfiles.
+
+Design Details
+--
+
+- The MIDX is stored in a file named 'multi-pack-index' in the
+  .git/objects/pack directory. This could be stored in the pack
+  directory of an alternate. It refers only to packfiles in that
+  same directory.
+
+- The core.midx config setting must be on to consume MIDX files.
+
+- The file format includes parameters for the object ID hash
+  function, so a future change of hash algorithm does not require
+  a change in format.
+
+- The MIDX keeps only one record per object ID. If an object appears
+  in multiple packfiles, then the MIDX selects the copy in the most-
+  recently modified packfile.
+
+- If there exist packfiles in the pack directory not registered in
+  the MIDX, then those packfiles are loaded into the `packed_git`
+  list and `packed_git_mru` cache.
+
+- The pack-indexes (.idx files) remain in the pack directory so we
+  can delete the MIDX file, set core.midx to false, or downgrade
+  without any loss of information.
+
+- The MIDX file format uses a chunk-based approach (similar to the
+  commit-graph file) that allows optional data to be added.
+
+Future Work
+---
+
+- Add a 'verify' subcommand to the 'git midx' builtin to verify the
+  contents of the multi-pack-index file match the offsets listed in
+  the corresponding pack-indexes.
+
+- The multi-pack-index allows many packfiles, especially in a context
+  where repacking is expensive (such as a very large repo), or
+  unexpected maintenance time is unacceptable (such as a high-demand
+  build machine). However, the multi-pack-index needs to be rewritten
+  in full every time. We can extend the format to be incremental, so
+  writes are fast. By storing a small "tip" multi-pack-index that
+  points to large "base" MIDX files, we can keep writes fast while
+  still reducing the number of binary searches required for object
+  lookups.
+
+- The reachability bitmap is currently paired directly with a single
+  packfile, using the pack-order as the object order to hopefully
+  compress the bitmaps well using run-length encoding. This could be
+  extended to pair a reachability bitmap with a multi-pack-index. If
+  the multi-pack-index is extended to store a "stable object order"
+  (a function Order(hash) = integer that is constant for a given hash,
+  even as the multi-pack-index is updated) then a reachability bitmap
+  could point to a multi-pack-index and be updated independently.
+
+- Packfiles can be marked as "special" using empty files that share
+  the initial name but replace ".pack" with ".keep" or ".promisor".
+  We can add an optional chunk of data to the multi-pack-index that
+  records flags of information about the packfiles. This allows new
+  states, such as 'repacked' or 'redeltified', that can help with
+  pack maintenance in a multi-pack environment. It may also be
+  helpful to organize packfiles by object type (commit, tree, blob,
+  etc.) and use this metadata to help that maintenance.
+
+- The partial clone feature records special "promisor" packs that
+  may point to objects that are not stored locally, but available
+  on re

Re: [PATCH 00/23] Multi-pack-index (MIDX)

2018-06-07 Thread Derrick Stolee

On 6/7/2018 10:03 AM, Derrick Stolee wrote:

This patch series includes a rewrite of the previous
multi-pack-index RFC [1] using the feedback from the
commit-graph feature.


Sorry to everyone who got a duplicate copy of this series. I misspelled 
'kernel.org' and it didn't go to the list.


I also have this series available as a GitHub PR [1]

[1] https://github.com/derrickstolee/git/pull/7



[PATCH v6 04/21] commit: force commit to parse from object database

2018-06-08 Thread Derrick Stolee
In anticipation of verifying commit-graph file contents against the
object database, create parse_commit_internal() to allow side-stepping
the commit-graph file and parse directly from the object database.

Due to the use of generation numbers, this method should not be called
unless the intention is explicit in avoiding commits from the
commit-graph file.

Signed-off-by: Derrick Stolee 
---
 commit.c | 9 +++--
 commit.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index d53dc16d72..720c6acddf 100644
--- a/commit.c
+++ b/commit.c
@@ -418,7 +418,7 @@ int parse_commit_buffer(struct commit *item, const void 
*buffer, unsigned long s
return 0;
 }
 
-int parse_commit_gently(struct commit *item, int quiet_on_missing)
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph)
 {
enum object_type type;
void *buffer;
@@ -429,7 +429,7 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return -1;
if (item->object.parsed)
return 0;
-   if (parse_commit_in_graph(item))
+   if (use_commit_graph && parse_commit_in_graph(item))
return 0;
buffer = read_object_file(>object.oid, , );
if (!buffer)
@@ -450,6 +450,11 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return ret;
 }
 
+int parse_commit_gently(struct commit *item, int quiet_on_missing)
+{
+   return parse_commit_internal(item, quiet_on_missing, 1);
+}
+
 void parse_commit_or_die(struct commit *item)
 {
if (parse_commit(item))
diff --git a/commit.h b/commit.h
index 3ad07c2e3d..7e0f273720 100644
--- a/commit.h
+++ b/commit.h
@@ -77,6 +77,7 @@ struct commit *lookup_commit_reference_by_name(const char 
*name);
 struct commit *lookup_commit_or_die(const struct object_id *oid, const char 
*ref_name);
 
 int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long 
size, int check_graph);
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph);
 int parse_commit_gently(struct commit *item, int quiet_on_missing);
 static inline int parse_commit(struct commit *item)
 {
-- 
2.18.0.rc1



[PATCH v6 01/21] commit-graph: UNLEAK before die()

2018-06-08 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 37420ae0fd..f0875b8bf3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv)
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
 
-   if (!graph)
+   if (!graph) {
+   UNLEAK(graph_name);
die("graph file %s does not exist", graph_name);
+   }
+
FREE_AND_NULL(graph_name);
 
printf("header: %08x %d %d %d %d\n",
-- 
2.18.0.rc1



[PATCH v6 11/21] commit-graph: verify root tree OIDs

2018-06-08 Thread Derrick Stolee
The 'verify' subcommand must compare the commit content parsed from the
commit-graph against the content in the object database. Use
lookup_commit() and parse_commit_in_graph_one() to parse the commits
from the graph and compare against a commit that is loaded separately
and parsed directly from the object database.

Add checks for the root tree OID.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 17 -
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 00e89b71e9..5df18394f9 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -866,6 +866,8 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
if (i && oidcmp(_oid, _oid) >= 0)
@@ -883,6 +885,11 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
 
cur_fanout_pos++;
}
+
+   graph_commit = lookup_commit(_oid);
+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph",
+oid_to_hex(_oid));
}
 
while (cur_fanout_pos < 256) {
@@ -899,16 +906,24 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
-   struct commit *odb_commit;
+   struct commit *graph_commit, *odb_commit;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
+   graph_commit = lookup_commit(_oid);
odb_commit = (struct commit *)create_object(r, cur_oid.hash, 
alloc_commit_node(r));
if (parse_commit_internal(odb_commit, 0, 0)) {
graph_report("failed to parse %s from object database",
 oid_to_hex(_oid));
continue;
}
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree OID for commit %s in 
commit-graph is %s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index af5e34c0cb..2b9214bc83 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 * 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 * 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 4 + 
10))
+GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 
$NUM_COMMITS))
+GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -341,4 +343,9 @@ test_expect_success 'detect OID not in object database' '
"from object database"
 '
 
+test_expect_success 'detect incorrect tree OID' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_TREE "\01" \
+   "root tree OID for commit"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 14/21] commit-graph: verify commit date

2018-06-08 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 6 ++
 t/t5318-commit-graph.sh | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index e0f71658da..6d6c6beff9 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -986,6 +986,12 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
 oid_to_hex(_oid),
 graph_commit->generation,
 max_generation + 1);
+
+   if (graph_commit->date != odb_commit->date)
+   graph_report("commit date for commit %s in commit-graph 
is %"PRItime" != %"PRItime,
+oid_to_hex(_oid),
+graph_commit->date,
+odb_commit->date);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 5b75c4dca2..b7b4410e75 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -273,6 +273,7 @@ GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + 
$HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
+GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -377,4 +378,9 @@ test_expect_success 'detect incorrect generation number' '
"non-zero generation number"
 '
 
+test_expect_success 'detect incorrect commit date' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATE "\01" \
+   "commit date"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 07/21] commit-graph: verify catches corrupt signature

2018-06-08 Thread Derrick Stolee
This is the first of several commits that add a test to check that
'git commit-graph verify' catches corruption in the commit-graph
file. The first test checks that the command catches an error in
the file signature. This is a check that exists in the existing
commit-graph reading code.

Add a helper method 'corrupt_graph_and_verify' to the test script
t5318-commit-graph.sh. This helper corrupts the commit-graph file
at a certain location, runs 'git commit-graph verify', and reports
the output to the 'err' file. This data is filtered to remove the
lines added by 'test_must_fail' when the test is run verbosely.
Then, the output is checked to contain a specific error message.

Most messages from 'git commit-graph verify' will not be marked
for translation. There will be one exception: the message that
reports an invalid checksum will be marked for translation, as that
is the only message that is intended for a typical user.

Helped-by: Szeder Gábor 
Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 43 +
 1 file changed, 43 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 6ca451dfd2..8f96e2636c 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+# the verify tests below expect the commit-graph to contain
+# exactly the commits reachable from the commits/8 branch.
+# If the file changes the set of commits in the list, then the
+# offsets into the binary file will result in different edits
+# and the tests will likely break.
+
 test_expect_success 'git commit-graph verify' '
cd "$TRASH_DIRECTORY/full" &&
+   git rev-parse commits/8 | git commit-graph write --stdin-commits &&
git commit-graph verify >output
 '
 
+GRAPH_BYTE_VERSION=4
+GRAPH_BYTE_HASH=5
+
+# usage: corrupt_graph_and_verify   
+# Manipulates the commit-graph file at the position
+# by inserting the data, then runs 'git commit-graph verify'
+# and places the output in the file 'err'. Test 'err' for
+# the given string.
+corrupt_graph_and_verify() {
+   pos=$1
+   data="${2:-\0}"
+   grepstr=$3
+   cd "$TRASH_DIRECTORY/full" &&
+   test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
+   cp $objdir/info/commit-graph commit-graph-backup &&
+   printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" 
conv=notrunc &&
+   test_must_fail git commit-graph verify 2>test_err &&
+   grep -v "^+" test_err >err
+   test_i18ngrep "$grepstr" err
+}
+
+test_expect_success 'detect bad signature' '
+   corrupt_graph_and_verify 0 "\0" \
+   "graph signature"
+'
+
+test_expect_success 'detect bad version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \
+   "graph version"
+'
+
+test_expect_success 'detect bad hash version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \
+   "hash version"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 08/21] commit-graph: verify required chunks are present

2018-06-08 Thread Derrick Stolee
The commit-graph file requires the following three chunks:

* OID Fanout
* OID Lookup
* Commit Data

If any of these are missing, then the 'verify' subcommand should
report a failure. This includes the chunk IDs malformed or the
chunk count is truncated.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  |  9 +
 t/t5318-commit-graph.sh | 29 +
 2 files changed, 38 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 22ef696e18..f30b4ccee9 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -848,5 +848,14 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
return 1;
}
 
+   verify_commit_graph_error = 0;
+
+   if (!g->chunk_oid_fanout)
+   graph_report("commit-graph is missing the OID Fanout chunk");
+   if (!g->chunk_oid_lookup)
+   graph_report("commit-graph is missing the OID Lookup chunk");
+   if (!g->chunk_commit_data)
+   graph_report("commit-graph is missing the Commit Data chunk");
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 8f96e2636c..c03792a8ed 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -249,6 +249,15 @@ test_expect_success 'git commit-graph verify' '
 
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
+GRAPH_BYTE_CHUNK_COUNT=6
+GRAPH_CHUNK_LOOKUP_OFFSET=8
+GRAPH_CHUNK_LOOKUP_WIDTH=12
+GRAPH_CHUNK_LOOKUP_ROWS=5
+GRAPH_BYTE_OID_FANOUT_ID=$GRAPH_CHUNK_LOOKUP_OFFSET
+GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+   1 * $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+2 * $GRAPH_CHUNK_LOOKUP_WIDTH))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -283,4 +292,24 @@ test_expect_success 'detect bad hash version' '
"hash version"
 '
 
+test_expect_success 'detect low chunk count' '
+   corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+   "missing the .* chunk"
+'
+
+test_expect_success 'detect missing OID fanout chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_FANOUT_ID "\0" \
+   "missing the OID Fanout chunk"
+'
+
+test_expect_success 'detect missing OID lookup chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ID "\0" \
+   "missing the OID Lookup chunk"
+'
+
+test_expect_success 'detect missing commit data chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATA_ID "\0" \
+   "missing the Commit Data chunk"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 13/21] commit-graph: verify generation number

2018-06-08 Thread Derrick Stolee
While iterating through the commit parents, perform the generation
number calculation and compare against the value stored in the
commit-graph.

The tests demonstrate that having a different set of parents affects
the generation number calculation, and this value propagates to
descendants. Hence, we drop the single-line condition on the output.

Since Git will ship with the commit-graph feature without generation
numbers, we need to accept commit-graphs with all generation numbers
equal to zero. In this case, ignore the generation number calculation.

However, verify that we should never have a mix of zero and non-zero
generation numbers. Create a test that sets one commit to generation
zero and all following commits report a failure as they have non-zero
generation in a file that contains generation number zero.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 34 ++
 t/t5318-commit-graph.sh | 11 +++
 2 files changed, 45 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 6d8d774eb0..e0f71658da 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -846,10 +846,14 @@ static void graph_report(const char *fmt, ...)
va_end(ap);
 }
 
+#define GENERATION_ZERO_EXISTS 1
+#define GENERATION_NUMBER_EXISTS 2
+
 int verify_commit_graph(struct repository *r, struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
struct object_id prev_oid, cur_oid;
+   int generation_zero = 0;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -911,6 +915,7 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
struct commit_list *graph_parents, *odb_parents;
+   uint32_t max_generation = 0;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -945,6 +950,9 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
 
oid_to_hex(_parents->item->object.oid),
 
oid_to_hex(_parents->item->object.oid));
 
+   if (graph_parents->item->generation > max_generation)
+   max_generation = 
graph_parents->item->generation;
+
graph_parents = graph_parents->next;
odb_parents = odb_parents->next;
}
@@ -952,6 +960,32 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
if (odb_parents != NULL)
graph_report("commit-graph parent list for commit %s 
terminates early",
 oid_to_hex(_oid));
+
+   if (!graph_commit->generation) {
+   if (generation_zero == GENERATION_NUMBER_EXISTS)
+   graph_report("commit-graph has generation 
number zero for commit %s, but non-zero elsewhere",
+oid_to_hex(_oid));
+   generation_zero = GENERATION_ZERO_EXISTS;
+   } else if (generation_zero == GENERATION_ZERO_EXISTS)
+   graph_report("commit-graph has non-zero generation 
number for commit %s, but zero elsewhere",
+oid_to_hex(_oid));
+
+   if (generation_zero == GENERATION_ZERO_EXISTS)
+   continue;
+
+   /*
+* If one of our parents has generation GENERATION_NUMBER_MAX, 
then
+* our generation is also GENERATION_NUMBER_MAX. Decrement to 
avoid
+* extra logic in the following condition.
+*/
+   if (max_generation == GENERATION_NUMBER_MAX)
+   max_generation--;
+
+   if (graph_commit->generation != max_generation + 1)
+   graph_report("commit-graph generation for commit %s is 
%u != %u",
+oid_to_hex(_oid),
+graph_commit->generation,
+max_generation + 1);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 9a3481c30f..5b75c4dca2 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
+GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 
 # usage: corrupt_graph_and_verify   
 # Manipul

[PATCH v6 17/21] fsck: verify commit-graph

2018-06-08 Thread Derrick Stolee
If core.commitGraph is true, verify the contents of the commit-graph
during 'git fsck' using the 'git commit-graph verify' subcommand. Run
this check on all alternates, as well.

We use a new process for two reasons:

1. The subcommand decouples the details of loading and verifying a
   commit-graph file from the other fsck details.

2. The commit-graph verification requires the commits to be loaded
   in a specific order to guarantee we parse from the commit-graph
   file for some objects and from the object database for others.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-fsck.txt |  3 +++
 builtin/fsck.c | 21 +
 t/t5318-commit-graph.sh|  8 
 3 files changed, 32 insertions(+)

diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt
index b9f060e3b2..ab9a93fb9b 100644
--- a/Documentation/git-fsck.txt
+++ b/Documentation/git-fsck.txt
@@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or 
other archives
 (i.e., you can just remove them and do an 'rsync' with some other site in
 the hopes that somebody else has the object you have corrupted).
 
+If core.commitGraph is true, the commit-graph file will also be inspected
+using 'git commit-graph verify'. See linkgit:git-commit-graph[1].
+
 Extracted Diagnostics
 -
 
diff --git a/builtin/fsck.c b/builtin/fsck.c
index 3ad4f160f9..9fb2edc69f 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -18,6 +18,7 @@
 #include "decorate.h"
 #include "packfile.h"
 #include "object-store.h"
+#include "run-command.h"
 
 #define REACHABLE 0x0001
 #define SEEN  0x0002
@@ -47,6 +48,7 @@ static int name_objects;
 #define ERROR_REACHABLE 02
 #define ERROR_PACK 04
 #define ERROR_REFS 010
+#define ERROR_COMMIT_GRAPH 020
 
 static const char *describe_object(struct object *obj)
 {
@@ -822,5 +824,24 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
}
 
check_connectivity();
+
+   if (core_commit_graph) {
+   struct child_process commit_graph_verify = CHILD_PROCESS_INIT;
+   const char *verify_argv[] = { "commit-graph", "verify", NULL, 
NULL, NULL };
+   commit_graph_verify.argv = verify_argv;
+   commit_graph_verify.git_cmd = 1;
+
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+
+   prepare_alt_odb(the_repository);
+   for (alt =  the_repository->objects->alt_odb_list; alt; alt = 
alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }
+
return errors_found;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index bd5b8428af..0da5a51552 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -399,4 +399,12 @@ test_expect_success 'detect invalid checksum hash' '
"incorrect checksum"
 '
 
+test_expect_success 'git fsck (checks commit-graph)' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git fsck &&
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum" &&
+   test_must_fail git fsck
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 12/21] commit-graph: verify parent list

2018-06-08 Thread Derrick Stolee
The commit-graph file stores parents in a two-column portion of the
commit data chunk. If there is only one parent, then the second column
stores 0x to indicate no second parent.

The 'verify' subcommand checks the parent list for the commit loaded
from the commit-graph and the one parsed from the object database. Test
these checks for corrupt parents, too many parents, and wrong parents.

Add a boundary check to insert_parent_or_die() for when the parent
position value is out of range.

The octopus merge will be tested in a later commit.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 28 
 t/t5318-commit-graph.sh | 18 ++
 2 files changed, 46 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 5df18394f9..6d8d774eb0 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -244,6 +244,9 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
struct commit *c;
struct object_id oid;
 
+   if (pos >= g->num_commits)
+   die("invalid parent position %"PRIu64, pos);
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -907,6 +910,7 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
 
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
+   struct commit_list *graph_parents, *odb_parents;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -924,6 +928,30 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
 oid_to_hex(_oid),
 
oid_to_hex(get_commit_tree_oid(graph_commit)),
 
oid_to_hex(get_commit_tree_oid(odb_commit)));
+
+   graph_parents = graph_commit->parents;
+   odb_parents = odb_commit->parents;
+
+   while (graph_parents) {
+   if (odb_parents == NULL) {
+   graph_report("commit-graph parent list for 
commit %s is too long",
+oid_to_hex(_oid));
+   break;
+   }
+
+   if (oidcmp(_parents->item->object.oid, 
_parents->item->object.oid))
+   graph_report("commit-graph parent for %s is %s 
!= %s",
+oid_to_hex(_oid),
+
oid_to_hex(_parents->item->object.oid),
+
oid_to_hex(_parents->item->object.oid));
+
+   graph_parents = graph_parents->next;
+   odb_parents = odb_parents->next;
+   }
+
+   if (odb_parents != NULL)
+   graph_report("commit-graph parent list for commit %s 
terminates early",
+oid_to_hex(_oid));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 2b9214bc83..9a3481c30f 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -269,6 +269,9 @@ GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + 
$HASH_LEN * 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 4 + 
10))
 GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 
$NUM_COMMITS))
 GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
+GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
+GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
+GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -348,4 +351,19 @@ test_expect_success 'detect incorrect tree OID' '
"root tree OID for commit"
 '
 
+test_expect_success 'detect incorrect parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_PARENT "\01" \
+   "invalid parent"
+'
+
+test_expect_success 'detect extra parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_EXTRA_PARENT "\00" \
+   "is too long"
+'
+
+test_expect_success 'detect wrong parent' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_WRONG_PARENT "\01" \
+   "commit-graph parent for"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 09/21] commit-graph: verify corrupt OID fanout and lookup

2018-06-08 Thread Derrick Stolee
In the commit-graph file, the OID fanout chunk provides an index into
the OID lookup. The 'verify' subcommand should find incorrect values
in the fanout.

Similarly, the 'verify' subcommand should find out-of-order values in
the OID lookup.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 36 
 t/t5318-commit-graph.sh | 22 ++
 2 files changed, 58 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index f30b4ccee9..866a9e7e41 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -843,6 +843,9 @@ static void graph_report(const char *fmt, ...)
 
 int verify_commit_graph(struct repository *r, struct commit_graph *g)
 {
+   uint32_t i, cur_fanout_pos = 0;
+   struct object_id prev_oid, cur_oid;
+
if (!g) {
graph_report("no commit-graph file loaded");
return 1;
@@ -857,5 +860,38 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
if (!g->chunk_commit_data)
graph_report("commit-graph is missing the Commit Data chunk");
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   if (i && oidcmp(_oid, _oid) >= 0)
+   graph_report("commit-graph has incorrect OID order: %s 
then %s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));
+
+   oidcpy(_oid, _oid);
+
+   while (cur_oid.hash[0] > cur_fanout_pos) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+   if (i != fanout_value)
+   graph_report("commit-graph has incorrect fanout 
value: fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+   }
+
+   while (cur_fanout_pos < 256) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+
+   if (g->num_commits != fanout_value)
+   graph_report("commit-graph has incorrect fanout value: 
fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c03792a8ed..4809cc881f 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
 GRAPH_BYTE_CHUNK_COUNT=6
@@ -258,6 +259,12 @@ GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
1 * $GRAPH_CHUNK_LOOKUP_WIDTH))
 GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
 2 * $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_FANOUT_OFFSET=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+  $GRAPH_CHUNK_LOOKUP_WIDTH * $GRAPH_CHUNK_LOOKUP_ROWS))
+GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 * 4))
+GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 * 255))
+GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 * 256))
+GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 8))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -312,4 +319,19 @@ test_expect_success 'detect missing commit data chunk' '
"missing the Commit Data chunk"
 '
 
+test_expect_success 'detect incorrect fanout' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT1 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect fanout final value' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect OID order' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ORDER "\01" \
+   "incorrect OID order"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 16/21] commit-graph: verify contents match checksum

2018-06-08 Thread Derrick Stolee
The commit-graph file ends with a SHA1 hash of the previous contents. If
a commit-graph file has errors but the checksum hash is correct, then we
know that the problem is a bug in Git and not simply file corruption
after-the-fact.

Compute the checksum right away so it is the first error that appears,
and make the message translatable since this error can be "corrected" by
a user by simply deleting the file and recomputing. The rest of the
errors are useful only to developers.

Be sure to continue checking the rest of the file data if the checksum
is wrong. This is important for our tests, as we break the checksum as
we modify bytes of the commit-graph file.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 16 ++--
 t/t5318-commit-graph.sh |  6 ++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 6d6c6beff9..d926c4b59f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -833,6 +833,7 @@ void write_commit_graph(const char *obj_dir,
oids.nr = 0;
 }
 
+#define VERIFY_COMMIT_GRAPH_ERROR_HASH 2
 static int verify_commit_graph_error;
 
 static void graph_report(const char *fmt, ...)
@@ -852,8 +853,10 @@ static void graph_report(const char *fmt, ...)
 int verify_commit_graph(struct repository *r, struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
-   struct object_id prev_oid, cur_oid;
+   struct object_id prev_oid, cur_oid, checksum;
int generation_zero = 0;
+   struct hashfile *f;
+   int devnull;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -872,6 +875,15 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
if (verify_commit_graph_error)
return verify_commit_graph_error;
 
+   devnull = open("/dev/null", O_WRONLY);
+   f = hashfd(devnull, NULL);
+   hashwrite(f, g->data, g->data_len - g->hash_len);
+   finalize_hashfile(f, checksum.hash, CSUM_CLOSE);
+   if (hashcmp(checksum.hash, g->data + g->data_len - g->hash_len)) {
+   graph_report(_("the commit-graph file has incorrect checksum 
and is likely corrupt"));
+   verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH;
+   }
+
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit;
 
@@ -909,7 +921,7 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
cur_fanout_pos++;
}
 
-   if (verify_commit_graph_error)
+   if (verify_commit_graph_error & ~VERIFY_COMMIT_GRAPH_ERROR_HASH)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index cbd6462226..bd5b8428af 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -279,6 +279,7 @@ GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
 GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
 $GRAPH_COMMIT_DATA_WIDTH * $NUM_COMMITS))
 GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
+GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 * $NUM_OCTOPUS_EDGES))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -393,4 +394,9 @@ test_expect_success 'detect incorrect parent for octopus 
merge' '
"invalid parent"
 '
 
+test_expect_success 'detect invalid checksum hash' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 21/21] commit-graph: update design document

2018-06-08 Thread Derrick Stolee
The commit-graph feature is now integrated with 'fsck' and 'gc',
so remove those items from the "Future Work" section of the
commit-graph design document.

Also remove the section on lazy-loading trees, as that was completed
in an earlier patch series.

Signed-off-by: Derrick Stolee 
---
 Documentation/technical/commit-graph.txt | 22 --
 1 file changed, 22 deletions(-)

diff --git a/Documentation/technical/commit-graph.txt 
b/Documentation/technical/commit-graph.txt
index e1a883eb46..c664acbd76 100644
--- a/Documentation/technical/commit-graph.txt
+++ b/Documentation/technical/commit-graph.txt
@@ -118,9 +118,6 @@ Future Work
 - The commit graph feature currently does not honor commit grafts. This can
   be remedied by duplicating or refactoring the current graft logic.
 
-- The 'commit-graph' subcommand does not have a "verify" mode that is
-  necessary for integration with fsck.
-
 - After computing and storing generation numbers, we must make graph
   walks aware of generation numbers to gain the performance benefits they
   enable. This will mostly be accomplished by swapping a commit-date-ordered
@@ -130,25 +127,6 @@ Future Work
 - 'log --topo-order'
 - 'tag --merged'
 
-- Currently, parse_commit_gently() requires filling in the root tree
-  object for a commit. This passes through lookup_tree() and consequently
-  lookup_object(). Also, it calls lookup_commit() when loading the parents.
-  These method calls check the ODB for object existence, even if the
-  consumer does not need the content. For example, we do not need the
-  tree contents when computing merge bases. Now that commit parsing is
-  removed from the computation time, these lookup operations are the
-  slowest operations keeping graph walks from being fast. Consider
-  loading these objects without verifying their existence in the ODB and
-  only loading them fully when consumers need them. Consider a method
-  such as "ensure_tree_loaded(commit)" that fully loads a tree before
-  using commit->tree.
-
-- The current design uses the 'commit-graph' subcommand to generate the graph.
-  When this feature stabilizes enough to recommend to most users, we should
-  add automatic graph writes to common operations that create many commits.
-  For example, one could compute a graph on 'clone', 'fetch', or 'repack'
-  commands.
-
 - A server could provide a commit graph file as part of the network protocol
   to avoid extra calculations by clients. This feature is only of benefit if
   the user is willing to trust the file, because verifying the file is correct
-- 
2.18.0.rc1



[PATCH v6 10/21] commit-graph: verify objects exist

2018-06-08 Thread Derrick Stolee
In the 'verify' subcommand, load commits directly from the object
database to ensure they exist. Parse by skipping the commit-graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 18 ++
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 25 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 866a9e7e41..00e89b71e9 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -11,6 +11,7 @@
 #include "sha1-lookup.h"
 #include "commit-graph.h"
 #include "object-store.h"
+#include "alloc.h"
 
 #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
 #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
@@ -242,6 +243,7 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -893,5 +895,21 @@ int verify_commit_graph(struct repository *r, struct 
commit_graph *g)
cur_fanout_pos++;
}
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *odb_commit;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   odb_commit = (struct commit *)create_object(r, cur_oid.hash, 
alloc_commit_node(r));
+   if (parse_commit_internal(odb_commit, 0, 0)) {
+   graph_report("failed to parse %s from object database",
+oid_to_hex(_oid));
+   continue;
+   }
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 4809cc881f..af5e34c0cb 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+NUM_COMMITS=9
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 * 4))
 GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 * 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 * 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 8))
+GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 4 + 
10))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' '
"incorrect OID order"
 '
 
+test_expect_success 'detect OID not in object database' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \
+   "from object database"
+'
+
 test_done
-- 
2.18.0.rc1



<    1   2   3   4   5   6   7   8   9   10   >