[PATCH v6 18/21] commit-graph: use string-list API for input

2018-06-08 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 39 +--
 commit-graph.c | 15 +++
 commit-graph.h |  7 +++
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 9d108f43a9..ea28bc311a 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -119,13 +119,9 @@ static int graph_read(int argc, const char **argv)
 
 static int graph_write(int argc, const char **argv)
 {
-   const char **pack_indexes = NULL;
-   int packs_nr = 0;
-   const char **commit_hex = NULL;
-   int commits_nr = 0;
-   const char **lines = NULL;
-   int lines_nr = 0;
-   int lines_alloc = 0;
+   struct string_list *pack_indexes = NULL;
+   struct string_list *commit_hex = NULL;
+   struct string_list lines;
 
static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", _dir,
@@ -149,34 +145,25 @@ static int graph_write(int argc, const char **argv)
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
 
+   string_list_init(, 0);
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
-   lines_nr = 0;
-   lines_alloc = 128;
-   ALLOC_ARRAY(lines, lines_alloc);
-
-   while (strbuf_getline(, stdin) != EOF) {
-   ALLOC_GROW(lines, lines_nr + 1, lines_alloc);
-   lines[lines_nr++] = strbuf_detach(, NULL);
-   }
-
-   if (opts.stdin_packs) {
-   pack_indexes = lines;
-   packs_nr = lines_nr;
-   }
-   if (opts.stdin_commits) {
-   commit_hex = lines;
-   commits_nr = lines_nr;
-   }
+
+   while (strbuf_getline(, stdin) != EOF)
+   string_list_append(, strbuf_detach(, NULL));
+
+   if (opts.stdin_packs)
+   pack_indexes = 
+   if (opts.stdin_commits)
+   commit_hex = 
}
 
write_commit_graph(opts.obj_dir,
   pack_indexes,
-  packs_nr,
   commit_hex,
-  commits_nr,
   opts.append);
 
+   string_list_clear(, 0);
return 0;
 }
 
diff --git a/commit-graph.c b/commit-graph.c
index d926c4b59f..a06e7e9549 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -657,10 +657,8 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
 }
 
 void write_commit_graph(const char *obj_dir,
-   const char **pack_indexes,
-   int nr_packs,
-   const char **commit_hex,
-   int nr_commits,
+   struct string_list *pack_indexes,
+   struct string_list *commit_hex,
int append)
 {
struct packed_oid_list oids;
@@ -701,10 +699,10 @@ void write_commit_graph(const char *obj_dir,
int dirlen;
strbuf_addf(, "%s/pack/", obj_dir);
dirlen = packname.len;
-   for (i = 0; i < nr_packs; i++) {
+   for (i = 0; i < pack_indexes->nr; i++) {
struct packed_git *p;
strbuf_setlen(, dirlen);
-   strbuf_addstr(, pack_indexes[i]);
+   strbuf_addstr(, pack_indexes->items[i].string);
p = add_packed_git(packname.buf, packname.len, 1);
if (!p)
die("error adding pack %s", packname.buf);
@@ -717,12 +715,13 @@ void write_commit_graph(const char *obj_dir,
}
 
if (commit_hex) {
-   for (i = 0; i < nr_commits; i++) {
+   for (i = 0; i < commit_hex->nr; i++) {
const char *end;
struct object_id oid;
struct commit *result;
 
-   if (commit_hex[i] && parse_oid_hex(commit_hex[i], , 
))
+   if (commit_hex->items[i].string &&
+   parse_oid_hex(commit_hex->items[i].string, , 
))
continue;
 
result = lookup_commit_reference_gently(, 1);
diff --git a/commit-graph.h b/commit-graph.h
index 4359812fb4..a79b854470 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -3,6 +3,7 @@
 
 #include "git-compat-util.h"
 #include "repository.h"
+#include "string-list.h"
 
 char *get_commit_graph_filename(const char *obj_dir);
 
@@ -48,10 +49,8 @@ struct commit_graph {
 struct commit_graph *load_commit_graph_one

[PATCH v6 06/21] commit-graph: add 'verify' subcommand

2018-06-08 Thread Derrick Stolee
If the commit-graph file becomes corrupt, we need a way to verify
that its contents match the object database. In the manner of
'git fsck' we will implement a 'git commit-graph verify' subcommand
to report all issues with the file.

Add the 'verify' subcommand to the 'commit-graph' builtin and its
documentation. The subcommand is currently a no-op except for
loading the commit-graph into memory, which may trigger run-time
errors that would be caught by normal use. Add a simple test that
ensures the command returns a zero error code.

If no commit-graph file exists, this is an acceptable state. Do
not report any errors.

Helped-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 Documentation/git-commit-graph.txt |  6 +
 builtin/commit-graph.c | 39 ++
 commit-graph.c | 23 ++
 commit-graph.h |  3 +++
 t/t5318-commit-graph.sh| 10 
 5 files changed, 81 insertions(+)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index 4c97b555cc..a222cfab08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -10,6 +10,7 @@ SYNOPSIS
 
 [verse]
 'git commit-graph read' [--object-dir ]
+'git commit-graph verify' [--object-dir ]
 'git commit-graph write'  [--object-dir ]
 
 
@@ -52,6 +53,11 @@ existing commit-graph file.
 Read a graph file given by the commit-graph file and output basic
 details about the graph file. Used for debugging purposes.
 
+'verify'::
+
+Read the commit-graph file and verify its contents against the object
+database. Used to check for corrupted data.
+
 
 EXAMPLES
 
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index f0875b8bf3..9d108f43a9 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -3,15 +3,22 @@
 #include "dir.h"
 #include "lockfile.h"
 #include "parse-options.h"
+#include "repository.h"
 #include "commit-graph.h"
 
 static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
+   N_("git commit-graph verify [--object-dir ]"),
N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
NULL
 };
 
+static const char * const builtin_commit_graph_verify_usage[] = {
+   N_("git commit-graph verify [--object-dir ]"),
+   NULL
+};
+
 static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir ]"),
NULL
@@ -29,6 +36,36 @@ static struct opts_commit_graph {
int append;
 } opts;
 
+
+static int graph_verify(int argc, const char **argv)
+{
+   struct commit_graph *graph = NULL;
+   char *graph_name;
+
+   static struct option builtin_commit_graph_verify_options[] = {
+   OPT_STRING(0, "object-dir", _dir,
+  N_("dir"),
+  N_("The object directory to store the graph")),
+   OPT_END(),
+   };
+
+   argc = parse_options(argc, argv, NULL,
+builtin_commit_graph_verify_options,
+builtin_commit_graph_verify_usage, 0);
+
+   if (!opts.obj_dir)
+   opts.obj_dir = get_object_directory();
+
+   graph_name = get_commit_graph_filename(opts.obj_dir);
+   graph = load_commit_graph_one(graph_name);
+   FREE_AND_NULL(graph_name);
+
+   if (!graph)
+   return 0;
+
+   return verify_commit_graph(the_repository, graph);
+}
+
 static int graph_read(int argc, const char **argv)
 {
struct commit_graph *graph = NULL;
@@ -165,6 +202,8 @@ int cmd_commit_graph(int argc, const char **argv, const 
char *prefix)
if (argc > 0) {
if (!strcmp(argv[0], "read"))
return graph_read(argc, argv);
+   if (!strcmp(argv[0], "verify"))
+   return graph_verify(argc, argv);
if (!strcmp(argv[0], "write"))
return graph_write(argc, argv);
}
diff --git a/commit-graph.c b/commit-graph.c
index 9e228d3bb5..22ef696e18 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -827,3 +827,26 @@ void write_commit_graph(const char *obj_dir,
oids.alloc = 0;
oids.nr = 0;
 }
+
+static int verify_commit_graph_error;
+
+static void graph_report(const char *fmt, ...)
+{
+   va_list ap;
+   verify_commit_graph_error = 1;
+
+   va_start(ap, fmt);
+   vfprintf(stderr, fmt, ap);
+   fprintf(stderr, "\n");
+   va_end(ap);
+}
+
+int verify_commit_graph(struct repository *r, struct commit_graph *g)
+{
+   if (!g) {
+   graph_report("no commit-graph fil

[PATCH v6 03/21] commit-graph: parse commit from chosen graph

2018-06-08 Thread Derrick Stolee
Before verifying a commit-graph file against the object database, we
need to parse all commits from the given commit-graph file. Create
parse_commit_in_graph_one() to target a given struct commit_graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index f83f6d2373..e77b19971d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -314,7 +314,7 @@ static int find_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
}
 }
 
-int parse_commit_in_graph(struct commit *item)
+static int parse_commit_in_graph_one(struct commit_graph *g, struct commit 
*item)
 {
uint32_t pos;
 
@@ -322,9 +322,21 @@ int parse_commit_in_graph(struct commit *item)
return 0;
if (item->object.parsed)
return 1;
+
+   if (find_commit_in_graph(item, g, ))
+   return fill_commit_in_graph(item, g, pos);
+
+   return 0;
+}
+
+int parse_commit_in_graph(struct commit *item)
+{
+   if (!core_commit_graph)
+   return 0;
+
prepare_commit_graph();
-   if (commit_graph && find_commit_in_graph(item, commit_graph, ))
-   return fill_commit_in_graph(item, commit_graph, pos);
+   if (commit_graph)
+   return parse_commit_in_graph_one(commit_graph, item);
return 0;
 }
 
-- 
2.18.0.rc1



[PATCH v6 19/21] commit-graph: add '--reachable' option

2018-06-08 Thread Derrick Stolee
When writing commit-graph files, it can be convenient to ask for all
reachable commits (starting at the ref set) in the resulting file. This
is particularly helpful when writing to stdin is complicated, such as a
future integration with 'git gc'.

Signed-off-by: Derrick Stolee 
---
 Documentation/git-commit-graph.txt |  8 ++--
 builtin/commit-graph.c | 16 
 commit-graph.c | 18 ++
 commit-graph.h |  1 +
 t/t5318-commit-graph.sh| 10 ++
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index a222cfab08..dececb79d7 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -38,12 +38,16 @@ Write a commit graph file based on the commits found in 
packfiles.
 +
 With the `--stdin-packs` option, generate the new commit graph by
 walking objects only in the specified pack-indexes. (Cannot be combined
-with --stdin-commits.)
+with `--stdin-commits` or `--reachable`.)
 +
 With the `--stdin-commits` option, generate the new commit graph by
 walking commits starting at the commits specified in stdin as a list
 of OIDs in hex, one OID per line. (Cannot be combined with
---stdin-packs.)
+`--stdin-packs` or `--reachable`.)
++
+With the `--reachable` option, generate the new commit graph by walking
+commits starting at all refs. (Cannot be combined with `--stdin-commits`
+or `--stdin-packs`.)
 +
 With the `--append` option, include all commits that are present in the
 existing commit-graph file.
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index ea28bc311a..c7d0db5ab4 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -10,7 +10,7 @@ static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
N_("git commit-graph verify [--object-dir ]"),
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
@@ -25,12 +25,13 @@ static const char * const builtin_commit_graph_read_usage[] 
= {
 };
 
 static const char * const builtin_commit_graph_write_usage[] = {
-   N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
+   N_("git commit-graph write [--object-dir ] [--append] 
[--reachable|--stdin-packs|--stdin-commits]"),
NULL
 };
 
 static struct opts_commit_graph {
const char *obj_dir;
+   int reachable;
int stdin_packs;
int stdin_commits;
int append;
@@ -127,6 +128,8 @@ static int graph_write(int argc, const char **argv)
OPT_STRING(0, "object-dir", _dir,
N_("dir"),
N_("The object directory to store the graph")),
+   OPT_BOOL(0, "reachable", ,
+   N_("start walk at all refs")),
OPT_BOOL(0, "stdin-packs", _packs,
N_("scan pack-indexes listed by stdin for commits")),
OPT_BOOL(0, "stdin-commits", _commits,
@@ -140,11 +143,16 @@ static int graph_write(int argc, const char **argv)
 builtin_commit_graph_write_options,
 builtin_commit_graph_write_usage, 0);
 
-   if (opts.stdin_packs && opts.stdin_commits)
-   die(_("cannot use both --stdin-commits and --stdin-packs"));
+   if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1)
+   die(_("use at most one of --reachable, --stdin-commits, or 
--stdin-packs"));
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
 
+   if (opts.reachable) {
+   write_commit_graph_reachable(opts.obj_dir, opts.append);
+   return 0;
+   }
+
string_list_init(, 0);
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
diff --git a/commit-graph.c b/commit-graph.c
index a06e7e9549..adf54e3fe7 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -7,6 +7,7 @@
 #include "packfile.h"
 #include "commit.h"
 #include "object.h"
+#include "refs.h"
 #include "revision.h"
 #include "sha1-lookup.h"
 #include "commit-graph.h"
@@ -656,6 +657,23 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
}
 }
 
+static int add_ref_to_list(const char *refname,
+  const struct object_id *oid,
+  int flags, void *cb

[PATCH v6 20/21] gc: automatically write commit-graph files

2018-06-08 Thread Derrick Stolee
The commit-graph file is a very helpful feature for speeding up git
operations. In order to make it more useful, make it possible to
write the commit-graph file during standard garbage collection
operations.

Add a 'gc.commitGraph' config setting that triggers writing a
commit-graph file after any non-trivial 'git gc' command. Defaults to
false while the commit-graph feature matures. We specifically do not
want to have this on by default until the commit-graph feature is fully
integrated with history-modifying features like shallow clones.

Helped-by: Ævar Arnfjörð Bjarmason 
Signed-off-by: Derrick Stolee 
---
 Documentation/config.txt | 10 +-
 Documentation/git-gc.txt |  4 
 builtin/gc.c |  6 ++
 t/t5318-commit-graph.sh  | 14 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ab641bf5a9..f2b5ed17c8 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -906,7 +906,8 @@ the `GIT_NOTES_REF` environment variable.  See 
linkgit:git-notes[1].
 
 core.commitGraph::
Enable git commit graph feature. Allows reading from the
-   commit-graph file.
+   commit-graph file. See `gc.commitGraph` for automatically
+   maintaining the file.
 
 core.sparseCheckout::
Enable "sparse checkout" feature. See section "Sparse checkout" in
@@ -1647,6 +1648,13 @@ this configuration variable is ignored, all packs except 
the base pack
 will be repacked. After this the number of packs should go below
 gc.autoPackLimit and gc.bigPackThreshold should be respected again.
 
+gc.commitGraph::
+   If true, then gc will rewrite the commit-graph file when
+   linkgit:git-gc[1] is run. When using linkgit:git-gc[1]
+   '--auto' the commit-graph will be updated if housekeeping is
+   required. Default is false. See linkgit:git-commit-graph[1]
+   for details.
+
 gc.logExpiry::
If the file gc.log exists, then `git gc --auto` won't run
unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 24b2dd44fe..f5bc98ccb3 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -136,6 +136,10 @@ The optional configuration variable `gc.packRefs` 
determines if
 it within all non-bare repos or it can be set to a boolean value.
 This defaults to true.
 
+The optional configuration variable `gc.commitGraph` determines if
+'git gc' should run 'git commit-graph write'. This can be set to a
+boolean value. This defaults to false.
+
 The optional configuration variable `gc.aggressiveWindow` controls how
 much time is spent optimizing the delta compression of the objects in
 the repository when the --aggressive option is specified.  The larger
diff --git a/builtin/gc.c b/builtin/gc.c
index ccfb1ceaeb..4e06e8372d 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,7 @@
 #include "sigchain.h"
 #include "argv-array.h"
 #include "commit.h"
+#include "commit-graph.h"
 #include "packfile.h"
 #include "object-store.h"
 #include "pack.h"
@@ -40,6 +41,7 @@ static int aggressive_depth = 50;
 static int aggressive_window = 250;
 static int gc_auto_threshold = 6700;
 static int gc_auto_pack_limit = 50;
+static int gc_commit_graph = 0;
 static int detach_auto = 1;
 static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
@@ -129,6 +131,7 @@ static void gc_config(void)
git_config_get_int("gc.aggressivedepth", _depth);
git_config_get_int("gc.auto", _auto_threshold);
git_config_get_int("gc.autopacklimit", _auto_pack_limit);
+   git_config_get_bool("gc.commitgraph", _commit_graph);
git_config_get_bool("gc.autodetach", _auto);
git_config_get_expiry("gc.pruneexpire", _expire);
git_config_get_expiry("gc.worktreepruneexpire", 
_worktrees_expire);
@@ -641,6 +644,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
if (pack_garbage.nr > 0)
clean_pack_garbage();
 
+   if (gc_commit_graph)
+   write_commit_graph_reachable(get_object_directory(), 0);
+
if (auto_gc && too_many_loose_objects())
warning(_("There are too many unreachable loose objects; "
"run 'git prune' to remove them."));
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 06ecbb3f4a..9a0661983c 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -245,6 +245,20 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+test_expect_success 'check that gc computes commit-graph' '
+   cd "$TRASH_DIRECTORY/full" &&
+   git commit --allow-empty -m "

[PATCH v6 15/21] commit-graph: test for corrupted octopus edge

2018-06-08 Thread Derrick Stolee
The commit-graph file has an extra chunk to store the parent int-ids for
parents beyond the first parent for octopus merges. Our test repo has a
single octopus merge that we can manipulate to demonstrate the 'verify'
subcommand detects incorrect values in that chunk.

Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index b7b4410e75..cbd6462226 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' '
 '
 
 NUM_COMMITS=9
+NUM_OCTOPUS_EDGES=2
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -274,6 +275,10 @@ 
GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
+GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
+GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
+$GRAPH_COMMIT_DATA_WIDTH * $NUM_COMMITS))
+GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -383,4 +388,9 @@ test_expect_success 'detect incorrect commit date' '
"commit date"
 '
 
+test_expect_success 'detect incorrect parent for octopus merge' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \
+   "invalid parent"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v6 02/21] commit-graph: fix GRAPH_MIN_SIZE

2018-06-08 Thread Derrick Stolee
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable
commit-graph file. However, the minimum number of chunks was wrong.
It is possible to write a commit-graph file with zero commits, and
that violates this macro's value.

Rewrite the macro, and use extra macros to better explain the magic
constants.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index b63a1fc85e..f83f6d2373 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -35,10 +35,11 @@
 
 #define GRAPH_LAST_EDGE 0x8000
 
+#define GRAPH_HEADER_SIZE 8
 #define GRAPH_FANOUT_SIZE (4 * 256)
 #define GRAPH_CHUNKLOOKUP_WIDTH 12
-#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \
-   GRAPH_OID_LEN + 8)
+#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \
+   + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN)
 
 char *get_commit_graph_filename(const char *obj_dir)
 {
-- 
2.18.0.rc1



[PATCH v6 00/21] Integrate commit-graph into 'fsck' and 'gc'

2018-06-08 Thread Derrick Stolee
+ 
10))
-GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS))
+  $GRAPH_CHUNK_LOOKUP_WIDTH * $GRAPH_CHUNK_LOOKUP_ROWS))
+GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 * 4))
+GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 * 255))
+GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 * 256))
+GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 8))
+GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 4 + 
10))
+GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 
$NUM_COMMITS))
 GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
@@ -301,9 +301,9 @@ GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + 
$HASH_LEN + 11))
 GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
 GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
 GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
-$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS))
+$GRAPH_COMMIT_DATA_WIDTH * $NUM_COMMITS))
 GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
-GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES))
+GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 * $NUM_OCTOPUS_EDGES))

 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position

---

Derrick Stolee (21):
  commit-graph: UNLEAK before die()
  commit-graph: fix GRAPH_MIN_SIZE
  commit-graph: parse commit from chosen graph
  commit: force commit to parse from object database
  commit-graph: load a root tree from specific graph
  commit-graph: add 'verify' subcommand
  commit-graph: verify catches corrupt signature
  commit-graph: verify required chunks are present
  commit-graph: verify corrupt OID fanout and lookup
  commit-graph: verify objects exist
  commit-graph: verify root tree OIDs
  commit-graph: verify parent list
  commit-graph: verify generation number
  commit-graph: verify commit date
  commit-graph: test for corrupted octopus edge
  commit-graph: verify contents match checksum
  fsck: verify commit-graph
  commit-graph: use string-list API for input
  commit-graph: add '--reachable' option
  gc: automatically write commit-graph files
  commit-graph: update design document

 Documentation/config.txt |  10 +-
 Documentation/git-commit-graph.txt   |  14 +-
 Documentation/git-fsck.txt   |   3 +
 Documentation/git-gc.txt |   4 +
 Documentation/technical/commit-graph.txt |  22 --
 builtin/commit-graph.c   |  99 ++---
 builtin/fsck.c   |  21 ++
 builtin/gc.c |   6 +
 commit-graph.c   | 249 +--
 commit-graph.h   |  11 +-
 commit.c |   9 +-
 commit.h |   1 +
 t/t5318-commit-graph.sh  | 201 ++
 13 files changed, 572 insertions(+), 78 deletions(-)

-- 
2.18.0.rc1



[PATCH v6 05/21] commit-graph: load a root tree from specific graph

2018-06-08 Thread Derrick Stolee
When lazy-loading a tree for a commit, it will be important to select
the tree from a specific struct commit_graph. Create a new method that
specifies the commit-graph file and use that in
get_commit_tree_in_graph().

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index e77b19971d..9e228d3bb5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -362,14 +362,20 @@ static struct tree *load_tree_for_commit(struct 
commit_graph *g, struct commit *
return c->maybe_tree;
 }
 
-struct tree *get_commit_tree_in_graph(const struct commit *c)
+static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g,
+const struct commit *c)
 {
if (c->maybe_tree)
return c->maybe_tree;
if (c->graph_pos == COMMIT_NOT_FROM_GRAPH)
-   BUG("get_commit_tree_in_graph called from non-commit-graph 
commit");
+   BUG("get_commit_tree_in_graph_one called from non-commit-graph 
commit");
+
+   return load_tree_for_commit(g, (struct commit *)c);
+}
 
-   return load_tree_for_commit(commit_graph, (struct commit *)c);
+struct tree *get_commit_tree_in_graph(const struct commit *c)
+{
+   return get_commit_tree_in_graph_one(commit_graph, c);
 }
 
 static void write_graph_chunk_fanout(struct hashfile *f,
-- 
2.18.0.rc1



Re: [PATCH v5 18/21] commit-graph: use string-list API for input

2018-06-08 Thread Derrick Stolee

On 6/6/2018 8:45 AM, Derrick Stolee wrote:

On 6/6/2018 8:26 AM, Ævar Arnfjörð Bjarmason wrote:

On Wed, Jun 06 2018, Derrick Stolee wrote:


On 6/6/2018 8:11 AM, Ævar Arnfjörð Bjarmason wrote:

On Wed, Jun 06 2018, Derrick Stolee wrote:


Signed-off-by: Derrick Stolee 

+    string_list_clear(, 0);
   return 0;
   }

This results in an invalid free() & segfault because you're freeing
 which may not have been allocated by string_list_init().

Good point. Did my tests not catch this? (seems it requires calling
`git commit-graph write` with no `--stdin-packs` or
`--stdin-commits`).

Most of your tests (t5318-commit-graph.sh) segfaulted, but presumably
you're on a more forgiving compiler/platform/options. I compiled with
-O0 -g on clang 4.0.1-8 + Debian testing.


I appreciate the extra platform testing. I'm using GCC on Ubuntu (gcc 
(Ubuntu 7.3.0-16ubuntu3) 7.3.0).


While the errors didn't repro on my box, they did on the VSTS Linux 
build machines [1]. I created an internal PR just so I could run those 
tests (and on our OSX agents which use clang) and so I can verify I 
fixed the situation. I'll send a v6 soon so Junio only picks up a 
version that succeeds in tests.


Thanks,
-Stolee

[1] 
https://github.com/Microsoft/vsts-agent-docker/blob/master/ubuntu/16.04/standard/Dockerfile

    What's installed on a VSTS Linux build agent


Re: [PATCH 00/23] Multi-pack-index (MIDX)

2018-06-07 Thread Derrick Stolee

On 6/7/2018 10:45 AM, Ævar Arnfjörð Bjarmason wrote:

On Thu, Jun 07 2018, Derrick Stolee wrote:


To test the performance in this situation, I created a
script that organizes the Linux repository in a similar
fashion. I split the commit history into 50 parts by
creating branches on every 10,000 commits of the first-
parent history. Then, `git rev-list --objects A ^B`
provides the list of objects reachable from A but not B,
so I could send that to `git pack-objects` to create
these "time-based" packfiles. With these 50 packfiles
(deleting the old one from my fresh clone, and deleting
all tags as they were no longer on-disk) I could then
test 'git rev-list --objects HEAD^{tree}' and see:

 Before: 0.17s
 After:  0.13s
 % Diff: -23.5%

By adding logic to count hits and misses to bsearch_pack,
I was able to see that the command above calls that
method 266,930 times with a hit rate of 33%. The MIDX
has the same number of calls with a 100% hit rate.

Do you have the script you used for this? It would be very interesting
as something we could stick in t/perf/ to test this use-case in the
future.

How does this & the numbers below compare to just a naïve
--max-pack-size= on linux.git?

Is it possible for you to tar this test repo up and share it as a
one-off? I've been polishing the core.validateAbbrev series I have, and
it would be interesting to compare some of the (abbrev) numbers.


Here is what I used. You will want to adjust your constants for whatever 
repo you are using. This is for the Linux kernel which has a 
first-parent history of ~50,000 commits. It also leaves a bunch of extra 
files around, so it is nowhere near incorporating into the code.


#!/bin/bash

for i in `seq 1 50`
do
    ORDER=$((51 - $i))
    NUM_BACK=$((1000 * ($i - 1)))
    echo creating batch/$ORDER
    git branch -f batch/$ORDER HEAD~$NUM_BACK
    echo batch/$ORDER
    git rev-parse batch/$ORDER
done

lastbranch=""
for i in `seq 1 50`
do
    branch=batch/$i
    if [$lastbranch -eq ""]
    then
    echo "$branch"
    git rev-list --objects $branch | sed 's/ .*//' 
>objects-$i.txt

    else
    echo "$lastbranch"
    echo "$branch"
    git rev-list --objects $branch ^$lastbranch | sed 's/ 
.*//' >objects-$i.txt

    fi

    git pack-objects --no-reuse-delta 
.git/objects/pack/branch-split2 
    lastbranch=$branch
done


for tag in `git tag --list`
do
    git tag -d $tag
done

rm -rf .git/objects/pack/pack-*
git midx write



Re: [PATCH 03/23] midx: add midx builtin

2018-06-18 Thread Derrick Stolee

On 6/7/2018 1:20 PM, Duy Nguyen wrote:

On Thu, Jun 7, 2018 at 4:03 PM, Derrick Stolee  wrote:

diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt
new file mode 100644
index 00..2bd886f1a2
--- /dev/null
+++ b/Documentation/git-midx.txt
@@ -0,0 +1,29 @@
+git-midx(1)
+
+
+NAME
+
+git-midx - Write and verify multi-pack-indexes (MIDX files).

No full stop. This head line is collected automatically with others
and its having a full stop while the rest does not looks strange/


diff --git a/builtin/midx.c b/builtin/midx.c
new file mode 100644
index 00..59ea92178f
--- /dev/null
+++ b/builtin/midx.c
@@ -0,0 +1,38 @@
+#include "builtin.h"
+#include "cache.h"
+#include "config.h"
+#include "git-compat-util.h"

You only need either cache.h or git-compat-util.h. If cache.h is here,
git-compat-util can be removed.


+#include "parse-options.h"
+
+static char const * const builtin_midx_usage[] ={
+   N_("git midx [--object-dir ]"),
+   NULL
+};
+
+static struct opts_midx {
+   const char *object_dir;
+} opts;
+
+int cmd_midx(int argc, const char **argv, const char *prefix)
+{
+   static struct option builtin_midx_options[] = {
+   { OPTION_STRING, 0, "object-dir", _dir,

For paths (including dir), OPTION_FILENAME may be a better option to
handle correctly when the command is run in a subdir. See df217ed643
(parse-opts: add OPT_FILENAME and transition builtins - 2009-05-23)
for more info.

Thanks for the pointer!




+ N_("dir"),
+ N_("The object directory containing set of packfile and pack-index 
pairs.") },

Other help strings do not have full stop either (I only checked a
couple commands though)

Also, doesn't OPT_STRING() work here too (if you avoid OPTION_FILENAME
for some reason)?


+   OPT_END(),
+   };
+
+   if (argc == 2 && !strcmp(argv[1], "-h"))
+   usage_with_options(builtin_midx_usage, builtin_midx_options);
+
+   git_config(git_default_config, NULL);
+
+   argc = parse_options(argc, argv, prefix,
+builtin_midx_options,
+builtin_midx_usage, 0);
+
+   if (!opts.object_dir)
+   opts.object_dir = get_object_directory();
+
+   return 0;
+}
diff --git a/git.c b/git.c
index c2f48d53dd..400fadd677 100644
--- a/git.c
+++ b/git.c
@@ -503,6 +503,7 @@ static struct cmd_struct commands[] = {
 { "merge-recursive-theirs", cmd_merge_recursive, RUN_SETUP | 
NEED_WORK_TREE | NO_PARSEOPT },
 { "merge-subtree", cmd_merge_recursive, RUN_SETUP | NEED_WORK_TREE | 
NO_PARSEOPT },
 { "merge-tree", cmd_merge_tree, RUN_SETUP | NO_PARSEOPT },
+   { "midx", cmd_midx, RUN_SETUP },

If it's a plumbing and can take an --object-dir, then I don't think
you should require it to run in a repo (with RUN_SETUP).
RUN_SETUP_GENTLY may be better. You could even leave it empty here and
only call setup_git_directory() only when --object-dir is not set.


I agree. Good point. This could be run to maintain an alternate without 
any .git folder.





 { "mktag", cmd_mktag, RUN_SETUP | NO_PARSEOPT },
 { "mktree", cmd_mktree, RUN_SETUP },
 { "mv", cmd_mv, RUN_SETUP | NEED_WORK_TREE },




Re: [PATCH 03/23] midx: add midx builtin

2018-06-18 Thread Derrick Stolee

On 6/11/2018 5:02 PM, Stefan Beller wrote:

Hi Derrick,
On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee  wrote:

This new 'git midx' builtin will be the plumbing access for writing,
reading, and checking multi-pack-index (MIDX) files. The initial
implementation is a no-op.

Let's talk about the name for a second:

.idx files are written by git-index-pack or as part of
git-pack-objects (which just calls write_idx_file as part
of finish_tmp_packfile), and the name actually suggests
it writes the index files. I have a hard time understanding
what the git-midx command does[1].

With both commit graph as well as multi index we introduce
a command that is centered around that concept (similar to
git-remote or git-config that are centered around a concept,
that is closely resembled by a file), but for indexes for packs
it was integrated differently into Git. So I am not sure if I want
to suggest to integrate it into the packfile commands as that
doesn't really fit. But maybe we can have a name that is human
readable instead of the file suffix? Maybe

   git multi-pack-index ?

I suppose that eventually this command is not really used by
users as it will be used by other porcelain commands in the
background or even as part of repack/gc so I am not worried
about a long name, but I'd be more worried about understandability.


I'll use "git multi-pack-index" in v2. I'll keep "midx.c" in the root, 
though, if that is OK.



[1] While these names are not perfect for the layman, it is okay?
   I am sure you are aware of https://git-man-page-generator.lokaltog.net/


I was not, and enjoyed that quite a bit.

Thanks,
-Stolee





new file mode 100644
index 00..2bd886f1a2
--- /dev/null
+++ b/Documentation/git-midx.txt
@@ -0,0 +1,29 @@
+git-midx(1)
+
+
+NAME
+
+git-midx - Write and verify multi-pack-indexes (MIDX files).

The reading is done as part of all other commands.


I like to think the 'read' verb is a subset of "verify" because we are 
checking for information about the MIDX, and mostly for tests or debugging.





+
+
+SYNOPSIS
+
+[verse]
+'git midx' [--object-dir ]
+
+DESCRIPTION
+---
+Write or verify a MIDX file.
+
+OPTIONS
+---
+
+--object-dir ::
+   Use given directory for the location of Git objects. We check
+   /packs/multi-pack-index for the current MIDX file, and
+   /packs for the pack-files to index.
+
+

Maybe we could have a SEE ALSO section that points at
the explanation of multi index files?
(c.f. man git-submodule that has a  SEE ALSO
gitsubmodules(7), gitmodules(5) explaining concepts(7)
and the file(5))

But as this is plumbing and users should not need to worry about it
this is optional, I would think.


The design document is also in 'Documentation/technical' instead of just 
'Documentation/'. Do we have a pattern of linking to the technical 
documents?


Thanks,
-Stolee


Re: [PATCH 03/23] midx: add midx builtin

2018-06-18 Thread Derrick Stolee

On 6/18/2018 3:55 PM, Stefan Beller wrote:

But as this is plumbing and users should not need to worry about it
this is optional, I would think.

The design document is also in 'Documentation/technical' instead of just
'Documentation/'. Do we have a pattern of linking to the technical
documents?

Apparently we do (and I was not aware of it):

 $ git -C Documentation/ grep link:technical
 git-credential.txt:23:link:technical/api-credentials.html[the Git
credential API] for more
 git.txt:839:link:technical/api-index.html[Git API documentation].
 gitcredentials.txt:184:link:technical/api-credentials.html[credentials
API] for details.
 technical/http-protocol.txt:517:link:technical/pack-protocol.html
 technical/http-protocol.txt:518:link:technical/protocol-capabilities.html
 user-manual.txt:3220:found in link:technical/pack-format.html[pack format].


Thanks! I'll add some links.


Re: [PATCH 01/23] midx: add design document

2018-06-18 Thread Derrick Stolee

On 6/11/2018 3:04 PM, Stefan Beller wrote:

On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee  wrote:

Signed-off-by: Derrick Stolee 
---
  Documentation/technical/midx.txt | 109 +++
  1 file changed, 109 insertions(+)
  create mode 100644 Documentation/technical/midx.txt

diff --git a/Documentation/technical/midx.txt b/Documentation/technical/midx.txt
new file mode 100644
index 00..789f410d71
--- /dev/null
+++ b/Documentation/technical/midx.txt
@@ -0,0 +1,109 @@
+Multi-Pack-Index (MIDX) Design Notes
+
+
+The Git object directory contains a 'pack' directory containing
+packfiles (with suffix ".pack") and pack-indexes (with suffix
+".idx"). The pack-indexes provide a way to lookup objects and
+navigate to their offset within the pack, but these must come
+in pairs with the packfiles. This pairing depends on the file
+names, as the pack-index differs only in suffix with its pack-
+file. While the pack-indexes provide fast lookup per packfile,
+this performance degrades as the number of packfiles increases,
+because abbreviations need to inspect every packfile and we are
+more likely to have a miss on our most-recently-used packfile.
+For some large repositories, repacking into a single packfile
+is not feasible due to storage space or excessive repack times.

This leads to the question how MIDX will cope with large repos or
a large number of packs. As it is just an index and not a pack itself,
I guess it is smaller by some orders of magnitude, such that it
is ok for now.


The MIDX file is only slightly larger than the union of the IDX files 
for those packfiles.



+The multi-pack-index (MIDX for short) stores a list of objects
+and their offsets into multiple packfiles. It contains:
+
+- A list of packfile names.
+- A sorted list of object IDs.
+- A list of metadata for the ith object ID including:
+  - A value j referring to the jth packfile.
+  - An offset within the jth packfile for the object.
+- If large offsets are required, we use another list of large
+  offsets similar to version 2 pack-indexes.
+
+Thus, we can provide O(log N) lookup time for any number
+of packfiles.

This sounds great for the lookup case!
Though that is for the repo-read case.
Let's read on how the dynamics of a repository are dealt with,
e.g. integrating new packs into the MIDX, or how we deal with
objects in multiple packs.


+
+Design Details
+--
+
+- The MIDX is stored in a file named 'multi-pack-index' in the
+  .git/objects/pack directory. This could be stored in the pack
+  directory of an alternate. It refers only to packfiles in that
+  same directory.

So there is one and only one multi pack index?
That makes the case of preparing the next MIDX that contains more
pack references more interesting, as then we have to atomically update
that file.


There is only one, but we can make the file incremental without changing 
this name similar to how the split index works.





+- The core.midx config setting must be on to consume MIDX files.

Looking through current config options, I would rename this to a more
suggestive name. I searched for the core.idx counterpart that enables
idx files -- it turns out that is named pack.indexVersion.

So maybe pack.MultiIndex ? That could start out as a boolean as in this
series and then evolve into a version number or such later.


I'll use that name and rename this file to 
Documentation/technical/multi-pack-index.txt





+- The file format includes parameters for the object ID hash
+  function, so a future change of hash algorithm does not require
+  a change in format.
+
+- The MIDX keeps only one record per object ID. If an object appears
+  in multiple packfiles, then the MIDX selects the copy in the most-
+  recently modified packfile.

Okay. That answers the question from above. Though this is just the tie
breaking decision and not a hard limitation? (i.e. we could change this
this later to that pack that has e.g. shortest delta chain for that object or
such)


This is a soft requirement. It is an easy thing to track at the moment. 
We can compute the MIDX without opening a packfile, for instance.





+- If there exist packfiles in the pack directory not registered in
+  the MIDX, then those packfiles are loaded into the `packed_git`
+  list and `packed_git_mru` cache.

Not sure I understand the implications of this?
Does that mean we first look at the multi index and if an object is not
found, we'll search linearly through all packs that are not part of the
MIDX? That would require the MIDX to be kepot up to date reasonably
to be useful.


If you add a packfile to the pack directory, you can immediately start 
consuming it. You do not need to wait for the MIDX to be updated. The 
more asynchronous these auxiliary data structures (MIDX, commit-graph) 
can be, the better. This is in direct contrast to the reachability 
bitmap which is useless without its corresponding packfi

Re: [PATCH 02/23] midx: add midx format details to pack-format.txt

2018-06-18 Thread Derrick Stolee

On 6/11/2018 3:19 PM, Stefan Beller wrote:

Hi Derrick,
On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee  wrote:

The multi-pack-index (MIDX) feature generalizes the existing pack-
index (IDX) feature by indexing objects across multiple pack-files.

Describe the basic file format, using a 12-byte header followed by
a lookup table for a list of "chunks" which will be described later.
The file ends with a footer containing a checksum using the hash
algorithm.

The header allows later versions to create breaking changes by
advancing the version number. We can also change the hash algorithm
using a different version value.

We will add the individual chunk format information as we introduce
the code that writes that information.

Signed-off-by: Derrick Stolee 
---
  Documentation/technical/pack-format.txt | 49 +
  1 file changed, 49 insertions(+)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 70a99fd142..17666b4bfc 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -252,3 +252,52 @@ Pack file entry: <+
  corresponding packfile.

  20-byte SHA-1-checksum of all of the above.
+
+== midx-*.midx files have the following format:
+
+The meta-index files refer to multiple pack-files and loose objects.

So is it meta or multi?


Good catch. We were calling this the meta-index internally before 
changing to "multi-pack-index" (helps to not change the acronym).





+In order to allow extensions that add extra data to the MIDX, we organize
+the body into "chunks" and provide a lookup table at the beginning of the
+body. The header includes certain length values, such as the number of packs,
+the number of base MIDX files, hash lengths and types.
+
+All 4-byte numbers are in network order.
+
+HEADER:
+
+   4-byte signature:
+   The signature is: {'M', 'I', 'D', 'X'}
+
+   1-byte version number:
+   Git only writes or recognizes version 1
+
+   1-byte Object Id Version
+   Git only writes or recognizes verion 1 (SHA-1)

s/verion/version/


+   1-byte number (C) of "chunks"
+
+   1-byte number (I) of base multi-pack-index files:
+   This value is currently always zero.

Oh? Are meta-index and multi-index files different things?


Not intended to be different things, but this number is related to 
making the feature incremental.





+   4-byte number (P) of pack files
+
+CHUNK LOOKUP:
+
+   (C + 1) * 12 bytes providing the chunk offsets:
+   First 4 bytes describe chunk id. Value 0 is a terminating label.
+   Other 8 bytes provide offset in current file for chunk to start.
+   (Chunks are provided in file-order, so you can infer the length
+   using the next chunk position if necessary.)

It is so nice to have the header also have 12 bytes, so it fits right into the
lookup table. So an alternative point of view:

   If a chunk needs to store more than 8 bytes, we'll have an offset after
   the first 4 bytes that describe the chunk, otherwise you can store the 8 
bytes
   of information directly after the 4 bytes.
"MIDX" is a special chunk and must come first (does it?) and only once
   as it contains the version number.


This sounds feasible, but unnecessarily complicated. I don't think any 
other chunk will be this small.



+   The remaining data in the body is described one chunk at a time, and
+   these chunks may be given in any order. Chunks are required unless
+   otherwise specified.
+
+CHUNK DATA:
+
+   (This section intentionally left incomplete.)
+
+TRAILER:
+
+   H-byte HASH-checksum of all of the above.

This means we have to rehash the whole file for updating its contents.
okay.




Re: [PATCH v2 05/31] tree: add repository argument to lookup_tree

2018-06-14 Thread Derrick Stolee

On 6/13/2018 7:04 PM, Stefan Beller wrote:

diff --git a/commit-graph.c b/commit-graph.c
index 71125d7cbb6..88a4b0d2a47 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -261,7 +261,7 @@ static int fill_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
item->graph_pos = pos;
  
  	hashcpy(oid.hash, commit_data);

-   item->tree = lookup_tree();
+   item->tree = lookup_tree(the_repository, );
  
  	date_high = get_be32(commit_data + g->hash_len + 8) & 0x3;

date_low = get_be32(commit_data + g->hash_len + 12);
diff --git a/commit.c b/commit.c
index d76d64d4dfc..755b8b9d94f 100644
--- a/commit.c
+++ b/commit.c
@@ -354,7 +354,7 @@ int parse_commit_buffer(struct commit *item, const void 
*buffer, unsigned long s
if (get_sha1_hex(bufptr + 5, parent.hash) < 0)
return error("bad tree pointer in commit %s",
 oid_to_hex(>object.oid));
-   item->tree = lookup_tree();
+   item->tree = lookup_tree(the_repository, );
bufptr += tree_entry_len + 1; /* "tree " + "hex sha1" + "\n" */
pptr = >parents;
  
diff --git a/fsck.c b/fsck.c

index 2d372f2a3f3..b07abb9796c 100644


I'm a bit late here, but you don't have ds/lazy-load-trees merged in 
your history, so this will conflict with 'master'. I caught this as I 
was trying to merge ds/generation-numbers with your branch.


The 'tree' member of 'struct commit' was renamed to 'maybe_tree'.

Thanks,
-Stolee


Re: [PATCH 4/3] ewah: adjust callers of ewah_read_mmap()

2018-06-15 Thread Derrick Stolee

On 6/14/2018 11:44 PM, Jeff King wrote:

The return value of ewah_read_mmap() is now an ssize_t,
since we could (in theory) process up to 32GB of data. This
would never happen in practice, but a corrupt or malicious
.bitmap or index file could convince us to do so.


Aside: Why a 32GB limit? Usually I see 32-bit issues limiting by 2 
(signed) or 4 GB (unsigned). Is there something specifically in the 
bitmap format that stops at this size?


Thanks,
-Stolee



Re: [PATCH 3/3] ewah: drop ewah_serialize_native function

2018-06-15 Thread Derrick Stolee

On 6/15/2018 9:56 AM, Ramsay Jones wrote:


On 15/06/18 04:32, Jeff King wrote:

We don't call this function, and never have. The on-disk
bitmap format uses network-byte-order integers, meaning that
we cannot use the native-byte-order format written here.

Let's drop it in the name of simplicity.

Hmm, if you are in the mood to drop ewah dead code, how about:

   ewah/bitmap.o   - bitmap_clear
   ewah/bitmap.o   - bitmap_each_bit
   ewah/ewah_bitmap.o  - ewah_and
   ewah/ewah_bitmap.o  - ewah_and_not
   ewah/ewah_bitmap.o  - ewah_not
   ewah/ewah_bitmap.o  - ewah_or

... in addition to these *(de)serialize* functions. ;-)



Normally, I would say we should keep this folder as close to the 
"original" [1] as possible, so we could more easily take improvements to 
that library. However, it appears that code is not being updated. 
Perhaps this is in favor of other EWAH libraries [2] or other compressed 
bitmap formats [3].


For that reason, I agree we should clean this up. We shouldn't block 
Peff's patch for that reason. I'll send a patch that deletes these methods.


Thanks,
-Stolee:

[1] https://github.com/vmg/libewok

[2] https://github.com/lemire/EWAHBoolArray

[3] https://github.com/RoaringBitmap/CRoaring


[PATCH 6/8] ewah_bitmap: delete unused 'ewah_or()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 69 --
 ewah/ewok.h|  5 
 2 files changed, 74 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 3fa3edf2a3..017c677f98 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -440,75 +440,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_or(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if (predator->rlw.running_bit) {
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 0);
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] |
-   rlw_j.buffer[rlw_j.literal_word_start + 
k]
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge(_i, out, ~0, 0);
-   else
-   rlwit_discharge(_j, out, ~0, 0);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
-
 #define BITMAP_POOL_MAX 16
 static struct ewah_bitmap *bitmap_pool[BITMAP_POOL_MAX];
 static size_t bitmap_pool_size;
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 4b665d09d9..cd826a0136 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -157,11 +157,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct 
ewah_bitmap *parent);
  */
 int ewah_iterator_next(eword_t *next, struct ewah_iterator *it);
 
-void ewah_or(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
-- 
2.18.0.rc1



[PATCH 3/8] ewah_bitmap: delete unused 'ewah_and()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 68 --
 ewah/ewok.h|  5 
 2 files changed, 73 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index b9fdda1d3d..5a4d3a6eb6 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -459,74 +459,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_and(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if (predator->rlw.running_bit == 0) {
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 0);
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] &
-   rlw_j.buffer[rlw_j.literal_word_start + 
k]
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge_empty(_i, out);
-   else
-   rlwit_discharge_empty(_j, out);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
 void ewah_and_not(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index dc43d05b64..5841c83507 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -179,11 +179,6 @@ void ewah_xor(
struct ewah_bitmap *ewah_j,
struct ewah_bitmap *out);
 
-void ewah_and(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 /**
  * Direct word access
  */
-- 
2.18.0.rc1



[PATCH 8/8] ewah_io: delete unused 'ewah_serialize_native()'

2018-06-15 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_io.c | 26 --
 ewah/ewok.h|  1 -
 2 files changed, 27 deletions(-)

diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c
index 86d3f1d02e..dde736bcd9 100644
--- a/ewah/ewah_io.c
+++ b/ewah/ewah_io.c
@@ -20,32 +20,6 @@
 #include "ewok.h"
 #include "strbuf.h"
 
-int ewah_serialize_native(struct ewah_bitmap *self, int fd)
-{
-   uint32_t write32;
-   size_t to_write = self->buffer_size * 8;
-
-   /* 32 bit -- bit size for the map */
-   write32 = (uint32_t)self->bit_size;
-   if (write(fd, , 4) != 4)
-   return -1;
-
-   /** 32 bit -- number of compressed 64-bit words */
-   write32 = (uint32_t)self->buffer_size;
-   if (write(fd, , 4) != 4)
-   return -1;
-
-   if (write(fd, self->buffer, to_write) != to_write)
-   return -1;
-
-   /** 32 bit -- position for the RLW */
-   write32 = self->rlw - self->buffer;
-   if (write(fd, , 4) != 4)
-   return -1;
-
-   return (3 * 4) + to_write;
-}
-
 int ewah_serialize_to(struct ewah_bitmap *self,
  int (*write_fun)(void *, const void *, size_t),
  void *data)
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 1055706e44..8a2b334278 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self);
 int ewah_serialize_to(struct ewah_bitmap *self,
  int (*write_fun)(void *out, const void *buf, size_t len),
  void *out);
-int ewah_serialize_native(struct ewah_bitmap *self, int fd);
 int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *);
 
 int ewah_deserialize(struct ewah_bitmap *self, int fd);
-- 
2.18.0.rc1



[PATCH 7/8] ewah_io: delete unused 'ewah_serialize()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_io.c | 10 --
 ewah/ewok.h|  1 -
 2 files changed, 11 deletions(-)

diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c
index bed1994551..86d3f1d02e 100644
--- a/ewah/ewah_io.c
+++ b/ewah/ewah_io.c
@@ -100,16 +100,6 @@ int ewah_serialize_to(struct ewah_bitmap *self,
return (3 * 4) + (self->buffer_size * 8);
 }
 
-static int write_helper(void *fd, const void *buf, size_t len)
-{
-   return write((intptr_t)fd, buf, len);
-}
-
-int ewah_serialize(struct ewah_bitmap *self, int fd)
-{
-   return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd);
-}
-
 static int write_strbuf(void *user_data, const void *data, size_t len)
 {
struct strbuf *sb = user_data;
diff --git a/ewah/ewok.h b/ewah/ewok.h
index cd826a0136..1055706e44 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self);
 int ewah_serialize_to(struct ewah_bitmap *self,
  int (*write_fun)(void *out, const void *buf, size_t len),
  void *out);
-int ewah_serialize(struct ewah_bitmap *self, int fd);
 int ewah_serialize_native(struct ewah_bitmap *self, int fd);
 int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *);
 
-- 
2.18.0.rc1



[PATCH 5/8] ewah_bitmap: delete unused 'ewah_not()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 19 ---
 ewah/ewok.h|  7 ---
 2 files changed, 26 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 559adde37c..3fa3edf2a3 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -376,25 +376,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct 
ewah_bitmap *parent)
read_new_rlw(it);
 }
 
-void ewah_not(struct ewah_bitmap *self)
-{
-   size_t pointer = 0;
-
-   while (pointer < self->buffer_size) {
-   eword_t *word = >buffer[pointer];
-   size_t literals, k;
-
-   rlw_xor_run_bit(word);
-   ++pointer;
-
-   literals = rlw_get_literal_words(word);
-   for (k = 0; k < literals; ++k) {
-   self->buffer[pointer] = ~self->buffer[pointer];
-   ++pointer;
-   }
-   }
-}
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 21c420770e..4b665d09d9 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -95,13 +95,6 @@ int ewah_read_mmap(struct ewah_bitmap *self, const void 
*map, size_t len);
 
 uint32_t ewah_checksum(struct ewah_bitmap *self);
 
-/**
- * Logical not (bitwise negation) in-place on the bitmap
- *
- * This operation is linear time based on the size of the bitmap.
- */
-void ewah_not(struct ewah_bitmap *self);
-
 /**
  * Call the given callback with the position of every single bit
  * that has been set on the bitmap.
-- 
2.18.0.rc1



[PATCH 2/8] ewah/bitmap.c: delete unused 'bitmap_each_bit()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/bitmap.c | 24 
 1 file changed, 24 deletions(-)

diff --git a/ewah/bitmap.c b/ewah/bitmap.c
index d61dc6114a..52f1178db4 100644
--- a/ewah/bitmap.c
+++ b/ewah/bitmap.c
@@ -129,30 +129,6 @@ void bitmap_or_ewah(struct bitmap *self, struct 
ewah_bitmap *other)
self->words[i++] |= word;
 }
 
-void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data)
-{
-   size_t pos = 0, i;
-
-   for (i = 0; i < self->word_alloc; ++i) {
-   eword_t word = self->words[i];
-   uint32_t offset;
-
-   if (word == (eword_t)~0) {
-   for (offset = 0; offset < BITS_IN_EWORD; ++offset)
-   callback(pos++, data);
-   } else {
-   for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
-   if ((word >> offset) == 0)
-   break;
-
-   offset += ewah_bit_ctz64(word >> offset);
-   callback(pos + offset, data);
-   }
-   pos += BITS_IN_EWORD;
-   }
-   }
-}
-
 size_t bitmap_popcount(struct bitmap *self)
 {
size_t i, count = 0;
-- 
2.18.0.rc1



Re: [PATCH 8/8] ewah_io: delete unused 'ewah_serialize_native()'

2018-06-15 Thread Derrick Stolee

On 6/15/2018 11:01 AM, Ramsay Jones wrote:


On 15/06/18 15:30, Derrick Stolee wrote:

Signed-off-by: Derrick Stolee 
---
  ewah/ewah_io.c | 26 --
  ewah/ewok.h|  1 -
  2 files changed, 27 deletions(-)

This duplicates Jeff's patch #3.
Thanks. I got a bit hasty as I was working from 'maint' and didn't 
double-check.


-Stolee


[PATCH 1/8] ewah/bitmap.c: delete unused 'bitmap_clear()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/bitmap.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/ewah/bitmap.c b/ewah/bitmap.c
index 756bdd050e..d61dc6114a 100644
--- a/ewah/bitmap.c
+++ b/ewah/bitmap.c
@@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos)
self->words[block] |= EWAH_MASK(pos);
 }
 
-void bitmap_clear(struct bitmap *self, size_t pos)
-{
-   size_t block = EWAH_BLOCK(pos);
-
-   if (block < self->word_alloc)
-   self->words[block] &= ~EWAH_MASK(pos);
-}
-
 int bitmap_get(struct bitmap *self, size_t pos)
 {
size_t block = EWAH_BLOCK(pos);
-- 
2.18.0.rc1



[PATCH 0/8] Delete unused methods in EWAH bitmap

2018-06-15 Thread Derrick Stolee
The EWAH bitmap code includes several logical operations that are
important for a general-purpose bitmap library. However, Git only
uses the XOR operation for storing deltas between reachability
bitmaps. This means that we can delete the following unused methods:

* ewah_and()
* ewah_and_not()
* ewah_not()
* ewah_or()
* ewah_serialize()
* ewah_serialize_native()

We can also delete the unused methods bitmap_clear() and
bitmap_each_bit().

Derrick Stolee (8):
  ewah/bitmap.c: delete unused 'bitmap_clear()'
  ewah/bitmap.c: delete unused 'bitmap_each_bit()'
  ewah_bitmap: delete unused 'ewah_and()'
  ewah_bitmap: delete unused 'ewah_and_not()'
  ewah_bitmap: delete unused 'ewah_not()'
  ewah_bitmap: delete unused 'ewah_or()'
  ewah_io: delete unused 'ewah_serialize()'
  ewah_io: delete unused 'ewah_serialize_native()'

 ewah/bitmap.c  |  32 ---
 ewah/ewah_bitmap.c | 229 -
 ewah/ewah_io.c |  36 ---
 ewah/ewok.h|  24 -
 4 files changed, 321 deletions(-)


base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e
-- 
2.18.0.rc1



[PATCH 4/8] ewah_bitmap: delete unused 'ewah_and_not()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 73 --
 ewah/ewok.h|  5 
 2 files changed, 78 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 5a4d3a6eb6..559adde37c 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -459,79 +459,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_and_not(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if ((predator->rlw.running_bit && prey == _i) ||
-   (!predator->rlw.running_bit && prey != _i)) 
{
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index;
-   int negate_words;
-
-   negate_words = (_i != prey);
-   index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 
negate_words);
-   ewah_add_empty_words(out, negate_words,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] &
-   ~(rlw_j.buffer[rlw_j.literal_word_start 
+ k])
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge(_i, out, ~0, 0);
-   else
-   rlwit_discharge_empty(_j, out);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
 void ewah_or(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 5841c83507..21c420770e 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -169,11 +169,6 @@ void ewah_or(
struct ewah_bitmap *ewah_j,
struct ewah_bitmap *out);
 
-void ewah_and_not(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
-- 
2.18.0.rc1



Re: [PATCH 0/8] Delete unused methods in EWAH bitmap

2018-06-15 Thread Derrick Stolee

On 6/15/2018 10:30 AM, Derrick Stolee wrote:

The EWAH bitmap code includes several logical operations that are
important for a general-purpose bitmap library. However, Git only
uses the XOR operation for storing deltas between reachability
bitmaps. This means that we can delete the following unused methods:

* ewah_and()
* ewah_and_not()
* ewah_not()
* ewah_or()
* ewah_serialize()
* ewah_serialize_native()

We can also delete the unused methods bitmap_clear() and
bitmap_each_bit().

Derrick Stolee (8):
   ewah/bitmap.c: delete unused 'bitmap_clear()'
   ewah/bitmap.c: delete unused 'bitmap_each_bit()'
   ewah_bitmap: delete unused 'ewah_and()'
   ewah_bitmap: delete unused 'ewah_and_not()'
   ewah_bitmap: delete unused 'ewah_not()'
   ewah_bitmap: delete unused 'ewah_or()'
   ewah_io: delete unused 'ewah_serialize()'
   ewah_io: delete unused 'ewah_serialize_native()'

  ewah/bitmap.c  |  32 ---
  ewah/ewah_bitmap.c | 229 -
  ewah/ewah_io.c |  36 ---
  ewah/ewok.h|  24 -
  4 files changed, 321 deletions(-)


base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e


Responders to this thread beware: I accidentally added an extra letter 
in Peff's email address, so be careful with reply-all.


Sorry!
-Stolee


Re: [PATCH 1/8] ewah/bitmap.c: delete unused 'bitmap_clear()'

2018-06-15 Thread Derrick Stolee

On 6/15/2018 10:46 AM, Ramsay Jones wrote:


On 15/06/18 15:30, Derrick Stolee wrote:

Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
  ewah/bitmap.c | 8 
  1 file changed, 8 deletions(-)

diff --git a/ewah/bitmap.c b/ewah/bitmap.c
index 756bdd050e..d61dc6114a 100644
--- a/ewah/bitmap.c
+++ b/ewah/bitmap.c
@@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos)
self->words[block] |= EWAH_MASK(pos);
  }
  
-void bitmap_clear(struct bitmap *self, size_t pos)

-{
-   size_t block = EWAH_BLOCK(pos);
-
-   if (block < self->word_alloc)
-   self->words[block] &= ~EWAH_MASK(pos);
-}
-
  int bitmap_get(struct bitmap *self, size_t pos)
  {
size_t block = EWAH_BLOCK(pos);


I haven't read all the patches yet, but what about the extern
declaration in ewah/ewok.h?


Thanks! I'll get that, too.


Re: [PATCH v2 05/31] tree: add repository argument to lookup_tree

2018-06-14 Thread Derrick Stolee

On 6/14/2018 1:55 PM, Derrick Stolee wrote:

On 6/13/2018 7:04 PM, Stefan Beller wrote:

diff --git a/commit-graph.c b/commit-graph.c
index 71125d7cbb6..88a4b0d2a47 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -261,7 +261,7 @@ static int fill_commit_in_graph(struct commit 
*item, struct commit_graph *g, uin

  item->graph_pos = pos;
    hashcpy(oid.hash, commit_data);
-    item->tree = lookup_tree();
+    item->tree = lookup_tree(the_repository, );
    date_high = get_be32(commit_data + g->hash_len + 8) & 0x3;
  date_low = get_be32(commit_data + g->hash_len + 12);
diff --git a/commit.c b/commit.c
index d76d64d4dfc..755b8b9d94f 100644
--- a/commit.c
+++ b/commit.c
@@ -354,7 +354,7 @@ int parse_commit_buffer(struct commit *item, 
const void *buffer, unsigned long s

  if (get_sha1_hex(bufptr + 5, parent.hash) < 0)
  return error("bad tree pointer in commit %s",
   oid_to_hex(>object.oid));
-    item->tree = lookup_tree();
+    item->tree = lookup_tree(the_repository, );
  bufptr += tree_entry_len + 1; /* "tree " + "hex sha1" + "\n" */
  pptr = >parents;
  diff --git a/fsck.c b/fsck.c
index 2d372f2a3f3..b07abb9796c 100644


I'm a bit late here, but you don't have ds/lazy-load-trees merged in 
your history, so this will conflict with 'master'. I caught this as I 
was trying to merge ds/generation-numbers with your branch.


The 'tree' member of 'struct commit' was renamed to 'maybe_tree'.


Resolving the merge was not very simple. I have a working merge 
available on GitHub [1] as commit 99a899f7c12ef73840dbe79c71acb11034d707dd.


Thanks,
-Stolee

[1] 
https://github.com/derrickstolee/git/commits/generation-numbers-object-store


Re: [PATCH v2 0/7] Delete unused methods in EWAH bitmap

2018-06-15 Thread Derrick Stolee

On 6/15/2018 2:51 PM, Junio C Hamano wrote:

Derrick Stolee  writes:


The EWAH bitmap code includes several logical operations that are
important for a general-purpose bitmap library. However, Git only
uses the XOR operation for storing deltas between reachability
bitmaps. This means that we can delete the following unused methods:

* ewah_and()
* ewah_and_not()
* ewah_not()
* ewah_or()
* ewah_serialize()

We can also delete the unused methods bitmap_clear() and
bitmap_each_bit().

Derrick Stolee (7):
   ewah/bitmap.c: delete unused 'bitmap_clear()'
   ewah/bitmap.c: delete unused 'bitmap_each_bit()'
   ewah_bitmap: delete unused 'ewah_and()'
   ewah_bitmap: delete unused 'ewah_and_not()'
   ewah_bitmap: delete unused 'ewah_not()'
   ewah_bitmap: delete unused 'ewah_or()'
   ewah_io: delete unused 'ewah_serialize()'

  ewah/bitmap.c  |  32 ---
  ewah/ewah_bitmap.c | 229 -
  ewah/ewah_io.c |  10 --
  ewah/ewok.h|  25 -
  4 files changed, 296 deletions(-)


base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e

Thanks.

ewah_clear() can become file-scope static, and
rlwit_discharge_empty() can be eliminated.  I do not know if either
is worth doing, though.


With Peff's patches, this is true. When I applied your diff to my patch 
alone we could not do that.


If you want to create a commit with the below, I'm happy to add my 
"Reviewed-by: Derrick Stolee "


Good team effort today!



  ewah/ewah_bitmap.c | 20 
  ewah/ewah_rlw.c|  8 
  ewah/ewok.h|  6 --
  ewah/ewok_rlw.h|  1 -
  4 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 017c677f98..d59b1afe3d 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -276,6 +276,18 @@ void ewah_each_bit(struct ewah_bitmap *self, void 
(*callback)(size_t, void*), vo
}
  }
  
+/**

+ * Clear all the bits in the bitmap. Does not free or resize
+ * memory.
+ */
+static void ewah_clear(struct ewah_bitmap *self)
+{
+   self->buffer_size = 1;
+   self->buffer[0] = 0;
+   self->bit_size = 0;
+   self->rlw = self->buffer;
+}
+
  struct ewah_bitmap *ewah_new(void)
  {
struct ewah_bitmap *self;
@@ -288,14 +300,6 @@ struct ewah_bitmap *ewah_new(void)
return self;
  }
  
-void ewah_clear(struct ewah_bitmap *self)

-{
-   self->buffer_size = 1;
-   self->buffer[0] = 0;
-   self->bit_size = 0;
-   self->rlw = self->buffer;
-}
-
  void ewah_free(struct ewah_bitmap *self)
  {
if (!self)
diff --git a/ewah/ewah_rlw.c b/ewah/ewah_rlw.c
index b9643b7d0f..5093d43e2f 100644
--- a/ewah/ewah_rlw.c
+++ b/ewah/ewah_rlw.c
@@ -104,11 +104,3 @@ size_t rlwit_discharge(
  
  	return index;

  }
-
-void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out)
-{
-   while (rlwit_word_size(it) > 0) {
-   ewah_add_empty_words(out, 0, rlwit_word_size(it));
-   rlwit_discard_first_words(it, rlwit_word_size(it));
-   }
-}
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 0c504f28e2..84b2a29faa 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -72,12 +72,6 @@ void ewah_pool_free(struct ewah_bitmap *self);
   */
  struct ewah_bitmap *ewah_new(void);
  
-/**

- * Clear all the bits in the bitmap. Does not free or resize
- * memory.
- */
-void ewah_clear(struct ewah_bitmap *self);
-
  /**
   * Free all the memory of the bitmap
   */
diff --git a/ewah/ewok_rlw.h b/ewah/ewok_rlw.h
index bb3c6ff7e0..7cdfdd0c02 100644
--- a/ewah/ewok_rlw.h
+++ b/ewah/ewok_rlw.h
@@ -98,7 +98,6 @@ void rlwit_init(struct rlw_iterator *it, struct ewah_bitmap 
*bitmap);
  void rlwit_discard_first_words(struct rlw_iterator *it, size_t x);
  size_t rlwit_discharge(
struct rlw_iterator *it, struct ewah_bitmap *out, size_t max, int 
negate);
-void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out);
  
  static inline size_t rlwit_word_size(struct rlw_iterator *it)

  {






Re: [PATCH 4/3] ewah: adjust callers of ewah_read_mmap()

2018-06-15 Thread Derrick Stolee

On 6/15/2018 1:31 PM, Jeff King wrote:

On Fri, Jun 15, 2018 at 09:41:46AM -0700, Junio C Hamano wrote:


Derrick Stolee  writes:


On 6/14/2018 11:44 PM, Jeff King wrote:

The return value of ewah_read_mmap() is now an ssize_t,
since we could (in theory) process up to 32GB of data. This
would never happen in practice, but a corrupt or malicious
.bitmap or index file could convince us to do so.

Aside: Why a 32GB limit? Usually I see 32-bit issues limiting by 2
(signed) or 4 GB (unsigned). Is there something specifically in the
bitmap format that stops at this size?

The proposed log message for 1/3 has this bit

   - check the size for integer overflow (this should be
 impossible on 64-bit, as the size is given as a 32-bit
 count of 8-byte words, but is possible on a 32-bit
 system)

4 Giga 8-byte words makes 32 Giga bytes, I'd guess.

Yes, exactly. It's definitely an odd size. I think using the same
varints we use in the packfile would be more efficient (and they're
already variable-length records), though we tend to have few enough of
these that it probably doesn't matter.

I think a more useful change for the bitmap format would be some kind of
index. IIRC, right now readers have to linearly scan the whole file in
order to use a single bitmap.

With Stolee's commit-metadata files, we could potentially move to
storing reachability bitmaps as a data element there. But there are two
complications:

  - the bitmaps themselves are dependent on having an ordered list of
objects (one per bit) to talk about. And that's why they're
integrated with packfiles, since that provides a constrained universe
with a set ordering.

  - the existing storage isn't entirely independent between bitmaps. Some
of them are basically "deltas" off of nearby bitmaps.


The VSTS reachability bitmap is different from the Git bitmap in a few ways.

1. It uses Roaring+Run bitmaps [1] instead of EWAH bitmap. This format 
is simpler (in my opinion) in several ways, especially in how it splits 
the bitmap into 16-bit address ranges and compresses each on its own. 
This makes set containment queries much faster, as we can navigate 
directly to the region that is important (and then binary search if that 
chunk is an "array" or "run" chunk). I say this is simpler because I can 
explain the entire compression format to you in five minutes and a 
whiteboard. The paper [2] is a simple enough read (start at the "Roaring 
Bitmap" section at the end of page 4).


2. Our file uses a chunk-based approach similar to the commit-graph 
file: one chunk is simply a list of the commits that have bitmaps. 
Another chunk is the raw binary data of all bitmaps concatenated 
together. The last chunk connects the other two chunks by a) providing 
an offset into the binary data for the bitmap at that commit, and b) the 
commit that provides a delta base for the bitmap.


If we are considering changing the reachability bitmap, then I'm very 
intrigued. I think the number one thing to consider is to use the 
multi-pack-index as a reference point (with a stable object order) so 
the objects can be repacked independently from the reachability bitmap 
computation. If we are changing the model at that level, then it is 
worth thinking about other questions, like how we index the file or how 
we compress the bitmaps.


Thanks,
-Stolee

[1] https://roaringbitmap.org/about/

[2] https://arxiv.org/abs/1603.06549
    Consistently faster and smaller compressed bitmaps with Roaring


[PATCH v2 7/7] ewah_io: delete unused 'ewah_serialize()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_io.c | 10 --
 ewah/ewok.h|  1 -
 2 files changed, 11 deletions(-)

diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c
index bed1994551..86d3f1d02e 100644
--- a/ewah/ewah_io.c
+++ b/ewah/ewah_io.c
@@ -100,16 +100,6 @@ int ewah_serialize_to(struct ewah_bitmap *self,
return (3 * 4) + (self->buffer_size * 8);
 }
 
-static int write_helper(void *fd, const void *buf, size_t len)
-{
-   return write((intptr_t)fd, buf, len);
-}
-
-int ewah_serialize(struct ewah_bitmap *self, int fd)
-{
-   return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd);
-}
-
 static int write_strbuf(void *user_data, const void *data, size_t len)
 {
struct strbuf *sb = user_data;
diff --git a/ewah/ewok.h b/ewah/ewok.h
index a540df2aa9..f3ac78ef1a 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self);
 int ewah_serialize_to(struct ewah_bitmap *self,
  int (*write_fun)(void *out, const void *buf, size_t len),
  void *out);
-int ewah_serialize(struct ewah_bitmap *self, int fd);
 int ewah_serialize_native(struct ewah_bitmap *self, int fd);
 int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *);
 
-- 
2.18.0.rc1



[PATCH v2 2/7] ewah/bitmap.c: delete unused 'bitmap_each_bit()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/bitmap.c | 24 
 ewah/ewok.h   |  1 -
 2 files changed, 25 deletions(-)

diff --git a/ewah/bitmap.c b/ewah/bitmap.c
index d61dc6114a..52f1178db4 100644
--- a/ewah/bitmap.c
+++ b/ewah/bitmap.c
@@ -129,30 +129,6 @@ void bitmap_or_ewah(struct bitmap *self, struct 
ewah_bitmap *other)
self->words[i++] |= word;
 }
 
-void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data)
-{
-   size_t pos = 0, i;
-
-   for (i = 0; i < self->word_alloc; ++i) {
-   eword_t word = self->words[i];
-   uint32_t offset;
-
-   if (word == (eword_t)~0) {
-   for (offset = 0; offset < BITS_IN_EWORD; ++offset)
-   callback(pos++, data);
-   } else {
-   for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
-   if ((word >> offset) == 0)
-   break;
-
-   offset += ewah_bit_ctz64(word >> offset);
-   callback(pos + offset, data);
-   }
-   pos += BITS_IN_EWORD;
-   }
-   }
-}
-
 size_t bitmap_popcount(struct bitmap *self)
 {
size_t i, count = 0;
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 2d25e93b9e..71704ed37f 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -217,7 +217,6 @@ void bitmap_and_not(struct bitmap *self, struct bitmap 
*other);
 void bitmap_or_ewah(struct bitmap *self, struct ewah_bitmap *other);
 void bitmap_or(struct bitmap *self, const struct bitmap *other);
 
-void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data);
 size_t bitmap_popcount(struct bitmap *self);
 
 #endif
-- 
2.18.0.rc1



[PATCH v2 1/7] ewah/bitmap.c: delete unused 'bitmap_clear()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/bitmap.c | 8 
 ewah/ewok.h   | 1 -
 2 files changed, 9 deletions(-)

diff --git a/ewah/bitmap.c b/ewah/bitmap.c
index 756bdd050e..d61dc6114a 100644
--- a/ewah/bitmap.c
+++ b/ewah/bitmap.c
@@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos)
self->words[block] |= EWAH_MASK(pos);
 }
 
-void bitmap_clear(struct bitmap *self, size_t pos)
-{
-   size_t block = EWAH_BLOCK(pos);
-
-   if (block < self->word_alloc)
-   self->words[block] &= ~EWAH_MASK(pos);
-}
-
 int bitmap_get(struct bitmap *self, size_t pos)
 {
size_t block = EWAH_BLOCK(pos);
diff --git a/ewah/ewok.h b/ewah/ewok.h
index dc43d05b64..2d25e93b9e 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -204,7 +204,6 @@ struct bitmap {
 
 struct bitmap *bitmap_new(void);
 void bitmap_set(struct bitmap *self, size_t pos);
-void bitmap_clear(struct bitmap *self, size_t pos);
 int bitmap_get(struct bitmap *self, size_t pos);
 void bitmap_reset(struct bitmap *self);
 void bitmap_free(struct bitmap *self);
-- 
2.18.0.rc1



[PATCH v2 4/7] ewah_bitmap: delete unused 'ewah_and_not()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 73 --
 ewah/ewok.h|  5 
 2 files changed, 78 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 5a4d3a6eb6..559adde37c 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -459,79 +459,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_and_not(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if ((predator->rlw.running_bit && prey == _i) ||
-   (!predator->rlw.running_bit && prey != _i)) 
{
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index;
-   int negate_words;
-
-   negate_words = (_i != prey);
-   index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 
negate_words);
-   ewah_add_empty_words(out, negate_words,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] &
-   ~(rlw_j.buffer[rlw_j.literal_word_start 
+ k])
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge(_i, out, ~0, 0);
-   else
-   rlwit_discharge_empty(_j, out);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
 void ewah_or(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index f4ada6979e..8dae8a608c 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -169,11 +169,6 @@ void ewah_or(
struct ewah_bitmap *ewah_j,
struct ewah_bitmap *out);
 
-void ewah_and_not(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
-- 
2.18.0.rc1



[PATCH v2 6/7] ewah_bitmap: delete unused 'ewah_or()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 69 --
 ewah/ewok.h|  5 
 2 files changed, 74 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 3fa3edf2a3..017c677f98 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -440,75 +440,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_or(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if (predator->rlw.running_bit) {
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 0);
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] |
-   rlw_j.buffer[rlw_j.literal_word_start + 
k]
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge(_i, out, ~0, 0);
-   else
-   rlwit_discharge(_j, out, ~0, 0);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
-
 #define BITMAP_POOL_MAX 16
 static struct ewah_bitmap *bitmap_pool[BITMAP_POOL_MAX];
 static size_t bitmap_pool_size;
diff --git a/ewah/ewok.h b/ewah/ewok.h
index a467645eec..a540df2aa9 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -157,11 +157,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct 
ewah_bitmap *parent);
  */
 int ewah_iterator_next(eword_t *next, struct ewah_iterator *it);
 
-void ewah_or(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
-- 
2.18.0.rc1



[PATCH v2 5/7] ewah_bitmap: delete unused 'ewah_not()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 19 ---
 ewah/ewok.h|  7 ---
 2 files changed, 26 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 559adde37c..3fa3edf2a3 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -376,25 +376,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct 
ewah_bitmap *parent)
read_new_rlw(it);
 }
 
-void ewah_not(struct ewah_bitmap *self)
-{
-   size_t pointer = 0;
-
-   while (pointer < self->buffer_size) {
-   eword_t *word = >buffer[pointer];
-   size_t literals, k;
-
-   rlw_xor_run_bit(word);
-   ++pointer;
-
-   literals = rlw_get_literal_words(word);
-   for (k = 0; k < literals; ++k) {
-   self->buffer[pointer] = ~self->buffer[pointer];
-   ++pointer;
-   }
-   }
-}
-
 void ewah_xor(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 8dae8a608c..a467645eec 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -95,13 +95,6 @@ int ewah_read_mmap(struct ewah_bitmap *self, const void 
*map, size_t len);
 
 uint32_t ewah_checksum(struct ewah_bitmap *self);
 
-/**
- * Logical not (bitwise negation) in-place on the bitmap
- *
- * This operation is linear time based on the size of the bitmap.
- */
-void ewah_not(struct ewah_bitmap *self);
-
 /**
  * Call the given callback with the position of every single bit
  * that has been set on the bitmap.
-- 
2.18.0.rc1



[PATCH v2 3/7] ewah_bitmap: delete unused 'ewah_and()'

2018-06-15 Thread Derrick Stolee
Reported-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 ewah/ewah_bitmap.c | 68 --
 ewah/ewok.h|  5 
 2 files changed, 73 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index b9fdda1d3d..5a4d3a6eb6 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -459,74 +459,6 @@ void ewah_xor(
out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
 }
 
-void ewah_and(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out)
-{
-   struct rlw_iterator rlw_i;
-   struct rlw_iterator rlw_j;
-   size_t literals;
-
-   rlwit_init(_i, ewah_i);
-   rlwit_init(_j, ewah_j);
-
-   while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) {
-   while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) {
-   struct rlw_iterator *prey, *predator;
-
-   if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) {
-   prey = _i;
-   predator = _j;
-   } else {
-   prey = _j;
-   predator = _i;
-   }
-
-   if (predator->rlw.running_bit == 0) {
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(prey,
-   predator->rlw.running_len);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   } else {
-   size_t index = rlwit_discharge(prey, out,
-   predator->rlw.running_len, 0);
-   ewah_add_empty_words(out, 0,
-   predator->rlw.running_len - index);
-   rlwit_discard_first_words(predator,
-   predator->rlw.running_len);
-   }
-   }
-
-   literals = min_size(
-   rlw_i.rlw.literal_words,
-   rlw_j.rlw.literal_words);
-
-   if (literals) {
-   size_t k;
-
-   for (k = 0; k < literals; ++k) {
-   ewah_add(out,
-   rlw_i.buffer[rlw_i.literal_word_start + 
k] &
-   rlw_j.buffer[rlw_j.literal_word_start + 
k]
-   );
-   }
-
-   rlwit_discard_first_words(_i, literals);
-   rlwit_discard_first_words(_j, literals);
-   }
-   }
-
-   if (rlwit_word_size(_i) > 0)
-   rlwit_discharge_empty(_i, out);
-   else
-   rlwit_discharge_empty(_j, out);
-
-   out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size);
-}
-
 void ewah_and_not(
struct ewah_bitmap *ewah_i,
struct ewah_bitmap *ewah_j,
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 71704ed37f..f4ada6979e 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -179,11 +179,6 @@ void ewah_xor(
struct ewah_bitmap *ewah_j,
struct ewah_bitmap *out);
 
-void ewah_and(
-   struct ewah_bitmap *ewah_i,
-   struct ewah_bitmap *ewah_j,
-   struct ewah_bitmap *out);
-
 /**
  * Direct word access
  */
-- 
2.18.0.rc1



[PATCH v2 0/7] Delete unused methods in EWAH bitmap

2018-06-15 Thread Derrick Stolee
The EWAH bitmap code includes several logical operations that are
important for a general-purpose bitmap library. However, Git only
uses the XOR operation for storing deltas between reachability
bitmaps. This means that we can delete the following unused methods:

* ewah_and()
* ewah_and_not()
* ewah_not()
* ewah_or()
* ewah_serialize()

We can also delete the unused methods bitmap_clear() and
bitmap_each_bit().

Derrick Stolee (7):
  ewah/bitmap.c: delete unused 'bitmap_clear()'
  ewah/bitmap.c: delete unused 'bitmap_each_bit()'
  ewah_bitmap: delete unused 'ewah_and()'
  ewah_bitmap: delete unused 'ewah_and_not()'
  ewah_bitmap: delete unused 'ewah_not()'
  ewah_bitmap: delete unused 'ewah_or()'
  ewah_io: delete unused 'ewah_serialize()'

 ewah/bitmap.c  |  32 ---
 ewah/ewah_bitmap.c | 229 -
 ewah/ewah_io.c |  10 --
 ewah/ewok.h|  25 -
 4 files changed, 296 deletions(-)


base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e
-- 
2.18.0.rc1



Re: [PATCH 05/23] midx: write header information to lockfile

2018-06-19 Thread Derrick Stolee

On 6/19/2018 10:59 AM, Duy Nguyen wrote:

On Tue, Jun 19, 2018 at 2:54 PM Derrick Stolee  wrote:

On 6/12/2018 11:00 AM, Duy Nguyen wrote:

On Thu, Jun 7, 2018 at 7:01 PM Derrick Stolee  wrote:

diff --git a/midx.c b/midx.c
index 616af66b13..3e55422a21 100644
--- a/midx.c
+++ b/midx.c
@@ -1,9 +1,62 @@
   #include "git-compat-util.h"
   #include "cache.h"
   #include "dir.h"
+#include "csum-file.h"
+#include "lockfile.h"
   #include "midx.h"

+#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
+#define MIDX_VERSION 1
+#define MIDX_HASH_VERSION 1 /* SHA-1 */

...

+static size_t write_midx_header(struct hashfile *f,
+   unsigned char num_chunks,
+   uint32_t num_packs)
+{
+   char byte_values[4];
+   hashwrite_be32(f, MIDX_SIGNATURE);
+   byte_values[0] = MIDX_VERSION;
+   byte_values[1] = MIDX_HASH_VERSION;

Quoting from "State of NewHash work, future directions, and discussion" [1]

* If you need to serialize an algorithm identifier into your data
format, use the format_id field of struct git_hash_algo.  It's
designed specifically for that purpose.

[1] 
https://public-inbox.org/git/20180612024252.ga141...@aiede.svl.corp.google.com/T/#m5fdd09dcaf31266c45343fb6c0beaaa3e928bc60

Thanks! I'll also use the_hash_algo->rawsz to infer the length of the
hash function.

BTW, since you're the author of commit-graph.c and may notice it has
the same problem. Don't touch that code. Brian already has some WIP
changes [1]. We just make sure new code does not add extra work for
him. I expect he'll send all those patches out soon.

[1] 
https://github.com/bk2204/git/commit/3f9031e06cfb21534eb7dfff7b54e7598ac1149f


Thanks for the link. It seems he is creating an oid_version() method 
that returns a 1-byte version for the hash version instead of the 4-byte 
signature of the_hash_algo->format_id. I look forward to incorporating 
that into the MIDX format. I'll keep my macros for now, as we work out 
the other details, and while Brain's patch is cooking.


Thanks,
-Stolee


Re: [PATCH 01/15] contrib: add cocci script to replace index compat macros

2018-06-19 Thread Derrick Stolee

On 6/19/2018 10:51 AM, Duy Nguyen wrote:

On Tue, Jun 19, 2018 at 1:41 PM Derrick Stolee  wrote:

Duy,

Here is the patch that was generated by `make coccicheck`.

Thanks,
-Stolee

-->8--

--- builtin/add.c

Ah right. This is on purpose. I think I mentioned in the commit
message that builtin/ is not touched. Do we run 'make coccicheck'
automatically somewhere? If true, I need to move this script elsewhere
because it's meant to run manually. You run it when you intend to do
more manual fixups afterwards. For builtin/, I think I'll wait until
'struct repository *' conversion is complete then maybe fix them one
by one.


I don't think it is part of the CI runs, but some community members run 
this themselves on 'next' and 'master'.


I'm CC'ing Szeder, as he has contributed a fix to my own Coccinelle 
script [1].


[1] 
https://public-inbox.org/git/20180430093153.13040-1-szeder@gmail.com/
    [PATCH] coccinelle: avoid wrong transformation suggestions from 
commit.cocci


Re: [PATCH] ewah: delete unused 'rlwit_discharge_empty()'

2018-06-19 Thread Derrick Stolee

On 6/19/2018 5:51 PM, Ramsay Jones wrote:

From: Junio C Hamano 

Complete the removal of unused 'ewah bitmap' code by removing the now
unused 'rlwit_discharge_empty()' function. Also, the 'ewah_clear()'
function can now be made a file-scope static symbol.

Signed-off-by: Ramsay Jones 
---

Hi Junio,

Can you please add this to the 'ds/ewah-cleanup' branch, before
we forget to do so! ;-)

Thanks!

ATB,
Ramsay Jones



Looks good to me! Thanks!

-Stolee




  ewah/ewah_bitmap.c | 20 
  ewah/ewah_rlw.c|  8 
  ewah/ewok.h|  6 --
  ewah/ewok_rlw.h|  1 -
  4 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c
index 017c677f9..d59b1afe3 100644
--- a/ewah/ewah_bitmap.c
+++ b/ewah/ewah_bitmap.c
@@ -276,6 +276,18 @@ void ewah_each_bit(struct ewah_bitmap *self, void 
(*callback)(size_t, void*), vo
}
  }
  
+/**

+ * Clear all the bits in the bitmap. Does not free or resize
+ * memory.
+ */
+static void ewah_clear(struct ewah_bitmap *self)
+{
+   self->buffer_size = 1;
+   self->buffer[0] = 0;
+   self->bit_size = 0;
+   self->rlw = self->buffer;
+}
+
  struct ewah_bitmap *ewah_new(void)
  {
struct ewah_bitmap *self;
@@ -288,14 +300,6 @@ struct ewah_bitmap *ewah_new(void)
return self;
  }
  
-void ewah_clear(struct ewah_bitmap *self)

-{
-   self->buffer_size = 1;
-   self->buffer[0] = 0;
-   self->bit_size = 0;
-   self->rlw = self->buffer;
-}
-
  void ewah_free(struct ewah_bitmap *self)
  {
if (!self)
diff --git a/ewah/ewah_rlw.c b/ewah/ewah_rlw.c
index b9643b7d0..5093d43e2 100644
--- a/ewah/ewah_rlw.c
+++ b/ewah/ewah_rlw.c
@@ -104,11 +104,3 @@ size_t rlwit_discharge(
  
  	return index;

  }
-
-void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out)
-{
-   while (rlwit_word_size(it) > 0) {
-   ewah_add_empty_words(out, 0, rlwit_word_size(it));
-   rlwit_discard_first_words(it, rlwit_word_size(it));
-   }
-}
diff --git a/ewah/ewok.h b/ewah/ewok.h
index 0c504f28e..84b2a29fa 100644
--- a/ewah/ewok.h
+++ b/ewah/ewok.h
@@ -72,12 +72,6 @@ void ewah_pool_free(struct ewah_bitmap *self);
   */
  struct ewah_bitmap *ewah_new(void);
  
-/**

- * Clear all the bits in the bitmap. Does not free or resize
- * memory.
- */
-void ewah_clear(struct ewah_bitmap *self);
-
  /**
   * Free all the memory of the bitmap
   */
diff --git a/ewah/ewok_rlw.h b/ewah/ewok_rlw.h
index bb3c6ff7e..7cdfdd0c0 100644
--- a/ewah/ewok_rlw.h
+++ b/ewah/ewok_rlw.h
@@ -98,7 +98,6 @@ void rlwit_init(struct rlw_iterator *it, struct ewah_bitmap 
*bitmap);
  void rlwit_discard_first_words(struct rlw_iterator *it, size_t x);
  size_t rlwit_discharge(
struct rlw_iterator *it, struct ewah_bitmap *out, size_t max, int 
negate);
-void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out);
  
  static inline size_t rlwit_word_size(struct rlw_iterator *it)

  {


Re: [PATCH 03/35] object: add repository argument to lookup_unknown_object

2018-06-13 Thread Derrick Stolee

On 6/13/2018 3:30 PM, Stefan Beller wrote:

On Wed, Jun 6, 2018 at 12:38 PM Duy Nguyen  wrote:

On Wed, May 30, 2018 at 2:47 AM, Stefan Beller  wrote:

diff --git a/object.c b/object.c
index 4de4fa58d59..def3c71cac2 100644
--- a/object.c
+++ b/object.c
@@ -177,7 +177,7 @@ void *object_as_type(struct object *obj, enum object_type 
type, int quiet)
 }
  }

-struct object *lookup_unknown_object(const unsigned char *sha1)
+struct object *lookup_unknown_object_the_repository(const unsigned char *sha1)

I'm looking at your branch and this function (with the _the_repository
suffix) is still there. Did you forget to send a patch to convert this
function?

This and parse_commit and parse_commit_gently have not been converted.

I stopped with this series as soon as I hit the commit-graph code, which needs
to be updated, too. I'll start redoing this series soon and will fix
those conversions.


Yes, we are colliding a bit at the moment.

 I'm planning to delay rerolling my patches until your last few series 
get merged.


Thanks,

-Stolee



Re: [PATCH v2 00/31] object-store: lookup_commit

2018-06-13 Thread Derrick Stolee

On 6/13/2018 7:04 PM, Stefan Beller wrote

* Once this is in good shape we can talk about converting parts of the revision
   walking code,


This is another reason why I'll be waiting for this series of series. I 
plan to rework the revision walking code around 
sort_in_topological_order(), but I'll wait for your changes to land 
before submitting a patch. I'm itching to play with it, so I may start 
testing things out before then.


In the meantime, I'll do my part by reviewing the current series.

Thanks,

-Stolee



Re: [PATCH v2 00/31] object-store: lookup_commit

2018-06-13 Thread Derrick Stolee



On 6/13/2018 9:23 PM, Derrick Stolee wrote:

On 6/13/2018 7:04 PM, Stefan Beller wrote
* Once this is in good shape we can talk about converting parts of 
the revision

   walking code,


This is another reason why I'll be waiting for this series of series. 
I plan to rework the revision walking code around 
sort_in_topological_order(), but I'll wait for your changes to land 
before submitting a patch. I'm itching to play with it, so I may start 
testing things out before then.


In the meantime, I'll do my part by reviewing the current series.


Looks good to me.

Reviewed-by: Derrick Stolee 



Re: [PATCH 03/35] object: add repository argument to lookup_unknown_object

2018-05-30 Thread Derrick Stolee

On 5/29/2018 8:47 PM, Stefan Beller wrote:

From: Jonathan Nieder 

Add a repository argument to allow callers of lookup_unknown_object to
be more specific about which repository to handle. This is a small
mechanical change; it doesn't change the implementation to handle
repositories other than the_repository yet.

As with the previous commits, use a macro to catch callers passing a
repositor other than the_repository at compile time.

The included coccinelle semantic patch will adapt any new callers in
the diff produced by `make coccicheck`.


I don't see any coccinelle change in this patch.

Perhaps it is in a later commit?

Thanks,

-Stolee



Signed-off-by: Jonathan Nieder 
Signed-off-by: Stefan Beller 
---
  builtin/fsck.c   | 2 +-
  builtin/pack-objects.c   | 2 +-
  http-push.c  | 2 +-
  object.c | 2 +-
  object.h | 3 ++-
  refs.c   | 2 +-
  t/helper/test-example-decorate.c | 6 +++---
  upload-pack.c| 2 +-
  walker.c | 2 +-
  9 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/builtin/fsck.c b/builtin/fsck.c
index 98fdeef5407..700739804fc 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -638,7 +638,7 @@ static int fsck_cache_tree(struct cache_tree *it)
  
  static void mark_object_for_connectivity(const struct object_id *oid)

  {
-   struct object *obj = lookup_unknown_object(oid->hash);
+   struct object *obj = lookup_unknown_object(the_repository, oid->hash);
obj->flags |= HAS_OBJ;
  }
  
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c

index 8c108109985..6eae39cf858 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2689,7 +2689,7 @@ static void add_objects_in_unpacked_packs(struct rev_info 
*revs)
  
  		for (i = 0; i < p->num_objects; i++) {

nth_packed_object_oid(, p, i);
-   o = lookup_unknown_object(oid.hash);
+   o = lookup_unknown_object(the_repository, oid.hash);
if (!(o->flags & OBJECT_ADDED))
mark_in_pack_object(o, p, _pack);
o->flags |= OBJECT_ADDED;
diff --git a/http-push.c b/http-push.c
index 2615c823d60..04d95bd5da8 100644
--- a/http-push.c
+++ b/http-push.c
@@ -1429,7 +1429,7 @@ static void one_remote_ref(const char *refname)
 * may be required for updating server info later.
 */
if (repo->can_update_info_refs && !has_object_file(>old_oid)) {
-   obj = lookup_unknown_object(ref->old_oid.hash);
+   obj = lookup_unknown_object(the_repository, ref->old_oid.hash);
fprintf(stderr, "  fetch %s for %s\n",
oid_to_hex(>old_oid), refname);
add_fetch_request(obj);
diff --git a/object.c b/object.c
index 4de4fa58d59..def3c71cac2 100644
--- a/object.c
+++ b/object.c
@@ -177,7 +177,7 @@ void *object_as_type(struct object *obj, enum object_type 
type, int quiet)
}
  }
  
-struct object *lookup_unknown_object(const unsigned char *sha1)

+struct object *lookup_unknown_object_the_repository(const unsigned char *sha1)
  {
struct object *obj = lookup_object(the_repository, sha1);
if (!obj)
diff --git a/object.h b/object.h
index fa41d711f44..778f83bf0f7 100644
--- a/object.h
+++ b/object.h
@@ -139,7 +139,8 @@ struct object *parse_object_or_die(const struct object_id 
*oid, const char *name
  struct object *parse_object_buffer(const struct object_id *oid, enum 
object_type type, unsigned long size, void *buffer, int *eaten_p);
  
  /** Returns the object, with potentially excess memory allocated. **/

-struct object *lookup_unknown_object(const unsigned  char *sha1);
+#define lookup_unknown_object(r, s) lookup_unknown_object_##r(s)
+struct object *lookup_unknown_object_the_repository(const unsigned char *sha1);
  
  struct object_list *object_list_insert(struct object *item,

   struct object_list **list_p);
diff --git a/refs.c b/refs.c
index 23d53957deb..3b9e8463656 100644
--- a/refs.c
+++ b/refs.c
@@ -301,7 +301,7 @@ static int filter_refs(const char *refname, const struct 
object_id *oid,
  
  enum peel_status peel_object(const struct object_id *name, struct object_id *oid)

  {
-   struct object *o = lookup_unknown_object(name->hash);
+   struct object *o = lookup_unknown_object(the_repository, name->hash);
  
  	if (o->type == OBJ_NONE) {

int type = oid_object_info(the_repository, name, NULL);
diff --git a/t/helper/test-example-decorate.c b/t/helper/test-example-decorate.c
index 081115bf8eb..33e727f7fc5 100644
--- a/t/helper/test-example-decorate.c
+++ b/t/helper/test-example-decorate.c
@@ -26,8 +26,8 @@ int cmd__example_decorate(int argc, const char **argv)
 * Add 2 objects, one with a non-NULL decoration and one with a NULL
 * 

Re: [PATCH 06/35] blob: add repository argument to lookup_blob

2018-05-30 Thread Derrick Stolee

On 5/29/2018 8:47 PM, Stefan Beller wrote:

Add a repository argument to allow the callers of lookup_blob
to be more specific about which repository to act on. This is a small
mechanical change; it doesn't change the implementation to handle
repositories other than the_repository yet.

As with the previous commits, use a macro to catch callers passing a
repository other than the_repository at compile time.

Add the cocci patch that converted the callers.
(this is the last time I'll point this out. I'll wait until I read to 
the end to see if the coccinelle patch is included in the series.)


Signed-off-by: Jonathan Nieder 
Signed-off-by: Stefan Beller 
---
  blob.c   | 2 +-
  blob.h   | 3 ++-
  builtin/fast-export.c| 2 +-
  builtin/fsck.c   | 3 ++-
  builtin/index-pack.c | 2 +-
  builtin/merge-tree.c | 3 ++-
  builtin/unpack-objects.c | 2 +-
  fsck.c   | 2 +-
  http-push.c  | 3 ++-
  list-objects.c   | 2 +-
  object.c | 4 ++--
  reachable.c  | 2 +-
  revision.c   | 4 ++--
  tag.c| 2 +-
  walker.c | 3 ++-
  15 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/blob.c b/blob.c
index dada295698c..17b9314f0a0 100644
--- a/blob.c
+++ b/blob.c
@@ -5,7 +5,7 @@
  
  const char *blob_type = "blob";
  
-struct blob *lookup_blob(const struct object_id *oid)

+struct blob *lookup_blob_the_repository(const struct object_id *oid)
  {
struct object *obj = lookup_object(the_repository, oid->hash);
if (!obj)
diff --git a/blob.h b/blob.h
index 44606168310..08bc34487a0 100644
--- a/blob.h
+++ b/blob.h
@@ -9,7 +9,8 @@ struct blob {
struct object object;
  };
  
-struct blob *lookup_blob(const struct object_id *oid);

+#define lookup_blob(r, o) lookup_blob_##r(o)
+struct blob *lookup_blob_the_repository(const struct object_id *oid);
  
  int parse_blob_buffer(struct blob *item, void *buffer, unsigned long size);
  
diff --git a/builtin/fast-export.c b/builtin/fast-export.c

index a34ab9768f4..23ca46e6008 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -237,7 +237,7 @@ static void export_blob(const struct object_id *oid)
  
  	if (anonymize) {

buf = anonymize_blob();
-   object = (struct object *)lookup_blob(oid);
+   object = (struct object *)lookup_blob(the_repository, oid);
eaten = 0;
} else {
buf = read_object_file(oid, , );
diff --git a/builtin/fsck.c b/builtin/fsck.c
index 82a5acaf76e..6e2b204e938 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -808,7 +808,8 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
mode = active_cache[i]->ce_mode;
if (S_ISGITLINK(mode))
continue;
-   blob = lookup_blob(_cache[i]->oid);
+   blob = lookup_blob(the_repository,
+  _cache[i]->oid);
if (!blob)
continue;
obj = >object;
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 0dd10693597..51c5244a15a 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -832,7 +832,7 @@ static void sha1_object(const void *data, struct 
object_entry *obj_entry,
if (strict || do_fsck_object) {
read_lock();
if (type == OBJ_BLOB) {
-   struct blob *blob = lookup_blob(oid);
+   struct blob *blob = lookup_blob(the_repository, oid);
if (blob)
blob->object.flags |= FLAG_CHECKED;
else
diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c
index 8a8d5797520..f8023bae1e2 100644
--- a/builtin/merge-tree.c
+++ b/builtin/merge-tree.c
@@ -2,6 +2,7 @@
  #include "tree-walk.h"
  #include "xdiff-interface.h"
  #include "object-store.h"
+#include "repository.h"
  #include "blob.h"
  #include "exec-cmd.h"
  #include "merge-blobs.h"
@@ -170,7 +171,7 @@ static struct merge_list *create_entry(unsigned stage, 
unsigned mode, const stru
res->stage = stage;
res->path = path;
res->mode = mode;
-   res->blob = lookup_blob(oid);
+   res->blob = lookup_blob(the_repository, oid);
return res;
  }
  
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c

index 8e454c48649..dfea7790fde 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -254,7 +254,7 @@ static void write_object(unsigned nr, enum object_type type,
added_object(nr, type, buf, size);
free(buf);
  
-		blob = lookup_blob(_list[nr].oid);

+   blob = lookup_blob(the_repository, _list[nr].oid);
if (blob)
blob->object.flags |= FLAG_WRITTEN;
 

Re: [PATCH v3 06/20] commit-graph: add 'verify' subcommand

2018-05-30 Thread Derrick Stolee

On 5/27/2018 6:55 PM, Jakub Narebski wrote:

Derrick Stolee  writes:


If the commit-graph file becomes corrupt, we need a way to verify
that its contents match the object database. In the manner of
'git fsck' we will implement a 'git commit-graph verify' subcommand
to report all issues with the file.

Add the 'verify' subcommand to the 'commit-graph' builtin and its
documentation. The subcommand is currently a no-op except for
loading the commit-graph into memory, which may trigger run-time
errors that would be caught by normal use.

So this commit is simply getting the boilerplate out of the way for
implementing 'git commit-graph verify' subcommand.  Good.


Add a simple test that
ensures the command returns a zero error code.

Nice.


If no commit-graph file exists, this is an acceptable state. Do
not report any errors.

All right.  I assume that as it is explicit verification call, it does
ignore core.commitGraph setting, isn't it?


Signed-off-by: Derrick Stolee 
---
  Documentation/git-commit-graph.txt |  6 ++
  builtin/commit-graph.c | 38 ++
  commit-graph.c | 26 ++
  commit-graph.h |  2 ++
  t/t5318-commit-graph.sh| 10 ++
  5 files changed, 82 insertions(+)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index 4c97b555cc..a222cfab08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -10,6 +10,7 @@ SYNOPSIS
  
  [verse]
  'git commit-graph read' [--object-dir ]
+'git commit-graph verify' [--object-dir ]
  'git commit-graph write'  [--object-dir ]

In alphabetical order, good.

  
  
@@ -52,6 +53,11 @@ existing commit-graph file.

  Read a graph file given by the commit-graph file and output basic
  details about the graph file. Used for debugging purposes.
  
+'verify'::

+
+Read the commit-graph file and verify its contents against the object
+database. Used to check for corrupted data.
+

All right, good enough description.

  
  EXAMPLES

  
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index f0875b8bf3..0433dd6e20 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -8,10 +8,16 @@
  static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
+   N_("git commit-graph verify [--object-dir ]"),
N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
NULL
  };

In alphabetical order, same as in the manpage for git-commit-graph.

  
+static const char * const builtin_commit_graph_verify_usage[] = {

+   N_("git commit-graph verify [--object-dir ]"),
+   NULL
+};
+
  static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir ]"),
NULL
@@ -29,6 +35,36 @@ static struct opts_commit_graph {
int append;
  } opts;
  
+

+static int graph_verify(int argc, const char **argv)
+{
+   struct commit_graph *graph = 0;
+   char *graph_name;
+
+   static struct option builtin_commit_graph_verify_options[] = {
+   OPT_STRING(0, "object-dir", _dir,
+  N_("dir"),
+  N_("The object directory to store the graph")),
+   OPT_END(),
+   };
+
+   argc = parse_options(argc, argv, NULL,
+builtin_commit_graph_verify_options,
+builtin_commit_graph_verify_usage, 0);
+
+   if (!opts.obj_dir)
+   opts.obj_dir = get_object_directory();

Getting the boilerplate of implementing the command mostly out of the
way.  Good.


+
+   graph_name = get_commit_graph_filename(opts.obj_dir);
+   graph = load_commit_graph_one(graph_name);

So we are verifying only the commit-graph file belonging directly to
current repository, as I have expected.  This is needed to for warnings
and error messages from the 'verify' action, to be able to tell in which
file there are problems.

This means that it is possible that there would be problems with
commit-graph files that running 'git commit-graph verify' would not
find, because they are in commit-graph file in one of the alternates.

It is very easy, though, to check all commit-graph files that would be
read and its data concatenated when using commit-graph feature
(e.g. 'git commit-graph read', IIRC):

   $ git commit-graph verify
   $ for obj_dir in $(cat .git/objects/info/alternates) do;
 git commit-graph --object-dir="$obj_dir";
 done

Note: I have not checked the above that it works.


+   FREE_AND_NULL(graph_name);

Freeing the resources, always nice to have.


+
+   if (!grap

Re: [PATCH 04/35] object: add repository argument to parse_object_buffer

2018-05-30 Thread Derrick Stolee

On 5/29/2018 8:47 PM, Stefan Beller wrote:

Add a repository argument to allow the callers of parse_object_buffer
to be more specific about which repository to act on. This is a small
mechanical change; it doesn't change the implementation to handle
repositories other than the_repository yet.

As with the previous commits, use a macro to catch callers passing a
repository other than the_repository at compile time.

Add the cocci patch that converted the callers.


Again, I'm missing the cocci patch here.



Signed-off-by: Jonathan Nieder 
Signed-off-by: Stefan Beller 
---
  builtin/fast-export.c| 3 ++-
  builtin/fsck.c   | 6 --
  builtin/index-pack.c | 3 ++-
  builtin/unpack-objects.c | 3 ++-
  object.c | 5 +++--
  object.h | 3 ++-
  ref-filter.c | 3 ++-
  7 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 24d42842f9d..a34ab9768f4 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -245,7 +245,8 @@ static void export_blob(const struct object_id *oid)
die ("Could not read blob %s", oid_to_hex(oid));
if (check_object_signature(oid, buf, size, type_name(type)) < 0)
die("sha1 mismatch in blob %s", oid_to_hex(oid));
-   object = parse_object_buffer(oid, type, size, buf, );
+   object = parse_object_buffer(the_repository, oid, type,
+size, buf, );
}
  
  	if (!object)

diff --git a/builtin/fsck.c b/builtin/fsck.c
index 700739804fc..e6d6eb266eb 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -392,7 +392,8 @@ static int fsck_obj_buffer(const struct object_id *oid, 
enum object_type type,
 * verify_packfile(), data_valid variable for details.
 */
struct object *obj;
-   obj = parse_object_buffer(oid, type, size, buffer, eaten);
+   obj = parse_object_buffer(the_repository, oid, type, size, buffer,
+ eaten);
if (!obj) {
errors_found |= ERROR_OBJECT;
return error("%s: object corrupt or missing", oid_to_hex(oid));
@@ -522,7 +523,8 @@ static struct object *parse_loose_object(const struct 
object_id *oid,
if (!contents && type != OBJ_BLOB)
die("BUG: read_loose_object streamed a non-blob");
  
-	obj = parse_object_buffer(oid, type, size, contents, );

+   obj = parse_object_buffer(the_repository, oid, type, size,
+ contents, );
  
  	if (!eaten)

free(contents);
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index e2f670bef9e..0dd10693597 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -848,7 +848,8 @@ static void sha1_object(const void *data, struct 
object_entry *obj_entry,
 * we do not need to free the memory here, as the
 * buf is deleted by the caller.
 */
-   obj = parse_object_buffer(oid, type, size, buf,
+   obj = parse_object_buffer(the_repository, oid, type,
+ size, buf,
  );
if (!obj)
die(_("invalid %s"), type_name(type));
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 9a4d2708123..8e454c48649 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -265,7 +265,8 @@ static void write_object(unsigned nr, enum object_type type,
int eaten;
hash_object_file(buf, size, type_name(type), _list[nr].oid);
added_object(nr, type, buf, size);
-   obj = parse_object_buffer(_list[nr].oid, type, size, buf,
+   obj = parse_object_buffer(the_repository, _list[nr].oid,
+ type, size, buf,
  );
if (!obj)
die("invalid %s", type_name(type));
diff --git a/object.c b/object.c
index def3c71cac2..4250ddd3f7f 100644
--- a/object.c
+++ b/object.c
@@ -186,7 +186,7 @@ struct object *lookup_unknown_object_the_repository(const 
unsigned char *sha1)
return obj;
  }
  
-struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p)

+struct object *parse_object_buffer_the_repository(const struct object_id *oid, 
enum object_type type, unsigned long size, void *buffer, int *eaten_p)
  {
struct object *obj;
*eaten_p = 0;
@@ -278,7 +278,8 @@ struct object *parse_object_the_repository(const struct 
object_id *oid)
return NULL;
}
  
-		obj = parse_object_buffer(oid, type, size, buffer, );

+   obj = 

Re: [PATCH v3 09/20] commit-graph: verify corrupt OID fanout and lookup

2018-05-30 Thread Derrick Stolee



On 5/30/2018 9:34 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


In the commit-graph file, the OID fanout chunk provides an index into
the OID lookup. The 'verify' subcommand should find incorrect values
in the fanout.

Similarly, the 'verify' subcommand should find out-of-order values in
the OID lookup.

O.K., the OID Lookup chunk is checked together with OID Fanout chunk
because those two chunks are tightly connected: OID Fanout is fanout
into OID Lookup.


Signed-off-by: Derrick Stolee 
---
  commit-graph.c  | 36 
  t/t5318-commit-graph.sh | 22 ++
  2 files changed, 58 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 06e3e4f9ba..cbd1aae514 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -855,6 +855,9 @@ static void graph_report(const char *fmt, ...)
  
  int verify_commit_graph(struct commit_graph *g)

  {
+   uint32_t i, cur_fanout_pos = 0;
+   struct object_id prev_oid, cur_oid;

Minor nitpick about the naming convention: why cur_*, and not curr_*?


+
if (!g) {
graph_report("no commit-graph file loaded");
return 1;
@@ -869,5 +872,38 @@ int verify_commit_graph(struct commit_graph *g)
if (!g->chunk_commit_data)
graph_report("commit-graph is missing the Commit Data chunk");
  
+	if (verify_commit_graph_error)

+   return verify_commit_graph_error;
+

Before checking that commits in OID Lookup are sorted, and that OID
Fanout correctly points into OID Lookup (thus also checking that OID
Lookup is sorted), it would be good to verify that sizes of those chunks
are correct.

Out of 4 standrd chunks, 1 of them (OIDF) has constant size, and 2 of
them have size given by number of commits and hash size
  - OID Fanout is (256 * 4 bytes)
  - OID Lookup is (N * H bytes),
where N is number of commits, and H is hash size

The one that is more significant is if number of commits gets corrupted
upwards, and walking through OID Lookup would lead us out of bounds,
outside the file size.

IIRC we have checked that all chunks fit within file size, isn't it?


+   for (i = 0; i < g->num_commits; i++) {
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);

Why do you use hashcpy() here, but oidcpy() below?


We are copying from a section of binary data, not from a struct object_id *.




+
+   if (i && oidcmp(_oid, _oid) >= 0)

All right, OIDs needs to be sorted in ascending lexicographical order,
and the above condition checks that this constraint is fullfilled.


+   graph_report("commit-graph has incorrect OID order: %s then 
%s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));

Nice informative error message.


+
+   oidcpy(_oid, _oid);
+
+   while (cur_oid.hash[0] > cur_fanout_pos) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+   if (i != fanout_value)
+   graph_report("commit-graph has incorrect fanout 
value: fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }

The k-th entry of fanout, F[k], stores the number of OIDs with first
byte at most k.

Let's examine it in detail on a simple example:

fanout lookup
-- --
0 : 0  ---> 0: 1
1 : 2  -\   1: 1
2 : 2  --\> 2: 3
3 : 3  ---> 3: 4

We are checking that after most significant byte (first byte) changes,
then all fanout position up to the current first byte value (exclusive)
point to current position in OID Lookup chunk.

Seems all right; it would be nice to come up with rigorous proof that it
is all right.


+   }
+
+   while (cur_fanout_pos < 256) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+
+   if (g->num_commits != fanout_value)
+   graph_report("commit-graph has incorrect fanout value: 
fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }

All right, this checks that all remaining fanout entries, up and
including the 255-ith entry store the total number of commits, N.


+
return verify_commit_graph_error;
  }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 4ef3fe3dc2..c050ef980b 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
  '
 

Re: [PATCH 27/35] commit-slabs: remove realloc counter outside of slab struct

2018-05-30 Thread Derrick Stolee

On 5/29/2018 8:48 PM, Stefan Beller wrote:

The realloc counter is declared outside the struct for the given slabname,
which makes it harder for a follow up patch to move the declaration of the
struct around as then the counter variable would need special treatment.

As the reallocation counter is currently unused we can just remove it.
If we ever need to count the reallocations again, we can reintroduce
the counter as part of 'struct slabname' in commit-slab-decl.h.


It's worth noting that this patch is different from the other mechanical 
patches. I do agree that we should remove this unused portion of the 
slab API. Likely this was built for testing purposes.


CC'ing Junio, who I found introduced this in a84b794ad "commit-slab: 
introduce a macro to define a slab for new type". Looking at that diff, 
the realloc portion did not appear to exist in the previous 
in-degree-specific version of slabs.




Signed-off-by: Stefan Beller 
---
  commit-slab-impl.h | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/commit-slab-impl.h b/commit-slab-impl.h
index 87a9cadfcca..ac1e6d409ad 100644
--- a/commit-slab-impl.h
+++ b/commit-slab-impl.h
@@ -11,8 +11,6 @@
  
  #define implement_commit_slab(slabname, elemtype, scope)		\

\
-static int stat_ ##slabname## realloc; \
-   \
  scope void init_ ##slabname## _with_stride(struct slabname *s,
\
   unsigned stride) \
  { \
@@ -54,7 +52,6 @@ scope elemtype *slabname## _at_peek(struct slabname *s,   
\
if (!add_if_missing)\
return NULL;\
REALLOC_ARRAY(s->slab, nth_slab + 1);\
-   stat_ ##slabname## realloc++;   \
for (i = s->slab_count; i <= nth_slab; i++)   \
s->slab[i] = NULL;   \
s->slab_count = nth_slab + 1;\


Re: [RFC PATCH 00/35] object-store: lookup_commit

2018-05-30 Thread Derrick Stolee

On 5/29/2018 11:18 PM, Stefan Beller wrote:

On Tue, May 29, 2018 at 6:05 PM, Derrick Stolee  wrote:

On 5/29/2018 8:47 PM, Stefan Beller wrote:

This applies on the merge of nd/commit-util-to-slab and
sb/object-store-grafts,
and is available at http://github.com/stefanbeller/ as branch
object-store-lookup-commit
as the merge has some merge conflicts as well as syntactical conflicts
(upload-pack.c
and fetch-pack.c introduce new calls of functions that would want to take
a repository struct
in the object-store-grafts series)

As layed out in
https://public-inbox.org/git/20180517225154.9200-1-sbel...@google.com/
this is getting close to finishing the set of object store series though
the last
unfinished part of this RFC hints at new work on the plate:
* To give this series a nice polish, we'd want to convert parse_commit,
too.
But that requires the conversion of the new commit graph. Maybe we need
to split this series into 2.


I'll take a look at this series tomorrow. I've been working in
ds/commit-graph-fsck to make many of the methods take a 'struct commit_graph
*' parameter, which could easily be 'r->commit_graph' for a 'struct
the_repository *r' instead of the global 'commit_graph'. Those graph-local
methods will make the transformation to be repo-local a lot easier. (There
still may be some need to insert a repository, though.)

Since you are working on the commit-slab stuff in this patch, I'll (continue
to) delay my patch series on using the commit-slab for generation numbers to
avoid collisions with your work.

This series touches the commit slabs in the last few commits only.
I don't think there will be huge conflicts, as all I had to do was
move the definition from a global to inside the parsed object store.


I went through the whole series and didn't find any issue with the code. 
The commit messages reference coccinelle scripts that I never saw, so 
those should be struck from the messages.


If anyone else plans to review, these two commits are worth special 
attention compared to the others:


[PATCH 27/35] commit-slabs: remove realloc counter outside of slab struct
[PATCH 28/35] commit.c: migrate the commit buffer to the parsed object store

I found no issue with them, but they do actually change behavior slightly as 
opposed to the other commits which have no change other than to relax the 
dependence on the_repository as a global. I also like the pattern of storing 
the slabs inside the parsed object structure in the_repository. I'll use a 
similar pattern when looking into using slabs for generation number.

For Junio: I know this will conflict with ds/commit-graph-fsck (specifically, I 
add new references to lookup_commit() and manipulate parse_commit_gently()). If 
this series is ready to be queued quickly, I can rebase my v4 onto these 
commits as they are placed into 'pu' (and please eject ds/commit-graph-fsck at 
that time).

Thanks,
-Stolee



Re: [RFC PATCH 00/35] object-store: lookup_commit

2018-05-29 Thread Derrick Stolee

On 5/29/2018 8:47 PM, Stefan Beller wrote:

This applies on the merge of nd/commit-util-to-slab and sb/object-store-grafts,
and is available at http://github.com/stefanbeller/ as branch 
object-store-lookup-commit
as the merge has some merge conflicts as well as syntactical conflicts 
(upload-pack.c
and fetch-pack.c introduce new calls of functions that would want to take a 
repository struct
in the object-store-grafts series)

As layed out in 
https://public-inbox.org/git/20180517225154.9200-1-sbel...@google.com/
this is getting close to finishing the set of object store series though the 
last
unfinished part of this RFC hints at new work on the plate:
* To give this series a nice polish, we'd want to convert parse_commit, too.
   But that requires the conversion of the new commit graph. Maybe we need
   to split this series into 2.


I'll take a look at this series tomorrow. I've been working in 
ds/commit-graph-fsck to make many of the methods take a 'struct 
commit_graph *' parameter, which could easily be 'r->commit_graph' for a 
'struct the_repository *r' instead of the global 'commit_graph'. Those 
graph-local methods will make the transformation to be repo-local a lot 
easier. (There still may be some need to insert a repository, though.)


Since you are working on the commit-slab stuff in this patch, I'll 
(continue to) delay my patch series on using the commit-slab for 
generation numbers to avoid collisions with your work.


Thanks,

-Stolee



Re: [PATCH] commit-graph: fix a sparse 'integer as NULL pointer' warning

2018-05-29 Thread Derrick Stolee

n 5/29/2018 4:01 PM, Ramsay Jones wrote:

Signed-off-by: Ramsay Jones 
---

Hi Derrick,

If you need to re-roll your 'ds/commit-graph-fsck' branch (pu@a84e06bc0f),
could you please squash this into the relevant patch (commit 80453b4529,
"commit-graph: add 'verify' subcommand", 2018-05-24).

[No, No, that was the one in graph_read(). :-D ]


Thanks for helping rid me of my bad habits! I'll get this one in v4.



Thanks!

ATB,
Ramsay Jones

  builtin/commit-graph.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 20ce6437a..6abfde5e6 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -39,7 +39,7 @@ static struct opts_commit_graph {
  
  static int graph_verify(int argc, const char **argv)

  {
-   struct commit_graph *graph = 0;
+   struct commit_graph *graph = NULL;
char *graph_name;
  
  	static struct option builtin_commit_graph_verify_options[] = {


Re: [PATCH v3 07/20] commit-graph: verify catches corrupt signature

2018-05-29 Thread Derrick Stolee

On 5/28/2018 10:05 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


This is the first of several commits that add a test to check that
'git commit-graph verify' catches corruption in the commit-graph
file. The first test checks that the command catches an error in
the file signature. This is a check that exists in the existing
commit-graph reading code.

Good start.

This handles 3 out of 5 checks in load_commit_graph_one().  The
remaining are:
* too short file (length smaller than minimal commit-graph size)
* more than one chunk of one of 4 defined types


Add a helper method 'corrupt_graph_and_verify' to the test script
t5318-commit-graph.sh. This helper corrupts the commit-graph file
at a certain location, runs 'git commit-graph verify', and reports
the output to the 'err' file. This data is filtered to remove the
lines added by 'test_must_fail' when the test is run verbosely.
Then, the output is checked to contain a specific error message.

Thanks for an explanation.


Signed-off-by: Derrick Stolee 
---
  t/t5318-commit-graph.sh | 43 +++
  1 file changed, 43 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 6ca451dfd2..bd64481c7a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
  '
  
+# the verify tests below expect the commit-graph to contain

+# exactly the commits reachable from the commits/8 branch.
+# If the file changes the set of commits in the list, then the
+# offsets into the binary file will result in different edits
+# and the tests will likely break.
+
  test_expect_success 'git commit-graph verify' '
cd "$TRASH_DIRECTORY/full" &&
+   git rev-parse commits/8 | git commit-graph write --stdin-commits &&
git commit-graph verify >output
  '

I don't quite understand what the change is meant to do.


This gives us a constant commit-graph file to work with in the later tests.

To get the "independent test" structure you want for the tests that are 
coming, we need to do one of the following:


1. Write a new commit-graph file for every test (slows things down).
2. Do all corruption/verify checks in a single test (reduces the 
information from a failed test, as it only reports the first failure).


I don't like either of these options, so I went with this "prepare" step.


Also, as I said earlier, I'd prefer if tests were as indpendent of each
other as possible, to make running individual tests (e.g. only
previously falling tests) easier.

I especially do not like mixing running actual test with setting up the
repository for future tests, as here.

  
+GRAPH_BYTE_VERSION=4

+GRAPH_BYTE_HASH=5
+
+# usage: corrupt_graph_and_verify   
+# Manipulates the commit-graph file at the position
+# by inserting the data, then runs 'git commit-graph verify'
+# and places the output in the file 'err'. Test 'err' for
+# the given string.

Very nice to have this description.


+corrupt_graph_and_verify() {
+   pos=$1
+   data="${2:-\0}"
+   grepstr=$3
+   cd "$TRASH_DIRECTORY/full" &&
+   test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
+   cp $objdir/info/commit-graph commit-graph-backup &&
+   printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" 
conv=notrunc &&

Using 'printf' with octal is much more portable than relying on 'echo'
supporting octal escape sequences (or supporting escape sequences at
all).


+   test_must_fail git commit-graph verify 2>test_err &&
+   grep -v "^+" test_err >err
+   grep "$grepstr" err

Shouldn't this last 'grep' be 'test_i18ngrep' instead, to allow for
translated messages from 'git commit-graph verify' / 'git fsck'?


+}

This function makes actual tests short and simple, without duplicated
code.  Very good.


+
+test_expect_success 'detect bad signature' '
+   corrupt_graph_and_verify 0 "\0" \
+   "graph signature"
+'
+
+test_expect_success 'detect bad version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \
+   "graph version"
+'
+
+test_expect_success 'detect bad hash version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \

When we move from SHA-1 (hash version 1) to NewHash (hash version 2),
this test would start failing... which is actually not a bad idea.


+   "hash version"
+'
+
  test_done




Re: [PATCH v3 03/20] commit-graph: parse commit from chosen graph

2018-05-29 Thread Derrick Stolee

On 5/27/2018 6:23 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


Before verifying a commit-graph file against the object database, we
need to parse all commits from the given commit-graph file. Create
parse_commit_in_graph_one() to target a given struct commit_graph.

If I understand it properly the problem is that when verifying against
the object database we want to check one single commit-graph file, not
concatenation of data from commit-graph file for the repository and
commit-graph files from its alternates -- like prepare_commit_graph()
does; which is called by parse_commit_in_graph().


Signed-off-by: Derrick Stolee 

O.K., so you introduce here a layer of indirection; parse_commit_in_graph()
now just uses parse_commit_in_graph_one(), passing core_commit_graph
(or the_commit_graph) to it, after checking that core_commit_graph is set
(which handles the case when feature is not turned off) and loading
commit-graph file.

Nice and simple 'split function' refactoring, with new function taking
over when there is commit graph file prepared.


So, after the changes:
* parse_commit_in_graph() is responsible for checking whether to use
   commit-graph feature and ensuring that data from commit-graph is
   loaded, where it passes the control to parse_commit_in_graph_one()
* parse_commit_in_graph_one() checks whether commit-graph feature is
   turned on, whether commit we are interested in was already parsed,
   and then uses fill_commit_in_graph() to actually get the data
* fill_commit_in_graph() gets data out of commit-graph file, extracting
   it from commit data chunk (and if needed large edges chunk).

All those functions return 1 if they got data from commit-graph, and 0
if they didn't.


One minor nitpick / complaint / question is about naming of global
variables used here, namely:
* static struct commit_graph *commit_graph
   from commit-graph.c for global storage of commit-graph[s] data
* int core_commit_graph
   from environment.c for storing core.commitGraph config

But I see that at least the latter is common convention in Git source
code; I guess that the former maybe follows convention as used for "the
index" and "the repository" - additionally it is static / file-local.


See also `struct packed_git *packed_git;` from cache.h.




---
  commit-graph.c | 18 +++---
  1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 82295f0975..78ba0edc80 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -310,7 +310,7 @@ static int find_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
}
  }
  
-int parse_commit_in_graph(struct commit *item)

+static int parse_commit_in_graph_one(struct commit_graph *g, struct commit 
*item)
  {
uint32_t pos;
  
@@ -318,9 +318,21 @@ static int parse_commit_in_graph_one(struct commit_graph *g, struct commit *item)

if (!core_commit_graph)
return 0;

All right, we check that commit-graph feature is enabled because
parse_commit_in_graph_one() will be used standalone, not by being
invoked from parse_commit_in_graph().

This check is fast.


if (item->object.parsed)
return 1;

Sidenote: I just wonder why object.parsed and not for example
object.graph_pos is used to checck whether the object was filled if
possible with commit-graph data...


Here we are filling all data, including commit date and parents. We use 
load_commit_graph_info() when we only need the graph_pos and generation 
values.





+
+   if (find_commit_in_graph(item, g, ))
+   return fill_commit_in_graph(item, g, pos);
+
+   return 0;
+}
+
+int parse_commit_in_graph(struct commit *item)
+{
+   if (!core_commit_graph)
+   return 0;

All right, this check is here to short-circuit and make it so git does
not even try to lod commit-graph file[s] if the feature is disabled.


+
prepare_commit_graph();
-   if (commit_graph && find_commit_in_graph(item, commit_graph, ))
-   return fill_commit_in_graph(item, commit_graph, pos);
+   if (commit_graph)
+   return parse_commit_in_graph_one(commit_graph, item);
return 0;
  }




Re: [PATCH] t990X: use '.git/objects' as 'deep inside .git' path

2018-05-29 Thread Derrick Stolee

On 5/27/2018 12:49 AM, Michael Haggerty wrote:

On Sat, May 26, 2018 at 8:47 AM, Christian Couder
 wrote:

Tests t9902-completion.sh and t9903-bash-prompt.sh each have tests
that check what happens when we are "in the '.git' directory" and
when we are "deep inside the '.git' directory".

To test the case when we are "deep inside the '.git' directory" the
test scripts used to perform a `cd .git/refs/heads`.

As there are plans to implement other ref storage systems, let's
use '.git/objects' instead of '.git/refs/heads' as the "deep inside
the '.git' directory" path.

Seems reasonable to me. +1.

Michael

Looks good to me, too.

Thanks,
-Stolee


Re: [PATCH v3 00/20] Integrate commit-graph into 'fsck' and 'gc'

2018-05-29 Thread Derrick Stolee

On 5/29/2018 12:27 AM, Junio C Hamano wrote:

Derrick Stolee  writes:


Thanks for all the feedback on v2. I've tried to make this round's
review a bit easier by splitting up the commits into smaller pieces.
Also, the test script now has less boilerplate and uses variables and
clear arithmetic to explain which bytes are being modified.

One other change worth mentioning: in "commit-graph: add '--reachable'
option" I put the ref-iteration into a new external
'write_commit_graph_reachable()' method inside commit-graph.c. This
makes the 'gc: automatically write commit-graph files' a simpler change.

I finally managed to find time to resolve conflicts this topic has
with other topics (of your own included, if I am not mistaken).
Please double check the resolution when I push out the day's
integration result later today.


Sorry about the confusion. I was operating on my copy of 
ds/generation-numbers (34fdd43339) but fetching just now I see you 
updated that branch to 1472978ec6. I reset my local branch to 
ds/commit-graph-fsk (53dd1e6600). The only diff I see between my v3 
branch and that commit are the changes from ds/commit-graph-lockfile-fix 
(33286dcd6d).


Thanks,
-Stolee


[RFC PATCH 5/6] fetch: destroy commit graph on shallow parameters

2018-05-31 Thread Derrick Stolee
The commit-graph feature is not compatible with history-rewriting
features such as shallow clones. When running a 'git fetch' with
any of the shallow/unshallow options, destroy the commit-graph
file and override core.commitGraph to be false.

Signed-off-by: Derrick Stolee 
---
 builtin/fetch.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index af896e4b74..2001dca881 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -1452,6 +1452,12 @@ int cmd_fetch(int argc, const char **argv, const char 
*prefix)
}
}
 
+   if (update_shallow || depth || deepen_since || deepen_not.nr ||
+   deepen_relative || unshallow || update_shallow || 
is_repository_shallow()) {
+   destroy_commit_graph(get_object_directory());
+   core_commit_graph = 0;
+   }
+
if (remote) {
if (filter_options.choice || repository_format_partial_clone)
fetch_one_setup_partial(remote);
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 2/6] DO NOT MERGE: write commit-graph on every fetch

2018-05-31 Thread Derrick Stolee
THIS IS ONLY FOR TESTING TO INCREASE TEST COVERAGE

Signed-off-by: Derrick Stolee 
---
 builtin/fetch.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 8ee998ea2e..af896e4b74 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -20,6 +20,7 @@
 #include "utf8.h"
 #include "packfile.h"
 #include "list-objects-filter-options.h"
+#include "commit-graph.h"
 
 static const char * const builtin_fetch_usage[] = {
N_("git fetch [] [ [...]]"),
@@ -1480,6 +1481,9 @@ int cmd_fetch(int argc, const char **argv, const char 
*prefix)
 
close_all_packs();
 
+   if (core_commit_graph)
+   write_commit_graph_reachable(get_object_directory(), 1);
+
argv_array_pushl(_gc_auto, "gc", "--auto", NULL);
if (verbosity < 0)
argv_array_push(_gc_auto, "--quiet");
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 1/6] DO NOT MERGE: compute commit-graph on every commit

2018-05-31 Thread Derrick Stolee
Also enable core.commitGraph and gc.commitGraph. Helps to test the
commit-graph feature with the rest of the test infrastructure.

Signed-off-by: Derrick Stolee 
---
 builtin/commit.c | 5 +
 builtin/gc.c | 2 +-
 environment.c| 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/builtin/commit.c b/builtin/commit.c
index e8e8d13be4..8751b816c1 100644
--- a/builtin/commit.c
+++ b/builtin/commit.c
@@ -32,6 +32,7 @@
 #include "column.h"
 #include "sequencer.h"
 #include "mailmap.h"
+#include "commit-graph.h"
 
 static const char * const builtin_commit_usage[] = {
N_("git commit [] [--] ..."),
@@ -1623,5 +1624,9 @@ int cmd_commit(int argc, const char **argv, const char 
*prefix)
 
UNLEAK(err);
UNLEAK(sb);
+
+   if (core_commit_graph)
+   write_commit_graph_reachable(get_object_directory(), 1);
+
return 0;
 }
diff --git a/builtin/gc.c b/builtin/gc.c
index efd214a59f..999c09d8e2 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -35,7 +35,7 @@ static int aggressive_depth = 50;
 static int aggressive_window = 250;
 static int gc_auto_threshold = 6700;
 static int gc_auto_pack_limit = 50;
-static int gc_commit_graph = 0;
+static int gc_commit_graph = 1;
 static int detach_auto = 1;
 static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
diff --git a/environment.c b/environment.c
index 8853e2f0dd..fdb2d56d90 100644
--- a/environment.c
+++ b/environment.c
@@ -62,7 +62,7 @@ enum push_default_type push_default = 
PUSH_DEFAULT_UNSPECIFIED;
 enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE;
 char *notes_ref_name;
 int grafts_replace_parents = 1;
-int core_commit_graph;
+int core_commit_graph = 1;
 int core_apply_sparse_checkout;
 int merge_log_config = -1;
 int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 4/6] commit-graph: avoid writing when repo is shallow

2018-05-31 Thread Derrick Stolee
Shallow clones do not interact well with the commit-graph feature for
several reasons. Instead of doing the hard thing to fix those
interactions, instead prevent reading or writing a commit-graph file for
shallow repositories.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 95af4ed519..80e377b90f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -208,6 +208,9 @@ static void prepare_commit_graph(void)
return;
prepare_commit_graph_run_once = 1;
 
+   if (is_repository_shallow())
+   return;
+
obj_dir = get_object_directory();
prepare_commit_graph_one(obj_dir);
prepare_alt_odb();
@@ -711,6 +714,15 @@ void write_commit_graph(const char *obj_dir,
int num_extra_edges;
struct commit_list *parent;
 
+   /*
+* Shallow clones are not supproted, as they create bad
+* generation skips as they are un-shallowed.
+*/
+   if (is_repository_shallow()) {
+   warning("writing a commit-graph in a shallow repository is not 
supported");
+   return;
+   }
+
oids.nr = 0;
oids.alloc = approximate_object_count() / 4;
 
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 6/6] commit-graph: revert to odb on missing parents

2018-05-31 Thread Derrick Stolee
The commit-graph format includes a way to specify a parent is
"missing" from the commit-graph (i.e. we do not have a record of
that parent in our list of object IDs, and hence cannot provide
a graph position). For mose cases, this does not occur due to
the close_reachable() method adding all reachable commits. However,
in a shallow clone, we will try to record the parents of a commit
on the shallow boundary, but the parents are not in the repository.

The GRAPH_PARENT_MISSING value that is stored in the format is
purposeful, especially for future plans to make the commit-graph file
incremental or transporting sections of a commit-graph file across
the network.

In the meantime, check if a commit has a missing parent while filling
its details from the commit-graph. If a parent is missing, still
assign the generation number and graph position for that item, but
report that the commit-graph failed to fill the contents. Then the
caller is responsible for filling the rest of the data from a commit
buffer.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 33 ++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 80e377b90f..3e33d061fe 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -278,17 +278,44 @@ static int fill_commit_in_graph(struct commit *item, 
struct commit_graph *g, uin
struct commit_list **pptr;
const unsigned char *commit_data = g->chunk_commit_data + (g->hash_len 
+ 16) * pos;
 
-   item->object.parsed = 1;
+   item->generation = get_be32(commit_data + g->hash_len + 8) >> 2;
item->graph_pos = pos;
 
+   /*
+* If we have any edges marked as GRAPH_PARENT_MISSING, we must not 
parse any
+* more of this object and leave it to the commit buffer to parse.
+*/
+   edge_value = get_be32(commit_data + g->hash_len);
+   if (edge_value == GRAPH_PARENT_MISSING)
+   return 0;
+   if (edge_value == GRAPH_PARENT_NONE)
+   goto continue_parsing;
+
+   edge_value = get_be32(commit_data + g->hash_len + 4);
+   if (edge_value == GRAPH_PARENT_MISSING)
+   return 0;
+   if (edge_value == GRAPH_PARENT_NONE)
+   goto continue_parsing;
+   if (!(edge_value & GRAPH_OCTOPUS_EDGES_NEEDED))
+   goto continue_parsing;
+
+   parent_data_ptr = (uint32_t*)(g->chunk_large_edges +
+ 4 * (uint64_t)(edge_value & GRAPH_EDGE_LAST_MASK));
+   do {
+   edge_value = get_be32(parent_data_ptr);
+   if (edge_value == GRAPH_PARENT_MISSING)
+   return 0;
+   parent_data_ptr++;
+   } while (!(edge_value & GRAPH_LAST_EDGE));
+   
+continue_parsing:
+   item->object.parsed = 1;
item->maybe_tree = NULL;
 
date_high = get_be32(commit_data + g->hash_len + 8) & 0x3;
date_low = get_be32(commit_data + g->hash_len + 12);
item->date = (timestamp_t)((date_high << 32) | date_low);
 
-   item->generation = get_be32(commit_data + g->hash_len + 8) >> 2;
-
pptr = >parents;
 
edge_value = get_be32(commit_data + g->hash_len);
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 3/6] commit-graph: enable replace-object and grafts

2018-05-31 Thread Derrick Stolee
Create destroy_commit_graph() method to delete the commit-graph file
when history is altered by a replace-object call. If the commit-graph
is rebuilt after that, we will load the correct object while reading
the commit-graph.

When parsing a commit, first check if the commit was grafted. If so,
then ignore the commit-graph for that commit and insted use the parents
loaded by parsing the commit buffer and comparing against the graft
file. 

Signed-off-by: Derrick Stolee 
---
 builtin/replace.c |  3 +++
 commit-graph.c| 20 +++-
 commit-graph.h|  9 +
 commit.c  |  5 +
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/builtin/replace.c b/builtin/replace.c
index 9f01f3fc7f..d553aadcdc 100644
--- a/builtin/replace.c
+++ b/builtin/replace.c
@@ -15,6 +15,7 @@
 #include "parse-options.h"
 #include "run-command.h"
 #include "tag.h"
+#include "commit-graph.h"
 
 static const char * const git_replace_usage[] = {
N_("git replace [-f]  "),
@@ -468,6 +469,8 @@ int cmd_replace(int argc, const char **argv, const char 
*prefix)
usage_msg_opt("--raw only makes sense with --edit",
  git_replace_usage, options);
 
+   destroy_commit_graph(get_object_directory());
+
switch (cmdmode) {
case MODE_DELETE:
if (argc < 1)
diff --git a/commit-graph.c b/commit-graph.c
index e9195dfb17..95af4ed519 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -6,6 +6,7 @@
 #include "pack.h"
 #include "packfile.h"
 #include "commit.h"
+#include "dir.h"
 #include "object.h"
 #include "refs.h"
 #include "revision.h"
@@ -240,15 +241,22 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+   const unsigned char *real;
 
if (pos >= g->num_commits)
die("invalid parent position %"PRIu64, pos);
 
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
+
+   real = lookup_replace_object(oid.hash);
+
c = lookup_commit();
if (!c)
die("could not find commit %s", oid_to_hex());
-   c->graph_pos = pos;
+
+   if (!hashcmp(real, oid.hash))
+   c->graph_pos = pos;
+
return _list_insert(c, pptr)->next;
 }
 
@@ -1019,3 +1027,13 @@ int verify_commit_graph(struct commit_graph *g)
 
return verify_commit_graph_error;
 }
+
+void destroy_commit_graph(const char *obj_dir)
+{
+   char *graph_name;
+   close_commit_graph();
+
+   graph_name = get_commit_graph_filename(obj_dir);
+   remove_path(graph_name);
+   FREE_AND_NULL(graph_name);
+}
diff --git a/commit-graph.h b/commit-graph.h
index 9a06a5f188..1d17da1582 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -56,4 +56,13 @@ void write_commit_graph(const char *obj_dir,
 
 int verify_commit_graph(struct commit_graph *g);
 
+/*
+ * Delete the commit-graph file in the given object directory.
+ *
+ * WARNING: this deletes data, so should only be used when
+ * performing history-altering actions like replace-object
+ * or grafts.
+ */
+void destroy_commit_graph(const char *obj_dir);
+
 #endif
diff --git a/commit.c b/commit.c
index 6eaed0174c..2fe31cde77 100644
--- a/commit.c
+++ b/commit.c
@@ -403,6 +403,11 @@ int parse_commit_internal(struct commit *item, int 
quiet_on_missing, int use_com
return -1;
if (item->object.parsed)
return 0;
+
+   prepare_commit_graft();
+   if (commit_graft_pos(item->object.oid.hash) >= 0)
+   use_commit_graph = 0;
+
if (use_commit_graph && parse_commit_in_graph(item))
return 0;
buffer = read_sha1_file(item->object.oid.hash, , );
-- 
2.16.2.338.gcfe06ae955



[RFC PATCH 0/6] Fix commit-graph/graft/replace/shallow combo

2018-05-31 Thread Derrick Stolee
The commit-graph file stores a condensed version of the commit history.
This helps speed up several operations involving commit walks. This
feature does not work well if those commits "change" using features like
commit grafts, replace objects, or shallow clones.

Since the commit-graph feature is optional, hidden behind the
'core.commitGraph' config option, and requires manual maintenance (until
ds/commit-graph-fsck' is merged), these issues only arise for expert
users who decided to opt-in.

This RFC is a first attempt at rectify the issues that come up when
these features interact. I have not succeeded entirely, as I will
discuss below.

The first two "DO NOT MERGE" commits are not intended to be part of the
main product, but are ways to help the full test suite run while
computing and consuming commit-graph files. This is acheived by calling
write_commit_graph_reachable() from `git fetch` and `git commit`. I
believe this is the only dependence on ds/commit-graph-fsck. The other
commits should only depend on ds/commit-graph-lockfile-fix.

Running the full test suite after these DO NO MERGE commits results in
the following test failures, which I categorize by type of failure.

The following tests are red herrings. Most work by deleting a commit
from the object database and trying to demonstrate a failure. Others
rely on how certain objects are loaded. These are not bugs, but will
add noise if you run the tests suite with this patch.

t0410-partial-clone.sh  Failed tests:  5, 8
t5307-pack-missing-commit.shFailed tests:  3-4
t6011-rev-list-with-bad-commit.sh   Failed test:  4
t6024-recursive-merge.shFailed test:  4

The following tests are fixed in "commit-graph: enable replace-object
and grafts".

t6001-rev-list-graft.sh Failed tests:  3, 5, 7, 9, 11, 13
t6050-replace.shFailed tests:  11-15, 23-24, 29

The following tests involve shallow clones.

t5500-fetch-pack.sh Failed tests:  30-31, 34, 348-349
t5537-fetch-shallow.sh  Failed tests:  4-7, 9
t5802-connect-helper.sh Failed test:  3

These tests are mostly fixed by three commits:

* commit-graph: avoid writing when repo is shallow
* fetch: destroy commit graph on shallow parameters
* commit-graph: revert to odb on missing parents

Each of these cases cover a different interaction that occurs with
shallow clones. Some are due to a commit becoming shallow, while others
are due to a commit becoming unshallow (and hence invalidating its
generation number).

After these changes, there is one test case that still fails, and I
cannot understand why:

t5500-fetch-pack.sh Failed test:  348

This test fails when performing a `git fetch --shallow-exclude`. When I
halt the test using `t5500-fetch-pack.sh -d -i` and navigate to the
directory to replay the fetch it performs as expected. After struggling
with it for a while, I figured I should just put this series up for
discussion. Maybe someone with more experience in shallow clones can
point out the obvious issues I'm having.

Thanks,
-Stolee

Derrick Stolee (6):
  DO NOT MERGE: compute commit-graph on every commit
  DO NOT MERGE: write commit-graph on every fetch
  commit-graph: enable replace-object and grafts
  commit-graph: avoid writing when repo is shallow
  fetch: destroy commit graph on shallow parameters
  commit-graph: revert to odb on missing parents

 builtin/commit.c  |  5 +
 builtin/fetch.c   | 10 +
 builtin/gc.c  |  2 +-
 builtin/replace.c |  3 +++
 commit-graph.c| 65 +++
 commit-graph.h|  9 
 commit.c  |  5 +
 environment.c |  2 +-
 8 files changed, 95 insertions(+), 6 deletions(-)

-- 
2.16.2.338.gcfe06ae955



Re: [PATCH v3 16/20] commit-graph: verify contents match checksum

2018-06-04 Thread Derrick Stolee

On 6/2/2018 11:52 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


The commit-graph file ends with a SHA1 hash of the previous contents. If
a commit-graph file has errors but the checksum hash is correct, then we
know that the problem is a bug in Git and not simply file corruption
after-the-fact.

Compute the checksum right away so it is the first error that appears,
and make the message translatable since this error can be "corrected" by
a user by simply deleting the file and recomputing. The rest of the
errors are useful only to developers.

Should we then provide --quiet / --verbose options, so that ordinary
user is not flooded with error messages meant for power users and Git
developers, then?


Be sure to continue checking the rest of the file data if the checksum
is wrong. This is important for our tests, as we break the checksum as
we modify bytes of the commit-graph file.

Well, we could have used sha1sum program, or test-sha1 helper to fix the
checksum after corrupting the commit-graph file...


Signed-off-by: Derrick Stolee 
---
  commit-graph.c  | 16 ++--
  t/t5318-commit-graph.sh |  6 ++
  2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index d2b291aca2..a33600c584 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -841,6 +841,7 @@ void write_commit_graph(const char *obj_dir,
oids.nr = 0;
  }
  
+#define VERIFY_COMMIT_GRAPH_ERROR_HASH 2

  static int verify_commit_graph_error;
  
  static void graph_report(const char *fmt, ...)

@@ -860,7 +861,9 @@ static void graph_report(const char *fmt, ...)
  int verify_commit_graph(struct commit_graph *g)
  {
uint32_t i, cur_fanout_pos = 0;
-   struct object_id prev_oid, cur_oid;
+   struct object_id prev_oid, cur_oid, checksum;
+   struct hashfile *f;
+   int devnull;
  
  	if (!g) {

graph_report("no commit-graph file loaded");
@@ -879,6 +882,15 @@ int verify_commit_graph(struct commit_graph *g)
if (verify_commit_graph_error)
return verify_commit_graph_error;
  
+	devnull = open("/dev/null", O_WRONLY);

+   f = hashfd(devnull, NULL);
+   hashwrite(f, g->data, g->data_len - g->hash_len);
+   finalize_hashfile(f, checksum.hash, CSUM_CLOSE);
+   if (hashcmp(checksum.hash, g->data + g->data_len - g->hash_len)) {
+   graph_report(_("the commit-graph file has incorrect checksum and is 
likely corrupt"));
+   verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH;
+   }

Is it the best way of calculating the SHA-1 checksum that out internal
APIs provide?  Is it how SHA-1 checksum is calculated and checked for
packfiles?
This pattern is similar to hashfd_check() in csum-file.c, except we are 
hashing the file data directly instead of re-creating it from scratch 
(as is done for 'git index-pack --verify').





+
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit;
  
@@ -916,7 +928,7 @@ int verify_commit_graph(struct commit_graph *g)

cur_fanout_pos++;
}
  
-	if (verify_commit_graph_error)

+   if (verify_commit_graph_error & ~VERIFY_COMMIT_GRAPH_ERROR_HASH)
return verify_commit_graph_error;

So if we detected that checksum do not match, but we have not found an
error, we say that it is all right?


This only prevents us from stopping early. We will still report an error.



  
  	for (i = 0; i < g->num_commits; i++) {

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 240aef6add..2680a2ebff 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -279,6 +279,7 @@ GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16`
  GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \
$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS`
  GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4`
+GRAPH_BYTE_FOOTER=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES`
  
  # usage: corrupt_graph_and_verify   

  # Manipulates the commit-graph file at the position
@@ -388,4 +389,9 @@ test_expect_success 'detect incorrect parent for octopus 
merge' '
"invalid parent"
  '
  
+test_expect_success 'detect invalid checksum hash' '

+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum"

This would not work under GETTEXT_POISON, as the message is marked as
translatable, but corrupt_graph_and_verify uses 'grep' and not
'test_i18grep' from t/test-lib-functions.sh


I fixed this locally based on Szeder's comment.




+'

If it is pure checksum corruption, wouldn't this fail because it is not
a failure (exit code is 0)?


It is not zero, so the test passes.



[PATCH] t5318-commit-graph.sh: use core.commitGraph

2018-06-04 Thread Derrick Stolee
The commit-graph tests should be checking that normal Git operations
succeed and have matching output with and without the commit-graph
feature enabled. However, the test was toggling 'core.graph' instead
of the correct 'core.commitGraph' variable.

Reported-by: Jakub Narebski 
Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 77d85aefe7..59d0be2877 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -28,8 +28,8 @@ test_expect_success 'create commits and repack' '
 '
 
 graph_git_two_modes() {
-   git -c core.graph=true $1 >output
-   git -c core.graph=false $1 >expect
+   git -c core.commitGraph=true $1 >output
+   git -c core.commitGraph=false $1 >expect
test_cmp output expect
 }
 
-- 
2.18.0.rc0



Re: [PATCH v3 09/20] commit-graph: verify corrupt OID fanout and lookup

2018-06-04 Thread Derrick Stolee

On 6/2/2018 12:38 AM, Duy Nguyen wrote:

On Thu, May 24, 2018 at 6:25 PM, Derrick Stolee  wrote:

+   if (i && oidcmp(_oid, _oid) >= 0)
+   graph_report("commit-graph has incorrect OID order: %s then 
%s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));

Should these strings be marked for translation with _()?
I've been asking myself "Is this message helpful to anyone other than a 
Git developer?" and for this series the only one that is helpful to an 
end-user is the message about the final hash. If the hash is correct, 
but these other messages appear, then there is a bug in the code that 
wrote the file. Otherwise, file corruption is more likely and the 
correct course of action is to delete and rebuild.


Thanks for being diligent in checking.

Thanks,
-Stolee


Re: [PATCH v3 17/20] fsck: verify commit-graph

2018-06-04 Thread Derrick Stolee

On 6/2/2018 12:17 PM, Jakub Narebski wrote:

Derrick Stolee  writes:


If core.commitGraph is true, verify the contents of the commit-graph
during 'git fsck' using the 'git commit-graph verify' subcommand. Run
this check on all alternates, as well.

All right, so we have one config variable to control the use of
serialized commit-graph feaature.  Nice.


We use a new process for two reasons:

1. The subcommand decouples the details of loading and verifying a
commit-graph file from the other fsck details.

All right, I can agree with that.

On the other hand using subcommand makes debugging harder, though not in
this case (well separated functionality that can be easily called with a
standalone command to be debugged).


2. The commit-graph verification requires the commits to be loaded
in a specific order to guarantee we parse from the commit-graph
file for some objects and from the object database for others.

I don't quite understand this.  Could you explain it in more detail?


We use `lookup_commit()` when verifying the commit-graph. If these 
commits were loaded earlier in the process and parsed directly from the 
object database, then we aren't comparing the commit-graph file contents 
against the ODB.





Signed-off-by: Derrick Stolee 
---
  Documentation/git-fsck.txt |  3 +++
  builtin/fsck.c | 21 +
  t/t5318-commit-graph.sh|  8 
  3 files changed, 32 insertions(+)

diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt
index b9f060e3b2..ab9a93fb9b 100644
--- a/Documentation/git-fsck.txt
+++ b/Documentation/git-fsck.txt
@@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or 
other archives
  (i.e., you can just remove them and do an 'rsync' with some other site in
  the hopes that somebody else has the object you have corrupted).
  
+If core.commitGraph is true, the commit-graph file will also be inspected

Shouldn't we use `core.commitGraph` here?


+using 'git commit-graph verify'. See linkgit:git-commit-graph[1].
+
  Extracted Diagnostics
  -
  
diff --git a/builtin/fsck.c b/builtin/fsck.c

index ef78c6c00c..a6d5045b77 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -16,6 +16,7 @@
  #include "streaming.h"
  #include "decorate.h"
  #include "packfile.h"
+#include "run-command.h"
  
  #define REACHABLE 0x0001

  #define SEEN  0x0002
@@ -45,6 +46,7 @@ static int name_objects;
  #define ERROR_REACHABLE 02
  #define ERROR_PACK 04
  #define ERROR_REFS 010
+#define ERROR_COMMIT_GRAPH 020

Minor nitpick and a sidenote: I wonder if it wouldn't be better to
either use hexadecimal constants, or use (1 << n) for all ERROR_*
preprocesor constants.

  
  static const char *describe_object(struct object *obj)

  {
@@ -815,5 +817,24 @@ int cmd_fsck(int argc, const char **argv, const char 
*prefix)
}
  
  	check_connectivity();

+
+   if (core_commit_graph) {
+   struct child_process commit_graph_verify = CHILD_PROCESS_INIT;
+   const char *verify_argv[] = { "commit-graph", "verify", NULL, 
NULL, NULL, NULL };

I see that NULL at index 2 and 3 (at 3rd and 4th place) are here for
"--object-dir" and , the last one is
terminator for that case, but what is next to last NULL (at 5th place)
for?


+   commit_graph_verify.argv = verify_argv;
+   commit_graph_verify.git_cmd = 1;
+
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+
+   prepare_alt_odb();
+   for (alt = alt_odb_list; alt; alt = alt->next) {
+   verify_argv[2] = "--object-dir";
+   verify_argv[3] = alt->path;
+   if (run_command(_graph_verify))
+   errors_found |= ERROR_COMMIT_GRAPH;
+   }
+   }

For performance reasons it may be better to start those 'git
commit-graph verify' commands asynchronously earlier, so that they can
run in parallel / concurrently wth other checks, and wait for them and
get their error code at the end of git-fsck run.

But that is probably better left for a separate commit.


+
return errors_found;
  }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 2680a2ebff..4941937163 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -394,4 +394,12 @@ test_expect_success 'detect invalid checksum hash' '
"incorrect checksum"
  '
  
+test_expect_success 'git fsck (checks commit-graph)' '

+   cd "$TRASH_DIRECTORY/full" &&
+   git fsck &&
+   corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \
+   "incorrect checksum" &&
+   test_must_fail git fsck
+'

All right; though the same caveats apply as with previous commit in
series.  Perhaps it would be better to truncate commit-graph file, or
corrupt it in some 'random' place.


+
  test_done

Best,




Re: [PATCH v3 15/20] commit-graph: test for corrupted octopus edge

2018-06-04 Thread Derrick Stolee

On 6/2/2018 8:39 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


The commit-graph file has an extra chunk to store the parent int-ids for
parents beyond the first parent for octopus merges. Our test repo has a
single octopus merge that we can manipulate to demonstrate the 'verify'
subcommand detects incorrect values in that chunk.

If I understand it correctly the above means that our _reading_ code
checks for validity (which then 'git commit-graph verify' uses), just
there were not any tests for that.


Signed-off-by: Derrick Stolee 
---
  t/t5318-commit-graph.sh | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 58adb8246d..240aef6add 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' '
  '
  
  NUM_COMMITS=9

+NUM_OCTOPUS_EDGES=2
  HASH_LEN=20
  GRAPH_BYTE_VERSION=4
  GRAPH_BYTE_HASH=5
@@ -274,6 +275,10 @@ GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr 
$GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4`
  GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 
3`
  GRAPH_BYTE_COMMIT_GENERATION=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 8`
  GRAPH_BYTE_COMMIT_DATE=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12`
+GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16`
+GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \
+   $GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS`
+GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4`
  
  # usage: corrupt_graph_and_verify   

  # Manipulates the commit-graph file at the position
@@ -378,4 +383,9 @@ test_expect_success 'detect incorrect commit date' '
"commit date"
  '
  
+test_expect_success 'detect incorrect parent for octopus merge' '

+   corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \
+   "invalid parent"
+'

So we change the int-id to non-existing commit, and check that
commit-graph code checks for that.

What about the case when there are octopus merges, but no EDGE chunk
(which I think we can emulate by changing / corrupting number of
chunks)?

What about the case where int-id of edge in EDGE chunk is correct, that
is points to a valid commit, but does not agree with what is in the
object database (what parents octopus merge has in reality)?

Do we detect the situation where the second parent value in the commit
data stores an array position within a Large Edge chunk, but we do not
reach a value with the most-significant bit on when reaching the end of
Large Edge chunk?


There are a few holes like this, but I think they are better suited to a 
follow-up series, as this series is already quite large.


-Stolee


Re: [PATCH v3 06/20] commit-graph: add 'verify' subcommand

2018-06-04 Thread Derrick Stolee

On 6/2/2018 5:19 PM, Jakub Narebski wrote:

Derrick Stolee  writes:

Do we have a way to run individual steps of the test suite? I am
unfamiliar with that process.

The t/README describes three such ways in "Skipping Tests" section:

- GIT_SKIP_TESTS environment variable, which can either can match the
   "t[0-9]{4}" part to skip the whole test, or t[0-9]{4} followed by
   ".$number" to say which particular test to skip

- For an individual test suite --run could be used to specify that
   only some tests should be run or that some tests should be
   excluded from a run (the latter with '!' prefix).

- 'prove' harness can also run individual tests; one of more useful
   options is --state, which for example would allow to run only failed
   tests with --state=failed,save ... if the tests were independent.


Adding the complexity of storing a copy of the commit-graph file for
re-use in a later test is wasted energy right now, because we need to
run the steps of the test that create the repo shape with the commits
laid out as set earlier in the test. This shape changes as we test
different states of the commit-graph (exists and contains all commits,
exists and doesn't contain all commits, etc.)

I think we can solve most of the problem by separating validation tests
(which all or almost all use the same commit-graph file) and other test;
putting them in different test scripts.  This means that the more
complicated case would be limited to the subset of tests.

Anyway, if the setup stages are clearly separated and clearly marked as
such, we would be able to at least manually skip tests, or manually run
only a subset of tests.

Test independence is certainly something nice to have, but as the git
testsuite is not in best shape wrt this, it is not a requirement.


I'm all for making the test suite better. In this case, I will hold that 
for a later series (that is entirely focused on that feature) as I 
expect we will want to discuss the correct pattern in detail.


Thanks,
-Stolee


Re: [PATCH v3 19/20] gc: automatically write commit-graph files

2018-06-04 Thread Derrick Stolee

On 6/2/2018 2:03 PM, Jakub Narebski wrote:

Derrick Stolee  writes:


The commit-graph file is a very helpful feature for speeding up git
operations. In order to make it more useful, write the commit-graph file
by default during standard garbage collection operations.

I think you meant here "make it possible to write the commit-graph file
during standard garbage collection operations." (i.e. add "make it
possible" because it hides behind new config option, and remove "by
default" because currently it is not turned on by default).


Add a 'gc.commitGraph' config setting that triggers writing a
commit-graph file after any non-trivial 'git gc' command. Defaults to
false while the commit-graph feature matures. We specifically do not
want to turn this on by default until the commit-graph feature is fully

s/turn this on/have this on/  I think.


integrated with history-modifying features like shallow clones.

Two things.

First, shallow clones, replacement mechanims (git-replace) and grafts
are not "history-modifying" features; this name is in my opinion
reserved for history-rewriting features such as interactive rebase, the
`git filter-branch` feature or out-of-tree BFG Repo Cleaner or
reposurgeon tools.  They alter the _view_ of history; they should be
IMVHO named "history-view-altering" features -- though I agree this is
mouthful.

Second, shouldn't we, as Martin Ågren said, warn about the issue in the
manpage for git-commit-graph?


Signed-off-by: Derrick Stolee 
---
  Documentation/config.txt |  6 ++
  Documentation/git-gc.txt |  4 
  builtin/gc.c |  6 ++
  t/t5318-commit-graph.sh  | 14 ++
  4 files changed, 30 insertions(+)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 11f027194e..9a3abd87e7 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1553,6 +1553,12 @@ gc.autoDetach::
Make `git gc --auto` return immediately and run in background
if the system supports it. Default is true.
  
+gc.commitGraph::

+   If true, then gc will rewrite the commit-graph file after any
+   change to the object database. If '--auto' is used, then the
+   commit-graph will not be updated unless the threshold is met.

What threshold?  Ah, thresholds defined for `git gc --auto` (gc.auto,
gc.autoPackLimit, gc.logExpiry,...).


+   See linkgit:git-commit-graph[1] for details.

You missed declaring the default value for this config option.


+
  gc.logExpiry::
If the file gc.log exists, then `git gc --auto` won't run
unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..17dd654a59 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -119,6 +119,10 @@ The optional configuration variable `gc.packRefs` 
determines if
  it within all non-bare repos or it can be set to a boolean value.
  This defaults to true.
  
+The optional configuration variable 'gc.commitGraph' determines if

+'git gc' runs 'git commit-graph write'. This can be set to a boolean

Should it be "runs" or "should run"?


+value. This defaults to false.

Should it be '...' or `...`?  Below we have `gc.aggresiveWindow`, above
we have 'gc.commitGraph', for example.


+
  The optional configuration variable `gc.aggressiveWindow` controls how
  much time is spent optimizing the delta compression of the objects in
  the repository when the --aggressive option is specified.  The larger
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..efd214a59f 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,7 @@
  #include "argv-array.h"
  #include "commit.h"
  #include "packfile.h"
+#include "commit-graph.h"
  
  #define FAILED_RUN "failed to run %s"
  
@@ -34,6 +35,7 @@ static int aggressive_depth = 50;

  static int aggressive_window = 250;
  static int gc_auto_threshold = 6700;
  static int gc_auto_pack_limit = 50;
+static int gc_commit_graph = 0;
  static int detach_auto = 1;
  static timestamp_t gc_log_expire_time;
  static const char *gc_log_expire = "1.day.ago";
@@ -121,6 +123,7 @@ static void gc_config(void)
git_config_get_int("gc.aggressivedepth", _depth);
git_config_get_int("gc.auto", _auto_threshold);
git_config_get_int("gc.autopacklimit", _auto_pack_limit);
+   git_config_get_bool("gc.commitgraph", _commit_graph);
git_config_get_bool("gc.autodetach", _auto);
git_config_get_expiry("gc.pruneexpire", _expire);
git_config_get_expiry("gc.worktreepruneexpire", 
_worktrees_expire);
@@ -480,6 +483,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
if (pack_garbage.nr > 0)
clean_pack_garbage();
  
+	if (gc_commit_graph)

+

Re: [PATCH v3 13/20] commit-graph: verify generation number

2018-06-04 Thread Derrick Stolee

On 6/2/2018 8:23 AM, Jakub Narebski wrote:

Derrick Stolee  writes:


While iterating through the commit parents, perform the generation
number calculation and compare against the value stored in the
commit-graph.

All right, that's good.

What about commit-graph files that have GENERATION_NUMBER_ZERO for all
its commits (because we verify single commit-graph file only, there
wouldn't be GENERATION_NUMBER_ZERO mixed with non-zero generation
numbers)?

Unless we can assume that no commit-graph file in the wild would have
GENERATION_NUMBER_ZERO.


I was expecting that we would not have files in the wild with 
GENERATION_NUMBER_ZERO, but it looks like 2.18 will create files like 
that. I'll put in logic to verify "all are GENERATION_NUMBER_ZERO or all 
have 'correct' generation number".





The tests demonstrate that having a different set of parents affects
the generation number calculation, and this value propagates to
descendants. Hence, we drop the single-line condition on the output.

I don't understand what part of changes this paragraph of the commit
message refers to.

Anyway, changing parents may not lead to changed generation numbers;
take for example commit with single parent, which we change to other
commit with the same generation number.


The tests introduced in the previous commit change the parent list, 
which then changes the generation number in some cases (the stored 
generation number doesn't match the generation number computed based on 
the loaded parents). Since we report as many errors as possible (instead 
of failing on first error) those tests would fail if we say "the _only_ 
error should be the parent list". (This comes up again when we report 
the hash is incorrect, which would appear in every test.)


The test introduced in this commit only changes the generation number, 
so that test will have the one error.





Signed-off-by: Derrick Stolee 
---
  commit-graph.c  | 18 ++
  t/t5318-commit-graph.sh |  6 ++

Sidenote: I have just realized that it may be better to put
validation-related tests into different test file.


  2 files changed, 24 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index fff22dc0c3..ead92460c1 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -922,6 +922,7 @@ int verify_commit_graph(struct commit_graph *g)
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
struct commit_list *graph_parents, *odb_parents;
+   uint32_t max_generation = 0;
  
  		hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
  
@@ -956,6 +957,9 @@ int verify_commit_graph(struct commit_graph *g)

 
oid_to_hex(_parents->item->object.oid),
 
oid_to_hex(_parents->item->object.oid));
  
+			if (graph_parents->item->generation > max_generation)

+   max_generation = 
graph_parents->item->generation;
+

All right, that calculates the maximum of generation number of commit
parents.


graph_parents = graph_parents->next;
odb_parents = odb_parents->next;
}
@@ -963,6 +967,20 @@ int verify_commit_graph(struct commit_graph *g)
if (odb_parents != NULL)
graph_report("commit-graph parent list for commit %s 
terminates early",
 oid_to_hex(_oid));
+
+   /*
+* If one of our parents has generation GENERATION_NUMBER_MAX, 
then
+* our generation is also GENERATION_NUMBER_MAX. Decrement to 
avoid
+* extra logic in the following condition.
+*/

Nice trick.


+   if (max_generation == GENERATION_NUMBER_MAX)
+   max_generation--;

What about GENERATION_NUMBER_ZERO?


+
+   if (graph_commit->generation != max_generation + 1)
+   graph_report("commit-graph generation for commit %s is %u != 
%u",
+oid_to_hex(_oid),
+graph_commit->generation,
+max_generation + 1);

I think we should also check that generation numbers do not exceed
GENERATION_NUMBER_MAX... unless it is already taken care of?


We get that for free. First, the condition above would fail for at least 
one commit. Second, we literally cannot store a value larger than 
GENERATION_NUMBER_MAX in the commit-graph as there are only 30 bits 
dedicated to the generation number.





}
  
  	return verify_commit_graph_error;

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 12f0d7f54d..673b0d37d5 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAP

Re: [RFC PATCH 0/6] Fix commit-graph/graft/replace/shallow combo

2018-05-31 Thread Derrick Stolee

On 5/31/2018 2:33 PM, Stefan Beller wrote:

On Thu, May 31, 2018 at 10:40 AM, Derrick Stolee  wrote:

The commit-graph file stores a condensed version of the commit history.
This helps speed up several operations involving commit walks. This
feature does not work well if those commits "change" using features like
commit grafts, replace objects, or shallow clones.

Since the commit-graph feature is optional, hidden behind the
'core.commitGraph' config option, and requires manual maintenance (until
ds/commit-graph-fsck' is merged), these issues only arise for expert
users who decided to opt-in.

This RFC is a first attempt at rectify the issues that come up when
these features interact. I have not succeeded entirely, as I will
discuss below.

The first two "DO NOT MERGE" commits are not intended to be part of the
main product, but are ways to help the full test suite run while
computing and consuming commit-graph files. This is acheived by calling
write_commit_graph_reachable() from `git fetch` and `git commit`. I
believe this is the only dependence on ds/commit-graph-fsck. The other
commits should only depend on ds/commit-graph-lockfile-fix.

I applied these patches on top of ds/commit-graph-fsck
(it would have been nice if you mentioned that it applies there)
Running the test suite with all patches applied results in:

./t0410-partial-clone.sh(Wstat: 256 Tests: 15 Failed: 2)
   Failed tests:  5, 8
./t5307-pack-missing-commit.sh  (Wstat: 256 Tests: 5 Failed: 2)
   Failed tests:  3-4
./t5500-fetch-pack.sh   (Wstat: 256 Tests: 353 Failed: 1)
   Failed test:  348
./t6011-rev-list-with-bad-commit.sh (Wstat: 256 Tests: 6 Failed: 1)
   Failed test:  4
./t6024-recursive-merge.sh  (Wstat: 256 Tests: 6 Failed: 1)
   Failed test:  4

which you identified as 4x noise and t5500 as not understood.

$ GIT_TRACE=1 ./t5500-fetch-pack.sh -d -i -v -x
[...]
expecting success:
git -C shallow12 fetch --shallow-exclude one origin &&
git -C shallow12 log --pretty=tformat:%s origin/master >actual &&
test_write_lines three two >expected &&
test_cmp expected actual

++ git -C shallow12 fetch --shallow-exclude one origin
trace: built-in: git fetch --shallow-exclude one origin
trace: run_command: unset GIT_PREFIX; 'git-upload-pack
'\''/u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/.'\'''
trace: run_command: git --shallow-file  pack-objects --revs --thin
--stdout --shallow --progress --delta-base-offset --include-tag
trace: built-in: git pack-objects --revs --thin --stdout --shallow
--progress --delta-base-offset --include-tag
remote: Counting objects: 4, done.
remote: Compressing objects: 100% (3/3), done.
remote: Total 4 (delta 0), reused 0 (delta 0)
trace: run_command: git --shallow-file  unpack-objects --pack_header=2,4
trace: built-in: git unpack-objects --pack_header=2,4
Unpacking objects: 100% (4/4), done.
trace: run_command: git rev-list --objects --stdin --not --all --quiet
trace: built-in: git rev-list --objects --stdin --not --all --quiet
trace: run_command: unset GIT_PREFIX; 'git-upload-pack
'\''/u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/.'\'''
trace: run_command: git pack-objects --revs --thin --stdout --progress
--delta-base-offset
trace: built-in: git pack-objects --revs --thin --stdout --progress
--delta-base-offset
remote: Total 0 (delta 0), reused 0 (delta 0)
trace: run_command: git unpack-objects --pack_header=2,0
trace: built-in: git unpack-objects --pack_header=2,0
trace: run_command: git rev-list --objects --stdin --not --all --quiet
trace: built-in: git rev-list --objects --stdin --not --all --quiet
 From file:///u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/.
  * [new tag] one-> one
  * [new tag] two-> two
run_processes_parallel: preparing to run up to 1 tasks
run_processes_parallel: done
trace: run_command: git gc --auto
trace: built-in: git gc --auto
++ git -C shallow12 log --pretty=tformat:%s origin/master
trace: built-in: git log '--pretty=tformat:%s' origin/master
++ test_write_lines three two
++ printf '%s\n' three two
++ test_cmp expected actual
++ diff -u expected actual
--- expected 2018-05-31 18:14:25.944540810 +
+++ actual 2018-05-31 18:14:25.944540810 +
@@ -1,2 +1,3 @@
  three
  two
+one
error: last command exited with $?=1
not ok 348 - fetch exclude tag one
#
# git -C shallow12 fetch --shallow-exclude one origin &&
# git -C shallow12 log --pretty=tformat:%s origin/master >actual &&
# test_write_lines three two >expected &&
# test_cmp expected actual
#



After these changes, there is one test case that still fails, and I
cannot understand why:

t5500-fetch-pack.sh Failed test:  348

This test fails when performing a `git fetch --shallow-exclude`. When I
halt the test using `t5500-fetch-pack.sh -d -i` and navigate to the
directory to replay t

Re: ds/generation-numbers (was Re: What's cooking in git.git (Jun 2018, #01; Fri, 1))

2018-06-01 Thread Derrick Stolee

On 6/1/2018 7:15 PM, Junio C Hamano wrote:

Derrick Stolee  writes:

   A recently added "commit-graph" datafile has learned to store
   pre-computed generation numbers to speed up the decisions to stop
   history traversal.

   Will cook in 'next'.

On Wednesday, these were marked as "Will merge to 'master'" What changed?

Nothing has changed.  "Will merge to 'master'" means "This is now in
'next', and unless there is some blocking breakage found, this topic
will be merged to 'master' eventually".  It does not even say if
that eventuality is before or after the release the current cycle is
working towards.

When it comes near pre-release feature freeze, things in 'next' need
to be sifted into various bins, and their labels are updated.  The
ones that are too big to be comfortably merged to the upcoming
release after -rc0 has passed (i.e. biggies are better merged early
to the 'master' in the cycle to give it full cycle of larger
exposure) will be kept in 'next' so that it can go first after the
final, the ones that are low risk but with higher importance will be
merged to 'master' before the release, the ones that are trivial,
distracting and lower value (i.e.  the ones that force i18n teams
extra work) may be held in 'next', and the ones that deserve a
chance to freshly restart are marked to be kicked back to 'pu'.
Etc. etc.


Thanks, Junio. This explanation is what I expected. I suppose the small 
extra bit of information of "Will cook in 'next' until after next 
release" would have answered my question in advance. Thanks for the 
patience as I get used to your workflow.


I am a little disappointed this didn't make 2.18 because this gives some 
of the biggest speedups for typically painful computations (like 'git 
branch --contains'). The generation numbers are what give us more than a 
constant-multiple speedup; what is in master only parses commit 
relationships faster, doesn't reduce the number of commits walked. It 
also means we will release a version of Git that writes commit-graph 
file with GENERATION_ZERO and so we will never be able to deprecate that 
logic in the code.


Thanks,

-Stolee




ds/generation-numbers (was Re: What's cooking in git.git (Jun 2018, #01; Fri, 1))

2018-06-01 Thread Derrick Stolee

On 6/1/2018 3:21 AM, Junio C Hamano wrote:


* ds/commit-graph-lockfile-fix (2018-05-22) 1 commit
   (merged to 'next' on 2018-05-24 at 3d12a02b0c)
  + commit-graph: fix UX issue when .lock file exists
  (this branch is used by ds/commit-graph-fsck; uses ds/generation-numbers.)

  Update to ds/generation-numbers topic.

  Wait for ds/generation-numbers


* ds/generation-numbers (2018-05-22) 11 commits
   (merged to 'next' on 2018-05-24 at 56fc38a1b6)
  + commit-graph.txt: update design document
  + merge: check config before loading commits
  + commit: use generation number in remove_redundant()
  + commit: add short-circuit to paint_down_to_common()
  + commit: use generation numbers for in_merge_bases()
  + ref-filter: use generation number for --contains
  + commit-graph: always load commit-graph information
  + commit: use generations in paint_down_to_common()
  + commit-graph: compute generation numbers
  + commit: add generation number to struct commit
  + ref-filter: fix outdated comment on in_commit_list
  (this branch is used by ds/commit-graph-fsck and 
ds/commit-graph-lockfile-fix.)

  A recently added "commit-graph" datafile has learned to store
  pre-computed generation numbers to speed up the decisions to stop
  history traversal.

  Will cook in 'next'.


On Wednesday, these were marked as "Will merge to 'master'" What changed?

Thanks,
-Stolee


Re: [RFC PATCH 4/6] commit-graph: avoid writing when repo is shallow

2018-06-01 Thread Derrick Stolee

On 5/31/2018 10:30 PM, Junio C Hamano wrote:

Derrick Stolee  writes:


Shallow clones do not interact well with the commit-graph feature for
several reasons. Instead of doing the hard thing to fix those
interactions, instead prevent reading or writing a commit-graph file for
shallow repositories.

The latter instead would want to vanish, I would guess.


Do you mean that we should call destroy_commit_graph() if we detect a 
shallow repository during write_commit_graph(), then I can make that change.





Signed-off-by: Derrick Stolee 
---
  commit-graph.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 95af4ed519..80e377b90f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -208,6 +208,9 @@ static void prepare_commit_graph(void)
return;
prepare_commit_graph_run_once = 1;
  
+	if (is_repository_shallow())

+   return;
+
obj_dir = get_object_directory();
prepare_commit_graph_one(obj_dir);
prepare_alt_odb();
@@ -711,6 +714,15 @@ void write_commit_graph(const char *obj_dir,
int num_extra_edges;
struct commit_list *parent;
  
+	/*

+* Shallow clones are not supproted, as they create bad
+* generation skips as they are un-shallowed.
+*/
+   if (is_repository_shallow()) {
+   warning("writing a commit-graph in a shallow repository is not 
supported");
+   return;
+   }
+
oids.nr = 0;
oids.alloc = approximate_object_count() / 4;




Re: [PATCH v4 00/21] Integrate commit-graph into 'fsck' and 'gc'

2018-06-05 Thread Derrick Stolee

On 6/5/2018 10:51 AM, Ævar Arnfjörð Bjarmason wrote:

On Mon, Jun 4, 2018 at 6:52 PM, Derrick Stolee  wrote:

Thanks for the feedback on v3. There were several small cleanups, but
perhaps the biggest change is the addition of "commit-graph: use
string-list API for input" which makes "commit-graph: add '--reachable'
option" much simpler.

Do you have a version of this pushed somewhere? Your fsck/upstream
branch conflicts with e.g. long-since-merged nd/repack-keep-pack.


Sorry I forgot to push. It is now available on GitHub [1]. I based my 
commits on 'next' including the core.commitGraph fix for 
t5318-commit-graph.sh I already sent [2].


[1] https://github.com/derrickstolee/git/tree/fsck/upstream
    fsck/upstream branch matches this patch series

[2] 
https://public-inbox.org/git/20180604123906.136417-1-dsto...@microsoft.com/

    [PATCH] t5318-commit-graph.sh: use core.commitGraph


Re: t5318-commit-graph.sh breaks travis gettext poison job

2018-06-01 Thread Derrick Stolee

On 6/1/2018 12:17 PM, Duy Nguyen wrote:

In case you're not checking travis, [1] reports

Test Summary Report
---
t5318-commit-graph.sh(Wstat: 256 Tests: 62
Failed: 2)
   Failed tests:  61-62
   Non-zero exit status: 1

This usually means you're grepping an i18n string

[1] https://travis-ci.org/git/git/jobs/386532326



Thanks, Duy. I believe this is related to the place Szeder pointed out 
[1] with the one translatable string in verify_commit_graph().


I've fixed this in my local branch, and plan to push v4 when review of 
v3 is complete.


Thanks,
-Stolee

[1] 
https://public-inbox.org/git/20180530123529.32090-1-szeder@gmail.com/

    Re: [PATCH v3 16/20] commit-graph: verify contents match checksum


Re: [PATCH v3 10/20] commit-graph: verify objects exist

2018-05-31 Thread Derrick Stolee

On 5/30/2018 3:22 PM, Jakub Narebski wrote:

Derrick Stolee  writes:


In the 'verify' subcommand, load commits directly from the object
database to ensure they exist. Parse by skipping the commit-graph.

All right, before we check that the commit data matches, we need to
check that all the commits in cache (in the serialized commit graph) are
present in real data (in the object database of the repository).

What's nice of this series is that the operation that actually removes
unreachable commits from the object database, that is `git gc`, would
also update commit-gaph file.


Signed-off-by: Derrick Stolee 
---
  commit-graph.c  | 20 
  t/t5318-commit-graph.sh |  7 +++
  2 files changed, 27 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index cbd1aae514..0420ebcd87 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -238,6 +238,10 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
  {
struct commit *c;
struct object_id oid;
+
+   if (pos >= g->num_commits)
+   die("invalid parent position %"PRIu64, pos);
+

This change is not described in the commit message.
This change should go in "commit-graph: verify parent list" which adds a 
test that fails without it. Thanks.



hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -905,5 +909,21 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
  
+	if (verify_commit_graph_error)

+   return verify_commit_graph_error;

All right, so we by default short-circuit so that errors found earlier
wouldn't cause crash when checking other things.

Is it needed, though, in this case?  Though I guess it is better to be
conservative; lso by terminating after a batch of one type of errors we
don't get many different error messages from the same error (i.e. error
propagation).


+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *odb_commit;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());

Do we really need to keep all those commits from the object database in
memory (in the object::obj_hash hash table)?  Perhaps using something
like Flywheel / Recycler pattern would be a better solution (if
possible)?

Though perhaps this doesn't matter much with respect to memory use.


+   if (parse_commit_internal(odb_commit, 0, 0)) {

Just a reminder to myself: the signature is

   int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph)


Hmmm... I wonder if with two boolean paramaters wouldn't it be better to
use flags parameter, i.e.

   int parse_commit_internal(struct commit *item, int flags)

   ...

   parse_commit_internal(commit, QUIET_ON_MISSING | USE_COMMIT_GRAPH)

But I guess that it is not worth it (especially for internal-ish
function).


+   graph_report("failed to parse %s from object database",
+oid_to_hex(_oid));

Wouldn't parse_commit_internal() show it's own error message, in
addition to the one above?


+   continue;
+   }
+   }
+
return verify_commit_graph_error;
  }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c050ef980b..996a016239 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
  '
  
+NUM_COMMITS=9

  HASH_LEN=20
  GRAPH_BYTE_VERSION=4
  GRAPH_BYTE_HASH=5
@@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=`expr $GRAPH_FANOUT_OFFSET + 4 \* 4`
  GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255`
  GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256`
  GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8`
+GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 
+ 10`

All right, so we modify 10-the byte of OID of 5-th commit out of 9,
am I correct here?

  
  # usage: corrupt_graph_and_verify   

  # Manipulates the commit-graph file at the position
@@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' '
"incorrect OID order"
  '
  
+test_expect_success 'detect OID not in object database' '

+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \
+   "from object database"
+'

I guess that if we ensure that OIDs are constant, you have chosen the
change to actually corrupt the OID in OID Lookup chunk to point to OID
that is not in the object database (while still not violating the
constraint that OID in OID Lookup chunk needs to be sorted).


+
  test_done

All right (well, except for `expr ... ` --> $(( ... )) change).





Re: [PATCH v3 11/20] commit-graph: verify root tree OIDs

2018-05-31 Thread Derrick Stolee

On 5/30/2018 6:24 PM, Jakub Narebski wrote:

Derrick Stolee  writes:


The 'verify' subcommand must compare the commit content parsed from the
commit-graph and compare it against the content in the object database.

You have "compare" twice in the above sentence.


Use lookup_commit() and parse_commit_in_graph_one() to parse the commits
from the graph and compare against a commit that is loaded separately
and parsed directly from the object database.

All right, that looks like a nice extension of what was done in previous
patch.  We want to check that cache (serialized commit graph) matches
reality (object database).


Add checks for the root tree OID.

All right; isn't it that now we check almost all information from
commit-graph that hs match in object database (with exception of commit
parents, I think).


Signed-off-by: Derrick Stolee 
---
  commit-graph.c  | 17 -
  t/t5318-commit-graph.sh |  7 +++
  2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 0420ebcd87..19ea369fc6 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -880,6 +880,8 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;

NOTE: we will be checking Commit Data chunk; I think it would be good
idea to verify that size of Commit Data chunk matches (N * (H + 16) bytes)
that format gives us, so that we don't accidentally red outside of
memory if something got screwed up (like number of commits being wrong,
or file truncated).


This is actually how we calculate 'num_commits' during 
load_commit_graph_one():


    if (last_chunk_id == GRAPH_CHUNKID_OIDLOOKUP)
    {
        graph->num_commits = (chunk_offset - last_chunk_offset)
                     / graph->hash_len;
    }

So, if the chunk doesn't match N*(H+16), we detect this because 
FANOUT[255] != N.


(There is one caveat here: (chunk_offset - last_chunk_offset) may not be 
a multiple of hash_len, and "accidentally" truncate to N in the 
division. I'll add more careful checks for this.)


We also check out-of-bounds offsets in that method.



  
  	for (i = 0; i < g->num_commits; i++) {

+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
  
  		if (i && oidcmp(_oid, _oid) >= 0)

@@ -897,6 +899,11 @@ int verify_commit_graph(struct commit_graph *g)
  
  			cur_fanout_pos++;

}
+
+   graph_commit = lookup_commit(_oid);

So now I see why we add all commits to memory (to hash structure).


+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph",
+oid_to_hex(_oid));

All right, this verifies that commit in OID Lookup chunk has parse-able
data in Commit Data chunk, isn't it?


}
  
  	while (cur_fanout_pos < 256) {

@@ -913,16 +920,24 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
  
  	for (i = 0; i < g->num_commits; i++) {

-   struct commit *odb_commit;
+   struct commit *graph_commit, *odb_commit;
  
  		hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
  
+		graph_commit = lookup_commit(_oid);

odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());

All right, so we have commit data from graph, and commit data from the
object database.


if (parse_commit_internal(odb_commit, 0, 0)) {
graph_report("failed to parse %s from object database",
 oid_to_hex(_oid));
continue;
}
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree OID for commit %s in commit-graph is 
%s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));

Maybe explicitly say that it doesn't match the value from the object
database; OTOH this may be too verbose.


}
  
  	return verify_commit_graph_error;

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 996a016239..21cc8e82f3 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255`
  GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256`
  GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8`
  GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 
+ 10`
+GRAPH_COMMIT_DATA_OFFSET=`expr $GRAPH_

[PATCH v4 06/21] commit-graph: add 'verify' subcommand

2018-06-04 Thread Derrick Stolee
If the commit-graph file becomes corrupt, we need a way to verify
that its contents match the object database. In the manner of
'git fsck' we will implement a 'git commit-graph verify' subcommand
to report all issues with the file.

Add the 'verify' subcommand to the 'commit-graph' builtin and its
documentation. The subcommand is currently a no-op except for
loading the commit-graph into memory, which may trigger run-time
errors that would be caught by normal use. Add a simple test that
ensures the command returns a zero error code.

If no commit-graph file exists, this is an acceptable state. Do
not report any errors.

Helped-by: Ramsay Jones 
Signed-off-by: Derrick Stolee 
---
 Documentation/git-commit-graph.txt |  6 +
 builtin/commit-graph.c | 38 ++
 commit-graph.c | 23 ++
 commit-graph.h |  2 ++
 t/t5318-commit-graph.sh| 10 
 5 files changed, 79 insertions(+)

diff --git a/Documentation/git-commit-graph.txt 
b/Documentation/git-commit-graph.txt
index 4c97b555cc..a222cfab08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -10,6 +10,7 @@ SYNOPSIS
 
 [verse]
 'git commit-graph read' [--object-dir ]
+'git commit-graph verify' [--object-dir ]
 'git commit-graph write'  [--object-dir ]
 
 
@@ -52,6 +53,11 @@ existing commit-graph file.
 Read a graph file given by the commit-graph file and output basic
 details about the graph file. Used for debugging purposes.
 
+'verify'::
+
+Read the commit-graph file and verify its contents against the object
+database. Used to check for corrupted data.
+
 
 EXAMPLES
 
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index f0875b8bf3..3079cde6f9 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -8,10 +8,16 @@
 static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir ]"),
N_("git commit-graph read [--object-dir ]"),
+   N_("git commit-graph verify [--object-dir ]"),
N_("git commit-graph write [--object-dir ] [--append] 
[--stdin-packs|--stdin-commits]"),
NULL
 };
 
+static const char * const builtin_commit_graph_verify_usage[] = {
+   N_("git commit-graph verify [--object-dir ]"),
+   NULL
+};
+
 static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir ]"),
NULL
@@ -29,6 +35,36 @@ static struct opts_commit_graph {
int append;
 } opts;
 
+
+static int graph_verify(int argc, const char **argv)
+{
+   struct commit_graph *graph = NULL;
+   char *graph_name;
+
+   static struct option builtin_commit_graph_verify_options[] = {
+   OPT_STRING(0, "object-dir", _dir,
+  N_("dir"),
+  N_("The object directory to store the graph")),
+   OPT_END(),
+   };
+
+   argc = parse_options(argc, argv, NULL,
+builtin_commit_graph_verify_options,
+builtin_commit_graph_verify_usage, 0);
+
+   if (!opts.obj_dir)
+   opts.obj_dir = get_object_directory();
+
+   graph_name = get_commit_graph_filename(opts.obj_dir);
+   graph = load_commit_graph_one(graph_name);
+   FREE_AND_NULL(graph_name);
+
+   if (!graph)
+   return 0;
+
+   return verify_commit_graph(graph);
+}
+
 static int graph_read(int argc, const char **argv)
 {
struct commit_graph *graph = NULL;
@@ -165,6 +201,8 @@ int cmd_commit_graph(int argc, const char **argv, const 
char *prefix)
if (argc > 0) {
if (!strcmp(argv[0], "read"))
return graph_read(argc, argv);
+   if (!strcmp(argv[0], "verify"))
+   return graph_verify(argc, argv);
if (!strcmp(argv[0], "write"))
return graph_write(argc, argv);
}
diff --git a/commit-graph.c b/commit-graph.c
index fee8437ce3..c860cbe721 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -824,3 +824,26 @@ void write_commit_graph(const char *obj_dir,
oids.alloc = 0;
oids.nr = 0;
 }
+
+static int verify_commit_graph_error;
+
+static void graph_report(const char *fmt, ...)
+{
+   va_list ap;
+   verify_commit_graph_error = 1;
+
+   va_start(ap, fmt);
+   vfprintf(stderr, fmt, ap);
+   fprintf(stderr, "\n");
+   va_end(ap);
+}
+
+int verify_commit_graph(struct commit_graph *g)
+{
+   if (!g) {
+   graph_report("no commit-graph file loaded");
+   return 1;
+   }
+
+   return verify_commit_graph_error;
+}
diff --git a/commit-graph.h b/commit-graph.h
index 96cccb10f3..71a39c5a57 100644
--- a/commit-graph.h
+++ b/co

[PATCH v4 07/21] commit-graph: verify catches corrupt signature

2018-06-04 Thread Derrick Stolee
This is the first of several commits that add a test to check that
'git commit-graph verify' catches corruption in the commit-graph
file. The first test checks that the command catches an error in
the file signature. This is a check that exists in the existing
commit-graph reading code.

Add a helper method 'corrupt_graph_and_verify' to the test script
t5318-commit-graph.sh. This helper corrupts the commit-graph file
at a certain location, runs 'git commit-graph verify', and reports
the output to the 'err' file. This data is filtered to remove the
lines added by 'test_must_fail' when the test is run verbosely.
Then, the output is checked to contain a specific error message.

Most messages from 'git commit-graph verify' will not be marked
for translation. There will be one exception: the message that
reports an invalid checksum will be marked for translation, as that
is the only message that is intended for a typical user.

Helped-by: Szeder Gábor 
Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 43 +
 1 file changed, 43 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 0830ef9fdd..c0c1ff09b9 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full 
repo' '
test_cmp expect output
 '
 
+# the verify tests below expect the commit-graph to contain
+# exactly the commits reachable from the commits/8 branch.
+# If the file changes the set of commits in the list, then the
+# offsets into the binary file will result in different edits
+# and the tests will likely break.
+
 test_expect_success 'git commit-graph verify' '
cd "$TRASH_DIRECTORY/full" &&
+   git rev-parse commits/8 | git commit-graph write --stdin-commits &&
git commit-graph verify >output
 '
 
+GRAPH_BYTE_VERSION=4
+GRAPH_BYTE_HASH=5
+
+# usage: corrupt_graph_and_verify   
+# Manipulates the commit-graph file at the position
+# by inserting the data, then runs 'git commit-graph verify'
+# and places the output in the file 'err'. Test 'err' for
+# the given string.
+corrupt_graph_and_verify() {
+   pos=$1
+   data="${2:-\0}"
+   grepstr=$3
+   cd "$TRASH_DIRECTORY/full" &&
+   test_when_finished mv commit-graph-backup $objdir/info/commit-graph &&
+   cp $objdir/info/commit-graph commit-graph-backup &&
+   printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" 
conv=notrunc &&
+   test_must_fail git commit-graph verify 2>test_err &&
+   grep -v "^+" test_err >err
+   test_i18ngrep "$grepstr" err
+}
+
+test_expect_success 'detect bad signature' '
+   corrupt_graph_and_verify 0 "\0" \
+   "graph signature"
+'
+
+test_expect_success 'detect bad version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \
+   "graph version"
+'
+
+test_expect_success 'detect bad hash version' '
+   corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \
+   "hash version"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 03/21] commit-graph: parse commit from chosen graph

2018-06-04 Thread Derrick Stolee
Before verifying a commit-graph file against the object database, we
need to parse all commits from the given commit-graph file. Create
parse_commit_in_graph_one() to target a given struct commit_graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index c09e87c3c2..472b41ec30 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -311,7 +311,7 @@ static int find_commit_in_graph(struct commit *item, struct 
commit_graph *g, uin
}
 }
 
-int parse_commit_in_graph(struct commit *item)
+static int parse_commit_in_graph_one(struct commit_graph *g, struct commit 
*item)
 {
uint32_t pos;
 
@@ -319,9 +319,21 @@ int parse_commit_in_graph(struct commit *item)
return 0;
if (item->object.parsed)
return 1;
+
+   if (find_commit_in_graph(item, g, ))
+   return fill_commit_in_graph(item, g, pos);
+
+   return 0;
+}
+
+int parse_commit_in_graph(struct commit *item)
+{
+   if (!core_commit_graph)
+   return 0;
+
prepare_commit_graph();
-   if (commit_graph && find_commit_in_graph(item, commit_graph, ))
-   return fill_commit_in_graph(item, commit_graph, pos);
+   if (commit_graph)
+   return parse_commit_in_graph_one(commit_graph, item);
return 0;
 }
 
-- 
2.18.0.rc1



[PATCH v4 08/21] commit-graph: verify required chunks are present

2018-06-04 Thread Derrick Stolee
The commit-graph file requires the following three chunks:

* OID Fanout
* OID Lookup
* Commit Data

If any of these are missing, then the 'verify' subcommand should
report a failure. This includes the chunk IDs malformed or the
chunk count is truncated.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  |  9 +
 t/t5318-commit-graph.sh | 29 +
 2 files changed, 38 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index c860cbe721..25d5edea82 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -845,5 +845,14 @@ int verify_commit_graph(struct commit_graph *g)
return 1;
}
 
+   verify_commit_graph_error = 0;
+
+   if (!g->chunk_oid_fanout)
+   graph_report("commit-graph is missing the OID Fanout chunk");
+   if (!g->chunk_oid_lookup)
+   graph_report("commit-graph is missing the OID Lookup chunk");
+   if (!g->chunk_commit_data)
+   graph_report("commit-graph is missing the Commit Data chunk");
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c0c1ff09b9..846396665e 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -249,6 +249,15 @@ test_expect_success 'git commit-graph verify' '
 
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
+GRAPH_BYTE_CHUNK_COUNT=6
+GRAPH_CHUNK_LOOKUP_OFFSET=8
+GRAPH_CHUNK_LOOKUP_WIDTH=12
+GRAPH_CHUNK_LOOKUP_ROWS=5
+GRAPH_BYTE_OID_FANOUT_ID=$GRAPH_CHUNK_LOOKUP_OFFSET
+GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+   1 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+2 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -283,4 +292,24 @@ test_expect_success 'detect bad hash version' '
"hash version"
 '
 
+test_expect_success 'detect low chunk count' '
+   corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+   "missing the .* chunk"
+'
+
+test_expect_success 'detect missing OID fanout chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_FANOUT_ID "\0" \
+   "missing the OID Fanout chunk"
+'
+
+test_expect_success 'detect missing OID lookup chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ID "\0" \
+   "missing the OID Lookup chunk"
+'
+
+test_expect_success 'detect missing commit data chunk' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATA_ID "\0" \
+   "missing the Commit Data chunk"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 05/21] commit-graph: load a root tree from specific graph

2018-06-04 Thread Derrick Stolee
When lazy-loading a tree for a commit, it will be important to select
the tree from a specific struct commit_graph. Create a new method that
specifies the commit-graph file and use that in
get_commit_tree_in_graph().

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 472b41ec30..fee8437ce3 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -359,14 +359,20 @@ static struct tree *load_tree_for_commit(struct 
commit_graph *g, struct commit *
return c->maybe_tree;
 }
 
-struct tree *get_commit_tree_in_graph(const struct commit *c)
+static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g,
+const struct commit *c)
 {
if (c->maybe_tree)
return c->maybe_tree;
if (c->graph_pos == COMMIT_NOT_FROM_GRAPH)
-   BUG("get_commit_tree_in_graph called from non-commit-graph 
commit");
+   BUG("get_commit_tree_in_graph_one called from non-commit-graph 
commit");
+
+   return load_tree_for_commit(g, (struct commit *)c);
+}
 
-   return load_tree_for_commit(commit_graph, (struct commit *)c);
+struct tree *get_commit_tree_in_graph(const struct commit *c)
+{
+   return get_commit_tree_in_graph_one(commit_graph, c);
 }
 
 static void write_graph_chunk_fanout(struct hashfile *f,
-- 
2.18.0.rc1



[PATCH v4 00/21] Integrate commit-graph into 'fsck' and 'gc'

2018-06-04 Thread Derrick Stolee
Thanks for the feedback on v3. There were several small cleanups, but
perhaps the biggest change is the addition of "commit-graph: use
string-list API for input" which makes "commit-graph: add '--reachable'
option" much simpler.

The inter-diff is still reasonably large, but I'll send it in a
follow-up PR.

Thanks,
-Stolee

Derrick Stolee (21):
  commit-graph: UNLEAK before die()
  commit-graph: fix GRAPH_MIN_SIZE
  commit-graph: parse commit from chosen graph
  commit: force commit to parse from object database
  commit-graph: load a root tree from specific graph
  commit-graph: add 'verify' subcommand
  commit-graph: verify catches corrupt signature
  commit-graph: verify required chunks are present
  commit-graph: verify corrupt OID fanout and lookup
  commit-graph: verify objects exist
  commit-graph: verify root tree OIDs
  commit-graph: verify parent list
  commit-graph: verify generation number
  commit-graph: verify commit date
  commit-graph: test for corrupted octopus edge
  commit-graph: verify contents match checksum
  fsck: verify commit-graph
  commit-graph: use string-list API for input
  commit-graph: add '--reachable' option
  gc: automatically write commit-graph files
  commit-graph: update design document

 Documentation/config.txt |  10 +-
 Documentation/git-commit-graph.txt   |  14 +-
 Documentation/git-fsck.txt   |   3 +
 Documentation/git-gc.txt |   4 +
 Documentation/technical/commit-graph.txt |  22 --
 builtin/commit-graph.c   |  98 ++---
 builtin/fsck.c   |  21 ++
 builtin/gc.c |   6 +
 commit-graph.c   | 248 +--
 commit-graph.h   |  10 +-
 commit.c |   9 +-
 commit.h |   1 +
 t/t5318-commit-graph.sh  | 201 ++
 13 files changed, 569 insertions(+), 78 deletions(-)

-- 
2.18.0.rc1



[PATCH v4 11/21] commit-graph: verify root tree OIDs

2018-06-04 Thread Derrick Stolee
The 'verify' subcommand must compare the commit content parsed from the
commit-graph against the content in the object database. Use
lookup_commit() and parse_commit_in_graph_one() to parse the commits
from the graph and compare against a commit that is loaded separately
and parsed directly from the object database.

Add checks for the root tree OID.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 17 -
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/commit-graph.c b/commit-graph.c
index 0cf1b61d80..50a8e27910 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -862,6 +862,8 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
+   struct commit *graph_commit;
+
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
if (i && oidcmp(_oid, _oid) >= 0)
@@ -879,6 +881,11 @@ int verify_commit_graph(struct commit_graph *g)
 
cur_fanout_pos++;
}
+
+   graph_commit = lookup_commit(_oid);
+   if (!parse_commit_in_graph_one(g, graph_commit))
+   graph_report("failed to parse %s from commit-graph",
+oid_to_hex(_oid));
}
 
while (cur_fanout_pos < 256) {
@@ -895,16 +902,24 @@ int verify_commit_graph(struct commit_graph *g)
return verify_commit_graph_error;
 
for (i = 0; i < g->num_commits; i++) {
-   struct commit *odb_commit;
+   struct commit *graph_commit, *odb_commit;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
+   graph_commit = lookup_commit(_oid);
odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
if (parse_commit_internal(odb_commit, 0, 0)) {
graph_report("failed to parse %s from object database",
 oid_to_hex(_oid));
continue;
}
+
+   if (oidcmp(_commit_tree_in_graph_one(g, 
graph_commit)->object.oid,
+  get_commit_tree_oid(odb_commit)))
+   graph_report("root tree OID for commit %s in 
commit-graph is %s != %s",
+oid_to_hex(_oid),
+
oid_to_hex(get_commit_tree_oid(graph_commit)),
+
oid_to_hex(get_commit_tree_oid(odb_commit)));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index cf60e48496..c0c1248eda 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
+GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS))
+GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -341,4 +343,9 @@ test_expect_success 'detect OID not in object database' '
"from object database"
 '
 
+test_expect_success 'detect incorrect tree OID' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_TREE "\01" \
+   "root tree OID for commit"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 09/21] commit-graph: verify corrupt OID fanout and lookup

2018-06-04 Thread Derrick Stolee
In the commit-graph file, the OID fanout chunk provides an index into
the OID lookup. The 'verify' subcommand should find incorrect values
in the fanout.

Similarly, the 'verify' subcommand should find out-of-order values in
the OID lookup.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 36 
 t/t5318-commit-graph.sh | 22 ++
 2 files changed, 58 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 25d5edea82..a90cc2e6f4 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -840,6 +840,9 @@ static void graph_report(const char *fmt, ...)
 
 int verify_commit_graph(struct commit_graph *g)
 {
+   uint32_t i, cur_fanout_pos = 0;
+   struct object_id prev_oid, cur_oid;
+
if (!g) {
graph_report("no commit-graph file loaded");
return 1;
@@ -854,5 +857,38 @@ int verify_commit_graph(struct commit_graph *g)
if (!g->chunk_commit_data)
graph_report("commit-graph is missing the Commit Data chunk");
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   if (i && oidcmp(_oid, _oid) >= 0)
+   graph_report("commit-graph has incorrect OID order: %s 
then %s",
+oid_to_hex(_oid),
+oid_to_hex(_oid));
+
+   oidcpy(_oid, _oid);
+
+   while (cur_oid.hash[0] > cur_fanout_pos) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+   if (i != fanout_value)
+   graph_report("commit-graph has incorrect fanout 
value: fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+   }
+
+   while (cur_fanout_pos < 256) {
+   uint32_t fanout_value = get_be32(g->chunk_oid_fanout + 
cur_fanout_pos);
+
+   if (g->num_commits != fanout_value)
+   graph_report("commit-graph has incorrect fanout value: 
fanout[%d] = %u != %u",
+cur_fanout_pos, fanout_value, i);
+
+   cur_fanout_pos++;
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 846396665e..c29eae47c9 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
 GRAPH_BYTE_CHUNK_COUNT=6
@@ -258,6 +259,12 @@ GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
1 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
 GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
 2 \* $GRAPH_CHUNK_LOOKUP_WIDTH))
+GRAPH_FANOUT_OFFSET=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \
+  $GRAPH_CHUNK_LOOKUP_WIDTH \* $GRAPH_CHUNK_LOOKUP_ROWS))
+GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4))
+GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
+GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
+GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -312,4 +319,19 @@ test_expect_success 'detect missing commit data chunk' '
"missing the Commit Data chunk"
 '
 
+test_expect_success 'detect incorrect fanout' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT1 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect fanout final value' '
+   corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \
+   "fanout value"
+'
+
+test_expect_success 'detect incorrect OID order' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ORDER "\01" \
+   "incorrect OID order"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 12/21] commit-graph: verify parent list

2018-06-04 Thread Derrick Stolee
The commit-graph file stores parents in a two-column portion of the
commit data chunk. If there is only one parent, then the second column
stores 0x to indicate no second parent.

The 'verify' subcommand checks the parent list for the commit loaded
from the commit-graph and the one parsed from the object database. Test
these checks for corrupt parents, too many parents, and wrong parents.

Add a boundary check to insert_parent_or_die() for when the parent
position value is out of range.

The octopus merge will be tested in a later commit.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 28 
 t/t5318-commit-graph.sh | 18 ++
 2 files changed, 46 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 50a8e27910..d2a86e329e 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -240,6 +240,9 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
struct commit *c;
struct object_id oid;
 
+   if (pos >= g->num_commits)
+   die("invalid parent position %"PRIu64, pos);
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -903,6 +906,7 @@ int verify_commit_graph(struct commit_graph *g)
 
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
+   struct commit_list *graph_parents, *odb_parents;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -920,6 +924,30 @@ int verify_commit_graph(struct commit_graph *g)
 oid_to_hex(_oid),
 
oid_to_hex(get_commit_tree_oid(graph_commit)),
 
oid_to_hex(get_commit_tree_oid(odb_commit)));
+
+   graph_parents = graph_commit->parents;
+   odb_parents = odb_commit->parents;
+
+   while (graph_parents) {
+   if (odb_parents == NULL) {
+   graph_report("commit-graph parent list for 
commit %s is too long",
+oid_to_hex(_oid));
+   break;
+   }
+
+   if (oidcmp(_parents->item->object.oid, 
_parents->item->object.oid))
+   graph_report("commit-graph parent for %s is %s 
!= %s",
+oid_to_hex(_oid),
+
oid_to_hex(_parents->item->object.oid),
+
oid_to_hex(_parents->item->object.oid));
+
+   graph_parents = graph_parents->next;
+   odb_parents = odb_parents->next;
+   }
+
+   if (odb_parents != NULL)
+   graph_report("commit-graph parent list for commit %s 
terminates early",
+oid_to_hex(_oid));
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c0c1248eda..ec0964112a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -269,6 +269,9 @@ GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + 
$HASH_LEN \* 8))
 GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
 GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 
$NUM_COMMITS))
 GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
+GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
+GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
+GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -348,4 +351,19 @@ test_expect_success 'detect incorrect tree OID' '
"root tree OID for commit"
 '
 
+test_expect_success 'detect incorrect parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_PARENT "\01" \
+   "invalid parent"
+'
+
+test_expect_success 'detect extra parent int-id' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_EXTRA_PARENT "\00" \
+   "is too long"
+'
+
+test_expect_success 'detect wrong parent' '
+   corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_WRONG_PARENT "\01" \
+   "commit-graph parent for"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 02/21] commit-graph: fix GRAPH_MIN_SIZE

2018-06-04 Thread Derrick Stolee
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable
commit-graph file. However, the minimum number of chunks was wrong.
It is possible to write a commit-graph file with zero commits, and
that violates this macro's value.

Rewrite the macro, and use extra macros to better explain the magic
constants.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index bb54c1214c..c09e87c3c2 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -34,10 +34,11 @@
 
 #define GRAPH_LAST_EDGE 0x8000
 
+#define GRAPH_HEADER_SIZE 8
 #define GRAPH_FANOUT_SIZE (4 * 256)
 #define GRAPH_CHUNKLOOKUP_WIDTH 12
-#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \
-   GRAPH_OID_LEN + 8)
+#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \
+   + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN)
 
 char *get_commit_graph_filename(const char *obj_dir)
 {
-- 
2.18.0.rc1



[PATCH v4 01/21] commit-graph: UNLEAK before die()

2018-06-04 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 37420ae0fd..f0875b8bf3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv)
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
 
-   if (!graph)
+   if (!graph) {
+   UNLEAK(graph_name);
die("graph file %s does not exist", graph_name);
+   }
+
FREE_AND_NULL(graph_name);
 
printf("header: %08x %d %d %d %d\n",
-- 
2.18.0.rc1



[PATCH v4 10/21] commit-graph: verify objects exist

2018-06-04 Thread Derrick Stolee
In the 'verify' subcommand, load commits directly from the object
database to ensure they exist. Parse by skipping the commit-graph.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 17 +
 t/t5318-commit-graph.sh |  7 +++
 2 files changed, 24 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index a90cc2e6f4..0cf1b61d80 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -239,6 +239,7 @@ static struct commit_list **insert_parent_or_die(struct 
commit_graph *g,
 {
struct commit *c;
struct object_id oid;
+
hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos);
c = lookup_commit();
if (!c)
@@ -890,5 +891,21 @@ int verify_commit_graph(struct commit_graph *g)
cur_fanout_pos++;
}
 
+   if (verify_commit_graph_error)
+   return verify_commit_graph_error;
+
+   for (i = 0; i < g->num_commits; i++) {
+   struct commit *odb_commit;
+
+   hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+
+   odb_commit = (struct commit *)create_object(cur_oid.hash, 
alloc_commit_node());
+   if (parse_commit_internal(odb_commit, 0, 0)) {
+   graph_report("failed to parse %s from object database",
+oid_to_hex(_oid));
+   continue;
+   }
+   }
+
return verify_commit_graph_error;
 }
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index c29eae47c9..cf60e48496 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' '
git commit-graph verify >output
 '
 
+NUM_COMMITS=9
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4))
 GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255))
 GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256))
 GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8))
+GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 
10))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' '
"incorrect OID order"
 '
 
+test_expect_success 'detect OID not in object database' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \
+   "from object database"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 15/21] commit-graph: test for corrupted octopus edge

2018-06-04 Thread Derrick Stolee
The commit-graph file has an extra chunk to store the parent int-ids for
parents beyond the first parent for octopus merges. Our test repo has a
single octopus merge that we can manipulate to demonstrate the 'verify'
subcommand detects incorrect values in that chunk.

Signed-off-by: Derrick Stolee 
---
 t/t5318-commit-graph.sh | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 6a873bfda8..cf67fb391a 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' '
 '
 
 NUM_COMMITS=9
+NUM_OCTOPUS_EDGES=2
 HASH_LEN=20
 GRAPH_BYTE_VERSION=4
 GRAPH_BYTE_HASH=5
@@ -274,6 +275,10 @@ 
GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
 GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12))
+GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16))
+GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \
+$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS))
+GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -383,4 +388,9 @@ test_expect_success 'detect incorrect commit date' '
"commit date"
 '
 
+test_expect_success 'detect incorrect parent for octopus merge' '
+   corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \
+   "invalid parent"
+'
+
 test_done
-- 
2.18.0.rc1



[PATCH v4 04/21] commit: force commit to parse from object database

2018-06-04 Thread Derrick Stolee
In anticipation of verifying commit-graph file contents against the
object database, create parse_commit_internal() to allow side-stepping
the commit-graph file and parse directly from the object database.

Due to the use of generation numbers, this method should not be called
unless the intention is explicit in avoiding commits from the
commit-graph file.

Signed-off-by: Derrick Stolee 
---
 commit.c | 9 +++--
 commit.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index 1d28677dfb..6eaed0174c 100644
--- a/commit.c
+++ b/commit.c
@@ -392,7 +392,7 @@ int parse_commit_buffer(struct commit *item, const void 
*buffer, unsigned long s
return 0;
 }
 
-int parse_commit_gently(struct commit *item, int quiet_on_missing)
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph)
 {
enum object_type type;
void *buffer;
@@ -403,7 +403,7 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return -1;
if (item->object.parsed)
return 0;
-   if (parse_commit_in_graph(item))
+   if (use_commit_graph && parse_commit_in_graph(item))
return 0;
buffer = read_sha1_file(item->object.oid.hash, , );
if (!buffer)
@@ -424,6 +424,11 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
return ret;
 }
 
+int parse_commit_gently(struct commit *item, int quiet_on_missing)
+{
+   return parse_commit_internal(item, quiet_on_missing, 1);
+}
+
 void parse_commit_or_die(struct commit *item)
 {
if (parse_commit(item))
diff --git a/commit.h b/commit.h
index b5afde1ae9..5fde74fcd7 100644
--- a/commit.h
+++ b/commit.h
@@ -73,6 +73,7 @@ struct commit *lookup_commit_reference_by_name(const char 
*name);
 struct commit *lookup_commit_or_die(const struct object_id *oid, const char 
*ref_name);
 
 int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long 
size, int check_graph);
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
use_commit_graph);
 int parse_commit_gently(struct commit *item, int quiet_on_missing);
 static inline int parse_commit(struct commit *item)
 {
-- 
2.18.0.rc1



[PATCH v4 18/21] commit-graph: use string-list API for input

2018-06-04 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 builtin/commit-graph.c | 39 +--
 commit-graph.c | 15 +++
 commit-graph.h |  7 +++
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 3079cde6f9..d8eb8278b3 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -118,13 +118,9 @@ static int graph_read(int argc, const char **argv)
 
 static int graph_write(int argc, const char **argv)
 {
-   const char **pack_indexes = NULL;
-   int packs_nr = 0;
-   const char **commit_hex = NULL;
-   int commits_nr = 0;
-   const char **lines = NULL;
-   int lines_nr = 0;
-   int lines_alloc = 0;
+   struct string_list *pack_indexes = NULL;
+   struct string_list *commit_hex = NULL;
+   struct string_list lines;
 
static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", _dir,
@@ -150,32 +146,23 @@ static int graph_write(int argc, const char **argv)
 
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
-   lines_nr = 0;
-   lines_alloc = 128;
-   ALLOC_ARRAY(lines, lines_alloc);
-
-   while (strbuf_getline(, stdin) != EOF) {
-   ALLOC_GROW(lines, lines_nr + 1, lines_alloc);
-   lines[lines_nr++] = strbuf_detach(, NULL);
-   }
-
-   if (opts.stdin_packs) {
-   pack_indexes = lines;
-   packs_nr = lines_nr;
-   }
-   if (opts.stdin_commits) {
-   commit_hex = lines;
-   commits_nr = lines_nr;
-   }
+   string_list_init(, 0);
+
+   while (strbuf_getline(, stdin) != EOF)
+   string_list_append(, strbuf_detach(, NULL));
+
+   if (opts.stdin_packs)
+   pack_indexes = 
+   if (opts.stdin_commits)
+   commit_hex = 
}
 
write_commit_graph(opts.obj_dir,
   pack_indexes,
-  packs_nr,
   commit_hex,
-  commits_nr,
   opts.append);
 
+   string_list_clear(, 0);
return 0;
 }
 
diff --git a/commit-graph.c b/commit-graph.c
index 05879de26c..c6070735c2 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -653,10 +653,8 @@ static void compute_generation_numbers(struct 
packed_commit_list* commits)
 }
 
 void write_commit_graph(const char *obj_dir,
-   const char **pack_indexes,
-   int nr_packs,
-   const char **commit_hex,
-   int nr_commits,
+   struct string_list *pack_indexes,
+   struct string_list *commit_hex,
int append)
 {
struct packed_oid_list oids;
@@ -697,10 +695,10 @@ void write_commit_graph(const char *obj_dir,
int dirlen;
strbuf_addf(, "%s/pack/", obj_dir);
dirlen = packname.len;
-   for (i = 0; i < nr_packs; i++) {
+   for (i = 0; i < pack_indexes->nr; i++) {
struct packed_git *p;
strbuf_setlen(, dirlen);
-   strbuf_addstr(, pack_indexes[i]);
+   strbuf_addstr(, pack_indexes->items[i].string);
p = add_packed_git(packname.buf, packname.len, 1);
if (!p)
die("error adding pack %s", packname.buf);
@@ -713,12 +711,13 @@ void write_commit_graph(const char *obj_dir,
}
 
if (commit_hex) {
-   for (i = 0; i < nr_commits; i++) {
+   for (i = 0; i < commit_hex->nr; i++) {
const char *end;
struct object_id oid;
struct commit *result;
 
-   if (commit_hex[i] && parse_oid_hex(commit_hex[i], , 
))
+   if (commit_hex->items[i].string &&
+   parse_oid_hex(commit_hex->items[i].string, , 
))
continue;
 
result = lookup_commit_reference_gently(, 1);
diff --git a/commit-graph.h b/commit-graph.h
index 71a39c5a57..1e1fc5 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -2,6 +2,7 @@
 #define COMMIT_GRAPH_H
 
 #include "git-compat-util.h"
+#include "string-list.h"
 
 char *get_commit_graph_filename(const char *obj_dir);
 
@@ -47,10 +48,8 @@ struct commit_graph {
 struct commit_graph *load_commit_graph_one(const char *graph_file);
 
 void write_commit_graph(const char *obj

[PATCH v4 13/21] commit-graph: verify generation number

2018-06-04 Thread Derrick Stolee
While iterating through the commit parents, perform the generation
number calculation and compare against the value stored in the
commit-graph.

The tests demonstrate that having a different set of parents affects
the generation number calculation, and this value propagates to
descendants. Hence, we drop the single-line condition on the output.

Since Git will ship with the commit-graph feature without generation
numbers, we need to accept commit-graphs with all generation numbers
equal to zero. In this case, ignore the generation number calculation.

However, verify that we should never have a mix of zero and non-zero
generation numbers. Create a test that sets one commit to generation
zero and all following commits report a failure as they have non-zero
generation in a file that contains generation number zero.

Signed-off-by: Derrick Stolee 
---
 commit-graph.c  | 34 ++
 t/t5318-commit-graph.sh | 11 +++
 2 files changed, 45 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index d2a86e329e..5faecae2a7 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -842,10 +842,14 @@ static void graph_report(const char *fmt, ...)
va_end(ap);
 }
 
+#define GENERATION_ZERO_EXISTS 1
+#define GENERATION_NUMBER_EXISTS 2
+
 int verify_commit_graph(struct commit_graph *g)
 {
uint32_t i, cur_fanout_pos = 0;
struct object_id prev_oid, cur_oid;
+   int generation_zero = 0;
 
if (!g) {
graph_report("no commit-graph file loaded");
@@ -907,6 +911,7 @@ int verify_commit_graph(struct commit_graph *g)
for (i = 0; i < g->num_commits; i++) {
struct commit *graph_commit, *odb_commit;
struct commit_list *graph_parents, *odb_parents;
+   uint32_t max_generation = 0;
 
hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i);
 
@@ -941,6 +946,9 @@ int verify_commit_graph(struct commit_graph *g)
 
oid_to_hex(_parents->item->object.oid),
 
oid_to_hex(_parents->item->object.oid));
 
+   if (graph_parents->item->generation > max_generation)
+   max_generation = 
graph_parents->item->generation;
+
graph_parents = graph_parents->next;
odb_parents = odb_parents->next;
}
@@ -948,6 +956,32 @@ int verify_commit_graph(struct commit_graph *g)
if (odb_parents != NULL)
graph_report("commit-graph parent list for commit %s 
terminates early",
 oid_to_hex(_oid));
+
+   if (!graph_commit->generation) {
+   if (generation_zero == GENERATION_NUMBER_EXISTS)
+   graph_report("commit-graph has generation 
number zero for commit %s, but non-zero elsewhere",
+oid_to_hex(_oid));
+   generation_zero = GENERATION_ZERO_EXISTS;
+   } else if (generation_zero == GENERATION_ZERO_EXISTS)
+   graph_report("commit-graph has non-zero generation 
number for commit %s, but zero elsewhere",
+oid_to_hex(_oid));
+
+   if (generation_zero == GENERATION_ZERO_EXISTS)
+   continue;
+
+   /*
+* If one of our parents has generation GENERATION_NUMBER_MAX, 
then
+* our generation is also GENERATION_NUMBER_MAX. Decrement to 
avoid
+* extra logic in the following condition.
+*/
+   if (max_generation == GENERATION_NUMBER_MAX)
+   max_generation--;
+
+   if (graph_commit->generation != max_generation + 1)
+   graph_report("commit-graph generation for commit %s is 
%u != %u",
+oid_to_hex(_oid),
+graph_commit->generation,
+max_generation + 1);
}
 
return verify_commit_graph_error;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index ec0964112a..a6ea1341dc 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET
 GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN))
 GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4))
 GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3))
+GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11))
 
 # usage: corrupt_graph_and_verify   
 # Manipulates the commit-graph file at the position
@@ -366,4 +367,14 @@ test_expect_success 'detect wrong parent' '

<    1   2   3   4   5   6   7   8   9   10   >