[PATCH v6 18/21] commit-graph: use string-list API for input
Signed-off-by: Derrick Stolee --- builtin/commit-graph.c | 39 +-- commit-graph.c | 15 +++ commit-graph.h | 7 +++ 3 files changed, 23 insertions(+), 38 deletions(-) diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index 9d108f43a9..ea28bc311a 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -119,13 +119,9 @@ static int graph_read(int argc, const char **argv) static int graph_write(int argc, const char **argv) { - const char **pack_indexes = NULL; - int packs_nr = 0; - const char **commit_hex = NULL; - int commits_nr = 0; - const char **lines = NULL; - int lines_nr = 0; - int lines_alloc = 0; + struct string_list *pack_indexes = NULL; + struct string_list *commit_hex = NULL; + struct string_list lines; static struct option builtin_commit_graph_write_options[] = { OPT_STRING(0, "object-dir", _dir, @@ -149,34 +145,25 @@ static int graph_write(int argc, const char **argv) if (!opts.obj_dir) opts.obj_dir = get_object_directory(); + string_list_init(, 0); if (opts.stdin_packs || opts.stdin_commits) { struct strbuf buf = STRBUF_INIT; - lines_nr = 0; - lines_alloc = 128; - ALLOC_ARRAY(lines, lines_alloc); - - while (strbuf_getline(, stdin) != EOF) { - ALLOC_GROW(lines, lines_nr + 1, lines_alloc); - lines[lines_nr++] = strbuf_detach(, NULL); - } - - if (opts.stdin_packs) { - pack_indexes = lines; - packs_nr = lines_nr; - } - if (opts.stdin_commits) { - commit_hex = lines; - commits_nr = lines_nr; - } + + while (strbuf_getline(, stdin) != EOF) + string_list_append(, strbuf_detach(, NULL)); + + if (opts.stdin_packs) + pack_indexes = + if (opts.stdin_commits) + commit_hex = } write_commit_graph(opts.obj_dir, pack_indexes, - packs_nr, commit_hex, - commits_nr, opts.append); + string_list_clear(, 0); return 0; } diff --git a/commit-graph.c b/commit-graph.c index d926c4b59f..a06e7e9549 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -657,10 +657,8 @@ static void compute_generation_numbers(struct packed_commit_list* commits) } void write_commit_graph(const char *obj_dir, - const char **pack_indexes, - int nr_packs, - const char **commit_hex, - int nr_commits, + struct string_list *pack_indexes, + struct string_list *commit_hex, int append) { struct packed_oid_list oids; @@ -701,10 +699,10 @@ void write_commit_graph(const char *obj_dir, int dirlen; strbuf_addf(, "%s/pack/", obj_dir); dirlen = packname.len; - for (i = 0; i < nr_packs; i++) { + for (i = 0; i < pack_indexes->nr; i++) { struct packed_git *p; strbuf_setlen(, dirlen); - strbuf_addstr(, pack_indexes[i]); + strbuf_addstr(, pack_indexes->items[i].string); p = add_packed_git(packname.buf, packname.len, 1); if (!p) die("error adding pack %s", packname.buf); @@ -717,12 +715,13 @@ void write_commit_graph(const char *obj_dir, } if (commit_hex) { - for (i = 0; i < nr_commits; i++) { + for (i = 0; i < commit_hex->nr; i++) { const char *end; struct object_id oid; struct commit *result; - if (commit_hex[i] && parse_oid_hex(commit_hex[i], , )) + if (commit_hex->items[i].string && + parse_oid_hex(commit_hex->items[i].string, , )) continue; result = lookup_commit_reference_gently(, 1); diff --git a/commit-graph.h b/commit-graph.h index 4359812fb4..a79b854470 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -3,6 +3,7 @@ #include "git-compat-util.h" #include "repository.h" +#include "string-list.h" char *get_commit_graph_filename(const char *obj_dir); @@ -48,10 +49,8 @@ struct commit_graph { struct commit_graph *load_commit_graph_one
[PATCH v6 06/21] commit-graph: add 'verify' subcommand
If the commit-graph file becomes corrupt, we need a way to verify that its contents match the object database. In the manner of 'git fsck' we will implement a 'git commit-graph verify' subcommand to report all issues with the file. Add the 'verify' subcommand to the 'commit-graph' builtin and its documentation. The subcommand is currently a no-op except for loading the commit-graph into memory, which may trigger run-time errors that would be caught by normal use. Add a simple test that ensures the command returns a zero error code. If no commit-graph file exists, this is an acceptable state. Do not report any errors. Helped-by: Ramsay Jones Signed-off-by: Derrick Stolee --- Documentation/git-commit-graph.txt | 6 + builtin/commit-graph.c | 39 ++ commit-graph.c | 23 ++ commit-graph.h | 3 +++ t/t5318-commit-graph.sh| 10 5 files changed, 81 insertions(+) diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index 4c97b555cc..a222cfab08 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -10,6 +10,7 @@ SYNOPSIS [verse] 'git commit-graph read' [--object-dir ] +'git commit-graph verify' [--object-dir ] 'git commit-graph write' [--object-dir ] @@ -52,6 +53,11 @@ existing commit-graph file. Read a graph file given by the commit-graph file and output basic details about the graph file. Used for debugging purposes. +'verify':: + +Read the commit-graph file and verify its contents against the object +database. Used to check for corrupted data. + EXAMPLES diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index f0875b8bf3..9d108f43a9 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -3,15 +3,22 @@ #include "dir.h" #include "lockfile.h" #include "parse-options.h" +#include "repository.h" #include "commit-graph.h" static char const * const builtin_commit_graph_usage[] = { N_("git commit-graph [--object-dir ]"), N_("git commit-graph read [--object-dir ]"), + N_("git commit-graph verify [--object-dir ]"), N_("git commit-graph write [--object-dir ] [--append] [--stdin-packs|--stdin-commits]"), NULL }; +static const char * const builtin_commit_graph_verify_usage[] = { + N_("git commit-graph verify [--object-dir ]"), + NULL +}; + static const char * const builtin_commit_graph_read_usage[] = { N_("git commit-graph read [--object-dir ]"), NULL @@ -29,6 +36,36 @@ static struct opts_commit_graph { int append; } opts; + +static int graph_verify(int argc, const char **argv) +{ + struct commit_graph *graph = NULL; + char *graph_name; + + static struct option builtin_commit_graph_verify_options[] = { + OPT_STRING(0, "object-dir", _dir, + N_("dir"), + N_("The object directory to store the graph")), + OPT_END(), + }; + + argc = parse_options(argc, argv, NULL, +builtin_commit_graph_verify_options, +builtin_commit_graph_verify_usage, 0); + + if (!opts.obj_dir) + opts.obj_dir = get_object_directory(); + + graph_name = get_commit_graph_filename(opts.obj_dir); + graph = load_commit_graph_one(graph_name); + FREE_AND_NULL(graph_name); + + if (!graph) + return 0; + + return verify_commit_graph(the_repository, graph); +} + static int graph_read(int argc, const char **argv) { struct commit_graph *graph = NULL; @@ -165,6 +202,8 @@ int cmd_commit_graph(int argc, const char **argv, const char *prefix) if (argc > 0) { if (!strcmp(argv[0], "read")) return graph_read(argc, argv); + if (!strcmp(argv[0], "verify")) + return graph_verify(argc, argv); if (!strcmp(argv[0], "write")) return graph_write(argc, argv); } diff --git a/commit-graph.c b/commit-graph.c index 9e228d3bb5..22ef696e18 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -827,3 +827,26 @@ void write_commit_graph(const char *obj_dir, oids.alloc = 0; oids.nr = 0; } + +static int verify_commit_graph_error; + +static void graph_report(const char *fmt, ...) +{ + va_list ap; + verify_commit_graph_error = 1; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +int verify_commit_graph(struct repository *r, struct commit_graph *g) +{ + if (!g) { + graph_report("no commit-graph fil
[PATCH v6 03/21] commit-graph: parse commit from chosen graph
Before verifying a commit-graph file against the object database, we need to parse all commits from the given commit-graph file. Create parse_commit_in_graph_one() to target a given struct commit_graph. Signed-off-by: Derrick Stolee --- commit-graph.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index f83f6d2373..e77b19971d 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -314,7 +314,7 @@ static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin } } -int parse_commit_in_graph(struct commit *item) +static int parse_commit_in_graph_one(struct commit_graph *g, struct commit *item) { uint32_t pos; @@ -322,9 +322,21 @@ int parse_commit_in_graph(struct commit *item) return 0; if (item->object.parsed) return 1; + + if (find_commit_in_graph(item, g, )) + return fill_commit_in_graph(item, g, pos); + + return 0; +} + +int parse_commit_in_graph(struct commit *item) +{ + if (!core_commit_graph) + return 0; + prepare_commit_graph(); - if (commit_graph && find_commit_in_graph(item, commit_graph, )) - return fill_commit_in_graph(item, commit_graph, pos); + if (commit_graph) + return parse_commit_in_graph_one(commit_graph, item); return 0; } -- 2.18.0.rc1
[PATCH v6 19/21] commit-graph: add '--reachable' option
When writing commit-graph files, it can be convenient to ask for all reachable commits (starting at the ref set) in the resulting file. This is particularly helpful when writing to stdin is complicated, such as a future integration with 'git gc'. Signed-off-by: Derrick Stolee --- Documentation/git-commit-graph.txt | 8 ++-- builtin/commit-graph.c | 16 commit-graph.c | 18 ++ commit-graph.h | 1 + t/t5318-commit-graph.sh| 10 ++ 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index a222cfab08..dececb79d7 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -38,12 +38,16 @@ Write a commit graph file based on the commits found in packfiles. + With the `--stdin-packs` option, generate the new commit graph by walking objects only in the specified pack-indexes. (Cannot be combined -with --stdin-commits.) +with `--stdin-commits` or `--reachable`.) + With the `--stdin-commits` option, generate the new commit graph by walking commits starting at the commits specified in stdin as a list of OIDs in hex, one OID per line. (Cannot be combined with ---stdin-packs.) +`--stdin-packs` or `--reachable`.) ++ +With the `--reachable` option, generate the new commit graph by walking +commits starting at all refs. (Cannot be combined with `--stdin-commits` +or `--stdin-packs`.) + With the `--append` option, include all commits that are present in the existing commit-graph file. diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index ea28bc311a..c7d0db5ab4 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -10,7 +10,7 @@ static char const * const builtin_commit_graph_usage[] = { N_("git commit-graph [--object-dir ]"), N_("git commit-graph read [--object-dir ]"), N_("git commit-graph verify [--object-dir ]"), - N_("git commit-graph write [--object-dir ] [--append] [--stdin-packs|--stdin-commits]"), + N_("git commit-graph write [--object-dir ] [--append] [--reachable|--stdin-packs|--stdin-commits]"), NULL }; @@ -25,12 +25,13 @@ static const char * const builtin_commit_graph_read_usage[] = { }; static const char * const builtin_commit_graph_write_usage[] = { - N_("git commit-graph write [--object-dir ] [--append] [--stdin-packs|--stdin-commits]"), + N_("git commit-graph write [--object-dir ] [--append] [--reachable|--stdin-packs|--stdin-commits]"), NULL }; static struct opts_commit_graph { const char *obj_dir; + int reachable; int stdin_packs; int stdin_commits; int append; @@ -127,6 +128,8 @@ static int graph_write(int argc, const char **argv) OPT_STRING(0, "object-dir", _dir, N_("dir"), N_("The object directory to store the graph")), + OPT_BOOL(0, "reachable", , + N_("start walk at all refs")), OPT_BOOL(0, "stdin-packs", _packs, N_("scan pack-indexes listed by stdin for commits")), OPT_BOOL(0, "stdin-commits", _commits, @@ -140,11 +143,16 @@ static int graph_write(int argc, const char **argv) builtin_commit_graph_write_options, builtin_commit_graph_write_usage, 0); - if (opts.stdin_packs && opts.stdin_commits) - die(_("cannot use both --stdin-commits and --stdin-packs")); + if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1) + die(_("use at most one of --reachable, --stdin-commits, or --stdin-packs")); if (!opts.obj_dir) opts.obj_dir = get_object_directory(); + if (opts.reachable) { + write_commit_graph_reachable(opts.obj_dir, opts.append); + return 0; + } + string_list_init(, 0); if (opts.stdin_packs || opts.stdin_commits) { struct strbuf buf = STRBUF_INIT; diff --git a/commit-graph.c b/commit-graph.c index a06e7e9549..adf54e3fe7 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -7,6 +7,7 @@ #include "packfile.h" #include "commit.h" #include "object.h" +#include "refs.h" #include "revision.h" #include "sha1-lookup.h" #include "commit-graph.h" @@ -656,6 +657,23 @@ static void compute_generation_numbers(struct packed_commit_list* commits) } } +static int add_ref_to_list(const char *refname, + const struct object_id *oid, + int flags, void *cb
[PATCH v6 20/21] gc: automatically write commit-graph files
The commit-graph file is a very helpful feature for speeding up git operations. In order to make it more useful, make it possible to write the commit-graph file during standard garbage collection operations. Add a 'gc.commitGraph' config setting that triggers writing a commit-graph file after any non-trivial 'git gc' command. Defaults to false while the commit-graph feature matures. We specifically do not want to have this on by default until the commit-graph feature is fully integrated with history-modifying features like shallow clones. Helped-by: Ævar Arnfjörð Bjarmason Signed-off-by: Derrick Stolee --- Documentation/config.txt | 10 +- Documentation/git-gc.txt | 4 builtin/gc.c | 6 ++ t/t5318-commit-graph.sh | 14 ++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index ab641bf5a9..f2b5ed17c8 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -906,7 +906,8 @@ the `GIT_NOTES_REF` environment variable. See linkgit:git-notes[1]. core.commitGraph:: Enable git commit graph feature. Allows reading from the - commit-graph file. + commit-graph file. See `gc.commitGraph` for automatically + maintaining the file. core.sparseCheckout:: Enable "sparse checkout" feature. See section "Sparse checkout" in @@ -1647,6 +1648,13 @@ this configuration variable is ignored, all packs except the base pack will be repacked. After this the number of packs should go below gc.autoPackLimit and gc.bigPackThreshold should be respected again. +gc.commitGraph:: + If true, then gc will rewrite the commit-graph file when + linkgit:git-gc[1] is run. When using linkgit:git-gc[1] + '--auto' the commit-graph will be updated if housekeeping is + required. Default is false. See linkgit:git-commit-graph[1] + for details. + gc.logExpiry:: If the file gc.log exists, then `git gc --auto` won't run unless that file is more than 'gc.logExpiry' old. Default is diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt index 24b2dd44fe..f5bc98ccb3 100644 --- a/Documentation/git-gc.txt +++ b/Documentation/git-gc.txt @@ -136,6 +136,10 @@ The optional configuration variable `gc.packRefs` determines if it within all non-bare repos or it can be set to a boolean value. This defaults to true. +The optional configuration variable `gc.commitGraph` determines if +'git gc' should run 'git commit-graph write'. This can be set to a +boolean value. This defaults to false. + The optional configuration variable `gc.aggressiveWindow` controls how much time is spent optimizing the delta compression of the objects in the repository when the --aggressive option is specified. The larger diff --git a/builtin/gc.c b/builtin/gc.c index ccfb1ceaeb..4e06e8372d 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -20,6 +20,7 @@ #include "sigchain.h" #include "argv-array.h" #include "commit.h" +#include "commit-graph.h" #include "packfile.h" #include "object-store.h" #include "pack.h" @@ -40,6 +41,7 @@ static int aggressive_depth = 50; static int aggressive_window = 250; static int gc_auto_threshold = 6700; static int gc_auto_pack_limit = 50; +static int gc_commit_graph = 0; static int detach_auto = 1; static timestamp_t gc_log_expire_time; static const char *gc_log_expire = "1.day.ago"; @@ -129,6 +131,7 @@ static void gc_config(void) git_config_get_int("gc.aggressivedepth", _depth); git_config_get_int("gc.auto", _auto_threshold); git_config_get_int("gc.autopacklimit", _auto_pack_limit); + git_config_get_bool("gc.commitgraph", _commit_graph); git_config_get_bool("gc.autodetach", _auto); git_config_get_expiry("gc.pruneexpire", _expire); git_config_get_expiry("gc.worktreepruneexpire", _worktrees_expire); @@ -641,6 +644,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix) if (pack_garbage.nr > 0) clean_pack_garbage(); + if (gc_commit_graph) + write_commit_graph_reachable(get_object_directory(), 0); + if (auto_gc && too_many_loose_objects()) warning(_("There are too many unreachable loose objects; " "run 'git prune' to remove them.")); diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 06ecbb3f4a..9a0661983c 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -245,6 +245,20 @@ test_expect_success 'perform fast-forward merge in full repo' ' test_cmp expect output ' +test_expect_success 'check that gc computes commit-graph' ' + cd "$TRASH_DIRECTORY/full" && + git commit --allow-empty -m "
[PATCH v6 15/21] commit-graph: test for corrupted octopus edge
The commit-graph file has an extra chunk to store the parent int-ids for parents beyond the first parent for octopus merges. Our test repo has a single octopus merge that we can manipulate to demonstrate the 'verify' subcommand detects incorrect values in that chunk. Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 10 ++ 1 file changed, 10 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index b7b4410e75..cbd6462226 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' ' ' NUM_COMMITS=9 +NUM_OCTOPUS_EDGES=2 HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 @@ -274,6 +275,10 @@ GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4)) GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3)) GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11)) GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12)) +GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16)) +GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \ +$GRAPH_COMMIT_DATA_WIDTH * $NUM_COMMITS)) +GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -383,4 +388,9 @@ test_expect_success 'detect incorrect commit date' ' "commit date" ' +test_expect_success 'detect incorrect parent for octopus merge' ' + corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \ + "invalid parent" +' + test_done -- 2.18.0.rc1
[PATCH v6 02/21] commit-graph: fix GRAPH_MIN_SIZE
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable commit-graph file. However, the minimum number of chunks was wrong. It is possible to write a commit-graph file with zero commits, and that violates this macro's value. Rewrite the macro, and use extra macros to better explain the magic constants. Signed-off-by: Derrick Stolee --- commit-graph.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index b63a1fc85e..f83f6d2373 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -35,10 +35,11 @@ #define GRAPH_LAST_EDGE 0x8000 +#define GRAPH_HEADER_SIZE 8 #define GRAPH_FANOUT_SIZE (4 * 256) #define GRAPH_CHUNKLOOKUP_WIDTH 12 -#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \ - GRAPH_OID_LEN + 8) +#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \ + + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN) char *get_commit_graph_filename(const char *obj_dir) { -- 2.18.0.rc1
[PATCH v6 00/21] Integrate commit-graph into 'fsck' and 'gc'
+ 10)) -GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* $NUM_COMMITS)) + $GRAPH_CHUNK_LOOKUP_WIDTH * $GRAPH_CHUNK_LOOKUP_ROWS)) +GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 * 4)) +GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 * 255)) +GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 * 256)) +GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 8)) +GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * 4 + 10)) +GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN * $NUM_COMMITS)) GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN)) GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4)) @@ -301,9 +301,9 @@ GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11)) GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12)) GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16)) GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \ -$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS)) +$GRAPH_COMMIT_DATA_WIDTH * $NUM_COMMITS)) GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4)) -GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES)) +GRAPH_BYTE_FOOTER=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4 * $NUM_OCTOPUS_EDGES)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position --- Derrick Stolee (21): commit-graph: UNLEAK before die() commit-graph: fix GRAPH_MIN_SIZE commit-graph: parse commit from chosen graph commit: force commit to parse from object database commit-graph: load a root tree from specific graph commit-graph: add 'verify' subcommand commit-graph: verify catches corrupt signature commit-graph: verify required chunks are present commit-graph: verify corrupt OID fanout and lookup commit-graph: verify objects exist commit-graph: verify root tree OIDs commit-graph: verify parent list commit-graph: verify generation number commit-graph: verify commit date commit-graph: test for corrupted octopus edge commit-graph: verify contents match checksum fsck: verify commit-graph commit-graph: use string-list API for input commit-graph: add '--reachable' option gc: automatically write commit-graph files commit-graph: update design document Documentation/config.txt | 10 +- Documentation/git-commit-graph.txt | 14 +- Documentation/git-fsck.txt | 3 + Documentation/git-gc.txt | 4 + Documentation/technical/commit-graph.txt | 22 -- builtin/commit-graph.c | 99 ++--- builtin/fsck.c | 21 ++ builtin/gc.c | 6 + commit-graph.c | 249 +-- commit-graph.h | 11 +- commit.c | 9 +- commit.h | 1 + t/t5318-commit-graph.sh | 201 ++ 13 files changed, 572 insertions(+), 78 deletions(-) -- 2.18.0.rc1
[PATCH v6 05/21] commit-graph: load a root tree from specific graph
When lazy-loading a tree for a commit, it will be important to select the tree from a specific struct commit_graph. Create a new method that specifies the commit-graph file and use that in get_commit_tree_in_graph(). Signed-off-by: Derrick Stolee --- commit-graph.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index e77b19971d..9e228d3bb5 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -362,14 +362,20 @@ static struct tree *load_tree_for_commit(struct commit_graph *g, struct commit * return c->maybe_tree; } -struct tree *get_commit_tree_in_graph(const struct commit *c) +static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g, +const struct commit *c) { if (c->maybe_tree) return c->maybe_tree; if (c->graph_pos == COMMIT_NOT_FROM_GRAPH) - BUG("get_commit_tree_in_graph called from non-commit-graph commit"); + BUG("get_commit_tree_in_graph_one called from non-commit-graph commit"); + + return load_tree_for_commit(g, (struct commit *)c); +} - return load_tree_for_commit(commit_graph, (struct commit *)c); +struct tree *get_commit_tree_in_graph(const struct commit *c) +{ + return get_commit_tree_in_graph_one(commit_graph, c); } static void write_graph_chunk_fanout(struct hashfile *f, -- 2.18.0.rc1
Re: [PATCH v5 18/21] commit-graph: use string-list API for input
On 6/6/2018 8:45 AM, Derrick Stolee wrote: On 6/6/2018 8:26 AM, Ævar Arnfjörð Bjarmason wrote: On Wed, Jun 06 2018, Derrick Stolee wrote: On 6/6/2018 8:11 AM, Ævar Arnfjörð Bjarmason wrote: On Wed, Jun 06 2018, Derrick Stolee wrote: Signed-off-by: Derrick Stolee + string_list_clear(, 0); return 0; } This results in an invalid free() & segfault because you're freeing which may not have been allocated by string_list_init(). Good point. Did my tests not catch this? (seems it requires calling `git commit-graph write` with no `--stdin-packs` or `--stdin-commits`). Most of your tests (t5318-commit-graph.sh) segfaulted, but presumably you're on a more forgiving compiler/platform/options. I compiled with -O0 -g on clang 4.0.1-8 + Debian testing. I appreciate the extra platform testing. I'm using GCC on Ubuntu (gcc (Ubuntu 7.3.0-16ubuntu3) 7.3.0). While the errors didn't repro on my box, they did on the VSTS Linux build machines [1]. I created an internal PR just so I could run those tests (and on our OSX agents which use clang) and so I can verify I fixed the situation. I'll send a v6 soon so Junio only picks up a version that succeeds in tests. Thanks, -Stolee [1] https://github.com/Microsoft/vsts-agent-docker/blob/master/ubuntu/16.04/standard/Dockerfile What's installed on a VSTS Linux build agent
Re: [PATCH 00/23] Multi-pack-index (MIDX)
On 6/7/2018 10:45 AM, Ævar Arnfjörð Bjarmason wrote: On Thu, Jun 07 2018, Derrick Stolee wrote: To test the performance in this situation, I created a script that organizes the Linux repository in a similar fashion. I split the commit history into 50 parts by creating branches on every 10,000 commits of the first- parent history. Then, `git rev-list --objects A ^B` provides the list of objects reachable from A but not B, so I could send that to `git pack-objects` to create these "time-based" packfiles. With these 50 packfiles (deleting the old one from my fresh clone, and deleting all tags as they were no longer on-disk) I could then test 'git rev-list --objects HEAD^{tree}' and see: Before: 0.17s After: 0.13s % Diff: -23.5% By adding logic to count hits and misses to bsearch_pack, I was able to see that the command above calls that method 266,930 times with a hit rate of 33%. The MIDX has the same number of calls with a 100% hit rate. Do you have the script you used for this? It would be very interesting as something we could stick in t/perf/ to test this use-case in the future. How does this & the numbers below compare to just a naïve --max-pack-size= on linux.git? Is it possible for you to tar this test repo up and share it as a one-off? I've been polishing the core.validateAbbrev series I have, and it would be interesting to compare some of the (abbrev) numbers. Here is what I used. You will want to adjust your constants for whatever repo you are using. This is for the Linux kernel which has a first-parent history of ~50,000 commits. It also leaves a bunch of extra files around, so it is nowhere near incorporating into the code. #!/bin/bash for i in `seq 1 50` do ORDER=$((51 - $i)) NUM_BACK=$((1000 * ($i - 1))) echo creating batch/$ORDER git branch -f batch/$ORDER HEAD~$NUM_BACK echo batch/$ORDER git rev-parse batch/$ORDER done lastbranch="" for i in `seq 1 50` do branch=batch/$i if [$lastbranch -eq ""] then echo "$branch" git rev-list --objects $branch | sed 's/ .*//' >objects-$i.txt else echo "$lastbranch" echo "$branch" git rev-list --objects $branch ^$lastbranch | sed 's/ .*//' >objects-$i.txt fi git pack-objects --no-reuse-delta .git/objects/pack/branch-split2 lastbranch=$branch done for tag in `git tag --list` do git tag -d $tag done rm -rf .git/objects/pack/pack-* git midx write
Re: [PATCH 03/23] midx: add midx builtin
On 6/7/2018 1:20 PM, Duy Nguyen wrote: On Thu, Jun 7, 2018 at 4:03 PM, Derrick Stolee wrote: diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt new file mode 100644 index 00..2bd886f1a2 --- /dev/null +++ b/Documentation/git-midx.txt @@ -0,0 +1,29 @@ +git-midx(1) + + +NAME + +git-midx - Write and verify multi-pack-indexes (MIDX files). No full stop. This head line is collected automatically with others and its having a full stop while the rest does not looks strange/ diff --git a/builtin/midx.c b/builtin/midx.c new file mode 100644 index 00..59ea92178f --- /dev/null +++ b/builtin/midx.c @@ -0,0 +1,38 @@ +#include "builtin.h" +#include "cache.h" +#include "config.h" +#include "git-compat-util.h" You only need either cache.h or git-compat-util.h. If cache.h is here, git-compat-util can be removed. +#include "parse-options.h" + +static char const * const builtin_midx_usage[] ={ + N_("git midx [--object-dir ]"), + NULL +}; + +static struct opts_midx { + const char *object_dir; +} opts; + +int cmd_midx(int argc, const char **argv, const char *prefix) +{ + static struct option builtin_midx_options[] = { + { OPTION_STRING, 0, "object-dir", _dir, For paths (including dir), OPTION_FILENAME may be a better option to handle correctly when the command is run in a subdir. See df217ed643 (parse-opts: add OPT_FILENAME and transition builtins - 2009-05-23) for more info. Thanks for the pointer! + N_("dir"), + N_("The object directory containing set of packfile and pack-index pairs.") }, Other help strings do not have full stop either (I only checked a couple commands though) Also, doesn't OPT_STRING() work here too (if you avoid OPTION_FILENAME for some reason)? + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(builtin_midx_usage, builtin_midx_options); + + git_config(git_default_config, NULL); + + argc = parse_options(argc, argv, prefix, +builtin_midx_options, +builtin_midx_usage, 0); + + if (!opts.object_dir) + opts.object_dir = get_object_directory(); + + return 0; +} diff --git a/git.c b/git.c index c2f48d53dd..400fadd677 100644 --- a/git.c +++ b/git.c @@ -503,6 +503,7 @@ static struct cmd_struct commands[] = { { "merge-recursive-theirs", cmd_merge_recursive, RUN_SETUP | NEED_WORK_TREE | NO_PARSEOPT }, { "merge-subtree", cmd_merge_recursive, RUN_SETUP | NEED_WORK_TREE | NO_PARSEOPT }, { "merge-tree", cmd_merge_tree, RUN_SETUP | NO_PARSEOPT }, + { "midx", cmd_midx, RUN_SETUP }, If it's a plumbing and can take an --object-dir, then I don't think you should require it to run in a repo (with RUN_SETUP). RUN_SETUP_GENTLY may be better. You could even leave it empty here and only call setup_git_directory() only when --object-dir is not set. I agree. Good point. This could be run to maintain an alternate without any .git folder. { "mktag", cmd_mktag, RUN_SETUP | NO_PARSEOPT }, { "mktree", cmd_mktree, RUN_SETUP }, { "mv", cmd_mv, RUN_SETUP | NEED_WORK_TREE },
Re: [PATCH 03/23] midx: add midx builtin
On 6/11/2018 5:02 PM, Stefan Beller wrote: Hi Derrick, On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee wrote: This new 'git midx' builtin will be the plumbing access for writing, reading, and checking multi-pack-index (MIDX) files. The initial implementation is a no-op. Let's talk about the name for a second: .idx files are written by git-index-pack or as part of git-pack-objects (which just calls write_idx_file as part of finish_tmp_packfile), and the name actually suggests it writes the index files. I have a hard time understanding what the git-midx command does[1]. With both commit graph as well as multi index we introduce a command that is centered around that concept (similar to git-remote or git-config that are centered around a concept, that is closely resembled by a file), but for indexes for packs it was integrated differently into Git. So I am not sure if I want to suggest to integrate it into the packfile commands as that doesn't really fit. But maybe we can have a name that is human readable instead of the file suffix? Maybe git multi-pack-index ? I suppose that eventually this command is not really used by users as it will be used by other porcelain commands in the background or even as part of repack/gc so I am not worried about a long name, but I'd be more worried about understandability. I'll use "git multi-pack-index" in v2. I'll keep "midx.c" in the root, though, if that is OK. [1] While these names are not perfect for the layman, it is okay? I am sure you are aware of https://git-man-page-generator.lokaltog.net/ I was not, and enjoyed that quite a bit. Thanks, -Stolee new file mode 100644 index 00..2bd886f1a2 --- /dev/null +++ b/Documentation/git-midx.txt @@ -0,0 +1,29 @@ +git-midx(1) + + +NAME + +git-midx - Write and verify multi-pack-indexes (MIDX files). The reading is done as part of all other commands. I like to think the 'read' verb is a subset of "verify" because we are checking for information about the MIDX, and mostly for tests or debugging. + + +SYNOPSIS + +[verse] +'git midx' [--object-dir ] + +DESCRIPTION +--- +Write or verify a MIDX file. + +OPTIONS +--- + +--object-dir :: + Use given directory for the location of Git objects. We check + /packs/multi-pack-index for the current MIDX file, and + /packs for the pack-files to index. + + Maybe we could have a SEE ALSO section that points at the explanation of multi index files? (c.f. man git-submodule that has a SEE ALSO gitsubmodules(7), gitmodules(5) explaining concepts(7) and the file(5)) But as this is plumbing and users should not need to worry about it this is optional, I would think. The design document is also in 'Documentation/technical' instead of just 'Documentation/'. Do we have a pattern of linking to the technical documents? Thanks, -Stolee
Re: [PATCH 03/23] midx: add midx builtin
On 6/18/2018 3:55 PM, Stefan Beller wrote: But as this is plumbing and users should not need to worry about it this is optional, I would think. The design document is also in 'Documentation/technical' instead of just 'Documentation/'. Do we have a pattern of linking to the technical documents? Apparently we do (and I was not aware of it): $ git -C Documentation/ grep link:technical git-credential.txt:23:link:technical/api-credentials.html[the Git credential API] for more git.txt:839:link:technical/api-index.html[Git API documentation]. gitcredentials.txt:184:link:technical/api-credentials.html[credentials API] for details. technical/http-protocol.txt:517:link:technical/pack-protocol.html technical/http-protocol.txt:518:link:technical/protocol-capabilities.html user-manual.txt:3220:found in link:technical/pack-format.html[pack format]. Thanks! I'll add some links.
Re: [PATCH 01/23] midx: add design document
On 6/11/2018 3:04 PM, Stefan Beller wrote: On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee wrote: Signed-off-by: Derrick Stolee --- Documentation/technical/midx.txt | 109 +++ 1 file changed, 109 insertions(+) create mode 100644 Documentation/technical/midx.txt diff --git a/Documentation/technical/midx.txt b/Documentation/technical/midx.txt new file mode 100644 index 00..789f410d71 --- /dev/null +++ b/Documentation/technical/midx.txt @@ -0,0 +1,109 @@ +Multi-Pack-Index (MIDX) Design Notes + + +The Git object directory contains a 'pack' directory containing +packfiles (with suffix ".pack") and pack-indexes (with suffix +".idx"). The pack-indexes provide a way to lookup objects and +navigate to their offset within the pack, but these must come +in pairs with the packfiles. This pairing depends on the file +names, as the pack-index differs only in suffix with its pack- +file. While the pack-indexes provide fast lookup per packfile, +this performance degrades as the number of packfiles increases, +because abbreviations need to inspect every packfile and we are +more likely to have a miss on our most-recently-used packfile. +For some large repositories, repacking into a single packfile +is not feasible due to storage space or excessive repack times. This leads to the question how MIDX will cope with large repos or a large number of packs. As it is just an index and not a pack itself, I guess it is smaller by some orders of magnitude, such that it is ok for now. The MIDX file is only slightly larger than the union of the IDX files for those packfiles. +The multi-pack-index (MIDX for short) stores a list of objects +and their offsets into multiple packfiles. It contains: + +- A list of packfile names. +- A sorted list of object IDs. +- A list of metadata for the ith object ID including: + - A value j referring to the jth packfile. + - An offset within the jth packfile for the object. +- If large offsets are required, we use another list of large + offsets similar to version 2 pack-indexes. + +Thus, we can provide O(log N) lookup time for any number +of packfiles. This sounds great for the lookup case! Though that is for the repo-read case. Let's read on how the dynamics of a repository are dealt with, e.g. integrating new packs into the MIDX, or how we deal with objects in multiple packs. + +Design Details +-- + +- The MIDX is stored in a file named 'multi-pack-index' in the + .git/objects/pack directory. This could be stored in the pack + directory of an alternate. It refers only to packfiles in that + same directory. So there is one and only one multi pack index? That makes the case of preparing the next MIDX that contains more pack references more interesting, as then we have to atomically update that file. There is only one, but we can make the file incremental without changing this name similar to how the split index works. +- The core.midx config setting must be on to consume MIDX files. Looking through current config options, I would rename this to a more suggestive name. I searched for the core.idx counterpart that enables idx files -- it turns out that is named pack.indexVersion. So maybe pack.MultiIndex ? That could start out as a boolean as in this series and then evolve into a version number or such later. I'll use that name and rename this file to Documentation/technical/multi-pack-index.txt +- The file format includes parameters for the object ID hash + function, so a future change of hash algorithm does not require + a change in format. + +- The MIDX keeps only one record per object ID. If an object appears + in multiple packfiles, then the MIDX selects the copy in the most- + recently modified packfile. Okay. That answers the question from above. Though this is just the tie breaking decision and not a hard limitation? (i.e. we could change this this later to that pack that has e.g. shortest delta chain for that object or such) This is a soft requirement. It is an easy thing to track at the moment. We can compute the MIDX without opening a packfile, for instance. +- If there exist packfiles in the pack directory not registered in + the MIDX, then those packfiles are loaded into the `packed_git` + list and `packed_git_mru` cache. Not sure I understand the implications of this? Does that mean we first look at the multi index and if an object is not found, we'll search linearly through all packs that are not part of the MIDX? That would require the MIDX to be kepot up to date reasonably to be useful. If you add a packfile to the pack directory, you can immediately start consuming it. You do not need to wait for the MIDX to be updated. The more asynchronous these auxiliary data structures (MIDX, commit-graph) can be, the better. This is in direct contrast to the reachability bitmap which is useless without its corresponding packfi
Re: [PATCH 02/23] midx: add midx format details to pack-format.txt
On 6/11/2018 3:19 PM, Stefan Beller wrote: Hi Derrick, On Thu, Jun 7, 2018 at 7:03 AM Derrick Stolee wrote: The multi-pack-index (MIDX) feature generalizes the existing pack- index (IDX) feature by indexing objects across multiple pack-files. Describe the basic file format, using a 12-byte header followed by a lookup table for a list of "chunks" which will be described later. The file ends with a footer containing a checksum using the hash algorithm. The header allows later versions to create breaking changes by advancing the version number. We can also change the hash algorithm using a different version value. We will add the individual chunk format information as we introduce the code that writes that information. Signed-off-by: Derrick Stolee --- Documentation/technical/pack-format.txt | 49 + 1 file changed, 49 insertions(+) diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt index 70a99fd142..17666b4bfc 100644 --- a/Documentation/technical/pack-format.txt +++ b/Documentation/technical/pack-format.txt @@ -252,3 +252,52 @@ Pack file entry: <+ corresponding packfile. 20-byte SHA-1-checksum of all of the above. + +== midx-*.midx files have the following format: + +The meta-index files refer to multiple pack-files and loose objects. So is it meta or multi? Good catch. We were calling this the meta-index internally before changing to "multi-pack-index" (helps to not change the acronym). +In order to allow extensions that add extra data to the MIDX, we organize +the body into "chunks" and provide a lookup table at the beginning of the +body. The header includes certain length values, such as the number of packs, +the number of base MIDX files, hash lengths and types. + +All 4-byte numbers are in network order. + +HEADER: + + 4-byte signature: + The signature is: {'M', 'I', 'D', 'X'} + + 1-byte version number: + Git only writes or recognizes version 1 + + 1-byte Object Id Version + Git only writes or recognizes verion 1 (SHA-1) s/verion/version/ + 1-byte number (C) of "chunks" + + 1-byte number (I) of base multi-pack-index files: + This value is currently always zero. Oh? Are meta-index and multi-index files different things? Not intended to be different things, but this number is related to making the feature incremental. + 4-byte number (P) of pack files + +CHUNK LOOKUP: + + (C + 1) * 12 bytes providing the chunk offsets: + First 4 bytes describe chunk id. Value 0 is a terminating label. + Other 8 bytes provide offset in current file for chunk to start. + (Chunks are provided in file-order, so you can infer the length + using the next chunk position if necessary.) It is so nice to have the header also have 12 bytes, so it fits right into the lookup table. So an alternative point of view: If a chunk needs to store more than 8 bytes, we'll have an offset after the first 4 bytes that describe the chunk, otherwise you can store the 8 bytes of information directly after the 4 bytes. "MIDX" is a special chunk and must come first (does it?) and only once as it contains the version number. This sounds feasible, but unnecessarily complicated. I don't think any other chunk will be this small. + The remaining data in the body is described one chunk at a time, and + these chunks may be given in any order. Chunks are required unless + otherwise specified. + +CHUNK DATA: + + (This section intentionally left incomplete.) + +TRAILER: + + H-byte HASH-checksum of all of the above. This means we have to rehash the whole file for updating its contents. okay.
Re: [PATCH v2 05/31] tree: add repository argument to lookup_tree
On 6/13/2018 7:04 PM, Stefan Beller wrote: diff --git a/commit-graph.c b/commit-graph.c index 71125d7cbb6..88a4b0d2a47 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -261,7 +261,7 @@ static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, uin item->graph_pos = pos; hashcpy(oid.hash, commit_data); - item->tree = lookup_tree(); + item->tree = lookup_tree(the_repository, ); date_high = get_be32(commit_data + g->hash_len + 8) & 0x3; date_low = get_be32(commit_data + g->hash_len + 12); diff --git a/commit.c b/commit.c index d76d64d4dfc..755b8b9d94f 100644 --- a/commit.c +++ b/commit.c @@ -354,7 +354,7 @@ int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long s if (get_sha1_hex(bufptr + 5, parent.hash) < 0) return error("bad tree pointer in commit %s", oid_to_hex(>object.oid)); - item->tree = lookup_tree(); + item->tree = lookup_tree(the_repository, ); bufptr += tree_entry_len + 1; /* "tree " + "hex sha1" + "\n" */ pptr = >parents; diff --git a/fsck.c b/fsck.c index 2d372f2a3f3..b07abb9796c 100644 I'm a bit late here, but you don't have ds/lazy-load-trees merged in your history, so this will conflict with 'master'. I caught this as I was trying to merge ds/generation-numbers with your branch. The 'tree' member of 'struct commit' was renamed to 'maybe_tree'. Thanks, -Stolee
Re: [PATCH 4/3] ewah: adjust callers of ewah_read_mmap()
On 6/14/2018 11:44 PM, Jeff King wrote: The return value of ewah_read_mmap() is now an ssize_t, since we could (in theory) process up to 32GB of data. This would never happen in practice, but a corrupt or malicious .bitmap or index file could convince us to do so. Aside: Why a 32GB limit? Usually I see 32-bit issues limiting by 2 (signed) or 4 GB (unsigned). Is there something specifically in the bitmap format that stops at this size? Thanks, -Stolee
Re: [PATCH 3/3] ewah: drop ewah_serialize_native function
On 6/15/2018 9:56 AM, Ramsay Jones wrote: On 15/06/18 04:32, Jeff King wrote: We don't call this function, and never have. The on-disk bitmap format uses network-byte-order integers, meaning that we cannot use the native-byte-order format written here. Let's drop it in the name of simplicity. Hmm, if you are in the mood to drop ewah dead code, how about: ewah/bitmap.o - bitmap_clear ewah/bitmap.o - bitmap_each_bit ewah/ewah_bitmap.o - ewah_and ewah/ewah_bitmap.o - ewah_and_not ewah/ewah_bitmap.o - ewah_not ewah/ewah_bitmap.o - ewah_or ... in addition to these *(de)serialize* functions. ;-) Normally, I would say we should keep this folder as close to the "original" [1] as possible, so we could more easily take improvements to that library. However, it appears that code is not being updated. Perhaps this is in favor of other EWAH libraries [2] or other compressed bitmap formats [3]. For that reason, I agree we should clean this up. We shouldn't block Peff's patch for that reason. I'll send a patch that deletes these methods. Thanks, -Stolee: [1] https://github.com/vmg/libewok [2] https://github.com/lemire/EWAHBoolArray [3] https://github.com/RoaringBitmap/CRoaring
[PATCH 6/8] ewah_bitmap: delete unused 'ewah_or()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 69 -- ewah/ewok.h| 5 2 files changed, 74 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 3fa3edf2a3..017c677f98 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -440,75 +440,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_or( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if (predator->rlw.running_bit) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index = rlwit_discharge(prey, out, - predator->rlw.running_len, 0); - ewah_add_empty_words(out, 0, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] | - rlw_j.buffer[rlw_j.literal_word_start + k] - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge(_i, out, ~0, 0); - else - rlwit_discharge(_j, out, ~0, 0); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - - #define BITMAP_POOL_MAX 16 static struct ewah_bitmap *bitmap_pool[BITMAP_POOL_MAX]; static size_t bitmap_pool_size; diff --git a/ewah/ewok.h b/ewah/ewok.h index 4b665d09d9..cd826a0136 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -157,11 +157,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct ewah_bitmap *parent); */ int ewah_iterator_next(eword_t *next, struct ewah_iterator *it); -void ewah_or( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, -- 2.18.0.rc1
[PATCH 3/8] ewah_bitmap: delete unused 'ewah_and()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 68 -- ewah/ewok.h| 5 2 files changed, 73 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index b9fdda1d3d..5a4d3a6eb6 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -459,74 +459,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_and( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if (predator->rlw.running_bit == 0) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index = rlwit_discharge(prey, out, - predator->rlw.running_len, 0); - ewah_add_empty_words(out, 0, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] & - rlw_j.buffer[rlw_j.literal_word_start + k] - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge_empty(_i, out); - else - rlwit_discharge_empty(_j, out); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - void ewah_and_not( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index dc43d05b64..5841c83507 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -179,11 +179,6 @@ void ewah_xor( struct ewah_bitmap *ewah_j, struct ewah_bitmap *out); -void ewah_and( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - /** * Direct word access */ -- 2.18.0.rc1
[PATCH 8/8] ewah_io: delete unused 'ewah_serialize_native()'
Signed-off-by: Derrick Stolee --- ewah/ewah_io.c | 26 -- ewah/ewok.h| 1 - 2 files changed, 27 deletions(-) diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c index 86d3f1d02e..dde736bcd9 100644 --- a/ewah/ewah_io.c +++ b/ewah/ewah_io.c @@ -20,32 +20,6 @@ #include "ewok.h" #include "strbuf.h" -int ewah_serialize_native(struct ewah_bitmap *self, int fd) -{ - uint32_t write32; - size_t to_write = self->buffer_size * 8; - - /* 32 bit -- bit size for the map */ - write32 = (uint32_t)self->bit_size; - if (write(fd, , 4) != 4) - return -1; - - /** 32 bit -- number of compressed 64-bit words */ - write32 = (uint32_t)self->buffer_size; - if (write(fd, , 4) != 4) - return -1; - - if (write(fd, self->buffer, to_write) != to_write) - return -1; - - /** 32 bit -- position for the RLW */ - write32 = self->rlw - self->buffer; - if (write(fd, , 4) != 4) - return -1; - - return (3 * 4) + to_write; -} - int ewah_serialize_to(struct ewah_bitmap *self, int (*write_fun)(void *, const void *, size_t), void *data) diff --git a/ewah/ewok.h b/ewah/ewok.h index 1055706e44..8a2b334278 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self); int ewah_serialize_to(struct ewah_bitmap *self, int (*write_fun)(void *out, const void *buf, size_t len), void *out); -int ewah_serialize_native(struct ewah_bitmap *self, int fd); int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *); int ewah_deserialize(struct ewah_bitmap *self, int fd); -- 2.18.0.rc1
[PATCH 7/8] ewah_io: delete unused 'ewah_serialize()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_io.c | 10 -- ewah/ewok.h| 1 - 2 files changed, 11 deletions(-) diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c index bed1994551..86d3f1d02e 100644 --- a/ewah/ewah_io.c +++ b/ewah/ewah_io.c @@ -100,16 +100,6 @@ int ewah_serialize_to(struct ewah_bitmap *self, return (3 * 4) + (self->buffer_size * 8); } -static int write_helper(void *fd, const void *buf, size_t len) -{ - return write((intptr_t)fd, buf, len); -} - -int ewah_serialize(struct ewah_bitmap *self, int fd) -{ - return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd); -} - static int write_strbuf(void *user_data, const void *data, size_t len) { struct strbuf *sb = user_data; diff --git a/ewah/ewok.h b/ewah/ewok.h index cd826a0136..1055706e44 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self); int ewah_serialize_to(struct ewah_bitmap *self, int (*write_fun)(void *out, const void *buf, size_t len), void *out); -int ewah_serialize(struct ewah_bitmap *self, int fd); int ewah_serialize_native(struct ewah_bitmap *self, int fd); int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *); -- 2.18.0.rc1
[PATCH 5/8] ewah_bitmap: delete unused 'ewah_not()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 19 --- ewah/ewok.h| 7 --- 2 files changed, 26 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 559adde37c..3fa3edf2a3 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -376,25 +376,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct ewah_bitmap *parent) read_new_rlw(it); } -void ewah_not(struct ewah_bitmap *self) -{ - size_t pointer = 0; - - while (pointer < self->buffer_size) { - eword_t *word = >buffer[pointer]; - size_t literals, k; - - rlw_xor_run_bit(word); - ++pointer; - - literals = rlw_get_literal_words(word); - for (k = 0; k < literals; ++k) { - self->buffer[pointer] = ~self->buffer[pointer]; - ++pointer; - } - } -} - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index 21c420770e..4b665d09d9 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -95,13 +95,6 @@ int ewah_read_mmap(struct ewah_bitmap *self, const void *map, size_t len); uint32_t ewah_checksum(struct ewah_bitmap *self); -/** - * Logical not (bitwise negation) in-place on the bitmap - * - * This operation is linear time based on the size of the bitmap. - */ -void ewah_not(struct ewah_bitmap *self); - /** * Call the given callback with the position of every single bit * that has been set on the bitmap. -- 2.18.0.rc1
[PATCH 2/8] ewah/bitmap.c: delete unused 'bitmap_each_bit()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/bitmap.c | 24 1 file changed, 24 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index d61dc6114a..52f1178db4 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -129,30 +129,6 @@ void bitmap_or_ewah(struct bitmap *self, struct ewah_bitmap *other) self->words[i++] |= word; } -void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data) -{ - size_t pos = 0, i; - - for (i = 0; i < self->word_alloc; ++i) { - eword_t word = self->words[i]; - uint32_t offset; - - if (word == (eword_t)~0) { - for (offset = 0; offset < BITS_IN_EWORD; ++offset) - callback(pos++, data); - } else { - for (offset = 0; offset < BITS_IN_EWORD; ++offset) { - if ((word >> offset) == 0) - break; - - offset += ewah_bit_ctz64(word >> offset); - callback(pos + offset, data); - } - pos += BITS_IN_EWORD; - } - } -} - size_t bitmap_popcount(struct bitmap *self) { size_t i, count = 0; -- 2.18.0.rc1
Re: [PATCH 8/8] ewah_io: delete unused 'ewah_serialize_native()'
On 6/15/2018 11:01 AM, Ramsay Jones wrote: On 15/06/18 15:30, Derrick Stolee wrote: Signed-off-by: Derrick Stolee --- ewah/ewah_io.c | 26 -- ewah/ewok.h| 1 - 2 files changed, 27 deletions(-) This duplicates Jeff's patch #3. Thanks. I got a bit hasty as I was working from 'maint' and didn't double-check. -Stolee
[PATCH 1/8] ewah/bitmap.c: delete unused 'bitmap_clear()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/bitmap.c | 8 1 file changed, 8 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 756bdd050e..d61dc6114a 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos) self->words[block] |= EWAH_MASK(pos); } -void bitmap_clear(struct bitmap *self, size_t pos) -{ - size_t block = EWAH_BLOCK(pos); - - if (block < self->word_alloc) - self->words[block] &= ~EWAH_MASK(pos); -} - int bitmap_get(struct bitmap *self, size_t pos) { size_t block = EWAH_BLOCK(pos); -- 2.18.0.rc1
[PATCH 0/8] Delete unused methods in EWAH bitmap
The EWAH bitmap code includes several logical operations that are important for a general-purpose bitmap library. However, Git only uses the XOR operation for storing deltas between reachability bitmaps. This means that we can delete the following unused methods: * ewah_and() * ewah_and_not() * ewah_not() * ewah_or() * ewah_serialize() * ewah_serialize_native() We can also delete the unused methods bitmap_clear() and bitmap_each_bit(). Derrick Stolee (8): ewah/bitmap.c: delete unused 'bitmap_clear()' ewah/bitmap.c: delete unused 'bitmap_each_bit()' ewah_bitmap: delete unused 'ewah_and()' ewah_bitmap: delete unused 'ewah_and_not()' ewah_bitmap: delete unused 'ewah_not()' ewah_bitmap: delete unused 'ewah_or()' ewah_io: delete unused 'ewah_serialize()' ewah_io: delete unused 'ewah_serialize_native()' ewah/bitmap.c | 32 --- ewah/ewah_bitmap.c | 229 - ewah/ewah_io.c | 36 --- ewah/ewok.h| 24 - 4 files changed, 321 deletions(-) base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e -- 2.18.0.rc1
[PATCH 4/8] ewah_bitmap: delete unused 'ewah_and_not()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 73 -- ewah/ewok.h| 5 2 files changed, 78 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 5a4d3a6eb6..559adde37c 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -459,79 +459,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_and_not( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if ((predator->rlw.running_bit && prey == _i) || - (!predator->rlw.running_bit && prey != _i)) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index; - int negate_words; - - negate_words = (_i != prey); - index = rlwit_discharge(prey, out, - predator->rlw.running_len, negate_words); - ewah_add_empty_words(out, negate_words, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] & - ~(rlw_j.buffer[rlw_j.literal_word_start + k]) - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge(_i, out, ~0, 0); - else - rlwit_discharge_empty(_j, out); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - void ewah_or( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index 5841c83507..21c420770e 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -169,11 +169,6 @@ void ewah_or( struct ewah_bitmap *ewah_j, struct ewah_bitmap *out); -void ewah_and_not( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, -- 2.18.0.rc1
Re: [PATCH 0/8] Delete unused methods in EWAH bitmap
On 6/15/2018 10:30 AM, Derrick Stolee wrote: The EWAH bitmap code includes several logical operations that are important for a general-purpose bitmap library. However, Git only uses the XOR operation for storing deltas between reachability bitmaps. This means that we can delete the following unused methods: * ewah_and() * ewah_and_not() * ewah_not() * ewah_or() * ewah_serialize() * ewah_serialize_native() We can also delete the unused methods bitmap_clear() and bitmap_each_bit(). Derrick Stolee (8): ewah/bitmap.c: delete unused 'bitmap_clear()' ewah/bitmap.c: delete unused 'bitmap_each_bit()' ewah_bitmap: delete unused 'ewah_and()' ewah_bitmap: delete unused 'ewah_and_not()' ewah_bitmap: delete unused 'ewah_not()' ewah_bitmap: delete unused 'ewah_or()' ewah_io: delete unused 'ewah_serialize()' ewah_io: delete unused 'ewah_serialize_native()' ewah/bitmap.c | 32 --- ewah/ewah_bitmap.c | 229 - ewah/ewah_io.c | 36 --- ewah/ewok.h| 24 - 4 files changed, 321 deletions(-) base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e Responders to this thread beware: I accidentally added an extra letter in Peff's email address, so be careful with reply-all. Sorry! -Stolee
Re: [PATCH 1/8] ewah/bitmap.c: delete unused 'bitmap_clear()'
On 6/15/2018 10:46 AM, Ramsay Jones wrote: On 15/06/18 15:30, Derrick Stolee wrote: Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/bitmap.c | 8 1 file changed, 8 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 756bdd050e..d61dc6114a 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos) self->words[block] |= EWAH_MASK(pos); } -void bitmap_clear(struct bitmap *self, size_t pos) -{ - size_t block = EWAH_BLOCK(pos); - - if (block < self->word_alloc) - self->words[block] &= ~EWAH_MASK(pos); -} - int bitmap_get(struct bitmap *self, size_t pos) { size_t block = EWAH_BLOCK(pos); I haven't read all the patches yet, but what about the extern declaration in ewah/ewok.h? Thanks! I'll get that, too.
Re: [PATCH v2 05/31] tree: add repository argument to lookup_tree
On 6/14/2018 1:55 PM, Derrick Stolee wrote: On 6/13/2018 7:04 PM, Stefan Beller wrote: diff --git a/commit-graph.c b/commit-graph.c index 71125d7cbb6..88a4b0d2a47 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -261,7 +261,7 @@ static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, uin item->graph_pos = pos; hashcpy(oid.hash, commit_data); - item->tree = lookup_tree(); + item->tree = lookup_tree(the_repository, ); date_high = get_be32(commit_data + g->hash_len + 8) & 0x3; date_low = get_be32(commit_data + g->hash_len + 12); diff --git a/commit.c b/commit.c index d76d64d4dfc..755b8b9d94f 100644 --- a/commit.c +++ b/commit.c @@ -354,7 +354,7 @@ int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long s if (get_sha1_hex(bufptr + 5, parent.hash) < 0) return error("bad tree pointer in commit %s", oid_to_hex(>object.oid)); - item->tree = lookup_tree(); + item->tree = lookup_tree(the_repository, ); bufptr += tree_entry_len + 1; /* "tree " + "hex sha1" + "\n" */ pptr = >parents; diff --git a/fsck.c b/fsck.c index 2d372f2a3f3..b07abb9796c 100644 I'm a bit late here, but you don't have ds/lazy-load-trees merged in your history, so this will conflict with 'master'. I caught this as I was trying to merge ds/generation-numbers with your branch. The 'tree' member of 'struct commit' was renamed to 'maybe_tree'. Resolving the merge was not very simple. I have a working merge available on GitHub [1] as commit 99a899f7c12ef73840dbe79c71acb11034d707dd. Thanks, -Stolee [1] https://github.com/derrickstolee/git/commits/generation-numbers-object-store
Re: [PATCH v2 0/7] Delete unused methods in EWAH bitmap
On 6/15/2018 2:51 PM, Junio C Hamano wrote: Derrick Stolee writes: The EWAH bitmap code includes several logical operations that are important for a general-purpose bitmap library. However, Git only uses the XOR operation for storing deltas between reachability bitmaps. This means that we can delete the following unused methods: * ewah_and() * ewah_and_not() * ewah_not() * ewah_or() * ewah_serialize() We can also delete the unused methods bitmap_clear() and bitmap_each_bit(). Derrick Stolee (7): ewah/bitmap.c: delete unused 'bitmap_clear()' ewah/bitmap.c: delete unused 'bitmap_each_bit()' ewah_bitmap: delete unused 'ewah_and()' ewah_bitmap: delete unused 'ewah_and_not()' ewah_bitmap: delete unused 'ewah_not()' ewah_bitmap: delete unused 'ewah_or()' ewah_io: delete unused 'ewah_serialize()' ewah/bitmap.c | 32 --- ewah/ewah_bitmap.c | 229 - ewah/ewah_io.c | 10 -- ewah/ewok.h| 25 - 4 files changed, 296 deletions(-) base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e Thanks. ewah_clear() can become file-scope static, and rlwit_discharge_empty() can be eliminated. I do not know if either is worth doing, though. With Peff's patches, this is true. When I applied your diff to my patch alone we could not do that. If you want to create a commit with the below, I'm happy to add my "Reviewed-by: Derrick Stolee " Good team effort today! ewah/ewah_bitmap.c | 20 ewah/ewah_rlw.c| 8 ewah/ewok.h| 6 -- ewah/ewok_rlw.h| 1 - 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 017c677f98..d59b1afe3d 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -276,6 +276,18 @@ void ewah_each_bit(struct ewah_bitmap *self, void (*callback)(size_t, void*), vo } } +/** + * Clear all the bits in the bitmap. Does not free or resize + * memory. + */ +static void ewah_clear(struct ewah_bitmap *self) +{ + self->buffer_size = 1; + self->buffer[0] = 0; + self->bit_size = 0; + self->rlw = self->buffer; +} + struct ewah_bitmap *ewah_new(void) { struct ewah_bitmap *self; @@ -288,14 +300,6 @@ struct ewah_bitmap *ewah_new(void) return self; } -void ewah_clear(struct ewah_bitmap *self) -{ - self->buffer_size = 1; - self->buffer[0] = 0; - self->bit_size = 0; - self->rlw = self->buffer; -} - void ewah_free(struct ewah_bitmap *self) { if (!self) diff --git a/ewah/ewah_rlw.c b/ewah/ewah_rlw.c index b9643b7d0f..5093d43e2f 100644 --- a/ewah/ewah_rlw.c +++ b/ewah/ewah_rlw.c @@ -104,11 +104,3 @@ size_t rlwit_discharge( return index; } - -void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out) -{ - while (rlwit_word_size(it) > 0) { - ewah_add_empty_words(out, 0, rlwit_word_size(it)); - rlwit_discard_first_words(it, rlwit_word_size(it)); - } -} diff --git a/ewah/ewok.h b/ewah/ewok.h index 0c504f28e2..84b2a29faa 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -72,12 +72,6 @@ void ewah_pool_free(struct ewah_bitmap *self); */ struct ewah_bitmap *ewah_new(void); -/** - * Clear all the bits in the bitmap. Does not free or resize - * memory. - */ -void ewah_clear(struct ewah_bitmap *self); - /** * Free all the memory of the bitmap */ diff --git a/ewah/ewok_rlw.h b/ewah/ewok_rlw.h index bb3c6ff7e0..7cdfdd0c02 100644 --- a/ewah/ewok_rlw.h +++ b/ewah/ewok_rlw.h @@ -98,7 +98,6 @@ void rlwit_init(struct rlw_iterator *it, struct ewah_bitmap *bitmap); void rlwit_discard_first_words(struct rlw_iterator *it, size_t x); size_t rlwit_discharge( struct rlw_iterator *it, struct ewah_bitmap *out, size_t max, int negate); -void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out); static inline size_t rlwit_word_size(struct rlw_iterator *it) {
Re: [PATCH 4/3] ewah: adjust callers of ewah_read_mmap()
On 6/15/2018 1:31 PM, Jeff King wrote: On Fri, Jun 15, 2018 at 09:41:46AM -0700, Junio C Hamano wrote: Derrick Stolee writes: On 6/14/2018 11:44 PM, Jeff King wrote: The return value of ewah_read_mmap() is now an ssize_t, since we could (in theory) process up to 32GB of data. This would never happen in practice, but a corrupt or malicious .bitmap or index file could convince us to do so. Aside: Why a 32GB limit? Usually I see 32-bit issues limiting by 2 (signed) or 4 GB (unsigned). Is there something specifically in the bitmap format that stops at this size? The proposed log message for 1/3 has this bit - check the size for integer overflow (this should be impossible on 64-bit, as the size is given as a 32-bit count of 8-byte words, but is possible on a 32-bit system) 4 Giga 8-byte words makes 32 Giga bytes, I'd guess. Yes, exactly. It's definitely an odd size. I think using the same varints we use in the packfile would be more efficient (and they're already variable-length records), though we tend to have few enough of these that it probably doesn't matter. I think a more useful change for the bitmap format would be some kind of index. IIRC, right now readers have to linearly scan the whole file in order to use a single bitmap. With Stolee's commit-metadata files, we could potentially move to storing reachability bitmaps as a data element there. But there are two complications: - the bitmaps themselves are dependent on having an ordered list of objects (one per bit) to talk about. And that's why they're integrated with packfiles, since that provides a constrained universe with a set ordering. - the existing storage isn't entirely independent between bitmaps. Some of them are basically "deltas" off of nearby bitmaps. The VSTS reachability bitmap is different from the Git bitmap in a few ways. 1. It uses Roaring+Run bitmaps [1] instead of EWAH bitmap. This format is simpler (in my opinion) in several ways, especially in how it splits the bitmap into 16-bit address ranges and compresses each on its own. This makes set containment queries much faster, as we can navigate directly to the region that is important (and then binary search if that chunk is an "array" or "run" chunk). I say this is simpler because I can explain the entire compression format to you in five minutes and a whiteboard. The paper [2] is a simple enough read (start at the "Roaring Bitmap" section at the end of page 4). 2. Our file uses a chunk-based approach similar to the commit-graph file: one chunk is simply a list of the commits that have bitmaps. Another chunk is the raw binary data of all bitmaps concatenated together. The last chunk connects the other two chunks by a) providing an offset into the binary data for the bitmap at that commit, and b) the commit that provides a delta base for the bitmap. If we are considering changing the reachability bitmap, then I'm very intrigued. I think the number one thing to consider is to use the multi-pack-index as a reference point (with a stable object order) so the objects can be repacked independently from the reachability bitmap computation. If we are changing the model at that level, then it is worth thinking about other questions, like how we index the file or how we compress the bitmaps. Thanks, -Stolee [1] https://roaringbitmap.org/about/ [2] https://arxiv.org/abs/1603.06549 Consistently faster and smaller compressed bitmaps with Roaring
[PATCH v2 7/7] ewah_io: delete unused 'ewah_serialize()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_io.c | 10 -- ewah/ewok.h| 1 - 2 files changed, 11 deletions(-) diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c index bed1994551..86d3f1d02e 100644 --- a/ewah/ewah_io.c +++ b/ewah/ewah_io.c @@ -100,16 +100,6 @@ int ewah_serialize_to(struct ewah_bitmap *self, return (3 * 4) + (self->buffer_size * 8); } -static int write_helper(void *fd, const void *buf, size_t len) -{ - return write((intptr_t)fd, buf, len); -} - -int ewah_serialize(struct ewah_bitmap *self, int fd) -{ - return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd); -} - static int write_strbuf(void *user_data, const void *data, size_t len) { struct strbuf *sb = user_data; diff --git a/ewah/ewok.h b/ewah/ewok.h index a540df2aa9..f3ac78ef1a 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -86,7 +86,6 @@ void ewah_free(struct ewah_bitmap *self); int ewah_serialize_to(struct ewah_bitmap *self, int (*write_fun)(void *out, const void *buf, size_t len), void *out); -int ewah_serialize(struct ewah_bitmap *self, int fd); int ewah_serialize_native(struct ewah_bitmap *self, int fd); int ewah_serialize_strbuf(struct ewah_bitmap *self, struct strbuf *); -- 2.18.0.rc1
[PATCH v2 2/7] ewah/bitmap.c: delete unused 'bitmap_each_bit()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/bitmap.c | 24 ewah/ewok.h | 1 - 2 files changed, 25 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index d61dc6114a..52f1178db4 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -129,30 +129,6 @@ void bitmap_or_ewah(struct bitmap *self, struct ewah_bitmap *other) self->words[i++] |= word; } -void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data) -{ - size_t pos = 0, i; - - for (i = 0; i < self->word_alloc; ++i) { - eword_t word = self->words[i]; - uint32_t offset; - - if (word == (eword_t)~0) { - for (offset = 0; offset < BITS_IN_EWORD; ++offset) - callback(pos++, data); - } else { - for (offset = 0; offset < BITS_IN_EWORD; ++offset) { - if ((word >> offset) == 0) - break; - - offset += ewah_bit_ctz64(word >> offset); - callback(pos + offset, data); - } - pos += BITS_IN_EWORD; - } - } -} - size_t bitmap_popcount(struct bitmap *self) { size_t i, count = 0; diff --git a/ewah/ewok.h b/ewah/ewok.h index 2d25e93b9e..71704ed37f 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -217,7 +217,6 @@ void bitmap_and_not(struct bitmap *self, struct bitmap *other); void bitmap_or_ewah(struct bitmap *self, struct ewah_bitmap *other); void bitmap_or(struct bitmap *self, const struct bitmap *other); -void bitmap_each_bit(struct bitmap *self, ewah_callback callback, void *data); size_t bitmap_popcount(struct bitmap *self); #endif -- 2.18.0.rc1
[PATCH v2 1/7] ewah/bitmap.c: delete unused 'bitmap_clear()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/bitmap.c | 8 ewah/ewok.h | 1 - 2 files changed, 9 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 756bdd050e..d61dc6114a 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -45,14 +45,6 @@ void bitmap_set(struct bitmap *self, size_t pos) self->words[block] |= EWAH_MASK(pos); } -void bitmap_clear(struct bitmap *self, size_t pos) -{ - size_t block = EWAH_BLOCK(pos); - - if (block < self->word_alloc) - self->words[block] &= ~EWAH_MASK(pos); -} - int bitmap_get(struct bitmap *self, size_t pos) { size_t block = EWAH_BLOCK(pos); diff --git a/ewah/ewok.h b/ewah/ewok.h index dc43d05b64..2d25e93b9e 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -204,7 +204,6 @@ struct bitmap { struct bitmap *bitmap_new(void); void bitmap_set(struct bitmap *self, size_t pos); -void bitmap_clear(struct bitmap *self, size_t pos); int bitmap_get(struct bitmap *self, size_t pos); void bitmap_reset(struct bitmap *self); void bitmap_free(struct bitmap *self); -- 2.18.0.rc1
[PATCH v2 4/7] ewah_bitmap: delete unused 'ewah_and_not()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 73 -- ewah/ewok.h| 5 2 files changed, 78 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 5a4d3a6eb6..559adde37c 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -459,79 +459,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_and_not( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if ((predator->rlw.running_bit && prey == _i) || - (!predator->rlw.running_bit && prey != _i)) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index; - int negate_words; - - negate_words = (_i != prey); - index = rlwit_discharge(prey, out, - predator->rlw.running_len, negate_words); - ewah_add_empty_words(out, negate_words, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] & - ~(rlw_j.buffer[rlw_j.literal_word_start + k]) - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge(_i, out, ~0, 0); - else - rlwit_discharge_empty(_j, out); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - void ewah_or( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index f4ada6979e..8dae8a608c 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -169,11 +169,6 @@ void ewah_or( struct ewah_bitmap *ewah_j, struct ewah_bitmap *out); -void ewah_and_not( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, -- 2.18.0.rc1
[PATCH v2 6/7] ewah_bitmap: delete unused 'ewah_or()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 69 -- ewah/ewok.h| 5 2 files changed, 74 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 3fa3edf2a3..017c677f98 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -440,75 +440,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_or( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if (predator->rlw.running_bit) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index = rlwit_discharge(prey, out, - predator->rlw.running_len, 0); - ewah_add_empty_words(out, 0, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] | - rlw_j.buffer[rlw_j.literal_word_start + k] - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge(_i, out, ~0, 0); - else - rlwit_discharge(_j, out, ~0, 0); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - - #define BITMAP_POOL_MAX 16 static struct ewah_bitmap *bitmap_pool[BITMAP_POOL_MAX]; static size_t bitmap_pool_size; diff --git a/ewah/ewok.h b/ewah/ewok.h index a467645eec..a540df2aa9 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -157,11 +157,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct ewah_bitmap *parent); */ int ewah_iterator_next(eword_t *next, struct ewah_iterator *it); -void ewah_or( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, -- 2.18.0.rc1
[PATCH v2 5/7] ewah_bitmap: delete unused 'ewah_not()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 19 --- ewah/ewok.h| 7 --- 2 files changed, 26 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 559adde37c..3fa3edf2a3 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -376,25 +376,6 @@ void ewah_iterator_init(struct ewah_iterator *it, struct ewah_bitmap *parent) read_new_rlw(it); } -void ewah_not(struct ewah_bitmap *self) -{ - size_t pointer = 0; - - while (pointer < self->buffer_size) { - eword_t *word = >buffer[pointer]; - size_t literals, k; - - rlw_xor_run_bit(word); - ++pointer; - - literals = rlw_get_literal_words(word); - for (k = 0; k < literals; ++k) { - self->buffer[pointer] = ~self->buffer[pointer]; - ++pointer; - } - } -} - void ewah_xor( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index 8dae8a608c..a467645eec 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -95,13 +95,6 @@ int ewah_read_mmap(struct ewah_bitmap *self, const void *map, size_t len); uint32_t ewah_checksum(struct ewah_bitmap *self); -/** - * Logical not (bitwise negation) in-place on the bitmap - * - * This operation is linear time based on the size of the bitmap. - */ -void ewah_not(struct ewah_bitmap *self); - /** * Call the given callback with the position of every single bit * that has been set on the bitmap. -- 2.18.0.rc1
[PATCH v2 3/7] ewah_bitmap: delete unused 'ewah_and()'
Reported-by: Ramsay Jones Signed-off-by: Derrick Stolee --- ewah/ewah_bitmap.c | 68 -- ewah/ewok.h| 5 2 files changed, 73 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index b9fdda1d3d..5a4d3a6eb6 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -459,74 +459,6 @@ void ewah_xor( out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); } -void ewah_and( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out) -{ - struct rlw_iterator rlw_i; - struct rlw_iterator rlw_j; - size_t literals; - - rlwit_init(_i, ewah_i); - rlwit_init(_j, ewah_j); - - while (rlwit_word_size(_i) > 0 && rlwit_word_size(_j) > 0) { - while (rlw_i.rlw.running_len > 0 || rlw_j.rlw.running_len > 0) { - struct rlw_iterator *prey, *predator; - - if (rlw_i.rlw.running_len < rlw_j.rlw.running_len) { - prey = _i; - predator = _j; - } else { - prey = _j; - predator = _i; - } - - if (predator->rlw.running_bit == 0) { - ewah_add_empty_words(out, 0, - predator->rlw.running_len); - rlwit_discard_first_words(prey, - predator->rlw.running_len); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } else { - size_t index = rlwit_discharge(prey, out, - predator->rlw.running_len, 0); - ewah_add_empty_words(out, 0, - predator->rlw.running_len - index); - rlwit_discard_first_words(predator, - predator->rlw.running_len); - } - } - - literals = min_size( - rlw_i.rlw.literal_words, - rlw_j.rlw.literal_words); - - if (literals) { - size_t k; - - for (k = 0; k < literals; ++k) { - ewah_add(out, - rlw_i.buffer[rlw_i.literal_word_start + k] & - rlw_j.buffer[rlw_j.literal_word_start + k] - ); - } - - rlwit_discard_first_words(_i, literals); - rlwit_discard_first_words(_j, literals); - } - } - - if (rlwit_word_size(_i) > 0) - rlwit_discharge_empty(_i, out); - else - rlwit_discharge_empty(_j, out); - - out->bit_size = max_size(ewah_i->bit_size, ewah_j->bit_size); -} - void ewah_and_not( struct ewah_bitmap *ewah_i, struct ewah_bitmap *ewah_j, diff --git a/ewah/ewok.h b/ewah/ewok.h index 71704ed37f..f4ada6979e 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -179,11 +179,6 @@ void ewah_xor( struct ewah_bitmap *ewah_j, struct ewah_bitmap *out); -void ewah_and( - struct ewah_bitmap *ewah_i, - struct ewah_bitmap *ewah_j, - struct ewah_bitmap *out); - /** * Direct word access */ -- 2.18.0.rc1
[PATCH v2 0/7] Delete unused methods in EWAH bitmap
The EWAH bitmap code includes several logical operations that are important for a general-purpose bitmap library. However, Git only uses the XOR operation for storing deltas between reachability bitmaps. This means that we can delete the following unused methods: * ewah_and() * ewah_and_not() * ewah_not() * ewah_or() * ewah_serialize() We can also delete the unused methods bitmap_clear() and bitmap_each_bit(). Derrick Stolee (7): ewah/bitmap.c: delete unused 'bitmap_clear()' ewah/bitmap.c: delete unused 'bitmap_each_bit()' ewah_bitmap: delete unused 'ewah_and()' ewah_bitmap: delete unused 'ewah_and_not()' ewah_bitmap: delete unused 'ewah_not()' ewah_bitmap: delete unused 'ewah_or()' ewah_io: delete unused 'ewah_serialize()' ewah/bitmap.c | 32 --- ewah/ewah_bitmap.c | 229 - ewah/ewah_io.c | 10 -- ewah/ewok.h| 25 - 4 files changed, 296 deletions(-) base-commit: fc54c1af3ec09bab8b8ea09768c2da4069b7f53e -- 2.18.0.rc1
Re: [PATCH 05/23] midx: write header information to lockfile
On 6/19/2018 10:59 AM, Duy Nguyen wrote: On Tue, Jun 19, 2018 at 2:54 PM Derrick Stolee wrote: On 6/12/2018 11:00 AM, Duy Nguyen wrote: On Thu, Jun 7, 2018 at 7:01 PM Derrick Stolee wrote: diff --git a/midx.c b/midx.c index 616af66b13..3e55422a21 100644 --- a/midx.c +++ b/midx.c @@ -1,9 +1,62 @@ #include "git-compat-util.h" #include "cache.h" #include "dir.h" +#include "csum-file.h" +#include "lockfile.h" #include "midx.h" +#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ +#define MIDX_VERSION 1 +#define MIDX_HASH_VERSION 1 /* SHA-1 */ ... +static size_t write_midx_header(struct hashfile *f, + unsigned char num_chunks, + uint32_t num_packs) +{ + char byte_values[4]; + hashwrite_be32(f, MIDX_SIGNATURE); + byte_values[0] = MIDX_VERSION; + byte_values[1] = MIDX_HASH_VERSION; Quoting from "State of NewHash work, future directions, and discussion" [1] * If you need to serialize an algorithm identifier into your data format, use the format_id field of struct git_hash_algo. It's designed specifically for that purpose. [1] https://public-inbox.org/git/20180612024252.ga141...@aiede.svl.corp.google.com/T/#m5fdd09dcaf31266c45343fb6c0beaaa3e928bc60 Thanks! I'll also use the_hash_algo->rawsz to infer the length of the hash function. BTW, since you're the author of commit-graph.c and may notice it has the same problem. Don't touch that code. Brian already has some WIP changes [1]. We just make sure new code does not add extra work for him. I expect he'll send all those patches out soon. [1] https://github.com/bk2204/git/commit/3f9031e06cfb21534eb7dfff7b54e7598ac1149f Thanks for the link. It seems he is creating an oid_version() method that returns a 1-byte version for the hash version instead of the 4-byte signature of the_hash_algo->format_id. I look forward to incorporating that into the MIDX format. I'll keep my macros for now, as we work out the other details, and while Brain's patch is cooking. Thanks, -Stolee
Re: [PATCH 01/15] contrib: add cocci script to replace index compat macros
On 6/19/2018 10:51 AM, Duy Nguyen wrote: On Tue, Jun 19, 2018 at 1:41 PM Derrick Stolee wrote: Duy, Here is the patch that was generated by `make coccicheck`. Thanks, -Stolee -->8-- --- builtin/add.c Ah right. This is on purpose. I think I mentioned in the commit message that builtin/ is not touched. Do we run 'make coccicheck' automatically somewhere? If true, I need to move this script elsewhere because it's meant to run manually. You run it when you intend to do more manual fixups afterwards. For builtin/, I think I'll wait until 'struct repository *' conversion is complete then maybe fix them one by one. I don't think it is part of the CI runs, but some community members run this themselves on 'next' and 'master'. I'm CC'ing Szeder, as he has contributed a fix to my own Coccinelle script [1]. [1] https://public-inbox.org/git/20180430093153.13040-1-szeder@gmail.com/ [PATCH] coccinelle: avoid wrong transformation suggestions from commit.cocci
Re: [PATCH] ewah: delete unused 'rlwit_discharge_empty()'
On 6/19/2018 5:51 PM, Ramsay Jones wrote: From: Junio C Hamano Complete the removal of unused 'ewah bitmap' code by removing the now unused 'rlwit_discharge_empty()' function. Also, the 'ewah_clear()' function can now be made a file-scope static symbol. Signed-off-by: Ramsay Jones --- Hi Junio, Can you please add this to the 'ds/ewah-cleanup' branch, before we forget to do so! ;-) Thanks! ATB, Ramsay Jones Looks good to me! Thanks! -Stolee ewah/ewah_bitmap.c | 20 ewah/ewah_rlw.c| 8 ewah/ewok.h| 6 -- ewah/ewok_rlw.h| 1 - 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index 017c677f9..d59b1afe3 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -276,6 +276,18 @@ void ewah_each_bit(struct ewah_bitmap *self, void (*callback)(size_t, void*), vo } } +/** + * Clear all the bits in the bitmap. Does not free or resize + * memory. + */ +static void ewah_clear(struct ewah_bitmap *self) +{ + self->buffer_size = 1; + self->buffer[0] = 0; + self->bit_size = 0; + self->rlw = self->buffer; +} + struct ewah_bitmap *ewah_new(void) { struct ewah_bitmap *self; @@ -288,14 +300,6 @@ struct ewah_bitmap *ewah_new(void) return self; } -void ewah_clear(struct ewah_bitmap *self) -{ - self->buffer_size = 1; - self->buffer[0] = 0; - self->bit_size = 0; - self->rlw = self->buffer; -} - void ewah_free(struct ewah_bitmap *self) { if (!self) diff --git a/ewah/ewah_rlw.c b/ewah/ewah_rlw.c index b9643b7d0..5093d43e2 100644 --- a/ewah/ewah_rlw.c +++ b/ewah/ewah_rlw.c @@ -104,11 +104,3 @@ size_t rlwit_discharge( return index; } - -void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out) -{ - while (rlwit_word_size(it) > 0) { - ewah_add_empty_words(out, 0, rlwit_word_size(it)); - rlwit_discard_first_words(it, rlwit_word_size(it)); - } -} diff --git a/ewah/ewok.h b/ewah/ewok.h index 0c504f28e..84b2a29fa 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -72,12 +72,6 @@ void ewah_pool_free(struct ewah_bitmap *self); */ struct ewah_bitmap *ewah_new(void); -/** - * Clear all the bits in the bitmap. Does not free or resize - * memory. - */ -void ewah_clear(struct ewah_bitmap *self); - /** * Free all the memory of the bitmap */ diff --git a/ewah/ewok_rlw.h b/ewah/ewok_rlw.h index bb3c6ff7e..7cdfdd0c0 100644 --- a/ewah/ewok_rlw.h +++ b/ewah/ewok_rlw.h @@ -98,7 +98,6 @@ void rlwit_init(struct rlw_iterator *it, struct ewah_bitmap *bitmap); void rlwit_discard_first_words(struct rlw_iterator *it, size_t x); size_t rlwit_discharge( struct rlw_iterator *it, struct ewah_bitmap *out, size_t max, int negate); -void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out); static inline size_t rlwit_word_size(struct rlw_iterator *it) {
Re: [PATCH 03/35] object: add repository argument to lookup_unknown_object
On 6/13/2018 3:30 PM, Stefan Beller wrote: On Wed, Jun 6, 2018 at 12:38 PM Duy Nguyen wrote: On Wed, May 30, 2018 at 2:47 AM, Stefan Beller wrote: diff --git a/object.c b/object.c index 4de4fa58d59..def3c71cac2 100644 --- a/object.c +++ b/object.c @@ -177,7 +177,7 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet) } } -struct object *lookup_unknown_object(const unsigned char *sha1) +struct object *lookup_unknown_object_the_repository(const unsigned char *sha1) I'm looking at your branch and this function (with the _the_repository suffix) is still there. Did you forget to send a patch to convert this function? This and parse_commit and parse_commit_gently have not been converted. I stopped with this series as soon as I hit the commit-graph code, which needs to be updated, too. I'll start redoing this series soon and will fix those conversions. Yes, we are colliding a bit at the moment. I'm planning to delay rerolling my patches until your last few series get merged. Thanks, -Stolee
Re: [PATCH v2 00/31] object-store: lookup_commit
On 6/13/2018 7:04 PM, Stefan Beller wrote * Once this is in good shape we can talk about converting parts of the revision walking code, This is another reason why I'll be waiting for this series of series. I plan to rework the revision walking code around sort_in_topological_order(), but I'll wait for your changes to land before submitting a patch. I'm itching to play with it, so I may start testing things out before then. In the meantime, I'll do my part by reviewing the current series. Thanks, -Stolee
Re: [PATCH v2 00/31] object-store: lookup_commit
On 6/13/2018 9:23 PM, Derrick Stolee wrote: On 6/13/2018 7:04 PM, Stefan Beller wrote * Once this is in good shape we can talk about converting parts of the revision walking code, This is another reason why I'll be waiting for this series of series. I plan to rework the revision walking code around sort_in_topological_order(), but I'll wait for your changes to land before submitting a patch. I'm itching to play with it, so I may start testing things out before then. In the meantime, I'll do my part by reviewing the current series. Looks good to me. Reviewed-by: Derrick Stolee
Re: [PATCH 03/35] object: add repository argument to lookup_unknown_object
On 5/29/2018 8:47 PM, Stefan Beller wrote: From: Jonathan Nieder Add a repository argument to allow callers of lookup_unknown_object to be more specific about which repository to handle. This is a small mechanical change; it doesn't change the implementation to handle repositories other than the_repository yet. As with the previous commits, use a macro to catch callers passing a repositor other than the_repository at compile time. The included coccinelle semantic patch will adapt any new callers in the diff produced by `make coccicheck`. I don't see any coccinelle change in this patch. Perhaps it is in a later commit? Thanks, -Stolee Signed-off-by: Jonathan Nieder Signed-off-by: Stefan Beller --- builtin/fsck.c | 2 +- builtin/pack-objects.c | 2 +- http-push.c | 2 +- object.c | 2 +- object.h | 3 ++- refs.c | 2 +- t/helper/test-example-decorate.c | 6 +++--- upload-pack.c| 2 +- walker.c | 2 +- 9 files changed, 12 insertions(+), 11 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 98fdeef5407..700739804fc 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -638,7 +638,7 @@ static int fsck_cache_tree(struct cache_tree *it) static void mark_object_for_connectivity(const struct object_id *oid) { - struct object *obj = lookup_unknown_object(oid->hash); + struct object *obj = lookup_unknown_object(the_repository, oid->hash); obj->flags |= HAS_OBJ; } diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 8c108109985..6eae39cf858 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2689,7 +2689,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs) for (i = 0; i < p->num_objects; i++) { nth_packed_object_oid(, p, i); - o = lookup_unknown_object(oid.hash); + o = lookup_unknown_object(the_repository, oid.hash); if (!(o->flags & OBJECT_ADDED)) mark_in_pack_object(o, p, _pack); o->flags |= OBJECT_ADDED; diff --git a/http-push.c b/http-push.c index 2615c823d60..04d95bd5da8 100644 --- a/http-push.c +++ b/http-push.c @@ -1429,7 +1429,7 @@ static void one_remote_ref(const char *refname) * may be required for updating server info later. */ if (repo->can_update_info_refs && !has_object_file(>old_oid)) { - obj = lookup_unknown_object(ref->old_oid.hash); + obj = lookup_unknown_object(the_repository, ref->old_oid.hash); fprintf(stderr, " fetch %s for %s\n", oid_to_hex(>old_oid), refname); add_fetch_request(obj); diff --git a/object.c b/object.c index 4de4fa58d59..def3c71cac2 100644 --- a/object.c +++ b/object.c @@ -177,7 +177,7 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet) } } -struct object *lookup_unknown_object(const unsigned char *sha1) +struct object *lookup_unknown_object_the_repository(const unsigned char *sha1) { struct object *obj = lookup_object(the_repository, sha1); if (!obj) diff --git a/object.h b/object.h index fa41d711f44..778f83bf0f7 100644 --- a/object.h +++ b/object.h @@ -139,7 +139,8 @@ struct object *parse_object_or_die(const struct object_id *oid, const char *name struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p); /** Returns the object, with potentially excess memory allocated. **/ -struct object *lookup_unknown_object(const unsigned char *sha1); +#define lookup_unknown_object(r, s) lookup_unknown_object_##r(s) +struct object *lookup_unknown_object_the_repository(const unsigned char *sha1); struct object_list *object_list_insert(struct object *item, struct object_list **list_p); diff --git a/refs.c b/refs.c index 23d53957deb..3b9e8463656 100644 --- a/refs.c +++ b/refs.c @@ -301,7 +301,7 @@ static int filter_refs(const char *refname, const struct object_id *oid, enum peel_status peel_object(const struct object_id *name, struct object_id *oid) { - struct object *o = lookup_unknown_object(name->hash); + struct object *o = lookup_unknown_object(the_repository, name->hash); if (o->type == OBJ_NONE) { int type = oid_object_info(the_repository, name, NULL); diff --git a/t/helper/test-example-decorate.c b/t/helper/test-example-decorate.c index 081115bf8eb..33e727f7fc5 100644 --- a/t/helper/test-example-decorate.c +++ b/t/helper/test-example-decorate.c @@ -26,8 +26,8 @@ int cmd__example_decorate(int argc, const char **argv) * Add 2 objects, one with a non-NULL decoration and one with a NULL *
Re: [PATCH 06/35] blob: add repository argument to lookup_blob
On 5/29/2018 8:47 PM, Stefan Beller wrote: Add a repository argument to allow the callers of lookup_blob to be more specific about which repository to act on. This is a small mechanical change; it doesn't change the implementation to handle repositories other than the_repository yet. As with the previous commits, use a macro to catch callers passing a repository other than the_repository at compile time. Add the cocci patch that converted the callers. (this is the last time I'll point this out. I'll wait until I read to the end to see if the coccinelle patch is included in the series.) Signed-off-by: Jonathan Nieder Signed-off-by: Stefan Beller --- blob.c | 2 +- blob.h | 3 ++- builtin/fast-export.c| 2 +- builtin/fsck.c | 3 ++- builtin/index-pack.c | 2 +- builtin/merge-tree.c | 3 ++- builtin/unpack-objects.c | 2 +- fsck.c | 2 +- http-push.c | 3 ++- list-objects.c | 2 +- object.c | 4 ++-- reachable.c | 2 +- revision.c | 4 ++-- tag.c| 2 +- walker.c | 3 ++- 15 files changed, 22 insertions(+), 17 deletions(-) diff --git a/blob.c b/blob.c index dada295698c..17b9314f0a0 100644 --- a/blob.c +++ b/blob.c @@ -5,7 +5,7 @@ const char *blob_type = "blob"; -struct blob *lookup_blob(const struct object_id *oid) +struct blob *lookup_blob_the_repository(const struct object_id *oid) { struct object *obj = lookup_object(the_repository, oid->hash); if (!obj) diff --git a/blob.h b/blob.h index 44606168310..08bc34487a0 100644 --- a/blob.h +++ b/blob.h @@ -9,7 +9,8 @@ struct blob { struct object object; }; -struct blob *lookup_blob(const struct object_id *oid); +#define lookup_blob(r, o) lookup_blob_##r(o) +struct blob *lookup_blob_the_repository(const struct object_id *oid); int parse_blob_buffer(struct blob *item, void *buffer, unsigned long size); diff --git a/builtin/fast-export.c b/builtin/fast-export.c index a34ab9768f4..23ca46e6008 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -237,7 +237,7 @@ static void export_blob(const struct object_id *oid) if (anonymize) { buf = anonymize_blob(); - object = (struct object *)lookup_blob(oid); + object = (struct object *)lookup_blob(the_repository, oid); eaten = 0; } else { buf = read_object_file(oid, , ); diff --git a/builtin/fsck.c b/builtin/fsck.c index 82a5acaf76e..6e2b204e938 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -808,7 +808,8 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) mode = active_cache[i]->ce_mode; if (S_ISGITLINK(mode)) continue; - blob = lookup_blob(_cache[i]->oid); + blob = lookup_blob(the_repository, + _cache[i]->oid); if (!blob) continue; obj = >object; diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 0dd10693597..51c5244a15a 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -832,7 +832,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, if (strict || do_fsck_object) { read_lock(); if (type == OBJ_BLOB) { - struct blob *blob = lookup_blob(oid); + struct blob *blob = lookup_blob(the_repository, oid); if (blob) blob->object.flags |= FLAG_CHECKED; else diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c index 8a8d5797520..f8023bae1e2 100644 --- a/builtin/merge-tree.c +++ b/builtin/merge-tree.c @@ -2,6 +2,7 @@ #include "tree-walk.h" #include "xdiff-interface.h" #include "object-store.h" +#include "repository.h" #include "blob.h" #include "exec-cmd.h" #include "merge-blobs.h" @@ -170,7 +171,7 @@ static struct merge_list *create_entry(unsigned stage, unsigned mode, const stru res->stage = stage; res->path = path; res->mode = mode; - res->blob = lookup_blob(oid); + res->blob = lookup_blob(the_repository, oid); return res; } diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 8e454c48649..dfea7790fde 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -254,7 +254,7 @@ static void write_object(unsigned nr, enum object_type type, added_object(nr, type, buf, size); free(buf); - blob = lookup_blob(_list[nr].oid); + blob = lookup_blob(the_repository, _list[nr].oid); if (blob) blob->object.flags |= FLAG_WRITTEN;
Re: [PATCH v3 06/20] commit-graph: add 'verify' subcommand
On 5/27/2018 6:55 PM, Jakub Narebski wrote: Derrick Stolee writes: If the commit-graph file becomes corrupt, we need a way to verify that its contents match the object database. In the manner of 'git fsck' we will implement a 'git commit-graph verify' subcommand to report all issues with the file. Add the 'verify' subcommand to the 'commit-graph' builtin and its documentation. The subcommand is currently a no-op except for loading the commit-graph into memory, which may trigger run-time errors that would be caught by normal use. So this commit is simply getting the boilerplate out of the way for implementing 'git commit-graph verify' subcommand. Good. Add a simple test that ensures the command returns a zero error code. Nice. If no commit-graph file exists, this is an acceptable state. Do not report any errors. All right. I assume that as it is explicit verification call, it does ignore core.commitGraph setting, isn't it? Signed-off-by: Derrick Stolee --- Documentation/git-commit-graph.txt | 6 ++ builtin/commit-graph.c | 38 ++ commit-graph.c | 26 ++ commit-graph.h | 2 ++ t/t5318-commit-graph.sh| 10 ++ 5 files changed, 82 insertions(+) diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index 4c97b555cc..a222cfab08 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -10,6 +10,7 @@ SYNOPSIS [verse] 'git commit-graph read' [--object-dir ] +'git commit-graph verify' [--object-dir ] 'git commit-graph write' [--object-dir ] In alphabetical order, good. @@ -52,6 +53,11 @@ existing commit-graph file. Read a graph file given by the commit-graph file and output basic details about the graph file. Used for debugging purposes. +'verify':: + +Read the commit-graph file and verify its contents against the object +database. Used to check for corrupted data. + All right, good enough description. EXAMPLES diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index f0875b8bf3..0433dd6e20 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -8,10 +8,16 @@ static char const * const builtin_commit_graph_usage[] = { N_("git commit-graph [--object-dir ]"), N_("git commit-graph read [--object-dir ]"), + N_("git commit-graph verify [--object-dir ]"), N_("git commit-graph write [--object-dir ] [--append] [--stdin-packs|--stdin-commits]"), NULL }; In alphabetical order, same as in the manpage for git-commit-graph. +static const char * const builtin_commit_graph_verify_usage[] = { + N_("git commit-graph verify [--object-dir ]"), + NULL +}; + static const char * const builtin_commit_graph_read_usage[] = { N_("git commit-graph read [--object-dir ]"), NULL @@ -29,6 +35,36 @@ static struct opts_commit_graph { int append; } opts; + +static int graph_verify(int argc, const char **argv) +{ + struct commit_graph *graph = 0; + char *graph_name; + + static struct option builtin_commit_graph_verify_options[] = { + OPT_STRING(0, "object-dir", _dir, + N_("dir"), + N_("The object directory to store the graph")), + OPT_END(), + }; + + argc = parse_options(argc, argv, NULL, +builtin_commit_graph_verify_options, +builtin_commit_graph_verify_usage, 0); + + if (!opts.obj_dir) + opts.obj_dir = get_object_directory(); Getting the boilerplate of implementing the command mostly out of the way. Good. + + graph_name = get_commit_graph_filename(opts.obj_dir); + graph = load_commit_graph_one(graph_name); So we are verifying only the commit-graph file belonging directly to current repository, as I have expected. This is needed to for warnings and error messages from the 'verify' action, to be able to tell in which file there are problems. This means that it is possible that there would be problems with commit-graph files that running 'git commit-graph verify' would not find, because they are in commit-graph file in one of the alternates. It is very easy, though, to check all commit-graph files that would be read and its data concatenated when using commit-graph feature (e.g. 'git commit-graph read', IIRC): $ git commit-graph verify $ for obj_dir in $(cat .git/objects/info/alternates) do; git commit-graph --object-dir="$obj_dir"; done Note: I have not checked the above that it works. + FREE_AND_NULL(graph_name); Freeing the resources, always nice to have. + + if (!grap
Re: [PATCH 04/35] object: add repository argument to parse_object_buffer
On 5/29/2018 8:47 PM, Stefan Beller wrote: Add a repository argument to allow the callers of parse_object_buffer to be more specific about which repository to act on. This is a small mechanical change; it doesn't change the implementation to handle repositories other than the_repository yet. As with the previous commits, use a macro to catch callers passing a repository other than the_repository at compile time. Add the cocci patch that converted the callers. Again, I'm missing the cocci patch here. Signed-off-by: Jonathan Nieder Signed-off-by: Stefan Beller --- builtin/fast-export.c| 3 ++- builtin/fsck.c | 6 -- builtin/index-pack.c | 3 ++- builtin/unpack-objects.c | 3 ++- object.c | 5 +++-- object.h | 3 ++- ref-filter.c | 3 ++- 7 files changed, 17 insertions(+), 9 deletions(-) diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 24d42842f9d..a34ab9768f4 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -245,7 +245,8 @@ static void export_blob(const struct object_id *oid) die ("Could not read blob %s", oid_to_hex(oid)); if (check_object_signature(oid, buf, size, type_name(type)) < 0) die("sha1 mismatch in blob %s", oid_to_hex(oid)); - object = parse_object_buffer(oid, type, size, buf, ); + object = parse_object_buffer(the_repository, oid, type, +size, buf, ); } if (!object) diff --git a/builtin/fsck.c b/builtin/fsck.c index 700739804fc..e6d6eb266eb 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -392,7 +392,8 @@ static int fsck_obj_buffer(const struct object_id *oid, enum object_type type, * verify_packfile(), data_valid variable for details. */ struct object *obj; - obj = parse_object_buffer(oid, type, size, buffer, eaten); + obj = parse_object_buffer(the_repository, oid, type, size, buffer, + eaten); if (!obj) { errors_found |= ERROR_OBJECT; return error("%s: object corrupt or missing", oid_to_hex(oid)); @@ -522,7 +523,8 @@ static struct object *parse_loose_object(const struct object_id *oid, if (!contents && type != OBJ_BLOB) die("BUG: read_loose_object streamed a non-blob"); - obj = parse_object_buffer(oid, type, size, contents, ); + obj = parse_object_buffer(the_repository, oid, type, size, + contents, ); if (!eaten) free(contents); diff --git a/builtin/index-pack.c b/builtin/index-pack.c index e2f670bef9e..0dd10693597 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -848,7 +848,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, * we do not need to free the memory here, as the * buf is deleted by the caller. */ - obj = parse_object_buffer(oid, type, size, buf, + obj = parse_object_buffer(the_repository, oid, type, + size, buf, ); if (!obj) die(_("invalid %s"), type_name(type)); diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 9a4d2708123..8e454c48649 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -265,7 +265,8 @@ static void write_object(unsigned nr, enum object_type type, int eaten; hash_object_file(buf, size, type_name(type), _list[nr].oid); added_object(nr, type, buf, size); - obj = parse_object_buffer(_list[nr].oid, type, size, buf, + obj = parse_object_buffer(the_repository, _list[nr].oid, + type, size, buf, ); if (!obj) die("invalid %s", type_name(type)); diff --git a/object.c b/object.c index def3c71cac2..4250ddd3f7f 100644 --- a/object.c +++ b/object.c @@ -186,7 +186,7 @@ struct object *lookup_unknown_object_the_repository(const unsigned char *sha1) return obj; } -struct object *parse_object_buffer(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p) +struct object *parse_object_buffer_the_repository(const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p) { struct object *obj; *eaten_p = 0; @@ -278,7 +278,8 @@ struct object *parse_object_the_repository(const struct object_id *oid) return NULL; } - obj = parse_object_buffer(oid, type, size, buffer, ); + obj =
Re: [PATCH v3 09/20] commit-graph: verify corrupt OID fanout and lookup
On 5/30/2018 9:34 AM, Jakub Narebski wrote: Derrick Stolee writes: In the commit-graph file, the OID fanout chunk provides an index into the OID lookup. The 'verify' subcommand should find incorrect values in the fanout. Similarly, the 'verify' subcommand should find out-of-order values in the OID lookup. O.K., the OID Lookup chunk is checked together with OID Fanout chunk because those two chunks are tightly connected: OID Fanout is fanout into OID Lookup. Signed-off-by: Derrick Stolee --- commit-graph.c | 36 t/t5318-commit-graph.sh | 22 ++ 2 files changed, 58 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 06e3e4f9ba..cbd1aae514 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -855,6 +855,9 @@ static void graph_report(const char *fmt, ...) int verify_commit_graph(struct commit_graph *g) { + uint32_t i, cur_fanout_pos = 0; + struct object_id prev_oid, cur_oid; Minor nitpick about the naming convention: why cur_*, and not curr_*? + if (!g) { graph_report("no commit-graph file loaded"); return 1; @@ -869,5 +872,38 @@ int verify_commit_graph(struct commit_graph *g) if (!g->chunk_commit_data) graph_report("commit-graph is missing the Commit Data chunk"); + if (verify_commit_graph_error) + return verify_commit_graph_error; + Before checking that commits in OID Lookup are sorted, and that OID Fanout correctly points into OID Lookup (thus also checking that OID Lookup is sorted), it would be good to verify that sizes of those chunks are correct. Out of 4 standrd chunks, 1 of them (OIDF) has constant size, and 2 of them have size given by number of commits and hash size - OID Fanout is (256 * 4 bytes) - OID Lookup is (N * H bytes), where N is number of commits, and H is hash size The one that is more significant is if number of commits gets corrupted upwards, and walking through OID Lookup would lead us out of bounds, outside the file size. IIRC we have checked that all chunks fit within file size, isn't it? + for (i = 0; i < g->num_commits; i++) { + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); Why do you use hashcpy() here, but oidcpy() below? We are copying from a section of binary data, not from a struct object_id *. + + if (i && oidcmp(_oid, _oid) >= 0) All right, OIDs needs to be sorted in ascending lexicographical order, and the above condition checks that this constraint is fullfilled. + graph_report("commit-graph has incorrect OID order: %s then %s", +oid_to_hex(_oid), +oid_to_hex(_oid)); Nice informative error message. + + oidcpy(_oid, _oid); + + while (cur_oid.hash[0] > cur_fanout_pos) { + uint32_t fanout_value = get_be32(g->chunk_oid_fanout + cur_fanout_pos); + if (i != fanout_value) + graph_report("commit-graph has incorrect fanout value: fanout[%d] = %u != %u", +cur_fanout_pos, fanout_value, i); + + cur_fanout_pos++; + } The k-th entry of fanout, F[k], stores the number of OIDs with first byte at most k. Let's examine it in detail on a simple example: fanout lookup -- -- 0 : 0 ---> 0: 1 1 : 2 -\ 1: 1 2 : 2 --\> 2: 3 3 : 3 ---> 3: 4 We are checking that after most significant byte (first byte) changes, then all fanout position up to the current first byte value (exclusive) point to current position in OID Lookup chunk. Seems all right; it would be nice to come up with rigorous proof that it is all right. + } + + while (cur_fanout_pos < 256) { + uint32_t fanout_value = get_be32(g->chunk_oid_fanout + cur_fanout_pos); + + if (g->num_commits != fanout_value) + graph_report("commit-graph has incorrect fanout value: fanout[%d] = %u != %u", +cur_fanout_pos, fanout_value, i); + + cur_fanout_pos++; + } All right, this checks that all remaining fanout entries, up and including the 255-ith entry store the total number of commits, N. + return verify_commit_graph_error; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 4ef3fe3dc2..c050ef980b 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' ' git commit-graph verify >output '
Re: [PATCH 27/35] commit-slabs: remove realloc counter outside of slab struct
On 5/29/2018 8:48 PM, Stefan Beller wrote: The realloc counter is declared outside the struct for the given slabname, which makes it harder for a follow up patch to move the declaration of the struct around as then the counter variable would need special treatment. As the reallocation counter is currently unused we can just remove it. If we ever need to count the reallocations again, we can reintroduce the counter as part of 'struct slabname' in commit-slab-decl.h. It's worth noting that this patch is different from the other mechanical patches. I do agree that we should remove this unused portion of the slab API. Likely this was built for testing purposes. CC'ing Junio, who I found introduced this in a84b794ad "commit-slab: introduce a macro to define a slab for new type". Looking at that diff, the realloc portion did not appear to exist in the previous in-degree-specific version of slabs. Signed-off-by: Stefan Beller --- commit-slab-impl.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/commit-slab-impl.h b/commit-slab-impl.h index 87a9cadfcca..ac1e6d409ad 100644 --- a/commit-slab-impl.h +++ b/commit-slab-impl.h @@ -11,8 +11,6 @@ #define implement_commit_slab(slabname, elemtype, scope) \ \ -static int stat_ ##slabname## realloc; \ - \ scope void init_ ##slabname## _with_stride(struct slabname *s, \ unsigned stride) \ { \ @@ -54,7 +52,6 @@ scope elemtype *slabname## _at_peek(struct slabname *s, \ if (!add_if_missing)\ return NULL;\ REALLOC_ARRAY(s->slab, nth_slab + 1);\ - stat_ ##slabname## realloc++; \ for (i = s->slab_count; i <= nth_slab; i++) \ s->slab[i] = NULL; \ s->slab_count = nth_slab + 1;\
Re: [RFC PATCH 00/35] object-store: lookup_commit
On 5/29/2018 11:18 PM, Stefan Beller wrote: On Tue, May 29, 2018 at 6:05 PM, Derrick Stolee wrote: On 5/29/2018 8:47 PM, Stefan Beller wrote: This applies on the merge of nd/commit-util-to-slab and sb/object-store-grafts, and is available at http://github.com/stefanbeller/ as branch object-store-lookup-commit as the merge has some merge conflicts as well as syntactical conflicts (upload-pack.c and fetch-pack.c introduce new calls of functions that would want to take a repository struct in the object-store-grafts series) As layed out in https://public-inbox.org/git/20180517225154.9200-1-sbel...@google.com/ this is getting close to finishing the set of object store series though the last unfinished part of this RFC hints at new work on the plate: * To give this series a nice polish, we'd want to convert parse_commit, too. But that requires the conversion of the new commit graph. Maybe we need to split this series into 2. I'll take a look at this series tomorrow. I've been working in ds/commit-graph-fsck to make many of the methods take a 'struct commit_graph *' parameter, which could easily be 'r->commit_graph' for a 'struct the_repository *r' instead of the global 'commit_graph'. Those graph-local methods will make the transformation to be repo-local a lot easier. (There still may be some need to insert a repository, though.) Since you are working on the commit-slab stuff in this patch, I'll (continue to) delay my patch series on using the commit-slab for generation numbers to avoid collisions with your work. This series touches the commit slabs in the last few commits only. I don't think there will be huge conflicts, as all I had to do was move the definition from a global to inside the parsed object store. I went through the whole series and didn't find any issue with the code. The commit messages reference coccinelle scripts that I never saw, so those should be struck from the messages. If anyone else plans to review, these two commits are worth special attention compared to the others: [PATCH 27/35] commit-slabs: remove realloc counter outside of slab struct [PATCH 28/35] commit.c: migrate the commit buffer to the parsed object store I found no issue with them, but they do actually change behavior slightly as opposed to the other commits which have no change other than to relax the dependence on the_repository as a global. I also like the pattern of storing the slabs inside the parsed object structure in the_repository. I'll use a similar pattern when looking into using slabs for generation number. For Junio: I know this will conflict with ds/commit-graph-fsck (specifically, I add new references to lookup_commit() and manipulate parse_commit_gently()). If this series is ready to be queued quickly, I can rebase my v4 onto these commits as they are placed into 'pu' (and please eject ds/commit-graph-fsck at that time). Thanks, -Stolee
Re: [RFC PATCH 00/35] object-store: lookup_commit
On 5/29/2018 8:47 PM, Stefan Beller wrote: This applies on the merge of nd/commit-util-to-slab and sb/object-store-grafts, and is available at http://github.com/stefanbeller/ as branch object-store-lookup-commit as the merge has some merge conflicts as well as syntactical conflicts (upload-pack.c and fetch-pack.c introduce new calls of functions that would want to take a repository struct in the object-store-grafts series) As layed out in https://public-inbox.org/git/20180517225154.9200-1-sbel...@google.com/ this is getting close to finishing the set of object store series though the last unfinished part of this RFC hints at new work on the plate: * To give this series a nice polish, we'd want to convert parse_commit, too. But that requires the conversion of the new commit graph. Maybe we need to split this series into 2. I'll take a look at this series tomorrow. I've been working in ds/commit-graph-fsck to make many of the methods take a 'struct commit_graph *' parameter, which could easily be 'r->commit_graph' for a 'struct the_repository *r' instead of the global 'commit_graph'. Those graph-local methods will make the transformation to be repo-local a lot easier. (There still may be some need to insert a repository, though.) Since you are working on the commit-slab stuff in this patch, I'll (continue to) delay my patch series on using the commit-slab for generation numbers to avoid collisions with your work. Thanks, -Stolee
Re: [PATCH] commit-graph: fix a sparse 'integer as NULL pointer' warning
n 5/29/2018 4:01 PM, Ramsay Jones wrote: Signed-off-by: Ramsay Jones --- Hi Derrick, If you need to re-roll your 'ds/commit-graph-fsck' branch (pu@a84e06bc0f), could you please squash this into the relevant patch (commit 80453b4529, "commit-graph: add 'verify' subcommand", 2018-05-24). [No, No, that was the one in graph_read(). :-D ] Thanks for helping rid me of my bad habits! I'll get this one in v4. Thanks! ATB, Ramsay Jones builtin/commit-graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index 20ce6437a..6abfde5e6 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -39,7 +39,7 @@ static struct opts_commit_graph { static int graph_verify(int argc, const char **argv) { - struct commit_graph *graph = 0; + struct commit_graph *graph = NULL; char *graph_name; static struct option builtin_commit_graph_verify_options[] = {
Re: [PATCH v3 07/20] commit-graph: verify catches corrupt signature
On 5/28/2018 10:05 AM, Jakub Narebski wrote: Derrick Stolee writes: This is the first of several commits that add a test to check that 'git commit-graph verify' catches corruption in the commit-graph file. The first test checks that the command catches an error in the file signature. This is a check that exists in the existing commit-graph reading code. Good start. This handles 3 out of 5 checks in load_commit_graph_one(). The remaining are: * too short file (length smaller than minimal commit-graph size) * more than one chunk of one of 4 defined types Add a helper method 'corrupt_graph_and_verify' to the test script t5318-commit-graph.sh. This helper corrupts the commit-graph file at a certain location, runs 'git commit-graph verify', and reports the output to the 'err' file. This data is filtered to remove the lines added by 'test_must_fail' when the test is run verbosely. Then, the output is checked to contain a specific error message. Thanks for an explanation. Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 43 +++ 1 file changed, 43 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 6ca451dfd2..bd64481c7a 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full repo' ' test_cmp expect output ' +# the verify tests below expect the commit-graph to contain +# exactly the commits reachable from the commits/8 branch. +# If the file changes the set of commits in the list, then the +# offsets into the binary file will result in different edits +# and the tests will likely break. + test_expect_success 'git commit-graph verify' ' cd "$TRASH_DIRECTORY/full" && + git rev-parse commits/8 | git commit-graph write --stdin-commits && git commit-graph verify >output ' I don't quite understand what the change is meant to do. This gives us a constant commit-graph file to work with in the later tests. To get the "independent test" structure you want for the tests that are coming, we need to do one of the following: 1. Write a new commit-graph file for every test (slows things down). 2. Do all corruption/verify checks in a single test (reduces the information from a failed test, as it only reports the first failure). I don't like either of these options, so I went with this "prepare" step. Also, as I said earlier, I'd prefer if tests were as indpendent of each other as possible, to make running individual tests (e.g. only previously falling tests) easier. I especially do not like mixing running actual test with setting up the repository for future tests, as here. +GRAPH_BYTE_VERSION=4 +GRAPH_BYTE_HASH=5 + +# usage: corrupt_graph_and_verify +# Manipulates the commit-graph file at the position +# by inserting the data, then runs 'git commit-graph verify' +# and places the output in the file 'err'. Test 'err' for +# the given string. Very nice to have this description. +corrupt_graph_and_verify() { + pos=$1 + data="${2:-\0}" + grepstr=$3 + cd "$TRASH_DIRECTORY/full" && + test_when_finished mv commit-graph-backup $objdir/info/commit-graph && + cp $objdir/info/commit-graph commit-graph-backup && + printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" conv=notrunc && Using 'printf' with octal is much more portable than relying on 'echo' supporting octal escape sequences (or supporting escape sequences at all). + test_must_fail git commit-graph verify 2>test_err && + grep -v "^+" test_err >err + grep "$grepstr" err Shouldn't this last 'grep' be 'test_i18ngrep' instead, to allow for translated messages from 'git commit-graph verify' / 'git fsck'? +} This function makes actual tests short and simple, without duplicated code. Very good. + +test_expect_success 'detect bad signature' ' + corrupt_graph_and_verify 0 "\0" \ + "graph signature" +' + +test_expect_success 'detect bad version' ' + corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \ + "graph version" +' + +test_expect_success 'detect bad hash version' ' + corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \ When we move from SHA-1 (hash version 1) to NewHash (hash version 2), this test would start failing... which is actually not a bad idea. + "hash version" +' + test_done
Re: [PATCH v3 03/20] commit-graph: parse commit from chosen graph
On 5/27/2018 6:23 AM, Jakub Narebski wrote: Derrick Stolee writes: Before verifying a commit-graph file against the object database, we need to parse all commits from the given commit-graph file. Create parse_commit_in_graph_one() to target a given struct commit_graph. If I understand it properly the problem is that when verifying against the object database we want to check one single commit-graph file, not concatenation of data from commit-graph file for the repository and commit-graph files from its alternates -- like prepare_commit_graph() does; which is called by parse_commit_in_graph(). Signed-off-by: Derrick Stolee O.K., so you introduce here a layer of indirection; parse_commit_in_graph() now just uses parse_commit_in_graph_one(), passing core_commit_graph (or the_commit_graph) to it, after checking that core_commit_graph is set (which handles the case when feature is not turned off) and loading commit-graph file. Nice and simple 'split function' refactoring, with new function taking over when there is commit graph file prepared. So, after the changes: * parse_commit_in_graph() is responsible for checking whether to use commit-graph feature and ensuring that data from commit-graph is loaded, where it passes the control to parse_commit_in_graph_one() * parse_commit_in_graph_one() checks whether commit-graph feature is turned on, whether commit we are interested in was already parsed, and then uses fill_commit_in_graph() to actually get the data * fill_commit_in_graph() gets data out of commit-graph file, extracting it from commit data chunk (and if needed large edges chunk). All those functions return 1 if they got data from commit-graph, and 0 if they didn't. One minor nitpick / complaint / question is about naming of global variables used here, namely: * static struct commit_graph *commit_graph from commit-graph.c for global storage of commit-graph[s] data * int core_commit_graph from environment.c for storing core.commitGraph config But I see that at least the latter is common convention in Git source code; I guess that the former maybe follows convention as used for "the index" and "the repository" - additionally it is static / file-local. See also `struct packed_git *packed_git;` from cache.h. --- commit-graph.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 82295f0975..78ba0edc80 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -310,7 +310,7 @@ static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin } } -int parse_commit_in_graph(struct commit *item) +static int parse_commit_in_graph_one(struct commit_graph *g, struct commit *item) { uint32_t pos; @@ -318,9 +318,21 @@ static int parse_commit_in_graph_one(struct commit_graph *g, struct commit *item) if (!core_commit_graph) return 0; All right, we check that commit-graph feature is enabled because parse_commit_in_graph_one() will be used standalone, not by being invoked from parse_commit_in_graph(). This check is fast. if (item->object.parsed) return 1; Sidenote: I just wonder why object.parsed and not for example object.graph_pos is used to checck whether the object was filled if possible with commit-graph data... Here we are filling all data, including commit date and parents. We use load_commit_graph_info() when we only need the graph_pos and generation values. + + if (find_commit_in_graph(item, g, )) + return fill_commit_in_graph(item, g, pos); + + return 0; +} + +int parse_commit_in_graph(struct commit *item) +{ + if (!core_commit_graph) + return 0; All right, this check is here to short-circuit and make it so git does not even try to lod commit-graph file[s] if the feature is disabled. + prepare_commit_graph(); - if (commit_graph && find_commit_in_graph(item, commit_graph, )) - return fill_commit_in_graph(item, commit_graph, pos); + if (commit_graph) + return parse_commit_in_graph_one(commit_graph, item); return 0; }
Re: [PATCH] t990X: use '.git/objects' as 'deep inside .git' path
On 5/27/2018 12:49 AM, Michael Haggerty wrote: On Sat, May 26, 2018 at 8:47 AM, Christian Couder wrote: Tests t9902-completion.sh and t9903-bash-prompt.sh each have tests that check what happens when we are "in the '.git' directory" and when we are "deep inside the '.git' directory". To test the case when we are "deep inside the '.git' directory" the test scripts used to perform a `cd .git/refs/heads`. As there are plans to implement other ref storage systems, let's use '.git/objects' instead of '.git/refs/heads' as the "deep inside the '.git' directory" path. Seems reasonable to me. +1. Michael Looks good to me, too. Thanks, -Stolee
Re: [PATCH v3 00/20] Integrate commit-graph into 'fsck' and 'gc'
On 5/29/2018 12:27 AM, Junio C Hamano wrote: Derrick Stolee writes: Thanks for all the feedback on v2. I've tried to make this round's review a bit easier by splitting up the commits into smaller pieces. Also, the test script now has less boilerplate and uses variables and clear arithmetic to explain which bytes are being modified. One other change worth mentioning: in "commit-graph: add '--reachable' option" I put the ref-iteration into a new external 'write_commit_graph_reachable()' method inside commit-graph.c. This makes the 'gc: automatically write commit-graph files' a simpler change. I finally managed to find time to resolve conflicts this topic has with other topics (of your own included, if I am not mistaken). Please double check the resolution when I push out the day's integration result later today. Sorry about the confusion. I was operating on my copy of ds/generation-numbers (34fdd43339) but fetching just now I see you updated that branch to 1472978ec6. I reset my local branch to ds/commit-graph-fsk (53dd1e6600). The only diff I see between my v3 branch and that commit are the changes from ds/commit-graph-lockfile-fix (33286dcd6d). Thanks, -Stolee
[RFC PATCH 5/6] fetch: destroy commit graph on shallow parameters
The commit-graph feature is not compatible with history-rewriting features such as shallow clones. When running a 'git fetch' with any of the shallow/unshallow options, destroy the commit-graph file and override core.commitGraph to be false. Signed-off-by: Derrick Stolee --- builtin/fetch.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/builtin/fetch.c b/builtin/fetch.c index af896e4b74..2001dca881 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1452,6 +1452,12 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) } } + if (update_shallow || depth || deepen_since || deepen_not.nr || + deepen_relative || unshallow || update_shallow || is_repository_shallow()) { + destroy_commit_graph(get_object_directory()); + core_commit_graph = 0; + } + if (remote) { if (filter_options.choice || repository_format_partial_clone) fetch_one_setup_partial(remote); -- 2.16.2.338.gcfe06ae955
[RFC PATCH 2/6] DO NOT MERGE: write commit-graph on every fetch
THIS IS ONLY FOR TESTING TO INCREASE TEST COVERAGE Signed-off-by: Derrick Stolee --- builtin/fetch.c | 4 1 file changed, 4 insertions(+) diff --git a/builtin/fetch.c b/builtin/fetch.c index 8ee998ea2e..af896e4b74 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -20,6 +20,7 @@ #include "utf8.h" #include "packfile.h" #include "list-objects-filter-options.h" +#include "commit-graph.h" static const char * const builtin_fetch_usage[] = { N_("git fetch [] [ [...]]"), @@ -1480,6 +1481,9 @@ int cmd_fetch(int argc, const char **argv, const char *prefix) close_all_packs(); + if (core_commit_graph) + write_commit_graph_reachable(get_object_directory(), 1); + argv_array_pushl(_gc_auto, "gc", "--auto", NULL); if (verbosity < 0) argv_array_push(_gc_auto, "--quiet"); -- 2.16.2.338.gcfe06ae955
[RFC PATCH 1/6] DO NOT MERGE: compute commit-graph on every commit
Also enable core.commitGraph and gc.commitGraph. Helps to test the commit-graph feature with the rest of the test infrastructure. Signed-off-by: Derrick Stolee --- builtin/commit.c | 5 + builtin/gc.c | 2 +- environment.c| 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/builtin/commit.c b/builtin/commit.c index e8e8d13be4..8751b816c1 100644 --- a/builtin/commit.c +++ b/builtin/commit.c @@ -32,6 +32,7 @@ #include "column.h" #include "sequencer.h" #include "mailmap.h" +#include "commit-graph.h" static const char * const builtin_commit_usage[] = { N_("git commit [] [--] ..."), @@ -1623,5 +1624,9 @@ int cmd_commit(int argc, const char **argv, const char *prefix) UNLEAK(err); UNLEAK(sb); + + if (core_commit_graph) + write_commit_graph_reachable(get_object_directory(), 1); + return 0; } diff --git a/builtin/gc.c b/builtin/gc.c index efd214a59f..999c09d8e2 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -35,7 +35,7 @@ static int aggressive_depth = 50; static int aggressive_window = 250; static int gc_auto_threshold = 6700; static int gc_auto_pack_limit = 50; -static int gc_commit_graph = 0; +static int gc_commit_graph = 1; static int detach_auto = 1; static timestamp_t gc_log_expire_time; static const char *gc_log_expire = "1.day.ago"; diff --git a/environment.c b/environment.c index 8853e2f0dd..fdb2d56d90 100644 --- a/environment.c +++ b/environment.c @@ -62,7 +62,7 @@ enum push_default_type push_default = PUSH_DEFAULT_UNSPECIFIED; enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE; char *notes_ref_name; int grafts_replace_parents = 1; -int core_commit_graph; +int core_commit_graph = 1; int core_apply_sparse_checkout; int merge_log_config = -1; int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */ -- 2.16.2.338.gcfe06ae955
[RFC PATCH 4/6] commit-graph: avoid writing when repo is shallow
Shallow clones do not interact well with the commit-graph feature for several reasons. Instead of doing the hard thing to fix those interactions, instead prevent reading or writing a commit-graph file for shallow repositories. Signed-off-by: Derrick Stolee --- commit-graph.c | 12 1 file changed, 12 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 95af4ed519..80e377b90f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -208,6 +208,9 @@ static void prepare_commit_graph(void) return; prepare_commit_graph_run_once = 1; + if (is_repository_shallow()) + return; + obj_dir = get_object_directory(); prepare_commit_graph_one(obj_dir); prepare_alt_odb(); @@ -711,6 +714,15 @@ void write_commit_graph(const char *obj_dir, int num_extra_edges; struct commit_list *parent; + /* +* Shallow clones are not supproted, as they create bad +* generation skips as they are un-shallowed. +*/ + if (is_repository_shallow()) { + warning("writing a commit-graph in a shallow repository is not supported"); + return; + } + oids.nr = 0; oids.alloc = approximate_object_count() / 4; -- 2.16.2.338.gcfe06ae955
[RFC PATCH 6/6] commit-graph: revert to odb on missing parents
The commit-graph format includes a way to specify a parent is "missing" from the commit-graph (i.e. we do not have a record of that parent in our list of object IDs, and hence cannot provide a graph position). For mose cases, this does not occur due to the close_reachable() method adding all reachable commits. However, in a shallow clone, we will try to record the parents of a commit on the shallow boundary, but the parents are not in the repository. The GRAPH_PARENT_MISSING value that is stored in the format is purposeful, especially for future plans to make the commit-graph file incremental or transporting sections of a commit-graph file across the network. In the meantime, check if a commit has a missing parent while filling its details from the commit-graph. If a parent is missing, still assign the generation number and graph position for that item, but report that the commit-graph failed to fill the contents. Then the caller is responsible for filling the rest of the data from a commit buffer. Signed-off-by: Derrick Stolee --- commit-graph.c | 33 ++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 80e377b90f..3e33d061fe 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -278,17 +278,44 @@ static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, uin struct commit_list **pptr; const unsigned char *commit_data = g->chunk_commit_data + (g->hash_len + 16) * pos; - item->object.parsed = 1; + item->generation = get_be32(commit_data + g->hash_len + 8) >> 2; item->graph_pos = pos; + /* +* If we have any edges marked as GRAPH_PARENT_MISSING, we must not parse any +* more of this object and leave it to the commit buffer to parse. +*/ + edge_value = get_be32(commit_data + g->hash_len); + if (edge_value == GRAPH_PARENT_MISSING) + return 0; + if (edge_value == GRAPH_PARENT_NONE) + goto continue_parsing; + + edge_value = get_be32(commit_data + g->hash_len + 4); + if (edge_value == GRAPH_PARENT_MISSING) + return 0; + if (edge_value == GRAPH_PARENT_NONE) + goto continue_parsing; + if (!(edge_value & GRAPH_OCTOPUS_EDGES_NEEDED)) + goto continue_parsing; + + parent_data_ptr = (uint32_t*)(g->chunk_large_edges + + 4 * (uint64_t)(edge_value & GRAPH_EDGE_LAST_MASK)); + do { + edge_value = get_be32(parent_data_ptr); + if (edge_value == GRAPH_PARENT_MISSING) + return 0; + parent_data_ptr++; + } while (!(edge_value & GRAPH_LAST_EDGE)); + +continue_parsing: + item->object.parsed = 1; item->maybe_tree = NULL; date_high = get_be32(commit_data + g->hash_len + 8) & 0x3; date_low = get_be32(commit_data + g->hash_len + 12); item->date = (timestamp_t)((date_high << 32) | date_low); - item->generation = get_be32(commit_data + g->hash_len + 8) >> 2; - pptr = >parents; edge_value = get_be32(commit_data + g->hash_len); -- 2.16.2.338.gcfe06ae955
[RFC PATCH 3/6] commit-graph: enable replace-object and grafts
Create destroy_commit_graph() method to delete the commit-graph file when history is altered by a replace-object call. If the commit-graph is rebuilt after that, we will load the correct object while reading the commit-graph. When parsing a commit, first check if the commit was grafted. If so, then ignore the commit-graph for that commit and insted use the parents loaded by parsing the commit buffer and comparing against the graft file. Signed-off-by: Derrick Stolee --- builtin/replace.c | 3 +++ commit-graph.c| 20 +++- commit-graph.h| 9 + commit.c | 5 + 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/builtin/replace.c b/builtin/replace.c index 9f01f3fc7f..d553aadcdc 100644 --- a/builtin/replace.c +++ b/builtin/replace.c @@ -15,6 +15,7 @@ #include "parse-options.h" #include "run-command.h" #include "tag.h" +#include "commit-graph.h" static const char * const git_replace_usage[] = { N_("git replace [-f] "), @@ -468,6 +469,8 @@ int cmd_replace(int argc, const char **argv, const char *prefix) usage_msg_opt("--raw only makes sense with --edit", git_replace_usage, options); + destroy_commit_graph(get_object_directory()); + switch (cmdmode) { case MODE_DELETE: if (argc < 1) diff --git a/commit-graph.c b/commit-graph.c index e9195dfb17..95af4ed519 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -6,6 +6,7 @@ #include "pack.h" #include "packfile.h" #include "commit.h" +#include "dir.h" #include "object.h" #include "refs.h" #include "revision.h" @@ -240,15 +241,22 @@ static struct commit_list **insert_parent_or_die(struct commit_graph *g, { struct commit *c; struct object_id oid; + const unsigned char *real; if (pos >= g->num_commits) die("invalid parent position %"PRIu64, pos); hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos); + + real = lookup_replace_object(oid.hash); + c = lookup_commit(); if (!c) die("could not find commit %s", oid_to_hex()); - c->graph_pos = pos; + + if (!hashcmp(real, oid.hash)) + c->graph_pos = pos; + return _list_insert(c, pptr)->next; } @@ -1019,3 +1027,13 @@ int verify_commit_graph(struct commit_graph *g) return verify_commit_graph_error; } + +void destroy_commit_graph(const char *obj_dir) +{ + char *graph_name; + close_commit_graph(); + + graph_name = get_commit_graph_filename(obj_dir); + remove_path(graph_name); + FREE_AND_NULL(graph_name); +} diff --git a/commit-graph.h b/commit-graph.h index 9a06a5f188..1d17da1582 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -56,4 +56,13 @@ void write_commit_graph(const char *obj_dir, int verify_commit_graph(struct commit_graph *g); +/* + * Delete the commit-graph file in the given object directory. + * + * WARNING: this deletes data, so should only be used when + * performing history-altering actions like replace-object + * or grafts. + */ +void destroy_commit_graph(const char *obj_dir); + #endif diff --git a/commit.c b/commit.c index 6eaed0174c..2fe31cde77 100644 --- a/commit.c +++ b/commit.c @@ -403,6 +403,11 @@ int parse_commit_internal(struct commit *item, int quiet_on_missing, int use_com return -1; if (item->object.parsed) return 0; + + prepare_commit_graft(); + if (commit_graft_pos(item->object.oid.hash) >= 0) + use_commit_graph = 0; + if (use_commit_graph && parse_commit_in_graph(item)) return 0; buffer = read_sha1_file(item->object.oid.hash, , ); -- 2.16.2.338.gcfe06ae955
[RFC PATCH 0/6] Fix commit-graph/graft/replace/shallow combo
The commit-graph file stores a condensed version of the commit history. This helps speed up several operations involving commit walks. This feature does not work well if those commits "change" using features like commit grafts, replace objects, or shallow clones. Since the commit-graph feature is optional, hidden behind the 'core.commitGraph' config option, and requires manual maintenance (until ds/commit-graph-fsck' is merged), these issues only arise for expert users who decided to opt-in. This RFC is a first attempt at rectify the issues that come up when these features interact. I have not succeeded entirely, as I will discuss below. The first two "DO NOT MERGE" commits are not intended to be part of the main product, but are ways to help the full test suite run while computing and consuming commit-graph files. This is acheived by calling write_commit_graph_reachable() from `git fetch` and `git commit`. I believe this is the only dependence on ds/commit-graph-fsck. The other commits should only depend on ds/commit-graph-lockfile-fix. Running the full test suite after these DO NO MERGE commits results in the following test failures, which I categorize by type of failure. The following tests are red herrings. Most work by deleting a commit from the object database and trying to demonstrate a failure. Others rely on how certain objects are loaded. These are not bugs, but will add noise if you run the tests suite with this patch. t0410-partial-clone.sh Failed tests: 5, 8 t5307-pack-missing-commit.shFailed tests: 3-4 t6011-rev-list-with-bad-commit.sh Failed test: 4 t6024-recursive-merge.shFailed test: 4 The following tests are fixed in "commit-graph: enable replace-object and grafts". t6001-rev-list-graft.sh Failed tests: 3, 5, 7, 9, 11, 13 t6050-replace.shFailed tests: 11-15, 23-24, 29 The following tests involve shallow clones. t5500-fetch-pack.sh Failed tests: 30-31, 34, 348-349 t5537-fetch-shallow.sh Failed tests: 4-7, 9 t5802-connect-helper.sh Failed test: 3 These tests are mostly fixed by three commits: * commit-graph: avoid writing when repo is shallow * fetch: destroy commit graph on shallow parameters * commit-graph: revert to odb on missing parents Each of these cases cover a different interaction that occurs with shallow clones. Some are due to a commit becoming shallow, while others are due to a commit becoming unshallow (and hence invalidating its generation number). After these changes, there is one test case that still fails, and I cannot understand why: t5500-fetch-pack.sh Failed test: 348 This test fails when performing a `git fetch --shallow-exclude`. When I halt the test using `t5500-fetch-pack.sh -d -i` and navigate to the directory to replay the fetch it performs as expected. After struggling with it for a while, I figured I should just put this series up for discussion. Maybe someone with more experience in shallow clones can point out the obvious issues I'm having. Thanks, -Stolee Derrick Stolee (6): DO NOT MERGE: compute commit-graph on every commit DO NOT MERGE: write commit-graph on every fetch commit-graph: enable replace-object and grafts commit-graph: avoid writing when repo is shallow fetch: destroy commit graph on shallow parameters commit-graph: revert to odb on missing parents builtin/commit.c | 5 + builtin/fetch.c | 10 + builtin/gc.c | 2 +- builtin/replace.c | 3 +++ commit-graph.c| 65 +++ commit-graph.h| 9 commit.c | 5 + environment.c | 2 +- 8 files changed, 95 insertions(+), 6 deletions(-) -- 2.16.2.338.gcfe06ae955
Re: [PATCH v3 16/20] commit-graph: verify contents match checksum
On 6/2/2018 11:52 AM, Jakub Narebski wrote: Derrick Stolee writes: The commit-graph file ends with a SHA1 hash of the previous contents. If a commit-graph file has errors but the checksum hash is correct, then we know that the problem is a bug in Git and not simply file corruption after-the-fact. Compute the checksum right away so it is the first error that appears, and make the message translatable since this error can be "corrected" by a user by simply deleting the file and recomputing. The rest of the errors are useful only to developers. Should we then provide --quiet / --verbose options, so that ordinary user is not flooded with error messages meant for power users and Git developers, then? Be sure to continue checking the rest of the file data if the checksum is wrong. This is important for our tests, as we break the checksum as we modify bytes of the commit-graph file. Well, we could have used sha1sum program, or test-sha1 helper to fix the checksum after corrupting the commit-graph file... Signed-off-by: Derrick Stolee --- commit-graph.c | 16 ++-- t/t5318-commit-graph.sh | 6 ++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index d2b291aca2..a33600c584 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -841,6 +841,7 @@ void write_commit_graph(const char *obj_dir, oids.nr = 0; } +#define VERIFY_COMMIT_GRAPH_ERROR_HASH 2 static int verify_commit_graph_error; static void graph_report(const char *fmt, ...) @@ -860,7 +861,9 @@ static void graph_report(const char *fmt, ...) int verify_commit_graph(struct commit_graph *g) { uint32_t i, cur_fanout_pos = 0; - struct object_id prev_oid, cur_oid; + struct object_id prev_oid, cur_oid, checksum; + struct hashfile *f; + int devnull; if (!g) { graph_report("no commit-graph file loaded"); @@ -879,6 +882,15 @@ int verify_commit_graph(struct commit_graph *g) if (verify_commit_graph_error) return verify_commit_graph_error; + devnull = open("/dev/null", O_WRONLY); + f = hashfd(devnull, NULL); + hashwrite(f, g->data, g->data_len - g->hash_len); + finalize_hashfile(f, checksum.hash, CSUM_CLOSE); + if (hashcmp(checksum.hash, g->data + g->data_len - g->hash_len)) { + graph_report(_("the commit-graph file has incorrect checksum and is likely corrupt")); + verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH; + } Is it the best way of calculating the SHA-1 checksum that out internal APIs provide? Is it how SHA-1 checksum is calculated and checked for packfiles? This pattern is similar to hashfd_check() in csum-file.c, except we are hashing the file data directly instead of re-creating it from scratch (as is done for 'git index-pack --verify'). + for (i = 0; i < g->num_commits; i++) { struct commit *graph_commit; @@ -916,7 +928,7 @@ int verify_commit_graph(struct commit_graph *g) cur_fanout_pos++; } - if (verify_commit_graph_error) + if (verify_commit_graph_error & ~VERIFY_COMMIT_GRAPH_ERROR_HASH) return verify_commit_graph_error; So if we detected that checksum do not match, but we have not found an error, we say that it is all right? This only prevents us from stopping early. We will still report an error. for (i = 0; i < g->num_commits; i++) { diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 240aef6add..2680a2ebff 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -279,6 +279,7 @@ GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16` GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \ $GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS` GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4` +GRAPH_BYTE_FOOTER=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4 \* $NUM_OCTOPUS_EDGES` # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -388,4 +389,9 @@ test_expect_success 'detect incorrect parent for octopus merge' ' "invalid parent" ' +test_expect_success 'detect invalid checksum hash' ' + corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \ + "incorrect checksum" This would not work under GETTEXT_POISON, as the message is marked as translatable, but corrupt_graph_and_verify uses 'grep' and not 'test_i18grep' from t/test-lib-functions.sh I fixed this locally based on Szeder's comment. +' If it is pure checksum corruption, wouldn't this fail because it is not a failure (exit code is 0)? It is not zero, so the test passes.
[PATCH] t5318-commit-graph.sh: use core.commitGraph
The commit-graph tests should be checking that normal Git operations succeed and have matching output with and without the commit-graph feature enabled. However, the test was toggling 'core.graph' instead of the correct 'core.commitGraph' variable. Reported-by: Jakub Narebski Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 77d85aefe7..59d0be2877 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -28,8 +28,8 @@ test_expect_success 'create commits and repack' ' ' graph_git_two_modes() { - git -c core.graph=true $1 >output - git -c core.graph=false $1 >expect + git -c core.commitGraph=true $1 >output + git -c core.commitGraph=false $1 >expect test_cmp output expect } -- 2.18.0.rc0
Re: [PATCH v3 09/20] commit-graph: verify corrupt OID fanout and lookup
On 6/2/2018 12:38 AM, Duy Nguyen wrote: On Thu, May 24, 2018 at 6:25 PM, Derrick Stolee wrote: + if (i && oidcmp(_oid, _oid) >= 0) + graph_report("commit-graph has incorrect OID order: %s then %s", +oid_to_hex(_oid), +oid_to_hex(_oid)); Should these strings be marked for translation with _()? I've been asking myself "Is this message helpful to anyone other than a Git developer?" and for this series the only one that is helpful to an end-user is the message about the final hash. If the hash is correct, but these other messages appear, then there is a bug in the code that wrote the file. Otherwise, file corruption is more likely and the correct course of action is to delete and rebuild. Thanks for being diligent in checking. Thanks, -Stolee
Re: [PATCH v3 17/20] fsck: verify commit-graph
On 6/2/2018 12:17 PM, Jakub Narebski wrote: Derrick Stolee writes: If core.commitGraph is true, verify the contents of the commit-graph during 'git fsck' using the 'git commit-graph verify' subcommand. Run this check on all alternates, as well. All right, so we have one config variable to control the use of serialized commit-graph feaature. Nice. We use a new process for two reasons: 1. The subcommand decouples the details of loading and verifying a commit-graph file from the other fsck details. All right, I can agree with that. On the other hand using subcommand makes debugging harder, though not in this case (well separated functionality that can be easily called with a standalone command to be debugged). 2. The commit-graph verification requires the commits to be loaded in a specific order to guarantee we parse from the commit-graph file for some objects and from the object database for others. I don't quite understand this. Could you explain it in more detail? We use `lookup_commit()` when verifying the commit-graph. If these commits were loaded earlier in the process and parsed directly from the object database, then we aren't comparing the commit-graph file contents against the ODB. Signed-off-by: Derrick Stolee --- Documentation/git-fsck.txt | 3 +++ builtin/fsck.c | 21 + t/t5318-commit-graph.sh| 8 3 files changed, 32 insertions(+) diff --git a/Documentation/git-fsck.txt b/Documentation/git-fsck.txt index b9f060e3b2..ab9a93fb9b 100644 --- a/Documentation/git-fsck.txt +++ b/Documentation/git-fsck.txt @@ -110,6 +110,9 @@ Any corrupt objects you will have to find in backups or other archives (i.e., you can just remove them and do an 'rsync' with some other site in the hopes that somebody else has the object you have corrupted). +If core.commitGraph is true, the commit-graph file will also be inspected Shouldn't we use `core.commitGraph` here? +using 'git commit-graph verify'. See linkgit:git-commit-graph[1]. + Extracted Diagnostics - diff --git a/builtin/fsck.c b/builtin/fsck.c index ef78c6c00c..a6d5045b77 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -16,6 +16,7 @@ #include "streaming.h" #include "decorate.h" #include "packfile.h" +#include "run-command.h" #define REACHABLE 0x0001 #define SEEN 0x0002 @@ -45,6 +46,7 @@ static int name_objects; #define ERROR_REACHABLE 02 #define ERROR_PACK 04 #define ERROR_REFS 010 +#define ERROR_COMMIT_GRAPH 020 Minor nitpick and a sidenote: I wonder if it wouldn't be better to either use hexadecimal constants, or use (1 << n) for all ERROR_* preprocesor constants. static const char *describe_object(struct object *obj) { @@ -815,5 +817,24 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) } check_connectivity(); + + if (core_commit_graph) { + struct child_process commit_graph_verify = CHILD_PROCESS_INIT; + const char *verify_argv[] = { "commit-graph", "verify", NULL, NULL, NULL, NULL }; I see that NULL at index 2 and 3 (at 3rd and 4th place) are here for "--object-dir" and , the last one is terminator for that case, but what is next to last NULL (at 5th place) for? + commit_graph_verify.argv = verify_argv; + commit_graph_verify.git_cmd = 1; + + if (run_command(_graph_verify)) + errors_found |= ERROR_COMMIT_GRAPH; + + prepare_alt_odb(); + for (alt = alt_odb_list; alt; alt = alt->next) { + verify_argv[2] = "--object-dir"; + verify_argv[3] = alt->path; + if (run_command(_graph_verify)) + errors_found |= ERROR_COMMIT_GRAPH; + } + } For performance reasons it may be better to start those 'git commit-graph verify' commands asynchronously earlier, so that they can run in parallel / concurrently wth other checks, and wait for them and get their error code at the end of git-fsck run. But that is probably better left for a separate commit. + return errors_found; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 2680a2ebff..4941937163 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -394,4 +394,12 @@ test_expect_success 'detect invalid checksum hash' ' "incorrect checksum" ' +test_expect_success 'git fsck (checks commit-graph)' ' + cd "$TRASH_DIRECTORY/full" && + git fsck && + corrupt_graph_and_verify $GRAPH_BYTE_FOOTER "\00" \ + "incorrect checksum" && + test_must_fail git fsck +' All right; though the same caveats apply as with previous commit in series. Perhaps it would be better to truncate commit-graph file, or corrupt it in some 'random' place. + test_done Best,
Re: [PATCH v3 15/20] commit-graph: test for corrupted octopus edge
On 6/2/2018 8:39 AM, Jakub Narebski wrote: Derrick Stolee writes: The commit-graph file has an extra chunk to store the parent int-ids for parents beyond the first parent for octopus merges. Our test repo has a single octopus merge that we can manipulate to demonstrate the 'verify' subcommand detects incorrect values in that chunk. If I understand it correctly the above means that our _reading_ code checks for validity (which then 'git commit-graph verify' uses), just there were not any tests for that. Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 10 ++ 1 file changed, 10 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 58adb8246d..240aef6add 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' ' ' NUM_COMMITS=9 +NUM_OCTOPUS_EDGES=2 HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 @@ -274,6 +275,10 @@ GRAPH_BYTE_COMMIT_EXTRA_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4` GRAPH_BYTE_COMMIT_WRONG_PARENT=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3` GRAPH_BYTE_COMMIT_GENERATION=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 8` GRAPH_BYTE_COMMIT_DATE=`expr $GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12` +GRAPH_COMMIT_DATA_WIDTH=`expr $HASH_LEN + 16` +GRAPH_OCTOPUS_DATA_OFFSET=`expr $GRAPH_COMMIT_DATA_OFFSET + \ + $GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS` +GRAPH_BYTE_OCTOPUS=`expr $GRAPH_OCTOPUS_DATA_OFFSET + 4` # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -378,4 +383,9 @@ test_expect_success 'detect incorrect commit date' ' "commit date" ' +test_expect_success 'detect incorrect parent for octopus merge' ' + corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \ + "invalid parent" +' So we change the int-id to non-existing commit, and check that commit-graph code checks for that. What about the case when there are octopus merges, but no EDGE chunk (which I think we can emulate by changing / corrupting number of chunks)? What about the case where int-id of edge in EDGE chunk is correct, that is points to a valid commit, but does not agree with what is in the object database (what parents octopus merge has in reality)? Do we detect the situation where the second parent value in the commit data stores an array position within a Large Edge chunk, but we do not reach a value with the most-significant bit on when reaching the end of Large Edge chunk? There are a few holes like this, but I think they are better suited to a follow-up series, as this series is already quite large. -Stolee
Re: [PATCH v3 06/20] commit-graph: add 'verify' subcommand
On 6/2/2018 5:19 PM, Jakub Narebski wrote: Derrick Stolee writes: Do we have a way to run individual steps of the test suite? I am unfamiliar with that process. The t/README describes three such ways in "Skipping Tests" section: - GIT_SKIP_TESTS environment variable, which can either can match the "t[0-9]{4}" part to skip the whole test, or t[0-9]{4} followed by ".$number" to say which particular test to skip - For an individual test suite --run could be used to specify that only some tests should be run or that some tests should be excluded from a run (the latter with '!' prefix). - 'prove' harness can also run individual tests; one of more useful options is --state, which for example would allow to run only failed tests with --state=failed,save ... if the tests were independent. Adding the complexity of storing a copy of the commit-graph file for re-use in a later test is wasted energy right now, because we need to run the steps of the test that create the repo shape with the commits laid out as set earlier in the test. This shape changes as we test different states of the commit-graph (exists and contains all commits, exists and doesn't contain all commits, etc.) I think we can solve most of the problem by separating validation tests (which all or almost all use the same commit-graph file) and other test; putting them in different test scripts. This means that the more complicated case would be limited to the subset of tests. Anyway, if the setup stages are clearly separated and clearly marked as such, we would be able to at least manually skip tests, or manually run only a subset of tests. Test independence is certainly something nice to have, but as the git testsuite is not in best shape wrt this, it is not a requirement. I'm all for making the test suite better. In this case, I will hold that for a later series (that is entirely focused on that feature) as I expect we will want to discuss the correct pattern in detail. Thanks, -Stolee
Re: [PATCH v3 19/20] gc: automatically write commit-graph files
On 6/2/2018 2:03 PM, Jakub Narebski wrote: Derrick Stolee writes: The commit-graph file is a very helpful feature for speeding up git operations. In order to make it more useful, write the commit-graph file by default during standard garbage collection operations. I think you meant here "make it possible to write the commit-graph file during standard garbage collection operations." (i.e. add "make it possible" because it hides behind new config option, and remove "by default" because currently it is not turned on by default). Add a 'gc.commitGraph' config setting that triggers writing a commit-graph file after any non-trivial 'git gc' command. Defaults to false while the commit-graph feature matures. We specifically do not want to turn this on by default until the commit-graph feature is fully s/turn this on/have this on/ I think. integrated with history-modifying features like shallow clones. Two things. First, shallow clones, replacement mechanims (git-replace) and grafts are not "history-modifying" features; this name is in my opinion reserved for history-rewriting features such as interactive rebase, the `git filter-branch` feature or out-of-tree BFG Repo Cleaner or reposurgeon tools. They alter the _view_ of history; they should be IMVHO named "history-view-altering" features -- though I agree this is mouthful. Second, shouldn't we, as Martin Ågren said, warn about the issue in the manpage for git-commit-graph? Signed-off-by: Derrick Stolee --- Documentation/config.txt | 6 ++ Documentation/git-gc.txt | 4 builtin/gc.c | 6 ++ t/t5318-commit-graph.sh | 14 ++ 4 files changed, 30 insertions(+) diff --git a/Documentation/config.txt b/Documentation/config.txt index 11f027194e..9a3abd87e7 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -1553,6 +1553,12 @@ gc.autoDetach:: Make `git gc --auto` return immediately and run in background if the system supports it. Default is true. +gc.commitGraph:: + If true, then gc will rewrite the commit-graph file after any + change to the object database. If '--auto' is used, then the + commit-graph will not be updated unless the threshold is met. What threshold? Ah, thresholds defined for `git gc --auto` (gc.auto, gc.autoPackLimit, gc.logExpiry,...). + See linkgit:git-commit-graph[1] for details. You missed declaring the default value for this config option. + gc.logExpiry:: If the file gc.log exists, then `git gc --auto` won't run unless that file is more than 'gc.logExpiry' old. Default is diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt index 571b5a7e3c..17dd654a59 100644 --- a/Documentation/git-gc.txt +++ b/Documentation/git-gc.txt @@ -119,6 +119,10 @@ The optional configuration variable `gc.packRefs` determines if it within all non-bare repos or it can be set to a boolean value. This defaults to true. +The optional configuration variable 'gc.commitGraph' determines if +'git gc' runs 'git commit-graph write'. This can be set to a boolean Should it be "runs" or "should run"? +value. This defaults to false. Should it be '...' or `...`? Below we have `gc.aggresiveWindow`, above we have 'gc.commitGraph', for example. + The optional configuration variable `gc.aggressiveWindow` controls how much time is spent optimizing the delta compression of the objects in the repository when the --aggressive option is specified. The larger diff --git a/builtin/gc.c b/builtin/gc.c index 77fa720bd0..efd214a59f 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -20,6 +20,7 @@ #include "argv-array.h" #include "commit.h" #include "packfile.h" +#include "commit-graph.h" #define FAILED_RUN "failed to run %s" @@ -34,6 +35,7 @@ static int aggressive_depth = 50; static int aggressive_window = 250; static int gc_auto_threshold = 6700; static int gc_auto_pack_limit = 50; +static int gc_commit_graph = 0; static int detach_auto = 1; static timestamp_t gc_log_expire_time; static const char *gc_log_expire = "1.day.ago"; @@ -121,6 +123,7 @@ static void gc_config(void) git_config_get_int("gc.aggressivedepth", _depth); git_config_get_int("gc.auto", _auto_threshold); git_config_get_int("gc.autopacklimit", _auto_pack_limit); + git_config_get_bool("gc.commitgraph", _commit_graph); git_config_get_bool("gc.autodetach", _auto); git_config_get_expiry("gc.pruneexpire", _expire); git_config_get_expiry("gc.worktreepruneexpire", _worktrees_expire); @@ -480,6 +483,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix) if (pack_garbage.nr > 0) clean_pack_garbage(); + if (gc_commit_graph) +
Re: [PATCH v3 13/20] commit-graph: verify generation number
On 6/2/2018 8:23 AM, Jakub Narebski wrote: Derrick Stolee writes: While iterating through the commit parents, perform the generation number calculation and compare against the value stored in the commit-graph. All right, that's good. What about commit-graph files that have GENERATION_NUMBER_ZERO for all its commits (because we verify single commit-graph file only, there wouldn't be GENERATION_NUMBER_ZERO mixed with non-zero generation numbers)? Unless we can assume that no commit-graph file in the wild would have GENERATION_NUMBER_ZERO. I was expecting that we would not have files in the wild with GENERATION_NUMBER_ZERO, but it looks like 2.18 will create files like that. I'll put in logic to verify "all are GENERATION_NUMBER_ZERO or all have 'correct' generation number". The tests demonstrate that having a different set of parents affects the generation number calculation, and this value propagates to descendants. Hence, we drop the single-line condition on the output. I don't understand what part of changes this paragraph of the commit message refers to. Anyway, changing parents may not lead to changed generation numbers; take for example commit with single parent, which we change to other commit with the same generation number. The tests introduced in the previous commit change the parent list, which then changes the generation number in some cases (the stored generation number doesn't match the generation number computed based on the loaded parents). Since we report as many errors as possible (instead of failing on first error) those tests would fail if we say "the _only_ error should be the parent list". (This comes up again when we report the hash is incorrect, which would appear in every test.) The test introduced in this commit only changes the generation number, so that test will have the one error. Signed-off-by: Derrick Stolee --- commit-graph.c | 18 ++ t/t5318-commit-graph.sh | 6 ++ Sidenote: I have just realized that it may be better to put validation-related tests into different test file. 2 files changed, 24 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index fff22dc0c3..ead92460c1 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -922,6 +922,7 @@ int verify_commit_graph(struct commit_graph *g) for (i = 0; i < g->num_commits; i++) { struct commit *graph_commit, *odb_commit; struct commit_list *graph_parents, *odb_parents; + uint32_t max_generation = 0; hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); @@ -956,6 +957,9 @@ int verify_commit_graph(struct commit_graph *g) oid_to_hex(_parents->item->object.oid), oid_to_hex(_parents->item->object.oid)); + if (graph_parents->item->generation > max_generation) + max_generation = graph_parents->item->generation; + All right, that calculates the maximum of generation number of commit parents. graph_parents = graph_parents->next; odb_parents = odb_parents->next; } @@ -963,6 +967,20 @@ int verify_commit_graph(struct commit_graph *g) if (odb_parents != NULL) graph_report("commit-graph parent list for commit %s terminates early", oid_to_hex(_oid)); + + /* +* If one of our parents has generation GENERATION_NUMBER_MAX, then +* our generation is also GENERATION_NUMBER_MAX. Decrement to avoid +* extra logic in the following condition. +*/ Nice trick. + if (max_generation == GENERATION_NUMBER_MAX) + max_generation--; What about GENERATION_NUMBER_ZERO? + + if (graph_commit->generation != max_generation + 1) + graph_report("commit-graph generation for commit %s is %u != %u", +oid_to_hex(_oid), +graph_commit->generation, +max_generation + 1); I think we should also check that generation numbers do not exceed GENERATION_NUMBER_MAX... unless it is already taken care of? We get that for free. First, the condition above would fail for at least one commit. Second, we literally cannot store a value larger than GENERATION_NUMBER_MAX in the commit-graph as there are only 30 bits dedicated to the generation number. } return verify_commit_graph_error; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 12f0d7f54d..673b0d37d5 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAP
Re: [RFC PATCH 0/6] Fix commit-graph/graft/replace/shallow combo
On 5/31/2018 2:33 PM, Stefan Beller wrote: On Thu, May 31, 2018 at 10:40 AM, Derrick Stolee wrote: The commit-graph file stores a condensed version of the commit history. This helps speed up several operations involving commit walks. This feature does not work well if those commits "change" using features like commit grafts, replace objects, or shallow clones. Since the commit-graph feature is optional, hidden behind the 'core.commitGraph' config option, and requires manual maintenance (until ds/commit-graph-fsck' is merged), these issues only arise for expert users who decided to opt-in. This RFC is a first attempt at rectify the issues that come up when these features interact. I have not succeeded entirely, as I will discuss below. The first two "DO NOT MERGE" commits are not intended to be part of the main product, but are ways to help the full test suite run while computing and consuming commit-graph files. This is acheived by calling write_commit_graph_reachable() from `git fetch` and `git commit`. I believe this is the only dependence on ds/commit-graph-fsck. The other commits should only depend on ds/commit-graph-lockfile-fix. I applied these patches on top of ds/commit-graph-fsck (it would have been nice if you mentioned that it applies there) Running the test suite with all patches applied results in: ./t0410-partial-clone.sh(Wstat: 256 Tests: 15 Failed: 2) Failed tests: 5, 8 ./t5307-pack-missing-commit.sh (Wstat: 256 Tests: 5 Failed: 2) Failed tests: 3-4 ./t5500-fetch-pack.sh (Wstat: 256 Tests: 353 Failed: 1) Failed test: 348 ./t6011-rev-list-with-bad-commit.sh (Wstat: 256 Tests: 6 Failed: 1) Failed test: 4 ./t6024-recursive-merge.sh (Wstat: 256 Tests: 6 Failed: 1) Failed test: 4 which you identified as 4x noise and t5500 as not understood. $ GIT_TRACE=1 ./t5500-fetch-pack.sh -d -i -v -x [...] expecting success: git -C shallow12 fetch --shallow-exclude one origin && git -C shallow12 log --pretty=tformat:%s origin/master >actual && test_write_lines three two >expected && test_cmp expected actual ++ git -C shallow12 fetch --shallow-exclude one origin trace: built-in: git fetch --shallow-exclude one origin trace: run_command: unset GIT_PREFIX; 'git-upload-pack '\''/u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/.'\''' trace: run_command: git --shallow-file pack-objects --revs --thin --stdout --shallow --progress --delta-base-offset --include-tag trace: built-in: git pack-objects --revs --thin --stdout --shallow --progress --delta-base-offset --include-tag remote: Counting objects: 4, done. remote: Compressing objects: 100% (3/3), done. remote: Total 4 (delta 0), reused 0 (delta 0) trace: run_command: git --shallow-file unpack-objects --pack_header=2,4 trace: built-in: git unpack-objects --pack_header=2,4 Unpacking objects: 100% (4/4), done. trace: run_command: git rev-list --objects --stdin --not --all --quiet trace: built-in: git rev-list --objects --stdin --not --all --quiet trace: run_command: unset GIT_PREFIX; 'git-upload-pack '\''/u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/.'\''' trace: run_command: git pack-objects --revs --thin --stdout --progress --delta-base-offset trace: built-in: git pack-objects --revs --thin --stdout --progress --delta-base-offset remote: Total 0 (delta 0), reused 0 (delta 0) trace: run_command: git unpack-objects --pack_header=2,0 trace: built-in: git unpack-objects --pack_header=2,0 trace: run_command: git rev-list --objects --stdin --not --all --quiet trace: built-in: git rev-list --objects --stdin --not --all --quiet From file:///u/git/t/trash directory.t5500-fetch-pack/shallow-exclude/. * [new tag] one-> one * [new tag] two-> two run_processes_parallel: preparing to run up to 1 tasks run_processes_parallel: done trace: run_command: git gc --auto trace: built-in: git gc --auto ++ git -C shallow12 log --pretty=tformat:%s origin/master trace: built-in: git log '--pretty=tformat:%s' origin/master ++ test_write_lines three two ++ printf '%s\n' three two ++ test_cmp expected actual ++ diff -u expected actual --- expected 2018-05-31 18:14:25.944540810 + +++ actual 2018-05-31 18:14:25.944540810 + @@ -1,2 +1,3 @@ three two +one error: last command exited with $?=1 not ok 348 - fetch exclude tag one # # git -C shallow12 fetch --shallow-exclude one origin && # git -C shallow12 log --pretty=tformat:%s origin/master >actual && # test_write_lines three two >expected && # test_cmp expected actual # After these changes, there is one test case that still fails, and I cannot understand why: t5500-fetch-pack.sh Failed test: 348 This test fails when performing a `git fetch --shallow-exclude`. When I halt the test using `t5500-fetch-pack.sh -d -i` and navigate to the directory to replay t
Re: ds/generation-numbers (was Re: What's cooking in git.git (Jun 2018, #01; Fri, 1))
On 6/1/2018 7:15 PM, Junio C Hamano wrote: Derrick Stolee writes: A recently added "commit-graph" datafile has learned to store pre-computed generation numbers to speed up the decisions to stop history traversal. Will cook in 'next'. On Wednesday, these were marked as "Will merge to 'master'" What changed? Nothing has changed. "Will merge to 'master'" means "This is now in 'next', and unless there is some blocking breakage found, this topic will be merged to 'master' eventually". It does not even say if that eventuality is before or after the release the current cycle is working towards. When it comes near pre-release feature freeze, things in 'next' need to be sifted into various bins, and their labels are updated. The ones that are too big to be comfortably merged to the upcoming release after -rc0 has passed (i.e. biggies are better merged early to the 'master' in the cycle to give it full cycle of larger exposure) will be kept in 'next' so that it can go first after the final, the ones that are low risk but with higher importance will be merged to 'master' before the release, the ones that are trivial, distracting and lower value (i.e. the ones that force i18n teams extra work) may be held in 'next', and the ones that deserve a chance to freshly restart are marked to be kicked back to 'pu'. Etc. etc. Thanks, Junio. This explanation is what I expected. I suppose the small extra bit of information of "Will cook in 'next' until after next release" would have answered my question in advance. Thanks for the patience as I get used to your workflow. I am a little disappointed this didn't make 2.18 because this gives some of the biggest speedups for typically painful computations (like 'git branch --contains'). The generation numbers are what give us more than a constant-multiple speedup; what is in master only parses commit relationships faster, doesn't reduce the number of commits walked. It also means we will release a version of Git that writes commit-graph file with GENERATION_ZERO and so we will never be able to deprecate that logic in the code. Thanks, -Stolee
ds/generation-numbers (was Re: What's cooking in git.git (Jun 2018, #01; Fri, 1))
On 6/1/2018 3:21 AM, Junio C Hamano wrote: * ds/commit-graph-lockfile-fix (2018-05-22) 1 commit (merged to 'next' on 2018-05-24 at 3d12a02b0c) + commit-graph: fix UX issue when .lock file exists (this branch is used by ds/commit-graph-fsck; uses ds/generation-numbers.) Update to ds/generation-numbers topic. Wait for ds/generation-numbers * ds/generation-numbers (2018-05-22) 11 commits (merged to 'next' on 2018-05-24 at 56fc38a1b6) + commit-graph.txt: update design document + merge: check config before loading commits + commit: use generation number in remove_redundant() + commit: add short-circuit to paint_down_to_common() + commit: use generation numbers for in_merge_bases() + ref-filter: use generation number for --contains + commit-graph: always load commit-graph information + commit: use generations in paint_down_to_common() + commit-graph: compute generation numbers + commit: add generation number to struct commit + ref-filter: fix outdated comment on in_commit_list (this branch is used by ds/commit-graph-fsck and ds/commit-graph-lockfile-fix.) A recently added "commit-graph" datafile has learned to store pre-computed generation numbers to speed up the decisions to stop history traversal. Will cook in 'next'. On Wednesday, these were marked as "Will merge to 'master'" What changed? Thanks, -Stolee
Re: [RFC PATCH 4/6] commit-graph: avoid writing when repo is shallow
On 5/31/2018 10:30 PM, Junio C Hamano wrote: Derrick Stolee writes: Shallow clones do not interact well with the commit-graph feature for several reasons. Instead of doing the hard thing to fix those interactions, instead prevent reading or writing a commit-graph file for shallow repositories. The latter instead would want to vanish, I would guess. Do you mean that we should call destroy_commit_graph() if we detect a shallow repository during write_commit_graph(), then I can make that change. Signed-off-by: Derrick Stolee --- commit-graph.c | 12 1 file changed, 12 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 95af4ed519..80e377b90f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -208,6 +208,9 @@ static void prepare_commit_graph(void) return; prepare_commit_graph_run_once = 1; + if (is_repository_shallow()) + return; + obj_dir = get_object_directory(); prepare_commit_graph_one(obj_dir); prepare_alt_odb(); @@ -711,6 +714,15 @@ void write_commit_graph(const char *obj_dir, int num_extra_edges; struct commit_list *parent; + /* +* Shallow clones are not supproted, as they create bad +* generation skips as they are un-shallowed. +*/ + if (is_repository_shallow()) { + warning("writing a commit-graph in a shallow repository is not supported"); + return; + } + oids.nr = 0; oids.alloc = approximate_object_count() / 4;
Re: [PATCH v4 00/21] Integrate commit-graph into 'fsck' and 'gc'
On 6/5/2018 10:51 AM, Ævar Arnfjörð Bjarmason wrote: On Mon, Jun 4, 2018 at 6:52 PM, Derrick Stolee wrote: Thanks for the feedback on v3. There were several small cleanups, but perhaps the biggest change is the addition of "commit-graph: use string-list API for input" which makes "commit-graph: add '--reachable' option" much simpler. Do you have a version of this pushed somewhere? Your fsck/upstream branch conflicts with e.g. long-since-merged nd/repack-keep-pack. Sorry I forgot to push. It is now available on GitHub [1]. I based my commits on 'next' including the core.commitGraph fix for t5318-commit-graph.sh I already sent [2]. [1] https://github.com/derrickstolee/git/tree/fsck/upstream fsck/upstream branch matches this patch series [2] https://public-inbox.org/git/20180604123906.136417-1-dsto...@microsoft.com/ [PATCH] t5318-commit-graph.sh: use core.commitGraph
Re: t5318-commit-graph.sh breaks travis gettext poison job
On 6/1/2018 12:17 PM, Duy Nguyen wrote: In case you're not checking travis, [1] reports Test Summary Report --- t5318-commit-graph.sh(Wstat: 256 Tests: 62 Failed: 2) Failed tests: 61-62 Non-zero exit status: 1 This usually means you're grepping an i18n string [1] https://travis-ci.org/git/git/jobs/386532326 Thanks, Duy. I believe this is related to the place Szeder pointed out [1] with the one translatable string in verify_commit_graph(). I've fixed this in my local branch, and plan to push v4 when review of v3 is complete. Thanks, -Stolee [1] https://public-inbox.org/git/20180530123529.32090-1-szeder@gmail.com/ Re: [PATCH v3 16/20] commit-graph: verify contents match checksum
Re: [PATCH v3 10/20] commit-graph: verify objects exist
On 5/30/2018 3:22 PM, Jakub Narebski wrote: Derrick Stolee writes: In the 'verify' subcommand, load commits directly from the object database to ensure they exist. Parse by skipping the commit-graph. All right, before we check that the commit data matches, we need to check that all the commits in cache (in the serialized commit graph) are present in real data (in the object database of the repository). What's nice of this series is that the operation that actually removes unreachable commits from the object database, that is `git gc`, would also update commit-gaph file. Signed-off-by: Derrick Stolee --- commit-graph.c | 20 t/t5318-commit-graph.sh | 7 +++ 2 files changed, 27 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index cbd1aae514..0420ebcd87 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -238,6 +238,10 @@ static struct commit_list **insert_parent_or_die(struct commit_graph *g, { struct commit *c; struct object_id oid; + + if (pos >= g->num_commits) + die("invalid parent position %"PRIu64, pos); + This change is not described in the commit message. This change should go in "commit-graph: verify parent list" which adds a test that fails without it. Thanks. hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos); c = lookup_commit(); if (!c) @@ -905,5 +909,21 @@ int verify_commit_graph(struct commit_graph *g) cur_fanout_pos++; } + if (verify_commit_graph_error) + return verify_commit_graph_error; All right, so we by default short-circuit so that errors found earlier wouldn't cause crash when checking other things. Is it needed, though, in this case? Though I guess it is better to be conservative; lso by terminating after a batch of one type of errors we don't get many different error messages from the same error (i.e. error propagation). + + for (i = 0; i < g->num_commits; i++) { + struct commit *odb_commit; + + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); + + odb_commit = (struct commit *)create_object(cur_oid.hash, alloc_commit_node()); Do we really need to keep all those commits from the object database in memory (in the object::obj_hash hash table)? Perhaps using something like Flywheel / Recycler pattern would be a better solution (if possible)? Though perhaps this doesn't matter much with respect to memory use. + if (parse_commit_internal(odb_commit, 0, 0)) { Just a reminder to myself: the signature is int parse_commit_internal(struct commit *item, int quiet_on_missing, int use_commit_graph) Hmmm... I wonder if with two boolean paramaters wouldn't it be better to use flags parameter, i.e. int parse_commit_internal(struct commit *item, int flags) ... parse_commit_internal(commit, QUIET_ON_MISSING | USE_COMMIT_GRAPH) But I guess that it is not worth it (especially for internal-ish function). + graph_report("failed to parse %s from object database", +oid_to_hex(_oid)); Wouldn't parse_commit_internal() show it's own error message, in addition to the one above? + continue; + } + } + return verify_commit_graph_error; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index c050ef980b..996a016239 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' ' git commit-graph verify >output ' +NUM_COMMITS=9 HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 @@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=`expr $GRAPH_FANOUT_OFFSET + 4 \* 4` GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255` GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256` GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8` +GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 10` All right, so we modify 10-the byte of OID of 5-th commit out of 9, am I correct here? # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' ' "incorrect OID order" ' +test_expect_success 'detect OID not in object database' ' + corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \ + "from object database" +' I guess that if we ensure that OIDs are constant, you have chosen the change to actually corrupt the OID in OID Lookup chunk to point to OID that is not in the object database (while still not violating the constraint that OID in OID Lookup chunk needs to be sorted). + test_done All right (well, except for `expr ... ` --> $(( ... )) change).
Re: [PATCH v3 11/20] commit-graph: verify root tree OIDs
On 5/30/2018 6:24 PM, Jakub Narebski wrote: Derrick Stolee writes: The 'verify' subcommand must compare the commit content parsed from the commit-graph and compare it against the content in the object database. You have "compare" twice in the above sentence. Use lookup_commit() and parse_commit_in_graph_one() to parse the commits from the graph and compare against a commit that is loaded separately and parsed directly from the object database. All right, that looks like a nice extension of what was done in previous patch. We want to check that cache (serialized commit graph) matches reality (object database). Add checks for the root tree OID. All right; isn't it that now we check almost all information from commit-graph that hs match in object database (with exception of commit parents, I think). Signed-off-by: Derrick Stolee --- commit-graph.c | 17 - t/t5318-commit-graph.sh | 7 +++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/commit-graph.c b/commit-graph.c index 0420ebcd87..19ea369fc6 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -880,6 +880,8 @@ int verify_commit_graph(struct commit_graph *g) return verify_commit_graph_error; NOTE: we will be checking Commit Data chunk; I think it would be good idea to verify that size of Commit Data chunk matches (N * (H + 16) bytes) that format gives us, so that we don't accidentally red outside of memory if something got screwed up (like number of commits being wrong, or file truncated). This is actually how we calculate 'num_commits' during load_commit_graph_one(): if (last_chunk_id == GRAPH_CHUNKID_OIDLOOKUP) { graph->num_commits = (chunk_offset - last_chunk_offset) / graph->hash_len; } So, if the chunk doesn't match N*(H+16), we detect this because FANOUT[255] != N. (There is one caveat here: (chunk_offset - last_chunk_offset) may not be a multiple of hash_len, and "accidentally" truncate to N in the division. I'll add more careful checks for this.) We also check out-of-bounds offsets in that method. for (i = 0; i < g->num_commits; i++) { + struct commit *graph_commit; + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); if (i && oidcmp(_oid, _oid) >= 0) @@ -897,6 +899,11 @@ int verify_commit_graph(struct commit_graph *g) cur_fanout_pos++; } + + graph_commit = lookup_commit(_oid); So now I see why we add all commits to memory (to hash structure). + if (!parse_commit_in_graph_one(g, graph_commit)) + graph_report("failed to parse %s from commit-graph", +oid_to_hex(_oid)); All right, this verifies that commit in OID Lookup chunk has parse-able data in Commit Data chunk, isn't it? } while (cur_fanout_pos < 256) { @@ -913,16 +920,24 @@ int verify_commit_graph(struct commit_graph *g) return verify_commit_graph_error; for (i = 0; i < g->num_commits; i++) { - struct commit *odb_commit; + struct commit *graph_commit, *odb_commit; hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); + graph_commit = lookup_commit(_oid); odb_commit = (struct commit *)create_object(cur_oid.hash, alloc_commit_node()); All right, so we have commit data from graph, and commit data from the object database. if (parse_commit_internal(odb_commit, 0, 0)) { graph_report("failed to parse %s from object database", oid_to_hex(_oid)); continue; } + + if (oidcmp(_commit_tree_in_graph_one(g, graph_commit)->object.oid, + get_commit_tree_oid(odb_commit))) + graph_report("root tree OID for commit %s in commit-graph is %s != %s", +oid_to_hex(_oid), + oid_to_hex(get_commit_tree_oid(graph_commit)), + oid_to_hex(get_commit_tree_oid(odb_commit))); Maybe explicitly say that it doesn't match the value from the object database; OTOH this may be too verbose. } return verify_commit_graph_error; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 996a016239..21cc8e82f3 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=`expr $GRAPH_FANOUT_OFFSET + 4 \* 255` GRAPH_OID_LOOKUP_OFFSET=`expr $GRAPH_FANOUT_OFFSET + 4 \* 256` GRAPH_BYTE_OID_LOOKUP_ORDER=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8` GRAPH_BYTE_OID_LOOKUP_MISSING=`expr $GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 10` +GRAPH_COMMIT_DATA_OFFSET=`expr $GRAPH_
[PATCH v4 06/21] commit-graph: add 'verify' subcommand
If the commit-graph file becomes corrupt, we need a way to verify that its contents match the object database. In the manner of 'git fsck' we will implement a 'git commit-graph verify' subcommand to report all issues with the file. Add the 'verify' subcommand to the 'commit-graph' builtin and its documentation. The subcommand is currently a no-op except for loading the commit-graph into memory, which may trigger run-time errors that would be caught by normal use. Add a simple test that ensures the command returns a zero error code. If no commit-graph file exists, this is an acceptable state. Do not report any errors. Helped-by: Ramsay Jones Signed-off-by: Derrick Stolee --- Documentation/git-commit-graph.txt | 6 + builtin/commit-graph.c | 38 ++ commit-graph.c | 23 ++ commit-graph.h | 2 ++ t/t5318-commit-graph.sh| 10 5 files changed, 79 insertions(+) diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index 4c97b555cc..a222cfab08 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -10,6 +10,7 @@ SYNOPSIS [verse] 'git commit-graph read' [--object-dir ] +'git commit-graph verify' [--object-dir ] 'git commit-graph write' [--object-dir ] @@ -52,6 +53,11 @@ existing commit-graph file. Read a graph file given by the commit-graph file and output basic details about the graph file. Used for debugging purposes. +'verify':: + +Read the commit-graph file and verify its contents against the object +database. Used to check for corrupted data. + EXAMPLES diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index f0875b8bf3..3079cde6f9 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -8,10 +8,16 @@ static char const * const builtin_commit_graph_usage[] = { N_("git commit-graph [--object-dir ]"), N_("git commit-graph read [--object-dir ]"), + N_("git commit-graph verify [--object-dir ]"), N_("git commit-graph write [--object-dir ] [--append] [--stdin-packs|--stdin-commits]"), NULL }; +static const char * const builtin_commit_graph_verify_usage[] = { + N_("git commit-graph verify [--object-dir ]"), + NULL +}; + static const char * const builtin_commit_graph_read_usage[] = { N_("git commit-graph read [--object-dir ]"), NULL @@ -29,6 +35,36 @@ static struct opts_commit_graph { int append; } opts; + +static int graph_verify(int argc, const char **argv) +{ + struct commit_graph *graph = NULL; + char *graph_name; + + static struct option builtin_commit_graph_verify_options[] = { + OPT_STRING(0, "object-dir", _dir, + N_("dir"), + N_("The object directory to store the graph")), + OPT_END(), + }; + + argc = parse_options(argc, argv, NULL, +builtin_commit_graph_verify_options, +builtin_commit_graph_verify_usage, 0); + + if (!opts.obj_dir) + opts.obj_dir = get_object_directory(); + + graph_name = get_commit_graph_filename(opts.obj_dir); + graph = load_commit_graph_one(graph_name); + FREE_AND_NULL(graph_name); + + if (!graph) + return 0; + + return verify_commit_graph(graph); +} + static int graph_read(int argc, const char **argv) { struct commit_graph *graph = NULL; @@ -165,6 +201,8 @@ int cmd_commit_graph(int argc, const char **argv, const char *prefix) if (argc > 0) { if (!strcmp(argv[0], "read")) return graph_read(argc, argv); + if (!strcmp(argv[0], "verify")) + return graph_verify(argc, argv); if (!strcmp(argv[0], "write")) return graph_write(argc, argv); } diff --git a/commit-graph.c b/commit-graph.c index fee8437ce3..c860cbe721 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -824,3 +824,26 @@ void write_commit_graph(const char *obj_dir, oids.alloc = 0; oids.nr = 0; } + +static int verify_commit_graph_error; + +static void graph_report(const char *fmt, ...) +{ + va_list ap; + verify_commit_graph_error = 1; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +int verify_commit_graph(struct commit_graph *g) +{ + if (!g) { + graph_report("no commit-graph file loaded"); + return 1; + } + + return verify_commit_graph_error; +} diff --git a/commit-graph.h b/commit-graph.h index 96cccb10f3..71a39c5a57 100644 --- a/commit-graph.h +++ b/co
[PATCH v4 07/21] commit-graph: verify catches corrupt signature
This is the first of several commits that add a test to check that 'git commit-graph verify' catches corruption in the commit-graph file. The first test checks that the command catches an error in the file signature. This is a check that exists in the existing commit-graph reading code. Add a helper method 'corrupt_graph_and_verify' to the test script t5318-commit-graph.sh. This helper corrupts the commit-graph file at a certain location, runs 'git commit-graph verify', and reports the output to the 'err' file. This data is filtered to remove the lines added by 'test_must_fail' when the test is run verbosely. Then, the output is checked to contain a specific error message. Most messages from 'git commit-graph verify' will not be marked for translation. There will be one exception: the message that reports an invalid checksum will be marked for translation, as that is the only message that is intended for a typical user. Helped-by: Szeder Gábor Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 43 + 1 file changed, 43 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 0830ef9fdd..c0c1ff09b9 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -235,9 +235,52 @@ test_expect_success 'perform fast-forward merge in full repo' ' test_cmp expect output ' +# the verify tests below expect the commit-graph to contain +# exactly the commits reachable from the commits/8 branch. +# If the file changes the set of commits in the list, then the +# offsets into the binary file will result in different edits +# and the tests will likely break. + test_expect_success 'git commit-graph verify' ' cd "$TRASH_DIRECTORY/full" && + git rev-parse commits/8 | git commit-graph write --stdin-commits && git commit-graph verify >output ' +GRAPH_BYTE_VERSION=4 +GRAPH_BYTE_HASH=5 + +# usage: corrupt_graph_and_verify +# Manipulates the commit-graph file at the position +# by inserting the data, then runs 'git commit-graph verify' +# and places the output in the file 'err'. Test 'err' for +# the given string. +corrupt_graph_and_verify() { + pos=$1 + data="${2:-\0}" + grepstr=$3 + cd "$TRASH_DIRECTORY/full" && + test_when_finished mv commit-graph-backup $objdir/info/commit-graph && + cp $objdir/info/commit-graph commit-graph-backup && + printf "$data" | dd of="$objdir/info/commit-graph" bs=1 seek="$pos" conv=notrunc && + test_must_fail git commit-graph verify 2>test_err && + grep -v "^+" test_err >err + test_i18ngrep "$grepstr" err +} + +test_expect_success 'detect bad signature' ' + corrupt_graph_and_verify 0 "\0" \ + "graph signature" +' + +test_expect_success 'detect bad version' ' + corrupt_graph_and_verify $GRAPH_BYTE_VERSION "\02" \ + "graph version" +' + +test_expect_success 'detect bad hash version' ' + corrupt_graph_and_verify $GRAPH_BYTE_HASH "\02" \ + "hash version" +' + test_done -- 2.18.0.rc1
[PATCH v4 03/21] commit-graph: parse commit from chosen graph
Before verifying a commit-graph file against the object database, we need to parse all commits from the given commit-graph file. Create parse_commit_in_graph_one() to target a given struct commit_graph. Signed-off-by: Derrick Stolee --- commit-graph.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index c09e87c3c2..472b41ec30 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -311,7 +311,7 @@ static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uin } } -int parse_commit_in_graph(struct commit *item) +static int parse_commit_in_graph_one(struct commit_graph *g, struct commit *item) { uint32_t pos; @@ -319,9 +319,21 @@ int parse_commit_in_graph(struct commit *item) return 0; if (item->object.parsed) return 1; + + if (find_commit_in_graph(item, g, )) + return fill_commit_in_graph(item, g, pos); + + return 0; +} + +int parse_commit_in_graph(struct commit *item) +{ + if (!core_commit_graph) + return 0; + prepare_commit_graph(); - if (commit_graph && find_commit_in_graph(item, commit_graph, )) - return fill_commit_in_graph(item, commit_graph, pos); + if (commit_graph) + return parse_commit_in_graph_one(commit_graph, item); return 0; } -- 2.18.0.rc1
[PATCH v4 08/21] commit-graph: verify required chunks are present
The commit-graph file requires the following three chunks: * OID Fanout * OID Lookup * Commit Data If any of these are missing, then the 'verify' subcommand should report a failure. This includes the chunk IDs malformed or the chunk count is truncated. Signed-off-by: Derrick Stolee --- commit-graph.c | 9 + t/t5318-commit-graph.sh | 29 + 2 files changed, 38 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index c860cbe721..25d5edea82 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -845,5 +845,14 @@ int verify_commit_graph(struct commit_graph *g) return 1; } + verify_commit_graph_error = 0; + + if (!g->chunk_oid_fanout) + graph_report("commit-graph is missing the OID Fanout chunk"); + if (!g->chunk_oid_lookup) + graph_report("commit-graph is missing the OID Lookup chunk"); + if (!g->chunk_commit_data) + graph_report("commit-graph is missing the Commit Data chunk"); + return verify_commit_graph_error; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index c0c1ff09b9..846396665e 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -249,6 +249,15 @@ test_expect_success 'git commit-graph verify' ' GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 +GRAPH_BYTE_CHUNK_COUNT=6 +GRAPH_CHUNK_LOOKUP_OFFSET=8 +GRAPH_CHUNK_LOOKUP_WIDTH=12 +GRAPH_CHUNK_LOOKUP_ROWS=5 +GRAPH_BYTE_OID_FANOUT_ID=$GRAPH_CHUNK_LOOKUP_OFFSET +GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \ + 1 \* $GRAPH_CHUNK_LOOKUP_WIDTH)) +GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \ +2 \* $GRAPH_CHUNK_LOOKUP_WIDTH)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -283,4 +292,24 @@ test_expect_success 'detect bad hash version' ' "hash version" ' +test_expect_success 'detect low chunk count' ' + corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \ + "missing the .* chunk" +' + +test_expect_success 'detect missing OID fanout chunk' ' + corrupt_graph_and_verify $GRAPH_BYTE_OID_FANOUT_ID "\0" \ + "missing the OID Fanout chunk" +' + +test_expect_success 'detect missing OID lookup chunk' ' + corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ID "\0" \ + "missing the OID Lookup chunk" +' + +test_expect_success 'detect missing commit data chunk' ' + corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_DATA_ID "\0" \ + "missing the Commit Data chunk" +' + test_done -- 2.18.0.rc1
[PATCH v4 05/21] commit-graph: load a root tree from specific graph
When lazy-loading a tree for a commit, it will be important to select the tree from a specific struct commit_graph. Create a new method that specifies the commit-graph file and use that in get_commit_tree_in_graph(). Signed-off-by: Derrick Stolee --- commit-graph.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 472b41ec30..fee8437ce3 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -359,14 +359,20 @@ static struct tree *load_tree_for_commit(struct commit_graph *g, struct commit * return c->maybe_tree; } -struct tree *get_commit_tree_in_graph(const struct commit *c) +static struct tree *get_commit_tree_in_graph_one(struct commit_graph *g, +const struct commit *c) { if (c->maybe_tree) return c->maybe_tree; if (c->graph_pos == COMMIT_NOT_FROM_GRAPH) - BUG("get_commit_tree_in_graph called from non-commit-graph commit"); + BUG("get_commit_tree_in_graph_one called from non-commit-graph commit"); + + return load_tree_for_commit(g, (struct commit *)c); +} - return load_tree_for_commit(commit_graph, (struct commit *)c); +struct tree *get_commit_tree_in_graph(const struct commit *c) +{ + return get_commit_tree_in_graph_one(commit_graph, c); } static void write_graph_chunk_fanout(struct hashfile *f, -- 2.18.0.rc1
[PATCH v4 00/21] Integrate commit-graph into 'fsck' and 'gc'
Thanks for the feedback on v3. There were several small cleanups, but perhaps the biggest change is the addition of "commit-graph: use string-list API for input" which makes "commit-graph: add '--reachable' option" much simpler. The inter-diff is still reasonably large, but I'll send it in a follow-up PR. Thanks, -Stolee Derrick Stolee (21): commit-graph: UNLEAK before die() commit-graph: fix GRAPH_MIN_SIZE commit-graph: parse commit from chosen graph commit: force commit to parse from object database commit-graph: load a root tree from specific graph commit-graph: add 'verify' subcommand commit-graph: verify catches corrupt signature commit-graph: verify required chunks are present commit-graph: verify corrupt OID fanout and lookup commit-graph: verify objects exist commit-graph: verify root tree OIDs commit-graph: verify parent list commit-graph: verify generation number commit-graph: verify commit date commit-graph: test for corrupted octopus edge commit-graph: verify contents match checksum fsck: verify commit-graph commit-graph: use string-list API for input commit-graph: add '--reachable' option gc: automatically write commit-graph files commit-graph: update design document Documentation/config.txt | 10 +- Documentation/git-commit-graph.txt | 14 +- Documentation/git-fsck.txt | 3 + Documentation/git-gc.txt | 4 + Documentation/technical/commit-graph.txt | 22 -- builtin/commit-graph.c | 98 ++--- builtin/fsck.c | 21 ++ builtin/gc.c | 6 + commit-graph.c | 248 +-- commit-graph.h | 10 +- commit.c | 9 +- commit.h | 1 + t/t5318-commit-graph.sh | 201 ++ 13 files changed, 569 insertions(+), 78 deletions(-) -- 2.18.0.rc1
[PATCH v4 11/21] commit-graph: verify root tree OIDs
The 'verify' subcommand must compare the commit content parsed from the commit-graph against the content in the object database. Use lookup_commit() and parse_commit_in_graph_one() to parse the commits from the graph and compare against a commit that is loaded separately and parsed directly from the object database. Add checks for the root tree OID. Signed-off-by: Derrick Stolee --- commit-graph.c | 17 - t/t5318-commit-graph.sh | 7 +++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/commit-graph.c b/commit-graph.c index 0cf1b61d80..50a8e27910 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -862,6 +862,8 @@ int verify_commit_graph(struct commit_graph *g) return verify_commit_graph_error; for (i = 0; i < g->num_commits; i++) { + struct commit *graph_commit; + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); if (i && oidcmp(_oid, _oid) >= 0) @@ -879,6 +881,11 @@ int verify_commit_graph(struct commit_graph *g) cur_fanout_pos++; } + + graph_commit = lookup_commit(_oid); + if (!parse_commit_in_graph_one(g, graph_commit)) + graph_report("failed to parse %s from commit-graph", +oid_to_hex(_oid)); } while (cur_fanout_pos < 256) { @@ -895,16 +902,24 @@ int verify_commit_graph(struct commit_graph *g) return verify_commit_graph_error; for (i = 0; i < g->num_commits; i++) { - struct commit *odb_commit; + struct commit *graph_commit, *odb_commit; hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); + graph_commit = lookup_commit(_oid); odb_commit = (struct commit *)create_object(cur_oid.hash, alloc_commit_node()); if (parse_commit_internal(odb_commit, 0, 0)) { graph_report("failed to parse %s from object database", oid_to_hex(_oid)); continue; } + + if (oidcmp(_commit_tree_in_graph_one(g, graph_commit)->object.oid, + get_commit_tree_oid(odb_commit))) + graph_report("root tree OID for commit %s in commit-graph is %s != %s", +oid_to_hex(_oid), + oid_to_hex(get_commit_tree_oid(graph_commit)), + oid_to_hex(get_commit_tree_oid(odb_commit))); } return verify_commit_graph_error; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index cf60e48496..c0c1248eda 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -267,6 +267,8 @@ GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255)) GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256)) GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8)) GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 10)) +GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* $NUM_COMMITS)) +GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -341,4 +343,9 @@ test_expect_success 'detect OID not in object database' ' "from object database" ' +test_expect_success 'detect incorrect tree OID' ' + corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_TREE "\01" \ + "root tree OID for commit" +' + test_done -- 2.18.0.rc1
[PATCH v4 09/21] commit-graph: verify corrupt OID fanout and lookup
In the commit-graph file, the OID fanout chunk provides an index into the OID lookup. The 'verify' subcommand should find incorrect values in the fanout. Similarly, the 'verify' subcommand should find out-of-order values in the OID lookup. Signed-off-by: Derrick Stolee --- commit-graph.c | 36 t/t5318-commit-graph.sh | 22 ++ 2 files changed, 58 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 25d5edea82..a90cc2e6f4 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -840,6 +840,9 @@ static void graph_report(const char *fmt, ...) int verify_commit_graph(struct commit_graph *g) { + uint32_t i, cur_fanout_pos = 0; + struct object_id prev_oid, cur_oid; + if (!g) { graph_report("no commit-graph file loaded"); return 1; @@ -854,5 +857,38 @@ int verify_commit_graph(struct commit_graph *g) if (!g->chunk_commit_data) graph_report("commit-graph is missing the Commit Data chunk"); + if (verify_commit_graph_error) + return verify_commit_graph_error; + + for (i = 0; i < g->num_commits; i++) { + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); + + if (i && oidcmp(_oid, _oid) >= 0) + graph_report("commit-graph has incorrect OID order: %s then %s", +oid_to_hex(_oid), +oid_to_hex(_oid)); + + oidcpy(_oid, _oid); + + while (cur_oid.hash[0] > cur_fanout_pos) { + uint32_t fanout_value = get_be32(g->chunk_oid_fanout + cur_fanout_pos); + if (i != fanout_value) + graph_report("commit-graph has incorrect fanout value: fanout[%d] = %u != %u", +cur_fanout_pos, fanout_value, i); + + cur_fanout_pos++; + } + } + + while (cur_fanout_pos < 256) { + uint32_t fanout_value = get_be32(g->chunk_oid_fanout + cur_fanout_pos); + + if (g->num_commits != fanout_value) + graph_report("commit-graph has incorrect fanout value: fanout[%d] = %u != %u", +cur_fanout_pos, fanout_value, i); + + cur_fanout_pos++; + } + return verify_commit_graph_error; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 846396665e..c29eae47c9 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' ' git commit-graph verify >output ' +HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 GRAPH_BYTE_CHUNK_COUNT=6 @@ -258,6 +259,12 @@ GRAPH_BYTE_OID_LOOKUP_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \ 1 \* $GRAPH_CHUNK_LOOKUP_WIDTH)) GRAPH_BYTE_COMMIT_DATA_ID=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \ 2 \* $GRAPH_CHUNK_LOOKUP_WIDTH)) +GRAPH_FANOUT_OFFSET=$(($GRAPH_CHUNK_LOOKUP_OFFSET + \ + $GRAPH_CHUNK_LOOKUP_WIDTH \* $GRAPH_CHUNK_LOOKUP_ROWS)) +GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4)) +GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255)) +GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256)) +GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -312,4 +319,19 @@ test_expect_success 'detect missing commit data chunk' ' "missing the Commit Data chunk" ' +test_expect_success 'detect incorrect fanout' ' + corrupt_graph_and_verify $GRAPH_BYTE_FANOUT1 "\01" \ + "fanout value" +' + +test_expect_success 'detect incorrect fanout final value' ' + corrupt_graph_and_verify $GRAPH_BYTE_FANOUT2 "\01" \ + "fanout value" +' + +test_expect_success 'detect incorrect OID order' ' + corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_ORDER "\01" \ + "incorrect OID order" +' + test_done -- 2.18.0.rc1
[PATCH v4 12/21] commit-graph: verify parent list
The commit-graph file stores parents in a two-column portion of the commit data chunk. If there is only one parent, then the second column stores 0x to indicate no second parent. The 'verify' subcommand checks the parent list for the commit loaded from the commit-graph and the one parsed from the object database. Test these checks for corrupt parents, too many parents, and wrong parents. Add a boundary check to insert_parent_or_die() for when the parent position value is out of range. The octopus merge will be tested in a later commit. Signed-off-by: Derrick Stolee --- commit-graph.c | 28 t/t5318-commit-graph.sh | 18 ++ 2 files changed, 46 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 50a8e27910..d2a86e329e 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -240,6 +240,9 @@ static struct commit_list **insert_parent_or_die(struct commit_graph *g, struct commit *c; struct object_id oid; + if (pos >= g->num_commits) + die("invalid parent position %"PRIu64, pos); + hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos); c = lookup_commit(); if (!c) @@ -903,6 +906,7 @@ int verify_commit_graph(struct commit_graph *g) for (i = 0; i < g->num_commits; i++) { struct commit *graph_commit, *odb_commit; + struct commit_list *graph_parents, *odb_parents; hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); @@ -920,6 +924,30 @@ int verify_commit_graph(struct commit_graph *g) oid_to_hex(_oid), oid_to_hex(get_commit_tree_oid(graph_commit)), oid_to_hex(get_commit_tree_oid(odb_commit))); + + graph_parents = graph_commit->parents; + odb_parents = odb_commit->parents; + + while (graph_parents) { + if (odb_parents == NULL) { + graph_report("commit-graph parent list for commit %s is too long", +oid_to_hex(_oid)); + break; + } + + if (oidcmp(_parents->item->object.oid, _parents->item->object.oid)) + graph_report("commit-graph parent for %s is %s != %s", +oid_to_hex(_oid), + oid_to_hex(_parents->item->object.oid), + oid_to_hex(_parents->item->object.oid)); + + graph_parents = graph_parents->next; + odb_parents = odb_parents->next; + } + + if (odb_parents != NULL) + graph_report("commit-graph parent list for commit %s terminates early", +oid_to_hex(_oid)); } return verify_commit_graph_error; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index c0c1248eda..ec0964112a 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -269,6 +269,9 @@ GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8)) GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 10)) GRAPH_COMMIT_DATA_OFFSET=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* $NUM_COMMITS)) GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET +GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN)) +GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4)) +GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -348,4 +351,19 @@ test_expect_success 'detect incorrect tree OID' ' "root tree OID for commit" ' +test_expect_success 'detect incorrect parent int-id' ' + corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_PARENT "\01" \ + "invalid parent" +' + +test_expect_success 'detect extra parent int-id' ' + corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_EXTRA_PARENT "\00" \ + "is too long" +' + +test_expect_success 'detect wrong parent' ' + corrupt_graph_and_verify $GRAPH_BYTE_COMMIT_WRONG_PARENT "\01" \ + "commit-graph parent for" +' + test_done -- 2.18.0.rc1
[PATCH v4 02/21] commit-graph: fix GRAPH_MIN_SIZE
The GRAPH_MIN_SIZE macro should be the smallest size of a parsable commit-graph file. However, the minimum number of chunks was wrong. It is possible to write a commit-graph file with zero commits, and that violates this macro's value. Rewrite the macro, and use extra macros to better explain the magic constants. Signed-off-by: Derrick Stolee --- commit-graph.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index bb54c1214c..c09e87c3c2 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -34,10 +34,11 @@ #define GRAPH_LAST_EDGE 0x8000 +#define GRAPH_HEADER_SIZE 8 #define GRAPH_FANOUT_SIZE (4 * 256) #define GRAPH_CHUNKLOOKUP_WIDTH 12 -#define GRAPH_MIN_SIZE (5 * GRAPH_CHUNKLOOKUP_WIDTH + GRAPH_FANOUT_SIZE + \ - GRAPH_OID_LEN + 8) +#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \ + + GRAPH_FANOUT_SIZE + GRAPH_OID_LEN) char *get_commit_graph_filename(const char *obj_dir) { -- 2.18.0.rc1
[PATCH v4 01/21] commit-graph: UNLEAK before die()
Signed-off-by: Derrick Stolee --- builtin/commit-graph.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index 37420ae0fd..f0875b8bf3 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -51,8 +51,11 @@ static int graph_read(int argc, const char **argv) graph_name = get_commit_graph_filename(opts.obj_dir); graph = load_commit_graph_one(graph_name); - if (!graph) + if (!graph) { + UNLEAK(graph_name); die("graph file %s does not exist", graph_name); + } + FREE_AND_NULL(graph_name); printf("header: %08x %d %d %d %d\n", -- 2.18.0.rc1
[PATCH v4 10/21] commit-graph: verify objects exist
In the 'verify' subcommand, load commits directly from the object database to ensure they exist. Parse by skipping the commit-graph. Signed-off-by: Derrick Stolee --- commit-graph.c | 17 + t/t5318-commit-graph.sh | 7 +++ 2 files changed, 24 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index a90cc2e6f4..0cf1b61d80 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -239,6 +239,7 @@ static struct commit_list **insert_parent_or_die(struct commit_graph *g, { struct commit *c; struct object_id oid; + hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * pos); c = lookup_commit(); if (!c) @@ -890,5 +891,21 @@ int verify_commit_graph(struct commit_graph *g) cur_fanout_pos++; } + if (verify_commit_graph_error) + return verify_commit_graph_error; + + for (i = 0; i < g->num_commits; i++) { + struct commit *odb_commit; + + hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); + + odb_commit = (struct commit *)create_object(cur_oid.hash, alloc_commit_node()); + if (parse_commit_internal(odb_commit, 0, 0)) { + graph_report("failed to parse %s from object database", +oid_to_hex(_oid)); + continue; + } + } + return verify_commit_graph_error; } diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index c29eae47c9..cf60e48496 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -247,6 +247,7 @@ test_expect_success 'git commit-graph verify' ' git commit-graph verify >output ' +NUM_COMMITS=9 HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 @@ -265,6 +266,7 @@ GRAPH_BYTE_FANOUT1=$(($GRAPH_FANOUT_OFFSET + 4 \* 4)) GRAPH_BYTE_FANOUT2=$(($GRAPH_FANOUT_OFFSET + 4 \* 255)) GRAPH_OID_LOOKUP_OFFSET=$(($GRAPH_FANOUT_OFFSET + 4 \* 256)) GRAPH_BYTE_OID_LOOKUP_ORDER=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 8)) +GRAPH_BYTE_OID_LOOKUP_MISSING=$(($GRAPH_OID_LOOKUP_OFFSET + $HASH_LEN \* 4 + 10)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -334,4 +336,9 @@ test_expect_success 'detect incorrect OID order' ' "incorrect OID order" ' +test_expect_success 'detect OID not in object database' ' + corrupt_graph_and_verify $GRAPH_BYTE_OID_LOOKUP_MISSING "\01" \ + "from object database" +' + test_done -- 2.18.0.rc1
[PATCH v4 15/21] commit-graph: test for corrupted octopus edge
The commit-graph file has an extra chunk to store the parent int-ids for parents beyond the first parent for octopus merges. Our test repo has a single octopus merge that we can manipulate to demonstrate the 'verify' subcommand detects incorrect values in that chunk. Signed-off-by: Derrick Stolee --- t/t5318-commit-graph.sh | 10 ++ 1 file changed, 10 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 6a873bfda8..cf67fb391a 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -248,6 +248,7 @@ test_expect_success 'git commit-graph verify' ' ' NUM_COMMITS=9 +NUM_OCTOPUS_EDGES=2 HASH_LEN=20 GRAPH_BYTE_VERSION=4 GRAPH_BYTE_HASH=5 @@ -274,6 +275,10 @@ GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4)) GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3)) GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11)) GRAPH_BYTE_COMMIT_DATE=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 12)) +GRAPH_COMMIT_DATA_WIDTH=$(($HASH_LEN + 16)) +GRAPH_OCTOPUS_DATA_OFFSET=$(($GRAPH_COMMIT_DATA_OFFSET + \ +$GRAPH_COMMIT_DATA_WIDTH \* $NUM_COMMITS)) +GRAPH_BYTE_OCTOPUS=$(($GRAPH_OCTOPUS_DATA_OFFSET + 4)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -383,4 +388,9 @@ test_expect_success 'detect incorrect commit date' ' "commit date" ' +test_expect_success 'detect incorrect parent for octopus merge' ' + corrupt_graph_and_verify $GRAPH_BYTE_OCTOPUS "\01" \ + "invalid parent" +' + test_done -- 2.18.0.rc1
[PATCH v4 04/21] commit: force commit to parse from object database
In anticipation of verifying commit-graph file contents against the object database, create parse_commit_internal() to allow side-stepping the commit-graph file and parse directly from the object database. Due to the use of generation numbers, this method should not be called unless the intention is explicit in avoiding commits from the commit-graph file. Signed-off-by: Derrick Stolee --- commit.c | 9 +++-- commit.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/commit.c b/commit.c index 1d28677dfb..6eaed0174c 100644 --- a/commit.c +++ b/commit.c @@ -392,7 +392,7 @@ int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long s return 0; } -int parse_commit_gently(struct commit *item, int quiet_on_missing) +int parse_commit_internal(struct commit *item, int quiet_on_missing, int use_commit_graph) { enum object_type type; void *buffer; @@ -403,7 +403,7 @@ int parse_commit_gently(struct commit *item, int quiet_on_missing) return -1; if (item->object.parsed) return 0; - if (parse_commit_in_graph(item)) + if (use_commit_graph && parse_commit_in_graph(item)) return 0; buffer = read_sha1_file(item->object.oid.hash, , ); if (!buffer) @@ -424,6 +424,11 @@ int parse_commit_gently(struct commit *item, int quiet_on_missing) return ret; } +int parse_commit_gently(struct commit *item, int quiet_on_missing) +{ + return parse_commit_internal(item, quiet_on_missing, 1); +} + void parse_commit_or_die(struct commit *item) { if (parse_commit(item)) diff --git a/commit.h b/commit.h index b5afde1ae9..5fde74fcd7 100644 --- a/commit.h +++ b/commit.h @@ -73,6 +73,7 @@ struct commit *lookup_commit_reference_by_name(const char *name); struct commit *lookup_commit_or_die(const struct object_id *oid, const char *ref_name); int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long size, int check_graph); +int parse_commit_internal(struct commit *item, int quiet_on_missing, int use_commit_graph); int parse_commit_gently(struct commit *item, int quiet_on_missing); static inline int parse_commit(struct commit *item) { -- 2.18.0.rc1
[PATCH v4 18/21] commit-graph: use string-list API for input
Signed-off-by: Derrick Stolee --- builtin/commit-graph.c | 39 +-- commit-graph.c | 15 +++ commit-graph.h | 7 +++ 3 files changed, 23 insertions(+), 38 deletions(-) diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index 3079cde6f9..d8eb8278b3 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -118,13 +118,9 @@ static int graph_read(int argc, const char **argv) static int graph_write(int argc, const char **argv) { - const char **pack_indexes = NULL; - int packs_nr = 0; - const char **commit_hex = NULL; - int commits_nr = 0; - const char **lines = NULL; - int lines_nr = 0; - int lines_alloc = 0; + struct string_list *pack_indexes = NULL; + struct string_list *commit_hex = NULL; + struct string_list lines; static struct option builtin_commit_graph_write_options[] = { OPT_STRING(0, "object-dir", _dir, @@ -150,32 +146,23 @@ static int graph_write(int argc, const char **argv) if (opts.stdin_packs || opts.stdin_commits) { struct strbuf buf = STRBUF_INIT; - lines_nr = 0; - lines_alloc = 128; - ALLOC_ARRAY(lines, lines_alloc); - - while (strbuf_getline(, stdin) != EOF) { - ALLOC_GROW(lines, lines_nr + 1, lines_alloc); - lines[lines_nr++] = strbuf_detach(, NULL); - } - - if (opts.stdin_packs) { - pack_indexes = lines; - packs_nr = lines_nr; - } - if (opts.stdin_commits) { - commit_hex = lines; - commits_nr = lines_nr; - } + string_list_init(, 0); + + while (strbuf_getline(, stdin) != EOF) + string_list_append(, strbuf_detach(, NULL)); + + if (opts.stdin_packs) + pack_indexes = + if (opts.stdin_commits) + commit_hex = } write_commit_graph(opts.obj_dir, pack_indexes, - packs_nr, commit_hex, - commits_nr, opts.append); + string_list_clear(, 0); return 0; } diff --git a/commit-graph.c b/commit-graph.c index 05879de26c..c6070735c2 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -653,10 +653,8 @@ static void compute_generation_numbers(struct packed_commit_list* commits) } void write_commit_graph(const char *obj_dir, - const char **pack_indexes, - int nr_packs, - const char **commit_hex, - int nr_commits, + struct string_list *pack_indexes, + struct string_list *commit_hex, int append) { struct packed_oid_list oids; @@ -697,10 +695,10 @@ void write_commit_graph(const char *obj_dir, int dirlen; strbuf_addf(, "%s/pack/", obj_dir); dirlen = packname.len; - for (i = 0; i < nr_packs; i++) { + for (i = 0; i < pack_indexes->nr; i++) { struct packed_git *p; strbuf_setlen(, dirlen); - strbuf_addstr(, pack_indexes[i]); + strbuf_addstr(, pack_indexes->items[i].string); p = add_packed_git(packname.buf, packname.len, 1); if (!p) die("error adding pack %s", packname.buf); @@ -713,12 +711,13 @@ void write_commit_graph(const char *obj_dir, } if (commit_hex) { - for (i = 0; i < nr_commits; i++) { + for (i = 0; i < commit_hex->nr; i++) { const char *end; struct object_id oid; struct commit *result; - if (commit_hex[i] && parse_oid_hex(commit_hex[i], , )) + if (commit_hex->items[i].string && + parse_oid_hex(commit_hex->items[i].string, , )) continue; result = lookup_commit_reference_gently(, 1); diff --git a/commit-graph.h b/commit-graph.h index 71a39c5a57..1e1fc5 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -2,6 +2,7 @@ #define COMMIT_GRAPH_H #include "git-compat-util.h" +#include "string-list.h" char *get_commit_graph_filename(const char *obj_dir); @@ -47,10 +48,8 @@ struct commit_graph { struct commit_graph *load_commit_graph_one(const char *graph_file); void write_commit_graph(const char *obj
[PATCH v4 13/21] commit-graph: verify generation number
While iterating through the commit parents, perform the generation number calculation and compare against the value stored in the commit-graph. The tests demonstrate that having a different set of parents affects the generation number calculation, and this value propagates to descendants. Hence, we drop the single-line condition on the output. Since Git will ship with the commit-graph feature without generation numbers, we need to accept commit-graphs with all generation numbers equal to zero. In this case, ignore the generation number calculation. However, verify that we should never have a mix of zero and non-zero generation numbers. Create a test that sets one commit to generation zero and all following commits report a failure as they have non-zero generation in a file that contains generation number zero. Signed-off-by: Derrick Stolee --- commit-graph.c | 34 ++ t/t5318-commit-graph.sh | 11 +++ 2 files changed, 45 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index d2a86e329e..5faecae2a7 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -842,10 +842,14 @@ static void graph_report(const char *fmt, ...) va_end(ap); } +#define GENERATION_ZERO_EXISTS 1 +#define GENERATION_NUMBER_EXISTS 2 + int verify_commit_graph(struct commit_graph *g) { uint32_t i, cur_fanout_pos = 0; struct object_id prev_oid, cur_oid; + int generation_zero = 0; if (!g) { graph_report("no commit-graph file loaded"); @@ -907,6 +911,7 @@ int verify_commit_graph(struct commit_graph *g) for (i = 0; i < g->num_commits; i++) { struct commit *graph_commit, *odb_commit; struct commit_list *graph_parents, *odb_parents; + uint32_t max_generation = 0; hashcpy(cur_oid.hash, g->chunk_oid_lookup + g->hash_len * i); @@ -941,6 +946,9 @@ int verify_commit_graph(struct commit_graph *g) oid_to_hex(_parents->item->object.oid), oid_to_hex(_parents->item->object.oid)); + if (graph_parents->item->generation > max_generation) + max_generation = graph_parents->item->generation; + graph_parents = graph_parents->next; odb_parents = odb_parents->next; } @@ -948,6 +956,32 @@ int verify_commit_graph(struct commit_graph *g) if (odb_parents != NULL) graph_report("commit-graph parent list for commit %s terminates early", oid_to_hex(_oid)); + + if (!graph_commit->generation) { + if (generation_zero == GENERATION_NUMBER_EXISTS) + graph_report("commit-graph has generation number zero for commit %s, but non-zero elsewhere", +oid_to_hex(_oid)); + generation_zero = GENERATION_ZERO_EXISTS; + } else if (generation_zero == GENERATION_ZERO_EXISTS) + graph_report("commit-graph has non-zero generation number for commit %s, but zero elsewhere", +oid_to_hex(_oid)); + + if (generation_zero == GENERATION_ZERO_EXISTS) + continue; + + /* +* If one of our parents has generation GENERATION_NUMBER_MAX, then +* our generation is also GENERATION_NUMBER_MAX. Decrement to avoid +* extra logic in the following condition. +*/ + if (max_generation == GENERATION_NUMBER_MAX) + max_generation--; + + if (graph_commit->generation != max_generation + 1) + graph_report("commit-graph generation for commit %s is %u != %u", +oid_to_hex(_oid), +graph_commit->generation, +max_generation + 1); } return verify_commit_graph_error; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index ec0964112a..a6ea1341dc 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -272,6 +272,7 @@ GRAPH_BYTE_COMMIT_TREE=$GRAPH_COMMIT_DATA_OFFSET GRAPH_BYTE_COMMIT_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN)) GRAPH_BYTE_COMMIT_EXTRA_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 4)) GRAPH_BYTE_COMMIT_WRONG_PARENT=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 3)) +GRAPH_BYTE_COMMIT_GENERATION=$(($GRAPH_COMMIT_DATA_OFFSET + $HASH_LEN + 11)) # usage: corrupt_graph_and_verify # Manipulates the commit-graph file at the position @@ -366,4 +367,14 @@ test_expect_success 'detect wrong parent' '