Re: [PATCH v2 11/14] commit: integrate commit graph with commit parsing

Derrick Stolee Tue, 06 Feb 2018 06:53:22 -0800

On 2/1/2018 8:51 PM, Jonathan Tan wrote:

On Tue, 30 Jan 2018 16:39:40 -0500
Derrick Stolee <sto...@gmail.com> wrote:

+/* global storage */
+struct commit_graph *commit_graph = 0;

NULL, not 0.

+static int bsearch_graph(struct commit_graph *g, struct object_id *oid, 
uint32_t *pos)
+{
+       uint32_t last, first = 0;
+
+       if (oid->hash[0])
+               first = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * 
(oid->hash[0] - 1)));
+       last = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * oid->hash[0]));
+
+       while (first < last) {
+               uint32_t mid = first + (last - first) / 2;
+               const unsigned char *current;
+               int cmp;
+
+               current = g->chunk_oid_lookup + g->hdr->hash_len * mid;
+               cmp = hashcmp(oid->hash, current);
+               if (!cmp) {
+                       *pos = mid;
+                       return 1;
+               }
+               if (cmp > 0) {
+                       first = mid + 1;
+                       continue;
+               }
+               last = mid;
+       }
+
+       *pos = first;
+       return 0;
+}

This would be better in sha1-lookup.c, something like the reverse of commit
f1068efefe6d ("sha1_file: drop experimental GIT_USE_LOOKUP search",
2017-08-09), except that it can be done using a simple binary search.

I rebased my patch onto your binary search patch, so I'll use that inthe future.

+static int full_parse_commit(struct commit *item, struct commit_graph *g,
+                            uint32_t pos, const unsigned char *commit_data)
+{
+       struct object_id oid;
+       struct commit *new_parent;
+       uint32_t new_parent_pos;
+       uint32_t *parent_data_ptr;
+       uint64_t date_low, date_high;
+       struct commit_list **pptr;
+
+       item->object.parsed = 1;
+       item->graph_pos = pos;
+
+       hashcpy(oid.hash, commit_data);
+       item->tree = lookup_tree(&oid);
+
+       date_high = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 8)) & 
0x3;
+       date_low = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 12));
+       item->date = (timestamp_t)((date_high << 32) | date_low);
+
+       pptr = &item->parents;
+
+       new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
+       if (new_parent_pos == GRAPH_PARENT_NONE)
+               return 1;
+       get_nth_commit_oid(g, new_parent_pos, &oid);
+       new_parent = lookup_commit(&oid);
+       if (new_parent) {
+               new_parent->graph_pos = new_parent_pos;
+               pptr = &commit_list_insert(new_parent, pptr)->next;
+       } else {
+               die("could not find commit %s", oid_to_hex(&oid));
+       }
+
+       new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
4));
+       if (new_parent_pos == GRAPH_PARENT_NONE)
+               return 1;
+       if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED)) {
+               get_nth_commit_oid(g, new_parent_pos, &oid);
+               new_parent = lookup_commit(&oid);
+               if (new_parent) {
+                       new_parent->graph_pos = new_parent_pos;
+                       pptr = &commit_list_insert(new_parent, pptr)->next;
+               } else
+                       die("could not find commit %s", oid_to_hex(&oid));
+               return 1;
+       }
+
+       parent_data_ptr = (uint32_t*)(g->chunk_large_edges + 4 * 
(new_parent_pos ^ GRAPH_LARGE_EDGES_NEEDED));
+       do {
+               new_parent_pos = ntohl(*parent_data_ptr);
+
+               get_nth_commit_oid(g, new_parent_pos & GRAPH_EDGE_LAST_MASK, 
&oid);
+               new_parent = lookup_commit(&oid);
+               if (new_parent) {
+                       new_parent->graph_pos = new_parent_pos & 
GRAPH_EDGE_LAST_MASK;
+                       pptr = &commit_list_insert(new_parent, pptr)->next;
+               } else
+                       die("could not find commit %s", oid_to_hex(&oid));
+               parent_data_ptr++;
+       } while (!(new_parent_pos & GRAPH_LAST_EDGE));
+
+       return 1;
+}

The part that converts <pointer to parent data> into <struct commit *>
seems to be duplicated 3 times. Refactor into a function?


Will do.

+/**
+ * Fill 'item' to contain all information that would be parsed by 
parse_commit_buffer.
+ */
+static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, 
uint32_t pos)
+{
+       uint32_t new_parent_pos;
+       uint32_t *parent_data_ptr;
+       const unsigned char *commit_data = g->chunk_commit_data + 
(g->hdr->hash_len + 16) * pos;
+
+       new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
+
+       if (new_parent_pos == GRAPH_PARENT_MISSING)
+               return 0;
+
+       if (new_parent_pos == GRAPH_PARENT_NONE)
+               return full_parse_commit(item, g, pos, commit_data);
+
+       new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
4));
+
+       if (new_parent_pos == GRAPH_PARENT_MISSING)
+               return 0;
+       if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED))
+               return full_parse_commit(item, g, pos, commit_data);
+
+       new_parent_pos = new_parent_pos ^ GRAPH_LARGE_EDGES_NEEDED;
+
+       if (new_parent_pos == GRAPH_PARENT_MISSING)
+               return 0;
+
+       parent_data_ptr = (uint32_t*)(g->chunk_large_edges + 4 * 
new_parent_pos);
+       do {
+               new_parent_pos = ntohl(*parent_data_ptr);
+
+               if ((new_parent_pos & GRAPH_EDGE_LAST_MASK) == 
GRAPH_PARENT_MISSING)
+                       return 0;
+
+               parent_data_ptr++;
+       } while (!(new_parent_pos & GRAPH_LAST_EDGE));
+
+       return full_parse_commit(item, g, pos, commit_data);
+}

This function seems to just check for GRAPH_PARENT_MISSING - could that
check be folded into full_parse_commit() instead? (Then
full_parse_commit can be renamed to fill_commit_in_graph.)

I'd rather not have a really long method, but I could make the two stepstheir own static methods (one for checking and one for full parsing) tomake it more clear that there are two steps here.

@@ -439,9 +656,24 @@ struct object_id *construct_commit_graph(const char 
*pack_dir)
        char *fname;
        struct commit_list *parent;

+ prepare_commit_graph();

+
        oids.num = 0;
        oids.size = 1024;
+
+       if (commit_graph && oids.size < commit_graph->num_commits)
+               oids.size = commit_graph->num_commits;
+
        ALLOC_ARRAY(oids.list, oids.size);
+
+       if (commit_graph) {
+               for (i = 0; i < commit_graph->num_commits; i++) {
+                       oids.list[i] = malloc(sizeof(struct object_id));
+                       get_nth_commit_oid(commit_graph, i, oids.list[i]);
+               }
+               oids.num = commit_graph->num_commits;
+       }
+
        for_each_packed_object(if_packed_commit_add_to_list, &oids, 0);
        QSORT(oids.list, oids.num, commit_compare);

This change auto-includes the commits that were in the existing graphinto the new graph.

@@ -525,6 +757,11 @@ struct object_id *construct_commit_graph(const char *pack_dir)

        hashcpy(f_hash->hash, final_hash);
        fname = get_commit_graph_filename_hash(pack_dir, f_hash);

+ if (commit_graph) {

+               close_commit_graph(commit_graph);
+               FREE_AND_NULL(commit_graph);
+       }
+
        if (rename(graph_name, fname))
                die("failed to rename %s to %s", graph_name, fname);

This change is necessary if we are going to use --delete-expired, as weneed to unmap the file before we can delete it. Perhaps it would bebetter to close the graph in the builtin instead so that relationship isclearer.

What is the relation of these changes to construct_commit_graph() to the
rest of the patch?


(answered above, since the two changes have different purposes)

diff --git a/commit-graph.h b/commit-graph.h
index 43eb0aec84..05ddbbe165 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -4,6 +4,18 @@
  #include "git-compat-util.h"
  #include "commit.h"

+/**

+ * Given a commit struct, try to fill the commit struct info, including:
+ *  1. tree object
+ *  2. date
+ *  3. parents.
+ *
+ * Returns 1 if and only if the commit was found in the packed graph.
+ *
+ * See parse_commit_buffer() for the fallback after this call.
+ */
+extern int parse_commit_in_graph(struct commit *item);
+
  extern struct object_id *get_graph_head_hash(const char *pack_dir,
                                             struct object_id *hash);
  extern char* get_commit_graph_filename_hash(const char *pack_dir,
@@ -40,7 +52,13 @@ extern struct commit_graph {

extern int close_commit_graph(struct commit_graph *g);-extern struct commit_graph *load_commit_graph_one(const char *graph_file, const char *pack_dir);

+extern struct commit_graph *load_commit_graph_one(const char *graph_file,
+                                                 const char *pack_dir);
+extern void prepare_commit_graph(void);
+
+extern struct object_id *get_nth_commit_oid(struct commit_graph *g,
+                                           uint32_t n,
+                                           struct object_id *oid);

extern struct object_id *construct_commit_graph(const char *pack_dir);

This header file now contains functions for reading the commit graph,
and functions for writing one. It seems to me that those are (and should
be) quite disjoint, so it might be better to separate them into two.

This header file provides a unified API surface for interacting withcommit graphs. I'm not a fan of how other write commands are hidden inthe builtins (like 'builtin/pack-objects.c' for writing packs). If thereis a better example of how this split has been done in the rootdirectory, I'm happy to consider it.

-int parse_commit_gently(struct commit *item, int quiet_on_missing)
+int parse_commit_internal(struct commit *item, int quiet_on_missing, int 
check_packed)
  {
        enum object_type type;
        void *buffer;
@@ -385,6 +386,8 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
                return -1;
        if (item->object.parsed)
                return 0;
+       if (check_packed && parse_commit_in_graph(item))
+               return 0;
        buffer = read_sha1_file(item->object.oid.hash, &type, &size);
        if (!buffer)
                return quiet_on_missing ? -1 :
@@ -404,6 +407,11 @@ int parse_commit_gently(struct commit *item, int 
quiet_on_missing)
        return ret;
  }

+int parse_commit_gently(struct commit *item, int quiet_on_missing)

+{
+       return parse_commit_internal(item, quiet_on_missing, 1);
+}

Are you planning to use parse_commit_internal() from elsewhere? (It
doesn't seem to be the case, at least from this patch series.)

At one point I was using it, but I removed the one caller and forgot toclean up.

diff --git a/log-tree.c b/log-tree.c
index fca29d4799..156aed4541 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -659,8 +659,7 @@ void show_log(struct rev_info *opt)
                show_mergetag(opt, commit);
        }

- if (!get_cached_commit_buffer(commit, NULL))

-               return;
+       get_commit_buffer(commit, NULL);

This undoes an optimization that I discuss in my e-mail message here
[1]. If we decide to do this, it should at least be called out in the
commit message.

[1] 
https://public-inbox.org/git/b88725476d9f13ba4381d85e5fe049f6ef93f621.1506714999.git.jonathanta...@google.com/

I will call this out more clearly in my commit message next time. Myproblem with the existing code is that it doesn't just ignore the commitcontents but will actually not write a newline. I noticed during testing'git log --oneline' with the graph enabled and the output listed severalshort-shas in one line.

+_graph_git_two_modes() {

No need for the name to start with an underscore, I think.

+    git -c core.commitgraph=true $1 >output
+    git -c core.commitgraph=false $1 >expect
+    cmp output expect

Use test_cmp.

+}
+
+_graph_git_behavior() {
+    BRANCH=$1
+    COMPARE=$2
+    test_expect_success 'check normal git operations' \
+        '_graph_git_two_modes "log --oneline ${BRANCH}" &&
+         _graph_git_two_modes "log --topo-order ${BRANCH}" &&
+         _graph_git_two_modes "branch -vv" &&
+         _graph_git_two_modes "merge-base -a ${BRANCH} ${COMPARE}"'
+}

This makes it difficult to debug failing tests, since they're all named
the same. Better to just run the commands inline, and wrap the
invocations of _graph_git_behavior in an appropriately named
test_expect_success.

I'll add a parameter that adds a message to each test about the state ofthe repo and graph.


Thanks,
-Stolee

Re: [PATCH v2 11/14] commit: integrate commit graph with commit parsing

Reply via email to