On Sun, Sep 02 2018, Jeff King wrote:

> On Sun, Sep 02, 2018 at 03:55:28AM -0400, Jeff King wrote:
>
>> I still think the more interesting long-term thing here is to reuse the
>> pack verification from index-pack, which actually hashes as it does the
>> per-object countup.
>>
>> That code isn't lib-ified enough to be run in process, but I think the
>> patch below should give similar behavior to what fsck currently does.
>> We'd need to tell index-pack to use our fsck.* config for its checks, I
>> imagine. The progress here is still per-pack, but I think we could pass
>> in sufficient information to have it do one continuous meter across all
>> of the packs (see the in-code comment).
>>
>> And it makes the result multi-threaded, and lets us drop a bunch of
>> duplicate code.
>
> Here's a little polish on that to pass enough progress data to
> index-pack to let it have one nice continuous meter. I'm not sure if
> it's worth all the hackery or not. The dual-meter that index-pack
> generally uses is actually more informative (since it meters the initial
> pass through the pack and the delta reconstruction separately).

Thanks!

> And there are definitely a few nasty bits (like the way the progress is
> ended). I'm not planning on taking this further for now, but maybe
> you or somebody can find it interesting or useful.

I think it would be really nice if this were taken further. Using my
perf test in
https://public-inbox.org/git/20180903144928.30691-7-ava...@gmail.com/T/#u
I get these results:

    $ GIT_PERF_LARGE_REPO=/home/aearnfjord/g/linux GIT_PERF_REPEAT_COUNT=5 
GIT_PERF_MAKE_OPTS='-j56 CFLAGS="-O3"' ./run HEAD~ HEAD p1450-fsck.sh
    [...]
    Test           HEAD~                 HEAD
    ----------------------------------------------------------------
    1450.1: fsck   384.18(381.63+2.53)   301.52(508.28+38.34) -21.5%


I.e. this gives a 20% speedup, although of course some of that might be
because some of this might be skipping too much work, but looks really
promising.

I don't know if I'll have time for it, and besides, you wrote the code
so you already understand it *hint* *hint* :)

> ---
>  builtin/fsck.c       |  59 +++++++++------
>  builtin/index-pack.c |  43 ++++++++++-
>  pack-check.c         | 142 -----------------------------------
>  pack.h               |   1 -
>  4 files changed, 75 insertions(+), 170 deletions(-)
>
> diff --git a/builtin/fsck.c b/builtin/fsck.c
> index 250f5af118..2d774ea2e5 100644
> --- a/builtin/fsck.c
> +++ b/builtin/fsck.c
> @@ -386,25 +386,6 @@ static int fsck_obj(struct object *obj, void *buffer, 
> unsigned long size)
>       return err;
>  }
>
> -static int fsck_obj_buffer(const struct object_id *oid, enum object_type 
> type,
> -                        unsigned long size, void *buffer, int *eaten)
> -{
> -     /*
> -      * Note, buffer may be NULL if type is OBJ_BLOB. See
> -      * verify_packfile(), data_valid variable for details.
> -      */
> -     struct object *obj;
> -     obj = parse_object_buffer(the_repository, oid, type, size, buffer,
> -                               eaten);
> -     if (!obj) {
> -             errors_found |= ERROR_OBJECT;
> -             return error("%s: object corrupt or missing", oid_to_hex(oid));
> -     }
> -     obj->flags &= ~(REACHABLE | SEEN);
> -     obj->flags |= HAS_OBJ;
> -     return fsck_obj(obj, buffer, size);
> -}
> -
>  static int default_refs;
>
>  static void fsck_handle_reflog_oid(const char *refname, struct object_id 
> *oid,
> @@ -662,6 +643,32 @@ static int mark_packed_for_connectivity(const struct 
> object_id *oid,
>       return 0;
>  }
>
> +static int verify_pack(struct packed_git *p,
> +                    unsigned long count, unsigned long total)
> +{
> +     struct child_process index_pack = CHILD_PROCESS_INIT;
> +
> +     if (is_pack_valid(p) < 0)
> +             return -1;
> +     for_each_object_in_pack(p, mark_packed_for_connectivity, NULL, 0);
> +
> +     index_pack.git_cmd = 1;
> +     argv_array_pushl(&index_pack.args,
> +                      "index-pack",
> +                      "--verify-fsck",
> +                      NULL);
> +     if (show_progress)
> +             argv_array_pushf(&index_pack.args,
> +                              "--fsck-progress=%lu,%lu,Checking pack %s",
> +                              count, total, sha1_to_hex(p->sha1));
> +     argv_array_push(&index_pack.args, p->pack_name);
> +
> +     if (run_command(&index_pack))
> +             return -1;
> +
> +     return 0;
> +}
> +
>  static char const * const fsck_usage[] = {
>       N_("git fsck [<options>] [<object>...]"),
>       NULL
> @@ -737,7 +744,6 @@ int cmd_fsck(int argc, const char **argv, const char 
> *prefix)
>               if (check_full) {
>                       struct packed_git *p;
>                       uint32_t total = 0, count = 0;
> -                     struct progress *progress = NULL;
>
>                       if (show_progress) {
>                               for (p = get_packed_git(the_repository); p;
> @@ -746,18 +752,21 @@ int cmd_fsck(int argc, const char **argv, const char 
> *prefix)
>                                               continue;
>                                       total += p->num_objects;
>                               }
> -
> -                             progress = start_progress(_("Checking 
> objects"), total);
>                       }
>                       for (p = get_packed_git(the_repository); p;
>                            p = p->next) {
>                               /* verify gives error messages itself */
> -                             if (verify_pack(p, fsck_obj_buffer,
> -                                             progress, count))
> +                             if (verify_pack(p, count, total))
>                                       errors_found |= ERROR_PACK;
>                               count += p->num_objects;
>                       }
> -                     stop_progress(&progress);
> +                     /*
> +                      * Our child index-pack never calls stop_progress(),
> +                      * which lets each child appear on the same line. Now
> +                      * that we've finished all of them, we have to tie that
> +                      * off.
> +                      */
> +                     fprintf(stderr, "\n");
>               }
>
>               if (fsck_finish(&fsck_obj_options))
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 9582ead950..d03ec7bb89 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -85,6 +85,13 @@ static int show_resolving_progress;
>  static int show_stat;
>  static int check_self_contained_and_connected;
>
> +static int verify_fsck_mode;
> +/* unlike our normal 2-part progress, this counts up only total objects */
> +struct progress *fsck_progress;
> +static uint32_t fsck_progress_cur;
> +static uint32_t fsck_progress_total;
> +static const char *fsck_progress_title;
> +
>  static struct progress *progress;
>
>  /* We always read in 4kB chunks. */
> @@ -1100,6 +1107,8 @@ static void *threaded_second_pass(void *data)
>               int i;
>               counter_lock();
>               display_progress(progress, nr_resolved_deltas);
> +             display_progress(fsck_progress,
> +                              fsck_progress_cur + nr_resolved_deltas);
>               counter_unlock();
>               work_lock();
>               while (nr_dispatched < nr_objects &&
> @@ -1145,18 +1154,23 @@ static void parse_pack_objects(unsigned char *hash)
>                       nr_ofs_deltas++;
>                       ofs_delta->obj_no = i;
>                       ofs_delta++;
> +                     /* no fsck_progress; we only count it when we've 
> resolved the delta */
>               } else if (obj->type == OBJ_REF_DELTA) {
>                       ALLOC_GROW(ref_deltas, nr_ref_deltas + 1, 
> ref_deltas_alloc);
>                       oidcpy(&ref_deltas[nr_ref_deltas].oid, &ref_delta_oid);
>                       ref_deltas[nr_ref_deltas].obj_no = i;
>                       nr_ref_deltas++;
> +                     /* no fsck_progress; we only count it when we've 
> resolved the delta */
>               } else if (!data) {
>                       /* large blobs, check later */
>                       obj->real_type = OBJ_BAD;
>                       nr_delays++;
> -             } else
> +                     display_progress(fsck_progress, ++fsck_progress_cur);
> +             } else {
>                       sha1_object(data, NULL, obj->size, obj->type,
>                                   &obj->idx.oid);
> +                     display_progress(fsck_progress, ++fsck_progress_cur);
> +             }
>               free(data);
>               display_progress(progress, i+1);
>       }
> @@ -1238,6 +1252,8 @@ static void resolve_deltas(void)
>                       continue;
>               resolve_base(obj);
>               display_progress(progress, nr_resolved_deltas);
> +             display_progress(fsck_progress,
> +                              fsck_progress_cur + nr_resolved_deltas);
>       }
>  }
>
> @@ -1391,6 +1407,8 @@ static void fix_unresolved_deltas(struct hashfile *f)
>                                       base_obj->data, base_obj->size, type);
>               find_unresolved_deltas(base_obj);
>               display_progress(progress, nr_resolved_deltas);
> +             display_progress(fsck_progress,
> +                              fsck_progress_cur + nr_resolved_deltas);
>       }
>       free(sorted_by_pos);
>  }
> @@ -1714,6 +1732,11 @@ int cmd_index_pack(int argc, const char **argv, const 
> char *prefix)
>                               verify = 1;
>                               show_stat = 1;
>                               stat_only = 1;
> +                     } else if (!strcmp(arg, "--verify-fsck")) {
> +                             verify_fsck_mode = 1;
> +                             verify = 1;
> +                             strict = 1;
> +                             do_fsck_object = 1;
>                       } else if (skip_to_optional_arg(arg, "--keep", 
> &keep_msg)) {
>                               ; /* nothing to do */
>                       } else if (skip_to_optional_arg(arg, "--promisor", 
> &promisor_msg)) {
> @@ -1746,6 +1769,15 @@ int cmd_index_pack(int argc, const char **argv, const 
> char *prefix)
>                               verbose = 1;
>                       } else if (!strcmp(arg, "--show-resolving-progress")) {
>                               show_resolving_progress = 1;
> +                     } else if (skip_prefix(arg, "--fsck-progress=", &arg)) {
> +                             char *end;
> +                             fsck_progress_cur = strtoul(arg, &end, 10);
> +                             if (*end++ != ',')
> +                                     die("bad fsck-progress arg: %s", arg);
> +                             fsck_progress_total = strtoul(end, &end, 10);
> +                             if (*end++ != ',')
> +                                     die("bad fsck-progress arg: %s", arg);
> +                             fsck_progress_title = end;
>                       } else if (!strcmp(arg, "--report-end-of-input")) {
>                               report_end_of_input = 1;
>                       } else if (!strcmp(arg, "-o")) {
> @@ -1800,6 +1832,10 @@ int cmd_index_pack(int argc, const char **argv, const 
> char *prefix)
>       }
>  #endif
>
> +     if (fsck_progress_total)
> +             fsck_progress = start_progress(fsck_progress_title,
> +                                            fsck_progress_total);
> +
>       curr_pack = open_pack_file(pack_name);
>       parse_pack_header();
>       objects = xcalloc(st_add(nr_objects, 1), sizeof(struct object_entry));
> @@ -1833,8 +1869,11 @@ int cmd_index_pack(int argc, const char **argv, const 
> char *prefix)
>       else
>               close(input_fd);
>
> -     if (do_fsck_object && fsck_finish(&fsck_options))
> +     if (do_fsck_object && fsck_finish(&fsck_options)) {
> +             if (verify_fsck_mode)
> +                     return 1; /* quietly return error */
>               die(_("fsck error in pack objects"));
> +     }
>
>       free(objects);
>       strbuf_release(&index_name_buf);
> diff --git a/pack-check.c b/pack-check.c
> index d3a57df34f..ea1457ce53 100644
> --- a/pack-check.c
> +++ b/pack-check.c
> @@ -15,17 +15,6 @@ struct idx_entry {
>       unsigned int nr;
>  };
>
> -static int compare_entries(const void *e1, const void *e2)
> -{
> -     const struct idx_entry *entry1 = e1;
> -     const struct idx_entry *entry2 = e2;
> -     if (entry1->offset < entry2->offset)
> -             return -1;
> -     if (entry1->offset > entry2->offset)
> -             return 1;
> -     return 0;
> -}
> -
>  int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
>                  off_t offset, off_t len, unsigned int nr)
>  {
> @@ -48,121 +37,6 @@ int check_pack_crc(struct packed_git *p, struct 
> pack_window **w_curs,
>       return data_crc != ntohl(*index_crc);
>  }
>
> -static int verify_packfile(struct packed_git *p,
> -                        struct pack_window **w_curs,
> -                        verify_fn fn,
> -                        struct progress *progress, uint32_t base_count)
> -
> -{
> -     off_t index_size = p->index_size;
> -     const unsigned char *index_base = p->index_data;
> -     git_hash_ctx ctx;
> -     unsigned char hash[GIT_MAX_RAWSZ], *pack_sig;
> -     off_t offset = 0, pack_sig_ofs = 0;
> -     uint32_t nr_objects, i;
> -     int err = 0;
> -     struct idx_entry *entries;
> -
> -     if (!is_pack_valid(p))
> -             return error("packfile %s cannot be accessed", p->pack_name);
> -
> -     the_hash_algo->init_fn(&ctx);
> -     do {
> -             unsigned long remaining;
> -             unsigned char *in = use_pack(p, w_curs, offset, &remaining);
> -             offset += remaining;
> -             if (!pack_sig_ofs)
> -                     pack_sig_ofs = p->pack_size - the_hash_algo->rawsz;
> -             if (offset > pack_sig_ofs)
> -                     remaining -= (unsigned int)(offset - pack_sig_ofs);
> -             the_hash_algo->update_fn(&ctx, in, remaining);
> -     } while (offset < pack_sig_ofs);
> -     the_hash_algo->final_fn(hash, &ctx);
> -     pack_sig = use_pack(p, w_curs, pack_sig_ofs, NULL);
> -     if (hashcmp(hash, pack_sig))
> -             err = error("%s pack checksum mismatch",
> -                         p->pack_name);
> -     if (hashcmp(index_base + index_size - the_hash_algo->hexsz, pack_sig))
> -             err = error("%s pack checksum does not match its index",
> -                         p->pack_name);
> -     unuse_pack(w_curs);
> -
> -     /* Make sure everything reachable from idx is valid.  Since we
> -      * have verified that nr_objects matches between idx and pack,
> -      * we do not do scan-streaming check on the pack file.
> -      */
> -     nr_objects = p->num_objects;
> -     ALLOC_ARRAY(entries, nr_objects + 1);
> -     entries[nr_objects].offset = pack_sig_ofs;
> -     /* first sort entries by pack offset, since unpacking them is more 
> efficient that way */
> -     for (i = 0; i < nr_objects; i++) {
> -             entries[i].oid.hash = nth_packed_object_sha1(p, i);
> -             if (!entries[i].oid.hash)
> -                     die("internal error pack-check nth-packed-object");
> -             entries[i].offset = nth_packed_object_offset(p, i);
> -             entries[i].nr = i;
> -     }
> -     QSORT(entries, nr_objects, compare_entries);
> -
> -     for (i = 0; i < nr_objects; i++) {
> -             void *data;
> -             enum object_type type;
> -             unsigned long size;
> -             off_t curpos;
> -             int data_valid;
> -
> -             if (p->index_version > 1) {
> -                     off_t offset = entries[i].offset;
> -                     off_t len = entries[i+1].offset - offset;
> -                     unsigned int nr = entries[i].nr;
> -                     if (check_pack_crc(p, w_curs, offset, len, nr))
> -                             err = error("index CRC mismatch for object %s "
> -                                         "from %s at offset %"PRIuMAX"",
> -                                         oid_to_hex(entries[i].oid.oid),
> -                                         p->pack_name, (uintmax_t)offset);
> -             }
> -
> -             curpos = entries[i].offset;
> -             type = unpack_object_header(p, w_curs, &curpos, &size);
> -             unuse_pack(w_curs);
> -
> -             if (type == OBJ_BLOB && big_file_threshold <= size) {
> -                     /*
> -                      * Let check_object_signature() check it with
> -                      * the streaming interface; no point slurping
> -                      * the data in-core only to discard.
> -                      */
> -                     data = NULL;
> -                     data_valid = 0;
> -             } else {
> -                     data = unpack_entry(the_repository, p, 
> entries[i].offset, &type, &size);
> -                     data_valid = 1;
> -             }
> -
> -             if (data_valid && !data)
> -                     err = error("cannot unpack %s from %s at offset 
> %"PRIuMAX"",
> -                                 oid_to_hex(entries[i].oid.oid), 
> p->pack_name,
> -                                 (uintmax_t)entries[i].offset);
> -             else if (check_object_signature(entries[i].oid.oid, data, size, 
> type_name(type)))
> -                     err = error("packed %s from %s is corrupt",
> -                                 oid_to_hex(entries[i].oid.oid), 
> p->pack_name);
> -             else if (fn) {
> -                     int eaten = 0;
> -                     err |= fn(entries[i].oid.oid, type, size, data, &eaten);
> -                     if (eaten)
> -                             data = NULL;
> -             }
> -             if (((base_count + i) & 1023) == 0)
> -                     display_progress(progress, base_count + i);
> -             free(data);
> -
> -     }
> -     display_progress(progress, base_count + i);
> -     free(entries);
> -
> -     return err;
> -}
> -
>  int verify_pack_index(struct packed_git *p)
>  {
>       off_t index_size;
> @@ -185,19 +59,3 @@ int verify_pack_index(struct packed_git *p)
>                           p->pack_name);
>       return err;
>  }
> -
> -int verify_pack(struct packed_git *p, verify_fn fn,
> -             struct progress *progress, uint32_t base_count)
> -{
> -     int err = 0;
> -     struct pack_window *w_curs = NULL;
> -
> -     err |= verify_pack_index(p);
> -     if (!p->index_data)
> -             return -1;
> -
> -     err |= verify_packfile(p, &w_curs, fn, progress, base_count);
> -     unuse_pack(&w_curs);
> -
> -     return err;
> -}
> diff --git a/pack.h b/pack.h
> index 34a9d458b4..0d346c5e31 100644
> --- a/pack.h
> +++ b/pack.h
> @@ -80,7 +80,6 @@ typedef int (*verify_fn)(const struct object_id *, enum 
> object_type, unsigned lo
>  extern const char *write_idx_file(const char *index_name, struct 
> pack_idx_entry **objects, int nr_objects, const struct pack_idx_option *, 
> const unsigned char *sha1);
>  extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, 
> off_t offset, off_t len, unsigned int nr);
>  extern int verify_pack_index(struct packed_git *);
> -extern int verify_pack(struct packed_git *, verify_fn fn, struct progress *, 
> uint32_t);
>  extern off_t write_pack_header(struct hashfile *f, uint32_t);
>  extern void fixup_pack_header_footer(int, unsigned char *, const char *, 
> uint32_t, unsigned char *, off_t);
>  extern char *index_pack_lockfile(int fd);

Reply via email to