On Fri, May 10, 2019 at 01:53:35PM -0700, Elijah Newren wrote:
> Automatic re-encoding of commit messages (and dropping of the encoding
> header) hurts attempts to do reversible history rewrites (e.g. sha1sum
> <-> sha256sum transitions, some subtree rewrites), and seems
> inconsistent with the general principle followed elsewhere in
> fast-export of requiring explicit user requests to modify the output
> (e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite). Add a
> --reencode flag that the user can use to specify, and like other
> fast-export flags, default it to 'abort'.
>
> Signed-off-by: Elijah Newren <[email protected]>
> ---
> builtin/fast-export.c | 35 ++++++++++++++++++++++++++++++++---
> t/t9350-fast-export.sh | 37 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 66 insertions(+), 6 deletions(-)
>
> diff --git a/builtin/fast-export.c b/builtin/fast-export.c
> index 66331fa401..43cc52331c 100644
> --- a/builtin/fast-export.c
> +++ b/builtin/fast-export.c
> @@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
> static int progress;
> static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP }
> signed_tag_mode = SIGNED_TAG_ABORT;
> static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode =
> TAG_FILTERING_ABORT;
> +static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER }
> reencode_mode = REENCODE_ABORT;
> static int fake_missing_tagger;
> static int use_done_feature;
> static int no_data;
> @@ -77,6 +78,20 @@ static int parse_opt_tag_of_filtered_mode(const struct
> option *opt,
> return 0;
> }
>
> +static int parse_opt_reencode_mode(const struct option *opt,
> + const char *arg, int unset)
> +{
This one is good:
> + if (unset || !strcmp(arg, "abort"))
> + reencode_mode = REENCODE_ABORT;
But here: does it make sense to use REENCODE_YES/NO to be more consistant ?
> + else if (!strcmp(arg, "yes"))
> + reencode_mode = REENCODE_PLEASE;
> + else if (!strcmp(arg, "no"))
> + reencode_mode = REENCODE_NEVER;
> + else
> + return error("Unknown reencoding mode: %s", arg);
> + return 0;
> +}
> +
> static struct decoration idnums;
> static uint32_t last_idnum;
>
> @@ -633,10 +648,21 @@ static void handle_commit(struct commit *commit, struct
> rev_info *rev,
> }
>
> mark_next_object(&commit->object);
> - if (anonymize)
> + if (anonymize) {
> reencoded = anonymize_commit_message(message);
> - else if (!is_encoding_utf8(encoding))
> - reencoded = reencode_string(message, "UTF-8", encoding);
> + } else if (encoding) {
> + switch(reencode_mode) {
> + case REENCODE_PLEASE:
> + reencoded = reencode_string(message, "UTF-8", encoding);
> + break;
> + case REENCODE_NEVER:
> + break;
> + case REENCODE_ABORT:
> + die("Encountered commit-specific encoding %s in commit "
> + "%s; use --reencode=<mode> to handle it",
Should we be more helpfull and say !use --reencode=[yes|no] to handle it ?
> + encoding, oid_to_hex(&commit->object.oid));
> + }
> + }
> if (!commit->parents)
> printf("reset %s\n", refname);
> printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
> @@ -1091,6 +1117,9 @@ int cmd_fast_export(int argc, const char **argv, const
> char *prefix)
> OPT_CALLBACK(0, "tag-of-filtered-object",
> &tag_of_filtered_mode, N_("mode"),
> N_("select handling of tags that tag filtered
> objects"),
> parse_opt_tag_of_filtered_mode),
> + OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
> + N_("select handling of commit messages in an
> alternate encoding"),
> + parse_opt_reencode_mode),
> OPT_STRING(0, "export-marks", &export_filename, N_("file"),
> N_("Dump marks to this file")),
> OPT_STRING(0, "import-marks", &import_filename, N_("file"),
> diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
> index fa124f4842..3cf4c7fc0c 100755
> --- a/t/t9350-fast-export.sh
> +++ b/t/t9350-fast-export.sh
> @@ -94,14 +94,14 @@ test_expect_success 'fast-export --show-original-ids |
> git fast-import' '
> test $MUSS = $(git rev-parse --verify refs/tags/muss)
> '
>
> -test_expect_success 'iso-8859-7' '
> +test_expect_success 'reencoding iso-8859-7' '
>
> test_when_finished "git reset --hard HEAD~1" &&
> test_config i18n.commitencoding iso-8859-7 &&
> test_tick &&
> echo rosten >file &&
> git commit -s -F
> "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
> - git fast-export wer^..wer >iso-8859-7.fi &&
> + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
> sed "s/wer/i18n/" iso-8859-7.fi |
> (cd new &&
> git fast-import &&
> @@ -120,13 +120,44 @@ test_expect_success 'iso-8859-7' '
> ! grep ^encoding actual)
> '
>
> +test_expect_success 'aborting on iso-8859-7' '
> +
> + test_when_finished "git reset --hard HEAD~1" &&
> + test_config i18n.commitencoding iso-8859-7 &&
> + echo rosten >file &&
> + git commit -s -F
> "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
> + test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
> +'
> +
> +test_expect_success 'preserving iso-8859-7' '
> +
> + test_when_finished "git reset --hard HEAD~1" &&
> + test_config i18n.commitencoding iso-8859-7 &&
> + echo rosten >file &&
> + git commit -s -F
> "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
> + git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
> + sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
> + (cd new &&
> + git fast-import &&
> + # The commit object, if not re-encoded, is 240 bytes.
> + # Removing the "encoding iso-8859-7\n" header would drops 20
> + # bytes. Re-encoding the Pi character from \xF0 in
> + # iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would
> + # grep for the # specific bytes, but Windows lamely does not
This is somewhat unclear to me. What does Windows not allow ?
> + # allow that, so just search for the expected size.
> + test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
> + # Also make sure the commit has the "encoding" header
> + git cat-file commit i18n-no-recoding >actual &&
> + grep ^encoding actual)
> +'
> +
> test_expect_success 'encoding preserved if reencoding fails' '
>
> test_when_finished "git reset --hard HEAD~1" &&
> test_config i18n.commitencoding iso-8859-7 &&
> echo rosten >file &&
> git commit -s -F
> "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
> - git fast-export wer^..wer >iso-8859-7.fi &&
> + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
> sed "s/wer/i18n-invalid/" iso-8859-7.fi |
> (cd new &&
> git fast-import &&
> --
> 2.21.0.782.g2063122293
>