From: Torsten Bögershausen
When an UTF-16 file is commited and later changed, `git diff` shows
"Binary files XX and YY differ".
When the user wants a diff in UTF-8, a textconv needs to be specified
in .gitattributes and the textconv must be configured.
A more user-friendly diff can be produced for UTF-16 if
- the user did not use `git diff --binary`
- the blob is identified as binary
- the blob has an UTF-16 BOM
- the blob can be converted into UTF-8
Enhance the diff machinery to auto-detect UTF-16 blobs and show them
as UTF-8, unless the user specifies `git diff --binary` which creates
a binary diff.
Signed-off-by: Torsten Bögershausen
---
diff.c | 43 -
diffcore.h | 3 ++
t/t4066-diff-encoding.sh | 98
utf8.h | 11 ++
4 files changed, 153 insertions(+), 2 deletions(-)
create mode 100755 t/t4066-diff-encoding.sh
diff --git a/diff.c b/diff.c
index fb22b19f09..51831ee94d 100644
--- a/diff.c
+++ b/diff.c
@@ -3192,6 +3192,10 @@ static void builtin_diff(const char *name_a,
strbuf_reset(&header);
}
+ if (one && one->reencoded_from_utf16)
+ strbuf_addf(&header, "a is converted to UTF-8 from
UTF-16\n");
+ if (two && two->reencoded_from_utf16)
+ strbuf_addf(&header, "b is converted to UTF-8 from
UTF-16\n");
mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
@@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s,
unsigned int flags)
s->size = size;
s->should_free = 1;
}
- }
- else {
+ if (!s->binary && buffer_is_binary(s->data, s->size) &&
+ buffer_has_utf16_bom(s->data, s->size)) {
+ int outsz = 0;
+ char *outbuf;
+ outbuf = reencode_string_len(s->data, (int)s->size,
+"UTF-8", "UTF-16", &outsz);
+ if (outbuf) {
+ if (s->should_free)
+ free(s->data);
+ if (s->should_munmap)
+ munmap(s->data, s->size);
+ s->should_munmap = 0;
+ s->data = outbuf;
+ s->size = outsz;
+ s->reencoded_from_utf16 = 1;
+ s->should_free = 1;
+ }
+ }
+ } else {
enum object_type type;
if (size_only || (flags & CHECK_BINARY)) {
type = sha1_object_info(s->oid.hash, &s->size);
@@ -3629,6 +3650,19 @@ int diff_populate_filespec(struct diff_filespec *s,
unsigned int flags)
s->data = read_sha1_file(s->oid.hash, &type, &s->size);
if (!s->data)
die("unable to read %s", oid_to_hex(&s->oid));
+ if (!s->binary && buffer_is_binary(s->data, s->size) &&
+ buffer_has_utf16_bom(s->data, s->size)) {
+ int outsz = 0;
+ char *buf;
+ buf = reencode_string_len(s->data, (int)s->size,
+ "UTF-8", "UTF-16", &outsz);
+ if (buf) {
+ free(s->data);
+ s->data = buf;
+ s->size = outsz;
+ s->reencoded_from_utf16 = 1;
+ }
+ }
s->should_free = 1;
}
return 0;
@@ -5695,6 +5729,10 @@ static int diff_filespec_is_identical(struct
diff_filespec *one,
static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
{
+ if (p->binary) {
+ p->one->binary = 1;
+ p->two->binary = 1;
+ }
if (p->done_skip_stat_unmatch)
return p->skip_stat_unmatch_result;
@@ -5735,6 +5773,7 @@ static void diffcore_skip_stat_unmatch(struct
diff_options *diffopt)
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
+ p->binary = diffopt->flags.binary;
if (diff_filespec_check_stat_unmatch(p))
diff_q(&outq, p);
else {
diff --git a/diffcore.h b/diffcore.h
index a30da161da..3cd97bb93b 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
unsigned has_more_entries : 1; /* only appear in combined diff */
/* data should be considered "binary"; -1 means "don't know yet" */
signed int is_binary : 2;
+