Re: [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8

2018-02-27 Thread Jeff King
On Mon, Feb 26, 2018 at 06:27:06PM +0100, tbo...@web.de wrote:

> @@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s, 
> unsigned int flags)
>   s->size = size;
>   s->should_free = 1;
>   }
> - }
> - else {
> + if (!s->binary && buffer_is_binary(s->data, s->size) &&
> + buffer_has_utf16_bom(s->data, s->size)) {
> + int outsz = 0;
> + char *outbuf;
> + outbuf = reencode_string_len(s->data, (int)s->size,
> +  "UTF-8", "UTF-16", );
> + if (outbuf) {
> + if (s->should_free)
> + free(s->data);
> + if (s->should_munmap)
> + munmap(s->data, s->size);
> + s->should_munmap = 0;
> + s->data = outbuf;
> + s->size = outsz;
> + s->reencoded_from_utf16 = 1;
> + s->should_free = 1;
> + }
> + }
> + } else {

I don't think it makes sense to do the conversion deep inside
diff_populate_filespec(), because it will kick in much more than you'd
want (e.g., "format-patch | am" would stop working with this patch, I
think).

I think you'd want to hook this in at the same level as fill_textconv().
In fact, one way to do it would be to have the get_textconv() stack just
fill in a special driver that does the auto-detection. This is similar
to my earlier patch, but it avoids overriding the user-facing config for
non-textconv things (and naturally any actual configured textconv filter
would override the auto-detection).

-Peff


Re: [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8

2018-02-26 Thread Peter Krefting

tbo...@web.de:


+static inline int buffer_has_utf16_bom(const void *buf, size_t len) {
+  const unsigned char *text = (unsigned char *)buf;
+  if (!text ||  len < 2)
+return 0;
+  if (text[0] == 0xff && text[1] == 0xfe)
+return 1;
+  if (text[0] == 0xfe && text[1] == 0xff)
+return 1;
+  return 0;
+}


This would also match a UTF-32LE BOM, not that I think anyone would 
actually use that for text files, but it might be worth adding a test 
for that, just in case?


  if (text[0] == 0xff && text[1] == 0xfe)
return len < 4 || (text[2] != 0 && text[3] != 0);

--
\\// Peter - http://www.softwolves.pp.se/


[PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8

2018-02-26 Thread tboegi
From: Torsten Bögershausen 

When an UTF-16 file is commited and later changed, `git diff` shows
"Binary files XX and YY differ".

When the user wants a diff in UTF-8, a textconv needs to be specified
in .gitattributes and the textconv must be configured.

A more user-friendly diff can be produced for UTF-16 if
- the user did not use `git diff --binary`
- the blob is identified as binary
- the blob has an UTF-16 BOM
- the blob can be converted into UTF-8

Enhance the diff machinery to auto-detect UTF-16 blobs and show them
as UTF-8, unless the user specifies `git diff --binary` which creates
a binary diff.

Signed-off-by: Torsten Bögershausen 
---
 diff.c   | 43 -
 diffcore.h   |  3 ++
 t/t4066-diff-encoding.sh | 98 
 utf8.h   | 11 ++
 4 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100755 t/t4066-diff-encoding.sh

diff --git a/diff.c b/diff.c
index fb22b19f09..51831ee94d 100644
--- a/diff.c
+++ b/diff.c
@@ -3192,6 +3192,10 @@ static void builtin_diff(const char *name_a,
strbuf_reset();
}
 
+   if (one && one->reencoded_from_utf16)
+   strbuf_addf(, "a is converted to UTF-8 from 
UTF-16\n");
+   if (two && two->reencoded_from_utf16)
+   strbuf_addf(, "b is converted to UTF-8 from 
UTF-16\n");
mf1.size = fill_textconv(textconv_one, one, );
mf2.size = fill_textconv(textconv_two, two, );
 
@@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s, 
unsigned int flags)
s->size = size;
s->should_free = 1;
}
-   }
-   else {
+   if (!s->binary && buffer_is_binary(s->data, s->size) &&
+   buffer_has_utf16_bom(s->data, s->size)) {
+   int outsz = 0;
+   char *outbuf;
+   outbuf = reencode_string_len(s->data, (int)s->size,
+"UTF-8", "UTF-16", );
+   if (outbuf) {
+   if (s->should_free)
+   free(s->data);
+   if (s->should_munmap)
+   munmap(s->data, s->size);
+   s->should_munmap = 0;
+   s->data = outbuf;
+   s->size = outsz;
+   s->reencoded_from_utf16 = 1;
+   s->should_free = 1;
+   }
+   }
+   } else {
enum object_type type;
if (size_only || (flags & CHECK_BINARY)) {
type = sha1_object_info(s->oid.hash, >size);
@@ -3629,6 +3650,19 @@ int diff_populate_filespec(struct diff_filespec *s, 
unsigned int flags)
s->data = read_sha1_file(s->oid.hash, , >size);
if (!s->data)
die("unable to read %s", oid_to_hex(>oid));
+   if (!s->binary && buffer_is_binary(s->data, s->size) &&
+   buffer_has_utf16_bom(s->data, s->size)) {
+   int outsz = 0;
+   char *buf;
+   buf = reencode_string_len(s->data, (int)s->size,
+ "UTF-8", "UTF-16", );
+   if (buf) {
+   free(s->data);
+   s->data = buf;
+   s->size = outsz;
+   s->reencoded_from_utf16 = 1;
+   }
+   }
s->should_free = 1;
}
return 0;
@@ -5695,6 +5729,10 @@ static int diff_filespec_is_identical(struct 
diff_filespec *one,
 
 static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
 {
+   if (p->binary) {
+   p->one->binary = 1;
+   p->two->binary = 1;
+   }
if (p->done_skip_stat_unmatch)
return p->skip_stat_unmatch_result;
 
@@ -5735,6 +5773,7 @@ static void diffcore_skip_stat_unmatch(struct 
diff_options *diffopt)
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
 
+   p->binary = diffopt->flags.binary;
if (diff_filespec_check_stat_unmatch(p))
diff_q(, p);
else {
diff --git a/diffcore.h b/diffcore.h
index a30da161da..3cd97bb93b 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
unsigned has_more_entries : 1; /* only appear in combined diff */
/* data should be considered "binary"; -1 means "don't know yet" */
signed int is_binary : 2;
+   unsigned binary : 1;
+