Re: [PATCH 4/4] diff: mark any file larger than core.bigfilethreshold binary

2014-06-23 Thread Thomas Braun
Am 23.06.2014 14:18, schrieb Duy Nguyen:
> On Thu, Jun 19, 2014 at 7:27 PM, Thomas Braun
>  wrote:
>>> @@ -2721,6 +2721,11 @@ int diff_populate_filespec(struct diff_filespec *s, 
>>> unsigned int flags)
>>>   }
>>>   if (size_only)
>>>   return 0;
>>> + if ((flags & DIFF_POPULATE_IS_BINARY) &&
>>> + s->size > big_file_threshold && s->is_binary == -1) {
>>> + s->is_binary = 1;
>>> + return 0;
>>> + }
>>
>> Why do you check for s->is_binary == -1 here? I think it does not matter
>> what s_is_binary says here.
> 
> If some .gitattributes to mark one file not-binary, we should respect
> that, I think. Same for below too.

Reading diffcore.h I thought is_binary being -1 means it is not yet
decided if it is binary or text.
Respecting .gitattributes is obviously a good thing :)

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] diff: mark any file larger than core.bigfilethreshold binary

2014-06-23 Thread Duy Nguyen
On Thu, Jun 19, 2014 at 7:27 PM, Thomas Braun
 wrote:
>> @@ -2721,6 +2721,11 @@ int diff_populate_filespec(struct diff_filespec *s, 
>> unsigned int flags)
>>   }
>>   if (size_only)
>>   return 0;
>> + if ((flags & DIFF_POPULATE_IS_BINARY) &&
>> + s->size > big_file_threshold && s->is_binary == -1) {
>> + s->is_binary = 1;
>> + return 0;
>> + }
>
> Why do you check for s->is_binary == -1 here? I think it does not matter
> what s_is_binary says here.

If some .gitattributes to mark one file not-binary, we should respect
that, I think. Same for below too.

> I would also add a note to the documentation e. g:
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 9f467d3..7a2f27d 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -499,7 +499,8 @@ core.bigFileThreshold::
> Files larger than this size are stored deflated, without
> attempting delta compression.  Storing large files without
> delta compression avoids excessive memory usage, at the
> -   slight expense of increased disk usage.
> +   slight expense of increased disk usage.  Additionally files
> +   larger than this size are allways treated as binary.
>  +
>  Default is 512 MiB on all platforms.  This should be reasonable
>  for most projects as source code and other text files can still

Thanks. Will do. Sorry a little busy these days and could not reply earlier.
-- 
Duy
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] diff: mark any file larger than core.bigfilethreshold binary

2014-06-19 Thread Thomas Braun
Am Donnerstag, den 29.05.2014, 19:57 +0700 schrieb Nguyễn Thái Ngọc Duy:

Hi,

sorry for chiming in so late.

I've just played around with patch 3 and 4 of that series.
And I like it very much as I work often with large files so any further 
enhancement in that area is really nice.

(see comments below)

> Too large files may lead to failure to allocate memory. If it happens
> here, it could impact quite a few commands that involve
> diff. Moreover, too large files are inefficient to compare anyway (and
> most likely non-text), so mark them binary and skip looking at their
> content.
> 
> Noticed-by: Dale R. Worley 
> Signed-off-by: Nguyễn Thái Ngọc Duy 
> ---
>  diff.c   | 26 ++
>  diffcore.h   |  1 +
>  t/t1050-large.sh |  4 
>  3 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/diff.c b/diff.c
> index 54281cb..0a2f865 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -2185,8 +2185,8 @@ int diff_filespec_is_binary(struct diff_filespec *one)
>   one->is_binary = one->driver->binary;
>   else {
>   if (!one->data && DIFF_FILE_VALID(one))
> - diff_populate_filespec(one, 0);
> - if (one->data)
> + diff_populate_filespec(one, 
> DIFF_POPULATE_IS_BINARY);
> + if (one->is_binary == -1 && one->data)
>   one->is_binary = buffer_is_binary(one->data,
>   one->size);
>   if (one->is_binary == -1)
> @@ -2721,6 +2721,11 @@ int diff_populate_filespec(struct diff_filespec *s, 
> unsigned int flags)
>   }
>   if (size_only)
>   return 0;
> + if ((flags & DIFF_POPULATE_IS_BINARY) &&
> + s->size > big_file_threshold && s->is_binary == -1) {
> + s->is_binary = 1;
> + return 0;
> + }

Why do you check for s->is_binary == -1 here? I think it does not matter
what s_is_binary says here.

>   fd = open(s->path, O_RDONLY);
>   if (fd < 0)
>   goto err_empty;
> @@ -2742,16 +2747,21 @@ int diff_populate_filespec(struct diff_filespec *s, 
> unsigned int flags)
>   }
>   else {
>   enum object_type type;
> - if (size_only) {
> + if (size_only || (flags & DIFF_POPULATE_IS_BINARY)) {
>   type = sha1_object_info(s->sha1, &s->size);
>   if (type < 0)
>   die("unable to read %s", sha1_to_hex(s->sha1));
> - } else {
> - s->data = read_sha1_file(s->sha1, &type, &s->size);
> - if (!s->data)
> - die("unable to read %s", sha1_to_hex(s->sha1));
> - s->should_free = 1;
> + if (size_only)
> + return 0;
> + if (s->size > big_file_threshold && s->is_binary == -1) 
> {
same as above.
> + s->is_binary = 1;
> + return 0;
> + }
>   }
> + s->data = read_sha1_file(s->sha1, &type, &s->size);
> + if (!s->data)
> + die("unable to read %s", sha1_to_hex(s->sha1));
> + s->should_free = 1;
>   }
>   return 0;
>  }
> diff --git a/diffcore.h b/diffcore.h
> index a186d7c..e7760d9 100644
> --- a/diffcore.h
> +++ b/diffcore.h
> @@ -56,6 +56,7 @@ extern void fill_filespec(struct diff_filespec *, const 
> unsigned char *,
> int, unsigned short);
>  
>  #define DIFF_POPULATE_SIZE_ONLY 1
> +#define DIFF_POPULATE_IS_BINARY 2
>  extern int diff_populate_filespec(struct diff_filespec *, unsigned int);
>  extern void diff_free_filespec_data(struct diff_filespec *);
>  extern void diff_free_filespec_blob(struct diff_filespec *);
> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
> index 333909b..4d922e2 100755
> --- a/t/t1050-large.sh
> +++ b/t/t1050-large.sh
> @@ -112,6 +112,10 @@ test_expect_success 'diff --raw' '
>   git diff --raw HEAD^
>  '
>  
> +test_expect_success 'diff --stat' '
> + git diff --stat HEAD^ HEAD
> +'
> +
>  test_expect_success 'hash-object' '
>   git hash-object large1
>  '

I would also add a note to the documentation e. g:

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9f467d3..7a2f27d 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -499,7 +499,8 @@ core.bigFileThreshold::
Files larger than this size are stored deflated, without
attempting delta compression.  Storing large files without
delta compression avoids excessive memory usage, at the
-   slight expense of increased disk usage.
+   slight expense of increased disk usage.  Additionally files
+   larger t

[PATCH 4/4] diff: mark any file larger than core.bigfilethreshold binary

2014-05-29 Thread Nguyễn Thái Ngọc Duy
Too large files may lead to failure to allocate memory. If it happens
here, it could impact quite a few commands that involve
diff. Moreover, too large files are inefficient to compare anyway (and
most likely non-text), so mark them binary and skip looking at their
content.

Noticed-by: Dale R. Worley 
Signed-off-by: Nguyễn Thái Ngọc Duy 
---
 diff.c   | 26 ++
 diffcore.h   |  1 +
 t/t1050-large.sh |  4 
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/diff.c b/diff.c
index 54281cb..0a2f865 100644
--- a/diff.c
+++ b/diff.c
@@ -2185,8 +2185,8 @@ int diff_filespec_is_binary(struct diff_filespec *one)
one->is_binary = one->driver->binary;
else {
if (!one->data && DIFF_FILE_VALID(one))
-   diff_populate_filespec(one, 0);
-   if (one->data)
+   diff_populate_filespec(one, 
DIFF_POPULATE_IS_BINARY);
+   if (one->is_binary == -1 && one->data)
one->is_binary = buffer_is_binary(one->data,
one->size);
if (one->is_binary == -1)
@@ -2721,6 +2721,11 @@ int diff_populate_filespec(struct diff_filespec *s, 
unsigned int flags)
}
if (size_only)
return 0;
+   if ((flags & DIFF_POPULATE_IS_BINARY) &&
+   s->size > big_file_threshold && s->is_binary == -1) {
+   s->is_binary = 1;
+   return 0;
+   }
fd = open(s->path, O_RDONLY);
if (fd < 0)
goto err_empty;
@@ -2742,16 +2747,21 @@ int diff_populate_filespec(struct diff_filespec *s, 
unsigned int flags)
}
else {
enum object_type type;
-   if (size_only) {
+   if (size_only || (flags & DIFF_POPULATE_IS_BINARY)) {
type = sha1_object_info(s->sha1, &s->size);
if (type < 0)
die("unable to read %s", sha1_to_hex(s->sha1));
-   } else {
-   s->data = read_sha1_file(s->sha1, &type, &s->size);
-   if (!s->data)
-   die("unable to read %s", sha1_to_hex(s->sha1));
-   s->should_free = 1;
+   if (size_only)
+   return 0;
+   if (s->size > big_file_threshold && s->is_binary == -1) 
{
+   s->is_binary = 1;
+   return 0;
+   }
}
+   s->data = read_sha1_file(s->sha1, &type, &s->size);
+   if (!s->data)
+   die("unable to read %s", sha1_to_hex(s->sha1));
+   s->should_free = 1;
}
return 0;
 }
diff --git a/diffcore.h b/diffcore.h
index a186d7c..e7760d9 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -56,6 +56,7 @@ extern void fill_filespec(struct diff_filespec *, const 
unsigned char *,
  int, unsigned short);
 
 #define DIFF_POPULATE_SIZE_ONLY 1
+#define DIFF_POPULATE_IS_BINARY 2
 extern int diff_populate_filespec(struct diff_filespec *, unsigned int);
 extern void diff_free_filespec_data(struct diff_filespec *);
 extern void diff_free_filespec_blob(struct diff_filespec *);
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 333909b..4d922e2 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -112,6 +112,10 @@ test_expect_success 'diff --raw' '
git diff --raw HEAD^
 '
 
+test_expect_success 'diff --stat' '
+   git diff --stat HEAD^ HEAD
+'
+
 test_expect_success 'hash-object' '
git hash-object large1
 '
-- 
1.9.1.346.ga2b5940

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html