From: Torsten Bögershausen
When statistics are done for the autocrlf handling, the search in
the content can be stopped, if e.g
- a search for binary is done, and a NUL character is found
- a search for CRLF is done, and the first CRLF is found.
Similar when statistics for binary vs non-binary are gathered:
Whenever a lone CR or NUL is found, the search can be aborted.
When checking out files in "auto" mode, any file that has a "lone CR"
or a CRLF will not be converted, so the search can be aborted early.
Add the new bit, CONVERT_STAT_BITS_ANY_CR,
which is set for either lone CR or CRLF.
Many binary files have a NUL very early (within the first few bytes,
latest within the first 1..2K).
It is often not necessary to load the whole content of a file or blob
into memory.
Use a streaming handling for blobs and files in the worktree.
---
convert.c | 195 +-
1 file changed, 130 insertions(+), 65 deletions(-)
diff --git a/convert.c b/convert.c
index 077f5e6..6a625e5 100644
--- a/convert.c
+++ b/convert.c
@@ -3,6 +3,7 @@
#include "run-command.h"
#include "quote.h"
#include "sigchain.h"
+#include "streaming.h"
/*
* convert.c - convert a file when checking it out and checking it in.
@@ -13,10 +14,10 @@
* translation when the "text" attribute or "auto_crlf" option is set.
*/
-/* Stat bits: When BIN is set, the txt bits are unset */
#define CONVERT_STAT_BITS_TXT_LF0x1
#define CONVERT_STAT_BITS_TXT_CRLF 0x2
#define CONVERT_STAT_BITS_BIN 0x4
+#define CONVERT_STAT_BITS_ANY_CR0x8
enum crlf_action {
CRLF_UNDEFINED,
@@ -31,30 +32,36 @@ enum crlf_action {
struct text_stat {
/* NUL, CR, LF and CRLF counts */
- unsigned nul, lonecr, lonelf, crlf;
+ unsigned stat_bits, lonecr, lonelf, crlf;
/* These are just approximations! */
unsigned printable, nonprintable;
};
-static void gather_stats(const char *buf, unsigned long size, struct text_stat
*stats)
+static void gather_stats_partly(const char *buf, unsigned long len,
+ struct text_stat *stats, unsigned earlyout)
{
unsigned long i;
- memset(stats, 0, sizeof(*stats));
-
- for (i = 0; i < size; i++) {
+ if (!buf || !len)
+ return;
+ for (i = 0; i < len; i++) {
unsigned char c = buf[i];
if (c == '\r') {
- if (i+1 < size && buf[i+1] == '\n') {
+ stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
+ if (i+1 < len && buf[i+1] == '\n') {
stats->crlf++;
i++;
- } else
+ stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
+ } else {
stats->lonecr++;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+ }
continue;
}
if (c == '\n') {
stats->lonelf++;
+ stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
continue;
}
if (c == 127)
@@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size,
struct text_stat *
stats->printable++;
break;
case 0:
- stats->nul++;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
/* fall through */
default:
stats->nonprintable++;
@@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long
size, struct text_stat *
}
else
stats->printable++;
+ if (stats->stat_bits & earlyout)
+ break; /* We found what we have been searching for */
}
/* If file ends with EOF then don't count this EOF as non-printable. */
- if (size >= 1 && buf[size-1] == '\032')
+ if (len >= 1 && buf[len-1] == '\032')
stats->nonprintable--;
}
@@ -86,41 +95,62 @@ static void gather_stats(const char *buf, unsigned long
size, struct text_stat *
* The same heuristics as diff.c::mmfile_is_binary()
* We treat files with bare CR as binary
*/
-static int convert_is_binary(unsigned long size, const struct text_stat *stats)
+static void convert_nonprintable(struct text_stat *stats)
{
- if (stats->lonecr)
- return 1;
- if (stats->nul)
- return 1;
if ((stats->printable >> 7) < stats->nonprintable)
- return 1;
- return 0;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
}
-static unsigned int gather_convert_stats(const char *data, unsigned long s