From: Torsten Bögershausen <tbo...@web.de>

When statistics are done for the autocrlf handling, the search in
the content can be stopped, if e.g
- a search for binary is done, and a NUL character is found
- a search for CRLF is done, and the first CRLF is found.

Similar when statistics for binary vs non-binary are gathered:
Whenever a lone CR or NUL is found, the search can be aborted.

When checking out files in "auto" mode, any file that has a "lone CR"
or a CRLF will not be converted, so the search can be aborted early.

Add the new bit, CONVERT_STAT_BITS_ANY_CR,
which is set for either lone CR or CRLF.

Many binary files have a NUL very early (within the first few bytes,
latest within the first 1..2K).
It is often not necessary to load the whole content of a file or blob
into memory.

Use a streaming handling for blobs and files in the worktree.

Signed-off-by: Torsten Bögershausen <tbo...@web.de>
---
 convert.c | 162 ++++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 106 insertions(+), 56 deletions(-)

diff --git a/convert.c b/convert.c
index f524b8d..b6da114 100644
--- a/convert.c
+++ b/convert.c
@@ -3,6 +3,7 @@
 #include "run-command.h"
 #include "quote.h"
 #include "sigchain.h"
+#include "streaming.h"
 
 /*
  * convert.c - convert a file when checking it out and checking it in.
@@ -13,10 +14,10 @@
  * translation when the "text" attribute or "auto_crlf" option is set.
  */
 
-/* Stat bits: When BIN is set, the txt bits are unset */
 #define CONVERT_STAT_BITS_TXT_LF    0x1
 #define CONVERT_STAT_BITS_TXT_CRLF  0x2
 #define CONVERT_STAT_BITS_BIN       0x4
+#define CONVERT_STAT_BITS_ANY_CR    0x8
 
 enum crlf_action {
        CRLF_UNDEFINED,
@@ -31,30 +32,36 @@ enum crlf_action {
 
 struct text_stat {
        /* NUL, CR, LF and CRLF counts */
-       unsigned nul, lonecr, lonelf, crlf;
+       unsigned stat_bits, lonecr, lonelf, crlf;
 
        /* These are just approximations! */
        unsigned printable, nonprintable;
 };
 
-static void gather_stats(const char *buf, unsigned long size, struct text_stat 
*stats)
+static void do_gather_stats(const char *buf, unsigned long size,
+                           struct text_stat *stats, unsigned earlyout)
 {
        unsigned long i;
 
-       memset(stats, 0, sizeof(*stats));
-
+       if (!buf || !size)
+               return;
        for (i = 0; i < size; i++) {
                unsigned char c = buf[i];
                if (c == '\r') {
+                       stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
                        if (i+1 < size && buf[i+1] == '\n') {
                                stats->crlf++;
                                i++;
-                       } else
+                               stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
+                       } else {
                                stats->lonecr++;
+                               stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+                       }
                        continue;
                }
                if (c == '\n') {
                        stats->lonelf++;
+                       stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
                        continue;
                }
                if (c == 127)
@@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size, 
struct text_stat *
                                stats->printable++;
                                break;
                        case 0:
-                               stats->nul++;
+                               stats->stat_bits |= CONVERT_STAT_BITS_BIN;
                                /* fall through */
                        default:
                                stats->nonprintable++;
@@ -75,6 +82,8 @@ static void gather_stats(const char *buf, unsigned long size, 
struct text_stat *
                }
                else
                        stats->printable++;
+               if (stats->stat_bits & earlyout)
+                       break; /* We found what we have been searching for */
        }
 
        /* If file ends with EOF then don't count this EOF as non-printable. */
@@ -86,41 +95,63 @@ static void gather_stats(const char *buf, unsigned long 
size, struct text_stat *
  * The same heuristics as diff.c::mmfile_is_binary()
  * We treat files with bare CR as binary
  */
-static int convert_is_binary(unsigned long size, const struct text_stat *stats)
+static void convert_nonprintable(struct text_stat *stats)
 {
-       if (stats->lonecr)
-               return 1;
-       if (stats->nul)
-               return 1;
        if ((stats->printable >> 7) < stats->nonprintable)
-               return 1;
-       return 0;
+               stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+}
+
+static void gather_stats(const char *buf, unsigned long size,
+                        struct text_stat *stats, unsigned earlyout)
+{
+       memset(stats, 0, sizeof(*stats));
+       do_gather_stats(buf, size, stats, earlyout);
+       convert_nonprintable(stats);
 }
 
-static unsigned int gather_convert_stats(const char *data, unsigned long size)
+
+static unsigned get_convert_stats_sha1(const char *path,
+                                      unsigned const char *sha1,
+                                      unsigned earlyout)
 {
+       struct git_istream *st;
        struct text_stat stats;
-       int ret = 0;
-       if (!data || !size)
-               return 0;
-       gather_stats(data, size, &stats);
-       if (convert_is_binary(size, &stats))
-               ret |= CONVERT_STAT_BITS_BIN;
-       if (stats.crlf)
-               ret |= CONVERT_STAT_BITS_TXT_CRLF;
-       if (stats.lonelf)
-               ret |=  CONVERT_STAT_BITS_TXT_LF;
+       enum object_type type;
+       unsigned long sz;
 
-       return ret;
+       if (!sha1)
+               return 0;
+       memset(&stats, 0, sizeof(stats));
+       st = open_istream(sha1, &type, &sz, NULL);
+       if (!st) {
+               return 0;
+       }
+       if (type != OBJ_BLOB)
+               goto close_and_exit_i;
+       for (;;) {
+               char buf[1024];
+               ssize_t readlen = read_istream(st, buf, sizeof(buf));
+               if (readlen < 0)
+                       break;
+               if (!readlen)
+                       break;
+               do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout);
+               if (stats.stat_bits & earlyout)
+                       break; /* We found what we have been searching for */
+       }
+close_and_exit_i:
+       close_istream(st);
+       convert_nonprintable(&stats);
+       return stats.stat_bits;
 }
 
-static const char *gather_convert_stats_ascii(const char *data, unsigned long 
size)
+static const char *convert_stats_ascii(unsigned convert_stats)
 {
-       unsigned int convert_stats = gather_convert_stats(data, size);
-
+       unsigned mask = CONVERT_STAT_BITS_TXT_LF |
+               CONVERT_STAT_BITS_TXT_CRLF;
        if (convert_stats & CONVERT_STAT_BITS_BIN)
                return "-text";
-       switch (convert_stats) {
+       switch (convert_stats & mask) {
        case CONVERT_STAT_BITS_TXT_LF:
                return "lf";
        case CONVERT_STAT_BITS_TXT_CRLF:
@@ -132,24 +163,46 @@ static const char *gather_convert_stats_ascii(const char 
*data, unsigned long si
        }
 }
 
+static unsigned get_convert_stats_wt(const char *path)
+{
+       struct text_stat stats;
+       unsigned earlyout = CONVERT_STAT_BITS_BIN;
+       int fd;
+       memset(&stats, 0, sizeof(stats));
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return 0;
+       for (;;) {
+               char buf[1024];
+               ssize_t readlen = read(fd, buf, sizeof(buf));
+               if (readlen < 0)
+                       break;
+               if (!readlen)
+                       break;
+               do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout);
+               if (stats.stat_bits & earlyout)
+                       break; /* We found what we have been searching for */
+       }
+       close(fd);
+       convert_nonprintable(&stats);
+       return stats.stat_bits;
+}
+
 const char *get_cached_convert_stats_ascii(const char *path)
 {
-       const char *ret;
-       unsigned long sz;
-       void *data = read_blob_data_from_cache(path, &sz);
-       ret = gather_convert_stats_ascii(data, sz);
-       free(data);
-       return ret;
+       unsigned convert_stats;
+       unsigned earlyout = CONVERT_STAT_BITS_BIN;
+       convert_stats = get_convert_stats_sha1(path,
+                                              get_sha1_from_cache(path),
+                                              earlyout);
+       return convert_stats_ascii(convert_stats);
 }
 
 const char *get_wt_convert_stats_ascii(const char *path)
 {
-       const char *ret = "";
-       struct strbuf sb = STRBUF_INIT;
-       if (strbuf_read_file(&sb, path, 0) >= 0)
-               ret = gather_convert_stats_ascii(sb.buf, sb.len);
-       strbuf_release(&sb);
-       return ret;
+       unsigned convert_stats;
+       convert_stats = get_convert_stats_wt(path);
+       return convert_stats_ascii(convert_stats);
 }
 
 static int text_eol_is_crlf(void)
@@ -219,16 +272,11 @@ static void check_safe_crlf(const char *path, enum 
crlf_action crlf_action,
 
 static int has_cr_in_index(const char *path)
 {
-       unsigned long sz;
-       void *data;
-       int has_cr;
-
-       data = read_blob_data_from_cache(path, &sz);
-       if (!data)
-               return 0;
-       has_cr = memchr(data, '\r', sz) != NULL;
-       free(data);
-       return has_cr;
+       unsigned convert_stats;
+       convert_stats = get_convert_stats_sha1(path,
+                                              get_sha1_from_cache(path),
+                                              CONVERT_STAT_BITS_ANY_CR);
+       return convert_stats & CONVERT_STAT_BITS_ANY_CR;
 }
 
 static int crlf_to_git(const char *path, const char *src, size_t len,
@@ -249,10 +297,10 @@ static int crlf_to_git(const char *path, const char *src, 
size_t len,
        if (!buf && !src)
                return 1;
 
-       gather_stats(src, len, &stats);
+       gather_stats(src, len, &stats, CONVERT_STAT_BITS_BIN);
 
        if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || 
crlf_action == CRLF_AUTO_CRLF) {
-               if (convert_is_binary(len, &stats))
+               if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
                        return 0;
 
                if (crlf_action == CRLF_AUTO_INPUT || crlf_action == 
CRLF_AUTO_CRLF) {
@@ -309,11 +357,13 @@ static int crlf_to_worktree(const char *path, const char 
*src, size_t len,
 {
        char *to_free = NULL;
        struct text_stat stats;
+       unsigned earlyout = CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_BIN;
+
 
        if (!len || output_eol(crlf_action) != EOL_CRLF)
                return 0;
 
-       gather_stats(src, len, &stats);
+       gather_stats(src, len, &stats, earlyout);
 
        /* No "naked" LF? Nothing to convert, regardless. */
        if (!stats.lonelf)
@@ -327,7 +377,7 @@ static int crlf_to_worktree(const char *path, const char 
*src, size_t len,
                                return 0;
                }
 
-               if (convert_is_binary(len, &stats))
+               if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
                        return 0;
        }
 
-- 
2.8.0.rc2.2.g1a4d45a.dirty

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to