When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.

This change moves the cache entry allocation to be on top of memory
pools.

Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.

In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:

Terminology:
  - 'the_index': represents the logical view of the index

  - 'split_index': represents the "base" cache entries. Read from the
    split index file.

'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is.  This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool.  This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.

Alternatives:
The current design does not track whether a cache_entry is allocated
from a pool or not. Instead, it relies on the caller to know how the
cache_entry will be used and to handle its lifecyle appropriately,
including calling the correct free() method. Instead, we could include
a bit of information in the cache_entry (either a bit indicating
whether cache_entry was allocated from a pool or not, or possibly even
a pointer back to the allocating pool), which can then be used to make
informed decisions about these objects. The downside of this approach
is that the cache_entry type would need to grow to incorporate this
information.

Signed-off-by: Jameson Miller <jam...@microsoft.com>
---
 cache.h       |  2 ++
 read-cache.c  | 95 +++++++++++++++++++++++++++++++++++++++++++++--------------
 split-index.c | 23 +++++++++++++--
 3 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/cache.h b/cache.h
index 3760adbe25..7ed68f28e0 100644
--- a/cache.h
+++ b/cache.h
@@ -15,6 +15,7 @@
 #include "path.h"
 #include "sha1-array.h"
 #include "repository.h"
+#include "mem-pool.h"
 
 #include <zlib.h>
 typedef struct git_zstream {
@@ -328,6 +329,7 @@ struct index_state {
        struct untracked_cache *untracked;
        uint64_t fsmonitor_last_update;
        struct ewah_bitmap *fsmonitor_dirty;
+       struct mem_pool *ce_mem_pool;
 };
 
 extern struct index_state the_index;
diff --git a/read-cache.c b/read-cache.c
index 2a61cee130..01cd7fea41 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -46,6 +46,42 @@
                 CE_ENTRY_ADDED | CE_ENTRY_REMOVED | CE_ENTRY_CHANGED | \
                 SPLIT_INDEX_ORDERED | UNTRACKED_CHANGED | FSMONITOR_CHANGED)
 
+
+/*
+ * This is a guess of an pathname in the index.  We use this for V4
+ * index files to guess the un-deltafied size of the index in memory
+ * because of pathname deltafication.  This is not required for V2/V3
+ * index formats because their pathnames are not compressed.  If the
+ * initial amount of memory set aside is not sufficient, the mem pool
+ * will allocate extra memory.
+ */
+#define CACHE_ENTRY_PATH_LENGTH 80
+
+static inline struct cache_entry *mem_pool__ce_alloc(struct mem_pool 
*mem_pool, size_t len)
+{
+       return mem_pool_alloc(mem_pool, cache_entry_size(len));
+}
+
+static inline struct cache_entry *mem_pool__ce_calloc(struct mem_pool 
*mem_pool, size_t len)
+{
+       return mem_pool_calloc(mem_pool, 1, cache_entry_size(len));
+}
+
+static struct mem_pool *find_mem_pool(struct index_state *istate)
+{
+       struct mem_pool **pool_ptr;
+
+       if (istate->split_index && istate->split_index->base)
+               pool_ptr = &istate->split_index->base->ce_mem_pool;
+       else
+               pool_ptr = &istate->ce_mem_pool;
+
+       if (!*pool_ptr)
+               mem_pool_init(pool_ptr, 0);
+
+       return *pool_ptr;
+}
+
 struct index_state the_index;
 static const char *alternate_index_output;
 
@@ -746,7 +782,7 @@ int add_file_to_index(struct index_state *istate, const 
char *path, int flags)
 
 struct cache_entry *make_empty_index_cache_entry(struct index_state *istate, 
size_t len)
 {
-       return xcalloc(1, cache_entry_size(len));
+       return mem_pool__ce_calloc(find_mem_pool(istate), len);
 }
 
 struct cache_entry *make_empty_transient_cache_entry(size_t len)
@@ -1641,13 +1677,13 @@ int read_index(struct index_state *istate)
        return read_index_from(istate, get_index_file(), get_git_dir());
 }
 
-static struct cache_entry *cache_entry_from_ondisk(struct index_state *istate,
+static struct cache_entry *cache_entry_from_ondisk(struct mem_pool *mem_pool,
                                                   struct ondisk_cache_entry 
*ondisk,
                                                   unsigned int flags,
                                                   const char *name,
                                                   size_t len)
 {
-       struct cache_entry *ce = make_empty_index_cache_entry(istate, len);
+       struct cache_entry *ce = mem_pool__ce_alloc(mem_pool, len);
 
        ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
        ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
@@ -1689,7 +1725,7 @@ static unsigned long expand_name_field(struct strbuf 
*name, const char *cp_)
        return (const char *)ep + 1 - cp_;
 }
 
-static struct cache_entry *create_from_disk(struct index_state *istate,
+static struct cache_entry *create_from_disk(struct mem_pool *mem_pool,
                                            struct ondisk_cache_entry *ondisk,
                                            unsigned long *ent_size,
                                            struct strbuf *previous_name)
@@ -1721,13 +1757,13 @@ static struct cache_entry *create_from_disk(struct 
index_state *istate,
                /* v3 and earlier */
                if (len == CE_NAMEMASK)
                        len = strlen(name);
-               ce = cache_entry_from_ondisk(istate, ondisk, flags, name, len);
+               ce = cache_entry_from_ondisk(mem_pool, ondisk, flags, name, 
len);
 
                *ent_size = ondisk_ce_size(ce);
        } else {
                unsigned long consumed;
                consumed = expand_name_field(previous_name, name);
-               ce = cache_entry_from_ondisk(istate, ondisk, flags,
+               ce = cache_entry_from_ondisk(mem_pool, ondisk, flags,
                                             previous_name->buf,
                                             previous_name->len);
 
@@ -1801,6 +1837,22 @@ static void post_read_index_from(struct index_state 
*istate)
        tweak_fsmonitor(istate);
 }
 
+static size_t estimate_cache_size_from_compressed(unsigned int entries)
+{
+       return entries * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH);
+}
+
+static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
+{
+       long per_entry = sizeof(struct cache_entry) - sizeof(struct 
ondisk_cache_entry);
+
+       /*
+        * Account for potential alignment differences.
+        */
+       per_entry += align_padding_size(sizeof(struct cache_entry), 
-sizeof(struct ondisk_cache_entry));
+       return ondisk_size + entries * per_entry;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -1847,10 +1899,15 @@ int do_read_index(struct index_state *istate, const 
char *path, int must_exist)
        istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
        istate->initialized = 1;
 
-       if (istate->version == 4)
+       if (istate->version == 4) {
                previous_name = &previous_name_buf;
-       else
+               mem_pool_init(&istate->ce_mem_pool,
+                             
estimate_cache_size_from_compressed(istate->cache_nr));
+       } else {
                previous_name = NULL;
+               mem_pool_init(&istate->ce_mem_pool,
+                             estimate_cache_size(mmap_size, istate->cache_nr));
+       }
 
        src_offset = sizeof(*hdr);
        for (i = 0; i < istate->cache_nr; i++) {
@@ -1859,7 +1916,7 @@ int do_read_index(struct index_state *istate, const char 
*path, int must_exist)
                unsigned long consumed;
 
                disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
src_offset);
-               ce = create_from_disk(istate, disk_ce, &consumed, 
previous_name);
+               ce = create_from_disk(istate->ce_mem_pool, disk_ce, &consumed, 
previous_name);
                set_index_entry(istate, i, ce);
 
                src_offset += consumed;
@@ -1956,17 +2013,6 @@ int is_index_unborn(struct index_state *istate)
 
 int discard_index(struct index_state *istate)
 {
-       int i;
-
-       for (i = 0; i < istate->cache_nr; i++) {
-               if (istate->cache[i]->index &&
-                   istate->split_index &&
-                   istate->split_index->base &&
-                   istate->cache[i]->index <= 
istate->split_index->base->cache_nr &&
-                   istate->cache[i] == 
istate->split_index->base->cache[istate->cache[i]->index - 1])
-                       continue;
-               discard_index_cache_entry(istate->cache[i]);
-       }
        resolve_undo_clear_index(istate);
        istate->cache_nr = 0;
        istate->cache_changed = 0;
@@ -1980,6 +2026,12 @@ int discard_index(struct index_state *istate)
        discard_split_index(istate);
        free_untracked_cache(istate->untracked);
        istate->untracked = NULL;
+
+       if (istate->ce_mem_pool) {
+               mem_pool_discard(istate->ce_mem_pool);
+               istate->ce_mem_pool = NULL;
+       }
+
        return 0;
 }
 
@@ -2772,11 +2824,10 @@ void move_index_extensions(struct index_state *dst, 
struct index_state *src)
 }
 
 /*
- * Free cache entry allocated for an index.
+ * Indicate that a cache entry is no longer is use
  */
 void discard_index_cache_entry(struct cache_entry *ce)
 {
-       free(ce);
 }
 
 void discard_transient_cache_entry(struct cache_entry *ce)
diff --git a/split-index.c b/split-index.c
index d3326d2645..6d82c4e148 100644
--- a/split-index.c
+++ b/split-index.c
@@ -73,16 +73,31 @@ void move_cache_to_base_index(struct index_state *istate)
        int i;
 
        /*
-        * do not delete old si->base, its index entries may be shared
-        * with istate->cache[]. Accept a bit of leaking here because
-        * this code is only used by short-lived update-index.
+        * If there was a previous base index, then transfer ownership of 
allocated
+        * entries to the parent index.
         */
+       if (si->base &&
+               si->base->ce_mem_pool) {
+
+               if (!istate->ce_mem_pool)
+                       mem_pool_init(&istate->ce_mem_pool, 0);
+
+               mem_pool_combine(istate->ce_mem_pool, 
istate->split_index->base->ce_mem_pool);
+       }
+
        si->base = xcalloc(1, sizeof(*si->base));
        si->base->version = istate->version;
        /* zero timestamp disables racy test in ce_write_index() */
        si->base->timestamp = istate->timestamp;
        ALLOC_GROW(si->base->cache, istate->cache_nr, si->base->cache_alloc);
        si->base->cache_nr = istate->cache_nr;
+
+       /*
+        * The mem_pool needs to move with the allocated entries.
+        */
+       si->base->ce_mem_pool = istate->ce_mem_pool;
+       istate->ce_mem_pool = NULL;
+
        COPY_ARRAY(si->base->cache, istate->cache, istate->cache_nr);
        mark_base_index_entries(si->base);
        for (i = 0; i < si->base->cache_nr; i++)
@@ -330,6 +345,8 @@ void add_split_index(struct index_state *istate)
 void remove_split_index(struct index_state *istate)
 {
        if (istate->split_index) {
+               mem_pool_combine(istate->ce_mem_pool, 
istate->split_index->base->ce_mem_pool);
+
                /*
                 * can't discard_split_index(&the_index); because that
                 * will destroy split_index->base->cache[], which may
-- 
2.14.3

Reply via email to