Reimplement oidset using khash.h in order to reduce its memory footprint
and make it faster.

Performance of a command that mainly checks for duplicate objects using
an oidset, with master and Clang 6.0.1:

  $ cmd="./git-cat-file --batch-all-objects --unordered --buffer 
--batch-check='%(objectname)'"

  $ /usr/bin/time $cmd >/dev/null
  0.22user 0.03system 0:00.25elapsed 99%CPU (0avgtext+0avgdata 
48484maxresident)k
  0inputs+0outputs (0major+11204minor)pagefaults 0swaps

  $ hyperfine "$cmd"
  Benchmark #1: ./git-cat-file --batch-all-objects --unordered --buffer 
--batch-check='%(objectname)'

    Time (mean ± σ):     250.0 ms ±   6.0 ms    [User: 225.9 ms, System: 23.6 
ms]

    Range (min … max):   242.0 ms … 261.1 ms

And with this patch:

  $ /usr/bin/time $cmd >/dev/null
  0.14user 0.00system 0:00.15elapsed 100%CPU (0avgtext+0avgdata 
41396maxresident)k
  0inputs+0outputs (0major+8318minor)pagefaults 0swaps

  $ hyperfine "$cmd"
  Benchmark #1: ./git-cat-file --batch-all-objects --unordered --buffer 
--batch-check='%(objectname)'

    Time (mean ± σ):     151.9 ms ±   4.9 ms    [User: 130.5 ms, System: 21.2 
ms]

    Range (min … max):   148.2 ms … 170.4 ms

Initial-patch-by: Jeff King <p...@peff.net>
Signed-off-by: Rene Scharfe <l....@web.de>
---
 oidset.c | 34 ++++++++++++----------------------
 oidset.h | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/oidset.c b/oidset.c
index 454c54f933..9836d427ef 100644
--- a/oidset.c
+++ b/oidset.c
@@ -3,38 +3,28 @@
 
 int oidset_contains(const struct oidset *set, const struct object_id *oid)
 {
-       if (!set->map.map.tablesize)
-               return 0;
-       return !!oidmap_get(&set->map, oid);
+       khiter_t pos = kh_get_oid(&set->set, *oid);
+       return pos != kh_end(&set->set);
 }
 
 int oidset_insert(struct oidset *set, const struct object_id *oid)
 {
-       struct oidmap_entry *entry;
-
-       if (!set->map.map.tablesize)
-               oidmap_init(&set->map, 0);
-       else if (oidset_contains(set, oid))
-               return 1;
-
-       entry = xmalloc(sizeof(*entry));
-       oidcpy(&entry->oid, oid);
-
-       oidmap_put(&set->map, entry);
-       return 0;
+       int added;
+       kh_put_oid(&set->set, *oid, &added);
+       return !added;
 }
 
 int oidset_remove(struct oidset *set, const struct object_id *oid)
 {
-       struct oidmap_entry *entry;
-
-       entry = oidmap_remove(&set->map, oid);
-       free(entry);
-
-       return (entry != NULL);
+       khiter_t pos = kh_get_oid(&set->set, *oid);
+       if (pos == kh_end(&set->set))
+               return 0;
+       kh_del_oid(&set->set, pos);
+       return 1;
 }
 
 void oidset_clear(struct oidset *set)
 {
-       oidmap_free(&set->map, 1);
+       kh_release_oid(&set->set);
+       oidset_init(set, 0);
 }
diff --git a/oidset.h b/oidset.h
index 40ec5f87fe..4b90540cd4 100644
--- a/oidset.h
+++ b/oidset.h
@@ -1,7 +1,8 @@
 #ifndef OIDSET_H
 #define OIDSET_H
 
-#include "oidmap.h"
+#include "hashmap.h"
+#include "khash.h"
 
 /**
  * This API is similar to sha1-array, in that it maintains a set of object ids
@@ -15,19 +16,33 @@
  *      table overhead.
  */
 
+static inline unsigned int oid_hash(struct object_id oid)
+{
+       return sha1hash(oid.hash);
+}
+
+static inline int oid_equal(struct object_id a, struct object_id b)
+{
+       return oideq(&a, &b);
+}
+
+KHASH_INIT(oid, struct object_id, int, 0, oid_hash, oid_equal)
+
 /**
  * A single oidset; should be zero-initialized (or use OIDSET_INIT).
  */
 struct oidset {
-       struct oidmap map;
+       kh_oid_t set;
 };
 
-#define OIDSET_INIT { OIDMAP_INIT }
+#define OIDSET_INIT { { 0 } }
 
 
 static inline void oidset_init(struct oidset *set, size_t initial_size)
 {
-       oidmap_init(&set->map, initial_size);
+       memset(&set->set, 0, sizeof(set->set));
+       if (initial_size)
+               kh_resize_oid(&set->set, initial_size);
 }
 
 /**
@@ -58,19 +73,24 @@ int oidset_remove(struct oidset *set, const struct 
object_id *oid);
 void oidset_clear(struct oidset *set);
 
 struct oidset_iter {
-       struct oidmap_iter m_iter;
+       kh_oid_t *set;
+       khiter_t iter;
 };
 
 static inline void oidset_iter_init(struct oidset *set,
                                    struct oidset_iter *iter)
 {
-       oidmap_iter_init(&set->map, &iter->m_iter);
+       iter->set = &set->set;
+       iter->iter = kh_begin(iter->set);
 }
 
 static inline struct object_id *oidset_iter_next(struct oidset_iter *iter)
 {
-       struct oidmap_entry *e = oidmap_iter_next(&iter->m_iter);
-       return e ? &e->oid : NULL;
+       for (; iter->iter != kh_end(iter->set); iter->iter++) {
+               if (kh_exist(iter->set, iter->iter))
+                       return &kh_key(iter->set, iter->iter++);
+       }
+       return NULL;
 }
 
 static inline struct object_id *oidset_iter_first(struct oidset *set,
-- 
2.19.0

Reply via email to