Author: philip
Date: Mon Mar 27 18:41:36 2023
New Revision: 1908751

URL: http://svn.apache.org/viewvc?rev=1908751&view=rev
Log:
There are some places where we use svn_revnum_t as keys in an apr_hash_t
and it turns out that APR's default hash function doesn't work very well
in this case. For the load revmap hash it is possible for over 96% of the
revnums added to the hash to be in hash collision chains, meaning that
most hash lookups will degrade to a linked list scan.  Subversion has an
alternative hash function, available via svn_hash__make(), that works
much better in this case, so use it.

* subversion/libsvn_repos/load-fs-vtable.c
  (struct parse_baton): Add comment.
  (svn_repos_get_fs_build_parser6): Use svn_hash__make.

* subversion/libsvn_repos/reporter.c
  (struct report_baton_t): Add comment.
  (svn_repos_begin_report3): Use svn_hash__make.

* tools/dev/hash-test.c: New.

Added:
    subversion/trunk/tools/dev/hash-test.c   (with props)
Modified:
    subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c
    subversion/trunk/subversion/libsvn_repos/reporter.c

Modified: subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c?rev=1908751&r1=1908750&r2=1908751&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c (original)
+++ subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c Mon Mar 27 
18:41:36 2023
@@ -42,6 +42,7 @@
 #include "private/svn_dep_compat.h"
 #include "private/svn_mergeinfo_private.h"
 #include "private/svn_repos_private.h"
+#include "private/svn_subr_private.h"
 
 /*----------------------------------------------------------------------*/
 
@@ -77,6 +78,8 @@ struct parse_baton
      contents are allocated in POOL. */
   /* ### See https://issues.apache.org/jira/browse/SVN-3903
      ### for discussion about improving the memory costs of this mapping. */
+  /* Using svn_revnum_t as a key can interact badly with APR's default hash
+     see tools/dev/hash-test.c. Use svn_hash__make to get a suitable hash. */
   apr_hash_t *rev_map;
 
   /* The most recent (youngest) revision from the dump stream mapped in
@@ -1255,7 +1258,7 @@ svn_repos_get_fs_build_parser6(const svn
   pb->parent_dir = parent_dir;
   pb->pool = pool;
   pb->notify_pool = svn_pool_create(pool);
-  pb->rev_map = apr_hash_make(pool);
+  pb->rev_map = svn_hash__make(pool);
   pb->oldest_dumpstream_rev = SVN_INVALID_REVNUM;
   pb->last_rev_mapped = SVN_INVALID_REVNUM;
   pb->start_rev = start_rev;

Modified: subversion/trunk/subversion/libsvn_repos/reporter.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_repos/reporter.c?rev=1908751&r1=1908750&r2=1908751&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_repos/reporter.c (original)
+++ subversion/trunk/subversion/libsvn_repos/reporter.c Mon Mar 27 18:41:36 2023
@@ -138,7 +138,8 @@ typedef struct report_baton_t
   svn_fs_root_t *s_roots[NUM_CACHED_SOURCE_ROOTS];
 
   /* Cache for revision properties. This is used to eliminate redundant
-     revprop fetching. */
+     revprop fetching. svn_revnum_t keys so use svn_hash__make to get a
+     suitable hash function. */
   apr_hash_t *revision_infos;
 
   /* This will not change. So, fetch it once and reuse it. */
@@ -1628,7 +1629,7 @@ svn_repos_begin_report3(void **report_ba
   b->edit_baton = edit_baton;
   b->authz_read_func = authz_read_func;
   b->authz_read_baton = authz_read_baton;
-  b->revision_infos = apr_hash_make(pool);
+  b->revision_infos = svn_hash__make(pool);
   b->pool = pool;
   b->reader = svn_spillbuf__reader_create(1000 /* blocksize */,
                                           1000000 /* maxsize */,

Added: subversion/trunk/tools/dev/hash-test.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/tools/dev/hash-test.c?rev=1908751&view=auto
==============================================================================
--- subversion/trunk/tools/dev/hash-test.c (added)
+++ subversion/trunk/tools/dev/hash-test.c Mon Mar 27 18:41:36 2023
@@ -0,0 +1,192 @@
+/*
+ * ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one
+ *    or more contributor license agreements.  See the NOTICE file
+ *    distributed with this work for additional information
+ *    regarding copyright ownership.  The ASF licenses this file
+ *    to you under the Apache License, Version 2.0 (the
+ *    "License"); you may not use this file except in compliance
+ *    with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing,
+ *    software distributed under the License is distributed on an
+ *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *    KIND, either express or implied.  See the License for the
+ *    specific language governing permissions and limitations
+ *    under the License.
+ * ====================================================================
+ */
+
+/* gcc hash-test.c -I/usr/include/apr-1 -Isubversion/include -lsvn_subr-1 
-lapr-1
+
+   Shows how bad the standard APR hash function can be for 4/8-byte
+   svn_revnum_t keys. Putting the first 1,000,000 revisions into a
+   hash table reveals that 96% of the keys end up in chains with 6 or
+   7 hash collisions, that means almost all hash lookups degrade to a
+   linked list scan.
+
+   Subversion has a different hash function available, accessed by
+   using svn_hash__make() instead of apr_hash_make(), that doesn't use
+   a seed and it is much better for svn_revnum_t keys. Another option
+   would be to use the svn_revnum_t values directly as keys with a
+   no-op hash function.
+
+ */
+
+#include <apr_pools.h>
+#include <apr_hash.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "private/svn_subr_private.h"
+
+struct e_t {
+  struct e_t *next;
+  unsigned int hash;
+  const void *key;
+  apr_ssize_t klen;
+  const void *val;
+};
+
+struct i_t {
+  struct h_t *h;
+  struct e_t *t;
+  struct e_t *n;
+  unsigned int i;
+};
+
+struct h_t {
+  apr_pool_t *pool;
+  struct e_t **array;
+  struct i_t it;
+  unsigned int count;
+  unsigned int max;
+  unsigned int seed;
+  void *func;
+  struct e_t *free;
+};
+
+static void test_hash(apr_hash_t *hash, const char *name)
+{
+  struct h_t *hack = (struct h_t *)hash;
+  int num = 0, max = 0, running = 0, i;
+  const int hist_len = 15;
+  int hist[hist_len];
+
+  for (i = 0; i < hist_len; ++i)
+    hist[i] = 0;
+
+  for (i = 0; i <= hack->max; ++i)
+    {
+      struct e_t *e = hack->array[i];
+      int j = 0;
+      while (e)
+        {
+          ++j;
+          ++num;
+          e = e->next;
+        }
+      if (j)
+        {
+          if (j > max)
+            max = j;
+          if (j < hist_len)
+            ++hist[j - 1];
+        }
+    }
+
+  printf("--\n%s\n--\nalloc:%d entries:%d seed:%0x\nhistogram\n",
+         name, hack->max, hack->count, hack->seed);
+  for (i = 0; i < hist_len; ++i)
+    printf("%d ", hist[i]);
+  printf("\ncummulative\n");
+  for (i = 0; i < hist_len && running < hack->count; ++i)
+    {
+      running += (i + 1) * hist[i];
+      printf("%0.2f ", ((float)running)/num);
+    }
+  printf("\nlongest:%d found:%d\n", max, num);
+}
+
+unsigned int
+hash_simple64(const char *key, apr_ssize_t *klen)
+{
+  unsigned int *p = (unsigned int *)key;
+  return p[0] ^ p[1];
+}
+
+
+int main(int argc, char *argv[])
+{
+  apr_pool_t *pool;
+  apr_hash_t *hash;
+  long i;
+  long min = 1;
+  long max = 1000000;
+
+  if (argc > 1)
+    min = atol(argv[1]);
+  if (argc > 2)
+    max = atol(argv[2]);
+  apr_initialize();
+  apr_pool_create(&pool, NULL);
+
+  hash = apr_hash_make(pool);
+  for (i = min; i <= max; ++i)
+    {
+      apr_int32_t *mapped = apr_palloc(pool, sizeof(apr_int32_t) * 2);
+      mapped[0] = i;
+      mapped[1] = i + 1;
+      apr_hash_set(hash, mapped, sizeof(apr_int32_t), mapped + 1);
+    }
+  test_hash(hash, "apr 32-bit keys");
+  apr_pool_clear(pool);
+
+  hash = apr_hash_make(pool);
+  for (i = min; i <= max; ++i)
+    {
+      apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2);
+      mapped[0] = i;
+      mapped[1] = i + 1;
+      apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1);
+    }
+  test_hash(hash, "apr 64-bit keys");
+  apr_pool_clear(pool);
+
+  hash = svn_hash__make(pool);
+  for (i = min; i <= max; ++i)
+    {
+      apr_int32_t *mapped = apr_palloc(pool, sizeof(apr_int32_t) * 2);
+      mapped[0] = i;
+      mapped[1] = i + 1;
+      apr_hash_set(hash, mapped, sizeof(apr_int32_t), mapped + 1);
+    }
+  test_hash(hash, "svn 32-bit keys");
+  apr_pool_clear(pool);
+
+  hash = svn_hash__make(pool);
+  for (i = min; i <= max; ++i)
+    {
+      apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2);
+      mapped[0] = i;
+      mapped[1] = i + 1;
+      apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1);
+    }
+  test_hash(hash, "svn 64-bit keys");
+  apr_pool_clear(pool);
+
+  hash = apr_hash_make_custom(pool, hash_simple64);
+  for (i = min; i <= max; ++i)
+    {
+      apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2);
+      mapped[0] = i;
+      mapped[1] = i + 1;
+      apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1);
+    }
+  test_hash(hash, "simple 64-bit keys");
+  apr_pool_clear(pool);
+
+  apr_pool_destroy(pool);
+  return 0;
+}

Propchange: subversion/trunk/tools/dev/hash-test.c
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to