Author: philip Date: Mon Mar 27 18:41:36 2023 New Revision: 1908751 URL: http://svn.apache.org/viewvc?rev=1908751&view=rev Log: There are some places where we use svn_revnum_t as keys in an apr_hash_t and it turns out that APR's default hash function doesn't work very well in this case. For the load revmap hash it is possible for over 96% of the revnums added to the hash to be in hash collision chains, meaning that most hash lookups will degrade to a linked list scan. Subversion has an alternative hash function, available via svn_hash__make(), that works much better in this case, so use it.
* subversion/libsvn_repos/load-fs-vtable.c (struct parse_baton): Add comment. (svn_repos_get_fs_build_parser6): Use svn_hash__make. * subversion/libsvn_repos/reporter.c (struct report_baton_t): Add comment. (svn_repos_begin_report3): Use svn_hash__make. * tools/dev/hash-test.c: New. Added: subversion/trunk/tools/dev/hash-test.c (with props) Modified: subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c subversion/trunk/subversion/libsvn_repos/reporter.c Modified: subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c?rev=1908751&r1=1908750&r2=1908751&view=diff ============================================================================== --- subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c (original) +++ subversion/trunk/subversion/libsvn_repos/load-fs-vtable.c Mon Mar 27 18:41:36 2023 @@ -42,6 +42,7 @@ #include "private/svn_dep_compat.h" #include "private/svn_mergeinfo_private.h" #include "private/svn_repos_private.h" +#include "private/svn_subr_private.h" /*----------------------------------------------------------------------*/ @@ -77,6 +78,8 @@ struct parse_baton contents are allocated in POOL. */ /* ### See https://issues.apache.org/jira/browse/SVN-3903 ### for discussion about improving the memory costs of this mapping. */ + /* Using svn_revnum_t as a key can interact badly with APR's default hash + see tools/dev/hash-test.c. Use svn_hash__make to get a suitable hash. */ apr_hash_t *rev_map; /* The most recent (youngest) revision from the dump stream mapped in @@ -1255,7 +1258,7 @@ svn_repos_get_fs_build_parser6(const svn pb->parent_dir = parent_dir; pb->pool = pool; pb->notify_pool = svn_pool_create(pool); - pb->rev_map = apr_hash_make(pool); + pb->rev_map = svn_hash__make(pool); pb->oldest_dumpstream_rev = SVN_INVALID_REVNUM; pb->last_rev_mapped = SVN_INVALID_REVNUM; pb->start_rev = start_rev; Modified: subversion/trunk/subversion/libsvn_repos/reporter.c URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_repos/reporter.c?rev=1908751&r1=1908750&r2=1908751&view=diff ============================================================================== --- subversion/trunk/subversion/libsvn_repos/reporter.c (original) +++ subversion/trunk/subversion/libsvn_repos/reporter.c Mon Mar 27 18:41:36 2023 @@ -138,7 +138,8 @@ typedef struct report_baton_t svn_fs_root_t *s_roots[NUM_CACHED_SOURCE_ROOTS]; /* Cache for revision properties. This is used to eliminate redundant - revprop fetching. */ + revprop fetching. svn_revnum_t keys so use svn_hash__make to get a + suitable hash function. */ apr_hash_t *revision_infos; /* This will not change. So, fetch it once and reuse it. */ @@ -1628,7 +1629,7 @@ svn_repos_begin_report3(void **report_ba b->edit_baton = edit_baton; b->authz_read_func = authz_read_func; b->authz_read_baton = authz_read_baton; - b->revision_infos = apr_hash_make(pool); + b->revision_infos = svn_hash__make(pool); b->pool = pool; b->reader = svn_spillbuf__reader_create(1000 /* blocksize */, 1000000 /* maxsize */, Added: subversion/trunk/tools/dev/hash-test.c URL: http://svn.apache.org/viewvc/subversion/trunk/tools/dev/hash-test.c?rev=1908751&view=auto ============================================================================== --- subversion/trunk/tools/dev/hash-test.c (added) +++ subversion/trunk/tools/dev/hash-test.c Mon Mar 27 18:41:36 2023 @@ -0,0 +1,192 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * ==================================================================== + */ + +/* gcc hash-test.c -I/usr/include/apr-1 -Isubversion/include -lsvn_subr-1 -lapr-1 + + Shows how bad the standard APR hash function can be for 4/8-byte + svn_revnum_t keys. Putting the first 1,000,000 revisions into a + hash table reveals that 96% of the keys end up in chains with 6 or + 7 hash collisions, that means almost all hash lookups degrade to a + linked list scan. + + Subversion has a different hash function available, accessed by + using svn_hash__make() instead of apr_hash_make(), that doesn't use + a seed and it is much better for svn_revnum_t keys. Another option + would be to use the svn_revnum_t values directly as keys with a + no-op hash function. + + */ + +#include <apr_pools.h> +#include <apr_hash.h> +#include <stdlib.h> +#include <stdio.h> +#include "private/svn_subr_private.h" + +struct e_t { + struct e_t *next; + unsigned int hash; + const void *key; + apr_ssize_t klen; + const void *val; +}; + +struct i_t { + struct h_t *h; + struct e_t *t; + struct e_t *n; + unsigned int i; +}; + +struct h_t { + apr_pool_t *pool; + struct e_t **array; + struct i_t it; + unsigned int count; + unsigned int max; + unsigned int seed; + void *func; + struct e_t *free; +}; + +static void test_hash(apr_hash_t *hash, const char *name) +{ + struct h_t *hack = (struct h_t *)hash; + int num = 0, max = 0, running = 0, i; + const int hist_len = 15; + int hist[hist_len]; + + for (i = 0; i < hist_len; ++i) + hist[i] = 0; + + for (i = 0; i <= hack->max; ++i) + { + struct e_t *e = hack->array[i]; + int j = 0; + while (e) + { + ++j; + ++num; + e = e->next; + } + if (j) + { + if (j > max) + max = j; + if (j < hist_len) + ++hist[j - 1]; + } + } + + printf("--\n%s\n--\nalloc:%d entries:%d seed:%0x\nhistogram\n", + name, hack->max, hack->count, hack->seed); + for (i = 0; i < hist_len; ++i) + printf("%d ", hist[i]); + printf("\ncummulative\n"); + for (i = 0; i < hist_len && running < hack->count; ++i) + { + running += (i + 1) * hist[i]; + printf("%0.2f ", ((float)running)/num); + } + printf("\nlongest:%d found:%d\n", max, num); +} + +unsigned int +hash_simple64(const char *key, apr_ssize_t *klen) +{ + unsigned int *p = (unsigned int *)key; + return p[0] ^ p[1]; +} + + +int main(int argc, char *argv[]) +{ + apr_pool_t *pool; + apr_hash_t *hash; + long i; + long min = 1; + long max = 1000000; + + if (argc > 1) + min = atol(argv[1]); + if (argc > 2) + max = atol(argv[2]); + apr_initialize(); + apr_pool_create(&pool, NULL); + + hash = apr_hash_make(pool); + for (i = min; i <= max; ++i) + { + apr_int32_t *mapped = apr_palloc(pool, sizeof(apr_int32_t) * 2); + mapped[0] = i; + mapped[1] = i + 1; + apr_hash_set(hash, mapped, sizeof(apr_int32_t), mapped + 1); + } + test_hash(hash, "apr 32-bit keys"); + apr_pool_clear(pool); + + hash = apr_hash_make(pool); + for (i = min; i <= max; ++i) + { + apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2); + mapped[0] = i; + mapped[1] = i + 1; + apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1); + } + test_hash(hash, "apr 64-bit keys"); + apr_pool_clear(pool); + + hash = svn_hash__make(pool); + for (i = min; i <= max; ++i) + { + apr_int32_t *mapped = apr_palloc(pool, sizeof(apr_int32_t) * 2); + mapped[0] = i; + mapped[1] = i + 1; + apr_hash_set(hash, mapped, sizeof(apr_int32_t), mapped + 1); + } + test_hash(hash, "svn 32-bit keys"); + apr_pool_clear(pool); + + hash = svn_hash__make(pool); + for (i = min; i <= max; ++i) + { + apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2); + mapped[0] = i; + mapped[1] = i + 1; + apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1); + } + test_hash(hash, "svn 64-bit keys"); + apr_pool_clear(pool); + + hash = apr_hash_make_custom(pool, hash_simple64); + for (i = min; i <= max; ++i) + { + apr_int64_t *mapped = apr_palloc(pool, sizeof(apr_int64_t) * 2); + mapped[0] = i; + mapped[1] = i + 1; + apr_hash_set(hash, mapped, sizeof(apr_int64_t), mapped + 1); + } + test_hash(hash, "simple 64-bit keys"); + apr_pool_clear(pool); + + apr_pool_destroy(pool); + return 0; +} Propchange: subversion/trunk/tools/dev/hash-test.c ------------------------------------------------------------------------------ svn:eol-style = native