Hello. OK, here is a patch.
Benchmark, before: ``` number of transactions actually processed: 1823 latency average = 1153.495 ms latency stddev = 154.366 ms tps = 6.061104 (including connections establishing) tps = 6.061211 (excluding connections establishing) ``` Benchmark, after: ``` number of transactions actually processed: 2462 latency average = 853.862 ms latency stddev = 112.270 ms tps = 8.191861 (including connections establishing) tps = 8.192028 (excluding connections establishing) ``` +35% TPS, just as expected. Feel free to run your own benchmarks on different datasets and workloads. `perf top` shows that first bottleneck is completely eliminated. I did nothing about the second bottleneck since as Amit mentioned partition-pruning should solve this anyway and apparently any micro-optimizations don't worth an effort. Also I checked test coverage using lcov to make sure that this code is test covered. An exact script I'm using could be found here [1]. [1] https://github.com/afiskon/pgscripts/blob/master/code-coverage.sh On Wed, Mar 01, 2017 at 10:36:24AM +0900, Amit Langote wrote: > Hi, > > On 2017/02/28 23:25, Aleksander Alekseev wrote: > > Hello. > > > > I decided to figure out whether current implementation of declarative > > partitioning has any bottlenecks when there is a lot of partitions. Here > > is what I did [1]. > > Thanks for sharing. > > > Then: > > > > ``` > > # 2580 is some pk that exists > > echo 'select * from part_test where pk = 2580;' > t.sql > > pgbench -j 7 -c 7 -f t.sql -P 1 -T 300 eax > > ``` > > > > `perf top` showed to bottlenecks [2]. A stacktrace for the first one > > looks like this [3]: > > > > ``` > > 0x00000000007a42e2 in get_tabstat_entry (rel_id=25696, isshared=0 '\000') > > at pgstat.c:1689 > > 1689 if (entry->t_id == rel_id) > > #0 0x00000000007a42e2 in get_tabstat_entry (rel_id=25696, isshared=0 > > '\000') at pgstat.c:1689 > > #1 0x00000000007a4275 in pgstat_initstats (rel=0x7f4af3fd41f8) at > > pgstat.c:1666 > > #2 0x00000000004c7090 in relation_open (relationId=25696, lockmode=0) at > > heapam.c:1137 > > #3 0x00000000004c72c9 in heap_open (relationId=25696, lockmode=0) at > > heapam.c:1291 > > (skipped) > > ``` > > > > And here is a stacktrace for the second bottleneck [4]: > > > > ``` > > 0x0000000000584fb1 in find_all_inheritors (parentrelId=16393, lockmode=1, > > numparents=0x0) at pg_inherits.c:199 > > 199 forboth(lo, rels_list, li, rel_numparents) > > #0 0x0000000000584fb1 in find_all_inheritors (parentrelId=16393, > > lockmode=1, numparents=0x0) at pg_inherits.c:199 > > #1 0x000000000077fc9f in expand_inherited_rtentry (root=0x1badcb8, > > rte=0x1b630b8, rti=1) at prepunion.c:1408 > > #2 0x000000000077fb67 in expand_inherited_tables (root=0x1badcb8) at > > prepunion.c:1335 > > #3 0x0000000000767526 in subquery_planner (glob=0x1b63cc0, > > parse=0x1b62fa0, parent_root=0x0, hasRecursion=0 '\000', tuple_fraction=0) > > at planner.c:568 > > (skipped) > > ``` > > > > The first one could be easily fixed by introducing a hash table > > (rel_id -> pgStatList entry). Perhaps hash table should be used only > > after some threshold. Unless there are any objections I will send a > > corresponding patch shortly. > > I have never thought about this one really. > > > I didn't explored the second bottleneck closely yet but at first glance > > it doesn't look much more complicated. > > I don't know which way you're thinking of fixing this, but a planner patch > to implement faster partition-pruning will have taken care of this, I > think. As you may know, even declarative partitioned tables currently > depend on constraint exclusion for partition-pruning and planner's current > approach of handling inheritance requires to open all the child tables > (partitions), whereas the new approach hopefully shouldn't need to do > that. I am not sure if looking for a more localized fix for this would be > worthwhile, although I may be wrong. > > Thanks, > Amit > > -- Best regards, Aleksander Alekseev
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 2fb9a8bf58..fa906e7930 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -160,6 +160,16 @@ typedef struct TabStatusArray
static TabStatusArray *pgStatTabList = NULL;
+/* pgStatTabHash entry */
+typedef struct TabStatHashEntry
+{
+ Oid t_id;
+ PgStat_TableStatus* tsa_entry;
+} TabStatHashEntry;
+
+/* Hash table for faster t_id -> tsa_entry lookup */
+static HTAB *pgStatTabHash = NULL;
+
/*
* Backends store per-function info that's waiting to be sent to the collector
* in this hash table (indexed by function OID).
@@ -815,7 +825,13 @@ pgstat_report_stat(bool force)
pgstat_send_tabstat(this_msg);
this_msg->m_nentries = 0;
}
+
+ /*
+ * Entry will be freed soon so we need to remove it from the lookup table.
+ */
+ hash_search(pgStatTabHash, &entry->t_id, HASH_REMOVE, NULL);
}
+
/* zero out TableStatus structs after use */
MemSet(tsa->tsa_entries, 0,
tsa->tsa_used * sizeof(PgStat_TableStatus));
@@ -1672,54 +1688,64 @@ pgstat_initstats(Relation rel)
static PgStat_TableStatus *
get_tabstat_entry(Oid rel_id, bool isshared)
{
+ TabStatHashEntry* hash_entry;
PgStat_TableStatus *entry;
TabStatusArray *tsa;
- TabStatusArray *prev_tsa;
- int i;
+
+ /* Try to find an entry */
+ entry = find_tabstat_entry(rel_id);
+ if(entry != NULL)
+ return entry;
/*
- * Search the already-used tabstat slots for this relation.
+ * Entry doesn't exist - creating one.
+ * First make sure there is a free space in a last element of pgStatTabList.
*/
- prev_tsa = NULL;
- for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
+ if (!pgStatTabList)
{
- for (i = 0; i < tsa->tsa_used; i++)
- {
- entry = &tsa->tsa_entries[i];
- if (entry->t_id == rel_id)
- return entry;
- }
+ HASHCTL ctl;
- if (tsa->tsa_used < TABSTAT_QUANTUM)
+ Assert(!pgStatTabHash);
+
+ memset(&ctl, 0, sizeof(ctl));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(TabStatHashEntry);
+ ctl.hcxt = TopMemoryContext;
+
+ pgStatTabHash = hash_create("pgstat t_id to tsa_entry lookup table",
+ TABSTAT_QUANTUM, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ pgStatTabList = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+ sizeof(TabStatusArray));
+ }
+
+ Assert(pgStatTabList && pgStatTabHash);
+
+ tsa = pgStatTabList;
+ while(tsa->tsa_used == TABSTAT_QUANTUM)
+ {
+ if(tsa->tsa_next == NULL)
{
- /*
- * It must not be present, but we found a free slot instead. Fine,
- * let's use this one. We assume the entry was already zeroed,
- * either at creation or after last use.
- */
- entry = &tsa->tsa_entries[tsa->tsa_used++];
- entry->t_id = rel_id;
- entry->t_shared = isshared;
- return entry;
+ tsa->tsa_next = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+ sizeof(TabStatusArray));
}
- }
- /*
- * We ran out of tabstat slots, so allocate more. Be sure they're zeroed.
- */
- tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
- sizeof(TabStatusArray));
- if (prev_tsa)
- prev_tsa->tsa_next = tsa;
- else
- pgStatTabList = tsa;
+ tsa = tsa->tsa_next;
+ }
/*
- * Use the first entry of the new TabStatusArray.
+ * Add an entry.
*/
entry = &tsa->tsa_entries[tsa->tsa_used++];
entry->t_id = rel_id;
entry->t_shared = isshared;
+
+ /*
+ * Add a corespinding entry to pgStatTabHash.
+ */
+ hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, NULL);
+ hash_entry->tsa_entry = entry;
+
return entry;
}
@@ -1731,22 +1757,19 @@ get_tabstat_entry(Oid rel_id, bool isshared)
PgStat_TableStatus *
find_tabstat_entry(Oid rel_id)
{
- PgStat_TableStatus *entry;
- TabStatusArray *tsa;
- int i;
+ TabStatHashEntry* hash_entry;
- for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
- {
- for (i = 0; i < tsa->tsa_used; i++)
- {
- entry = &tsa->tsa_entries[i];
- if (entry->t_id == rel_id)
- return entry;
- }
- }
+ /*
+ * There are no entries at all.
+ */
+ if(!pgStatTabHash)
+ return NULL;
- /* Not present */
- return NULL;
+ hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL);
+ if(!hash_entry)
+ return NULL;
+
+ return hash_entry->tsa_entry;
}
/*
signature.asc
Description: PGP signature
