Hello.

This is intended to provide more stats like the following thread.

https://www.postgresql.org/message-id/20171010.192616.108347483.horiguchi.kyot...@lab.ntt.co.jp

Most major obstracle for having more items in statistics
collector views comes from the mechanism to share the values
among backends. It is currently using a file. The stats collector
writes a file by triggers from backens then backens reads the
written file. Larger file makes the latency longer and we don't
have a spare bandwidth for additional statistics items.

Nowadays PostgreSQL has dynamic shared hash (dshash) so we can
use this as the main storage of statistics. We can share data
without a stress using this.

A PoC previously posted tried to use "locally copied" dshash but
it doesn't looks fine so I steered to different direction.

With this patch dshash can create a local copy based on dynhash.

This patch consists of tree files.

v1-0001-Give-dshash-ability-to-make-a-local-snapshot.patch

  adds dshash to make a local copy backed by dynahash.

v1-0002-Change-stats-collector-to-an-axiliary-process.patch

  change the stats collector to be a auxiliary process so that it
  can attach dynamic shared memory.

v1-0003-dshash-based-stats-collector.patch

  implements shared-memory based stats collector.

I'll put more detailed explanation later.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From f1e033f308b9bbd2f1b1a3bb4a71f0fe2c538e82 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp>
Date: Fri, 29 Jun 2018 16:41:04 +0900
Subject: [PATCH 1/3] Give dshash ability to make a local snapshot

Add snapshot feature to DSHASH, that makes a dynahash that consists of
the same data.
---
 src/backend/lib/dshash.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++
 src/include/lib/dshash.h |  21 +++++-
 2 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index b46f7c4cfd..758435efe7 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -354,6 +354,48 @@ dshash_destroy(dshash_table *hash_table)
 	pfree(hash_table);
 }
 
+/*
+ * take a local snapshot of a dshash table
+ */
+HTAB *
+dshash_take_snapshot(dshash_table *org_table)
+{
+	HTAB	*dest_hash;
+	HASHCTL ctl;
+	int		num_entries = 0;
+	dshash_seq_status s;
+	void *ps;
+
+	ctl.keysize = org_table->params.key_size;
+	ctl.entrysize = org_table->params.entry_size;
+
+	/*
+	 *  num_entries is not a strict parameter. We don't care if new entries
+	 *	are added before taking snapshot
+	 */
+	num_entries = dshash_get_num_entries(org_table);
+
+	/*
+	 * dshash only supports binary hash and comparator functions, which won't
+	 * stop at intemediate NUL(\x0) bytes. Just specify HASH_BLOBS so that the
+	 * local hash behaves in the same way.
+	 */
+	dest_hash = hash_create("local snapshot of dshash",
+							num_entries, &ctl,
+							HASH_ELEM | HASH_BLOBS);
+							
+	dshash_seq_init(&s, org_table, true);
+	while ((ps = dshash_seq_next(&s)) != NULL)
+	{
+		bool found;
+		void *pd = hash_search(dest_hash, ps, HASH_ENTER, &found);
+		Assert(!found);
+		memcpy(pd, ps, ctl.entrysize);
+	}
+
+	return dest_hash;
+}
+
 /*
  * Get a handle that can be used by other processes to attach to this hash
  * table.
@@ -592,6 +634,128 @@ dshash_memhash(const void *v, size_t size, void *arg)
 	return tag_hash(v, size);
 }
 
+/*
+ * Initialize a sequential scan on the hash_table. Allows no modifications
+ * during a scan if consistent = true.
+ */
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+				bool consistent)
+{
+	status->hash_table = hash_table;
+	status->curbucket = 0;
+	status->nbuckets = ((size_t) 1) << hash_table->control->size_log2;
+	status->curitem = NULL;
+	status->curpartition = -1;
+	status->consistent = consistent;
+
+	/*
+	 * Protect all partitions from modification if the caller wants a
+	 * consistent result.
+	 */
+	if (consistent)
+	{
+		int i;
+
+		for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+		{
+			Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+			LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
+		}
+	}
+	ensure_valid_bucket_pointers(hash_table);
+}
+
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+	dsa_pointer next_item_pointer;
+
+	if (status->curitem == NULL)
+	{
+		Assert (status->curbucket == 0);
+		Assert(!status->hash_table->find_locked);
+
+		/* first shot. grab the first item. */
+		next_item_pointer = status->hash_table->buckets[status->curbucket];
+		status->hash_table->find_locked = true;
+	}
+	else
+		next_item_pointer = status->curitem->next;
+
+	/* Move to the next bucket if we finished the current bucket */
+	while (!DsaPointerIsValid(next_item_pointer))
+	{
+		if (++status->curbucket >= status->nbuckets)
+		{
+			/* all buckets have been scanned. finsih. */
+			dshash_seq_release(status);
+			return NULL;
+		}
+		Assert(status->hash_table->find_locked);
+
+		next_item_pointer = status->hash_table->buckets[status->curbucket];
+
+		/*
+		 * we need a lock on the scanning partition even if the caller don't
+		 * requested a consistent snapshot.
+		 */
+		if (!status->consistent && DsaPointerIsValid(next_item_pointer))
+		{
+			dshash_table_item  *item = dsa_get_address(status->hash_table->area,
+													   next_item_pointer);
+			int next_partition = PARTITION_FOR_HASH(item->hash);
+			if (status->curpartition != next_partition)
+			{
+				if (status->curpartition >= 0)
+					LWLockRelease(PARTITION_LOCK(status->hash_table,
+												 status->curpartition));
+				LWLockAcquire(PARTITION_LOCK(status->hash_table,
+											 next_partition),
+							  LW_SHARED);
+				status->curpartition = next_partition;
+			}
+		}
+	}
+
+	status->curitem =
+		dsa_get_address(status->hash_table->area, next_item_pointer);
+	return ENTRY_FROM_ITEM(status->curitem);
+}
+
+void
+dshash_seq_release(dshash_seq_status *status)
+{
+	Assert(status->hash_table->find_locked);
+	status->hash_table->find_locked = false;
+
+	if (status->consistent)
+	{
+		int i;
+
+		for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+			LWLockRelease(PARTITION_LOCK(status->hash_table, i));
+	}
+	else if (status->curpartition >= 0)
+		LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
+}
+
+int
+dshash_get_num_entries(dshash_table *hash_table)
+{
+	/* a shotcut implement. should be improved  */
+	dshash_seq_status s;
+	void *p;
+	int n = 0;
+
+	dshash_seq_init(&s, hash_table, false);
+	while ((p = dshash_seq_next(&s)) != NULL)
+		n++;
+
+	return n;
+}
+
 /*
  * Print debugging information about the internal state of the hash table to
  * stderr.  The caller must hold no partition locks.
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 8c733bfe25..eb2e45cf66 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -15,6 +15,7 @@
 #define DSHASH_H
 
 #include "utils/dsa.h"
+#include "utils/hsearch.h"
 
 /* The opaque type representing a hash table. */
 struct dshash_table;
@@ -59,6 +60,18 @@ typedef struct dshash_parameters
 struct dshash_table_item;
 typedef struct dshash_table_item dshash_table_item;
 
+struct dshash_seq_status
+{
+	dshash_table	   *hash_table;
+	int					curbucket;
+	int					nbuckets;
+	dshash_table_item  *curitem;
+	int					curpartition;
+	bool				consistent;
+};
+
+typedef struct dshash_seq_status dshash_seq_status;
+
 /* Creating, sharing and destroying from hash tables. */
 extern dshash_table *dshash_create(dsa_area *area,
 			  const dshash_parameters *params,
@@ -70,7 +83,7 @@ extern dshash_table *dshash_attach(dsa_area *area,
 extern void dshash_detach(dshash_table *hash_table);
 extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table);
 extern void dshash_destroy(dshash_table *hash_table);
-
+extern HTAB * dshash_take_snapshot(dshash_table *org_table);
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
 			const void *key, bool exclusive);
@@ -80,6 +93,12 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 
+/* seq scan support */
+extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+							bool exclusive);
+extern void *dshash_seq_next(dshash_seq_status *status);
+extern void dshash_seq_release(dshash_seq_status *status);
+extern int dshash_get_num_entries(dshash_table *hash_table);
 /* Convenience hash and compare functions wrapping memcmp and tag_hash. */
 extern int	dshash_memcmp(const void *a, const void *b, size_t size, void *arg);
 extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg);
-- 
2.16.3

>From 26d1d99e5584cc4868ffc7cb6c20d4303276bdf5 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp>
Date: Fri, 29 Jun 2018 16:58:32 +0900
Subject: [PATCH 2/3] Change stats collector to an axiliary process.

Shared memory and LWLocks are required to let stats collector use
dshash. This patch makes stats collector an auxiliary process.
---
 src/backend/bootstrap/bootstrap.c   |  8 +++++
 src/backend/postmaster/pgstat.c     | 58 +++++++++++++++++++++++++------------
 src/backend/postmaster/postmaster.c | 24 +++++++++------
 src/include/miscadmin.h             |  3 +-
 src/include/pgstat.h                |  6 +++-
 5 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 7e34bee63e..8f8327495a 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -335,6 +335,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case WalReceiverProcess:
 				statmsg = pgstat_get_backend_desc(B_WAL_RECEIVER);
 				break;
+			case StatsCollectorProcess:
+				statmsg = pgstat_get_backend_desc(B_STATS_COLLECTOR);
+				break;
 			default:
 				statmsg = "??? process";
 				break;
@@ -462,6 +465,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			WalReceiverMain();
 			proc_exit(1);		/* should never return */
 
+		case StatsCollectorProcess:
+			/* don't set signals, stats collector has its own agenda */
+			PgstatCollectorMain();
+			proc_exit(1);		/* should never return */
+
 		default:
 			elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType);
 			proc_exit(1);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 084573e77c..6d9344fcca 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -267,6 +267,7 @@ static List *pending_write_requests = NIL;
 /* Signal handler flags */
 static volatile bool need_exit = false;
 static volatile bool got_SIGHUP = false;
+static volatile bool got_SIGTERM = false;
 
 /*
  * Total time charged to functions so far in the current backend.
@@ -284,8 +285,8 @@ static instr_time total_func_time;
 static pid_t pgstat_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-static void pgstat_exit(SIGNAL_ARGS);
+static void pgstat_shutdown_handler(SIGNAL_ARGS);
+static void pgstat_quickdie_handler(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 static void pgstat_sighup_handler(SIGNAL_ARGS);
 
@@ -770,11 +771,7 @@ pgstat_start(void)
 			/* Close the postmaster's sockets */
 			ClosePostmasterPorts(false);
 
-			/* Drop our connection to postmaster's shared memory, as well */
-			dsm_detach_all();
-			PGSharedMemoryDetach();
-
-			PgstatCollectorMain(0, NULL);
+			PgstatCollectorMain();
 			break;
 #endif
 
@@ -2870,6 +2867,9 @@ pgstat_bestart(void)
 			case WalReceiverProcess:
 				beentry->st_backendType = B_WAL_RECEIVER;
 				break;
+			case StatsCollectorProcess:
+				beentry->st_backendType = B_STATS_COLLECTOR;
+				break;
 			default:
 				elog(FATAL, "unrecognized process type: %d",
 					 (int) MyAuxProcType);
@@ -4132,6 +4132,9 @@ pgstat_get_backend_desc(BackendType backendType)
 		case B_WAL_WRITER:
 			backendDesc = "walwriter";
 			break;
+		case B_STATS_COLLECTOR:
+			backendDesc = "stats collector";
+			break;
 	}
 
 	return backendDesc;
@@ -4249,8 +4252,8 @@ pgstat_send_bgwriter(void)
  *	The argc/argv parameters are valid only in EXEC_BACKEND case.
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+void
+PgstatCollectorMain(void)
 {
 	int			len;
 	PgStat_Msg	msg;
@@ -4263,8 +4266,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pqsignal(SIGHUP, pgstat_sighup_handler);
 	pqsignal(SIGINT, SIG_IGN);
-	pqsignal(SIGTERM, SIG_IGN);
-	pqsignal(SIGQUIT, pgstat_exit);
+	pqsignal(SIGTERM, pgstat_shutdown_handler);
+	pqsignal(SIGQUIT, pgstat_quickdie_handler);
 	pqsignal(SIGALRM, SIG_IGN);
 	pqsignal(SIGPIPE, SIG_IGN);
 	pqsignal(SIGUSR1, SIG_IGN);
@@ -4309,14 +4312,14 @@ PgstatCollectorMain(int argc, char *argv[])
 		/*
 		 * Quit if we get SIGQUIT from the postmaster.
 		 */
-		if (need_exit)
+		if (got_SIGTERM)
 			break;
 
 		/*
 		 * Inner loop iterates as long as we keep getting messages, or until
 		 * need_exit becomes set.
 		 */
-		while (!need_exit)
+		while (!got_SIGTERM)
 		{
 			/*
 			 * Reload configuration if we got SIGHUP from the postmaster.
@@ -4504,14 +4507,21 @@ PgstatCollectorMain(int argc, char *argv[])
 
 /* SIGQUIT signal handler for collector process */
 static void
-pgstat_exit(SIGNAL_ARGS)
+pgstat_quickdie_handler(SIGNAL_ARGS)
 {
-	int			save_errno = errno;
+	PG_SETMASK(&BlockSig);
 
-	need_exit = true;
-	SetLatch(MyLatch);
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
 
-	errno = save_errno;
+	exit(2);
 }
 
 /* SIGHUP handler for collector process */
@@ -4526,6 +4536,18 @@ pgstat_sighup_handler(SIGNAL_ARGS)
 	errno = save_errno;
 }
 
+static void
+pgstat_shutdown_handler(SIGNAL_ARGS)
+{
+	int save_errno = errno;
+
+	got_SIGTERM = true;
+
+	SetLatch(MyLatch);
+
+	errno = save_errno;
+}
+
 /*
  * Subroutine to clear stats in a database entry
  *
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index a4b53b33cd..a6209cd749 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -145,7 +145,8 @@
 #define BACKEND_TYPE_AUTOVAC	0x0002	/* autovacuum worker process */
 #define BACKEND_TYPE_WALSND		0x0004	/* walsender process */
 #define BACKEND_TYPE_BGWORKER	0x0008	/* bgworker process */
-#define BACKEND_TYPE_ALL		0x000F	/* OR of all the above */
+#define BACKEND_TYPE_STATS		0x0010	/* bgworker process */
+#define BACKEND_TYPE_ALL		0x001F	/* OR of all the above */
 
 #define BACKEND_TYPE_WORKER		(BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
 
@@ -551,6 +552,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartStatsCollector()	StartChildProcess(StatsCollectorProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -1760,7 +1762,7 @@ ServerLoop(void)
 		/* If we have lost the stats collector, try to start a new one */
 		if (PgStatPID == 0 &&
 			(pmState == PM_RUN || pmState == PM_HOT_STANDBY))
-			PgStatPID = pgstat_start();
+			PgStatPID = StartStatsCollector();
 
 		/* If we have lost the archiver, try to start a new one. */
 		if (PgArchPID == 0 && PgArchStartupAllowed())
@@ -2878,7 +2880,7 @@ reaper(SIGNAL_ARGS)
 			if (PgArchStartupAllowed() && PgArchPID == 0)
 				PgArchPID = pgarch_start();
 			if (PgStatPID == 0)
-				PgStatPID = pgstat_start();
+				PgStatPID = StartStatsCollector();
 
 			/* workers may be scheduled to start now */
 			maybe_start_bgworkers();
@@ -2951,7 +2953,7 @@ reaper(SIGNAL_ARGS)
 				 * nothing left for it to do.
 				 */
 				if (PgStatPID != 0)
-					signal_child(PgStatPID, SIGQUIT);
+					signal_child(PgStatPID, SIGTERM);
 			}
 			else
 			{
@@ -3037,10 +3039,10 @@ reaper(SIGNAL_ARGS)
 		{
 			PgStatPID = 0;
 			if (!EXIT_STATUS_0(exitstatus))
-				LogChildExit(LOG, _("statistics collector process"),
-							 pid, exitstatus);
+				HandleChildCrash(pid, exitstatus,
+								 _("statistics collector process"));
 			if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
-				PgStatPID = pgstat_start();
+				PgStatPID = StartStatsCollector();
 			continue;
 		}
 
@@ -3270,7 +3272,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, stats collector or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -5071,7 +5073,7 @@ sigusr1_handler(SIGNAL_ARGS)
 		 * Likewise, start other special children as needed.
 		 */
 		Assert(PgStatPID == 0);
-		PgStatPID = pgstat_start();
+		PgStatPID = StartStatsCollector();
 
 		ereport(LOG,
 				(errmsg("database system is ready to accept read only connections")));
@@ -5361,6 +5363,10 @@ StartChildProcess(AuxProcType type)
 				ereport(LOG,
 						(errmsg("could not fork WAL receiver process: %m")));
 				break;
+			case StatsCollectorProcess:
+				ereport(LOG,
+						(errmsg("could not fork stats collector process: %m")));
+				break;
 			default:
 				ereport(LOG,
 						(errmsg("could not fork process: %m")));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index e167ee8fcb..53b260cb1f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -400,7 +400,7 @@ typedef enum
 	CheckpointerProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
-
+	StatsCollectorProcess,
 	NUM_AUXPROCTYPES			/* Must be last! */
 } AuxProcType;
 
@@ -412,6 +412,7 @@ extern AuxProcType MyAuxProcType;
 #define AmCheckpointerProcess()		(MyAuxProcType == CheckpointerProcess)
 #define AmWalWriterProcess()		(MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess()		(MyAuxProcType == WalReceiverProcess)
+#define AmStatsCollectorProcess()	(MyAuxProcType == StatsCollectorProcess)
 
 
 /*****************************************************************************
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index be2f59239b..0b9609f96e 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -710,7 +710,8 @@ typedef enum BackendType
 	B_STARTUP,
 	B_WAL_RECEIVER,
 	B_WAL_SENDER,
-	B_WAL_WRITER
+	B_WAL_WRITER,
+	B_STATS_COLLECTOR
 } BackendType;
 
 
@@ -1352,4 +1353,7 @@ extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
 
+/* Main loop */
+extern void PgstatCollectorMain(void);
+
 #endif							/* PGSTAT_H */
-- 
2.16.3

>From 02891365848ad56494864c8107da436e3f7909d1 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp>
Date: Fri, 29 Jun 2018 17:05:46 +0900
Subject: [PATCH 3/3] dshash-based stats collector

Stats collector no longer uses files to distribute stats numbers. They
are now stored in dynamic shared hash.
---
 src/backend/postmaster/autovacuum.c           |    6 +-
 src/backend/postmaster/pgstat.c               | 1259 ++++++++++---------------
 src/backend/replication/basebackup.c          |   36 -
 src/backend/storage/ipc/ipci.c                |    2 +
 src/backend/storage/lmgr/lwlock.c             |    3 +
 src/backend/storage/lmgr/lwlocknames.txt      |    1 +
 src/backend/utils/misc/guc.c                  |   41 -
 src/backend/utils/misc/postgresql.conf.sample |    1 -
 src/bin/initdb/initdb.c                       |    1 -
 src/bin/pg_basebackup/t/010_pg_basebackup.pl  |    2 +-
 src/include/pgstat.h                          |   51 +-
 src/include/storage/lwlock.h                  |    3 +
 12 files changed, 547 insertions(+), 859 deletions(-)

diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 02e6d8131e..4ece3f411d 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -2768,12 +2768,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
 	if (isshared)
 	{
 		if (PointerIsValid(shared))
-			tabentry = hash_search(shared->tables, &relid,
-								   HASH_FIND, NULL);
+			tabentry = backend_get_tab_entry(shared, relid);
 	}
 	else if (PointerIsValid(dbentry))
-		tabentry = hash_search(dbentry->tables, &relid,
-							   HASH_FIND, NULL);
+		tabentry = backend_get_tab_entry(dbentry, relid);
 
 	return tabentry;
 }
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 6d9344fcca..626e3c6867 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -77,22 +77,10 @@
 #define PGSTAT_STAT_INTERVAL	500 /* Minimum time between stats file
 									 * updates; in milliseconds. */
 
-#define PGSTAT_RETRY_DELAY		10	/* How long to wait between checks for a
-									 * new file; in milliseconds. */
-
-#define PGSTAT_MAX_WAIT_TIME	10000	/* Maximum time to wait for a stats
-										 * file update; in milliseconds. */
-
-#define PGSTAT_INQ_INTERVAL		640 /* How often to ping the collector for a
-									 * new file; in milliseconds. */
-
 #define PGSTAT_RESTART_INTERVAL 60	/* How often to attempt to restart a
 									 * failed statistics collector; in
 									 * seconds. */
 
-#define PGSTAT_POLL_LOOP_COUNT	(PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
-#define PGSTAT_INQ_LOOP_COUNT	(PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
-
 /* Minimum receive buffer size for the collector's socket. */
 #define PGSTAT_MIN_RCVBUF		(100 * 1024)
 
@@ -101,7 +89,6 @@
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
-#define PGSTAT_DB_HASH_SIZE		16
 #define PGSTAT_TAB_HASH_SIZE	512
 #define PGSTAT_FUNCTION_HASH_SIZE	512
 
@@ -127,14 +114,6 @@ bool		pgstat_track_counts = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
-/* ----------
- * Built from GUC parameter
- * ----------
- */
-char	   *pgstat_stat_directory = NULL;
-char	   *pgstat_stat_filename = NULL;
-char	   *pgstat_stat_tmpname = NULL;
-
 /*
  * BgWriter global statistics counters (unused in other processes).
  * Stored directly in a stats message structure so it can be sent
@@ -154,6 +133,42 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+/* Shared stats bootstrap infomation */
+typedef struct StatsShmemStruct {
+	dsa_handle stats_dsa_handle;
+	dshash_table_handle db_stats_handle;
+	dsa_pointer	global_stats;
+	dsa_pointer	archiver_stats;
+} StatsShmemStruct;
+
+static StatsShmemStruct * StatsShmem = NULL;
+static dsa_area *area = NULL;
+static dshash_table *db_stats;
+static HTAB *snapshot_db_stats;
+
+/* dshash parameter for each type of table */
+static const dshash_parameters dsh_dbparams = {
+	sizeof(Oid),
+	sizeof(PgStat_StatDBEntry),
+	dshash_memcmp,
+	dshash_memhash,
+	LWTRANCHE_STATS_DB
+};
+static const dshash_parameters dsh_tblparams = {
+	sizeof(Oid),
+	sizeof(PgStat_StatTabEntry),
+	dshash_memcmp,
+	dshash_memhash,
+	LWTRANCHE_STATS_FUNC_TABLE
+};
+static const dshash_parameters dsh_funcparams = {
+	sizeof(Oid),
+	sizeof(PgStat_StatFuncEntry),
+	dshash_memcmp,
+	dshash_memhash,
+	LWTRANCHE_STATS_FUNC_TABLE
+};
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -250,12 +265,16 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL;
 static int	localNumBackends = 0;
 
 /*
- * Cluster wide statistics, kept in the stats collector.
- * Contains statistics that are not collected per database
- * or per table.
+ * Cluster wide statistics.
+ * Contains statistics that are not collected per database or per table.
+ * shared_* are the statistics maintained by pgstats and snapshot_* are the
+ * snapshots taken on backends.
  */
-static PgStat_ArchiverStats archiverStats;
-static PgStat_GlobalStats globalStats;
+static PgStat_ArchiverStats *shared_archiverStats;
+static PgStat_ArchiverStats *snapshot_archiverStats;
+static PgStat_GlobalStats *shared_globalStats;
+static PgStat_GlobalStats *snapshot_globalStats;
+
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -285,23 +304,22 @@ static instr_time total_func_time;
 static pid_t pgstat_forkexec(void);
 #endif
 
+/* functions used in stats collector */
 static void pgstat_shutdown_handler(SIGNAL_ARGS);
 static void pgstat_quickdie_handler(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 static void pgstat_sighup_handler(SIGNAL_ARGS);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
-					 Oid tableoid, bool create);
-static void pgstat_write_statsfiles(bool permanent, bool allDbs);
-static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
-static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
-static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
-static void backend_read_statsfile(void);
-static void pgstat_read_current_status(void);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create);
+static void pgstat_write_statsfiles(void);
+static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry);
+static void pgstat_read_statsfiles(void);
+static void pgstat_read_db_statsfile(Oid databaseid, dshash_table *tabhash, dshash_table *funchash);
 
-static bool pgstat_write_statsfile_needed(void);
-static bool pgstat_db_requested(Oid databaseid);
+/* functions used in backends */
+static bool backend_take_stats_snapshot(void);
+static void pgstat_read_current_status(void);
 
 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
 static void pgstat_send_funcstats(void);
@@ -320,7 +338,6 @@ static const char *pgstat_get_wait_io(WaitEventIO w);
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
 static void pgstat_send(void *msg, int len);
 
-static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
@@ -685,7 +702,6 @@ pgstat_reset_remove_files(const char *directory)
 void
 pgstat_reset_all(void)
 {
-	pgstat_reset_remove_files(pgstat_stat_directory);
 	pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
 }
 
@@ -1009,6 +1025,93 @@ pgstat_send_funcstats(void)
 }
 
 
+/* ----------
+ * pgstat_attach_shared_stats() -
+ *
+ *	attach existing shared stats memory
+ * ----------
+ */
+static bool
+pgstat_attach_shared_stats(void)
+{
+	MemoryContext oldcontext;
+
+	LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+	if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID || area != NULL)
+	{
+		LWLockRelease(StatsLock);
+		return area != NULL;
+	}
+
+	/* top level varialbles. lives for the lifetime of the process */
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	area = dsa_attach(StatsShmem->stats_dsa_handle);
+	dsa_pin_mapping(area);
+	db_stats = dshash_attach(area, &dsh_dbparams,
+							 StatsShmem->db_stats_handle, 0);
+	snapshot_db_stats = NULL;
+	shared_globalStats = (PgStat_GlobalStats *)
+		dsa_get_address(area, StatsShmem->global_stats);
+	shared_archiverStats =	(PgStat_ArchiverStats *)
+		dsa_get_address(area, StatsShmem->archiver_stats);
+	MemoryContextSwitchTo(oldcontext);
+	LWLockRelease(StatsLock);
+
+	return true;
+}
+
+/* ----------
+ * pgstat_create_shared_stats() -
+ *
+ *	create shared stats memory
+ * ----------
+ */
+static void
+pgstat_create_shared_stats(void)
+{
+	MemoryContext oldcontext;
+
+	LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+	Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID);
+
+	/* lives for the lifetime of the process */
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	area = dsa_create(LWTRANCHE_STATS_DSA);
+	dsa_pin_mapping(area);
+
+	db_stats = dshash_create(area, &dsh_dbparams, 0);
+
+	/* create shared area and write bootstrap information */
+	StatsShmem->stats_dsa_handle = dsa_get_handle(area);
+	StatsShmem->global_stats =
+		dsa_allocate0(area, sizeof(PgStat_GlobalStats));
+	StatsShmem->archiver_stats =
+		dsa_allocate0(area, sizeof(PgStat_ArchiverStats));
+	StatsShmem->db_stats_handle =
+		dshash_get_hash_table_handle(db_stats);
+
+	/* connect to the memory */
+	snapshot_db_stats = NULL;
+	shared_globalStats = (PgStat_GlobalStats *)
+		dsa_get_address(area, StatsShmem->global_stats);
+	shared_archiverStats = (PgStat_ArchiverStats *)
+		dsa_get_address(area, StatsShmem->archiver_stats);
+	MemoryContextSwitchTo(oldcontext);
+	LWLockRelease(StatsLock);
+}
+
+/* ----------
+ * backend_get_tab_entry() -
+ *
+ *	Find database stats entry on backends.
+ */
+PgStat_StatTabEntry *
+backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid)
+{
+	Assert(dbent->snapshot_tables);
+	return hash_search(dbent->snapshot_tables, &relid, HASH_FIND, NULL);
+}
+
 /* ----------
  * pgstat_vacuum_stat() -
  *
@@ -1030,11 +1133,9 @@ pgstat_vacuum_stat(void)
 	if (pgStatSock == PGINVALID_SOCKET)
 		return;
 
-	/*
-	 * If not done for this transaction, read the statistics collector stats
-	 * file into some hash tables.
-	 */
-	backend_read_statsfile();
+	/* If not done for this transaction, take a snapshot of stats */
+	if (!backend_take_stats_snapshot())
+		return;
 
 	/*
 	 * Read pg_database and make a list of OIDs of all existing databases
@@ -1045,7 +1146,7 @@ pgstat_vacuum_stat(void)
 	 * Search the database hash table for dead databases and tell the
 	 * collector to drop them.
 	 */
-	hash_seq_init(&hstat, pgStatDBHash);
+	hash_seq_init(&hstat, snapshot_db_stats);
 	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		Oid			dbid = dbentry->databaseid;
@@ -1064,10 +1165,10 @@ pgstat_vacuum_stat(void)
 	/*
 	 * Lookup our own database entry; if not found, nothing more to do.
 	 */
-	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+	dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats,
 												 (void *) &MyDatabaseId,
 												 HASH_FIND, NULL);
-	if (dbentry == NULL || dbentry->tables == NULL)
+	if (dbentry == NULL || dbentry->snapshot_tables == NULL)
 		return;
 
 	/*
@@ -1083,7 +1184,7 @@ pgstat_vacuum_stat(void)
 	/*
 	 * Check for all tables listed in stats hashtable if they still exist.
 	 */
-	hash_seq_init(&hstat, dbentry->tables);
+	hash_seq_init(&hstat, dbentry->snapshot_tables);
 	while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		Oid			tabid = tabentry->tableid;
@@ -1134,8 +1235,8 @@ pgstat_vacuum_stat(void)
 	 * Now repeat the above steps for functions.  However, we needn't bother
 	 * in the common case where no function stats are being collected.
 	 */
-	if (dbentry->functions != NULL &&
-		hash_get_num_entries(dbentry->functions) > 0)
+	if (dbentry->snapshot_functions != NULL &&
+		hash_get_num_entries(dbentry->snapshot_functions) > 0)
 	{
 		htab = pgstat_collect_oids(ProcedureRelationId);
 
@@ -1143,7 +1244,7 @@ pgstat_vacuum_stat(void)
 		f_msg.m_databaseid = MyDatabaseId;
 		f_msg.m_nentries = 0;
 
-		hash_seq_init(&hstat, dbentry->functions);
+		hash_seq_init(&hstat, dbentry->snapshot_functions);
 		while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			Oid			funcid = funcentry->functionid;
@@ -1551,24 +1652,6 @@ pgstat_ping(void)
 	pgstat_send(&msg, sizeof(msg));
 }
 
-/* ----------
- * pgstat_send_inquiry() -
- *
- *	Notify collector that we need fresh data.
- * ----------
- */
-static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
-{
-	PgStat_MsgInquiry msg;
-
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
-	msg.clock_time = clock_time;
-	msg.cutoff_time = cutoff_time;
-	msg.databaseid = databaseid;
-	pgstat_send(&msg, sizeof(msg));
-}
-
 
 /*
  * Initialize function call usage data.
@@ -2383,16 +2466,14 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info,
 PgStat_StatDBEntry *
 pgstat_fetch_stat_dbentry(Oid dbid)
 {
-	/*
-	 * If not done for this transaction, read the statistics collector stats
-	 * file into some hash tables.
-	 */
-	backend_read_statsfile();
+	/* If not done for this transaction, take a stats snapshot */
+	if (!backend_take_stats_snapshot())
+		return NULL;
 
 	/*
 	 * Lookup the requested database; return NULL if not found
 	 */
-	return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+	return (PgStat_StatDBEntry *) hash_search(snapshot_db_stats,
 											  (void *) &dbid,
 											  HASH_FIND, NULL);
 }
@@ -2415,23 +2496,22 @@ pgstat_fetch_stat_tabentry(Oid relid)
 	PgStat_StatTabEntry *tabentry;
 
 	/*
-	 * If not done for this transaction, read the statistics collector stats
-	 * file into some hash tables.
+	 * If not done for this transaction, take a stats snapshot
 	 */
-	backend_read_statsfile();
+	if (!backend_take_stats_snapshot())
+		return NULL;
 
-	/*
-	 * Lookup our database, then look in its table hash table.
-	 */
+	/* Lookup our database, then look in its table hash table. */
 	dbid = MyDatabaseId;
-	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-												 (void *) &dbid,
+	dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats,
+												 (void *)&dbid,
 												 HASH_FIND, NULL);
-	if (dbentry != NULL && dbentry->tables != NULL)
+	if (dbentry != NULL && dbentry->snapshot_tables != NULL)
 	{
-		tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-													   (void *) &relid,
-													   HASH_FIND, NULL);
+		tabentry = (PgStat_StatTabEntry *)
+			hash_search(dbentry->snapshot_tables, (void *)&relid,
+						HASH_FIND, NULL);
+
 		if (tabentry)
 			return tabentry;
 	}
@@ -2440,14 +2520,15 @@ pgstat_fetch_stat_tabentry(Oid relid)
 	 * If we didn't find it, maybe it's a shared table.
 	 */
 	dbid = InvalidOid;
-	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+	dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats,
 												 (void *) &dbid,
 												 HASH_FIND, NULL);
-	if (dbentry != NULL && dbentry->tables != NULL)
+	if (dbentry != NULL && dbentry->snapshot_tables != NULL)
 	{
-		tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-													   (void *) &relid,
-													   HASH_FIND, NULL);
+		tabentry = (PgStat_StatTabEntry *)
+			hash_search(dbentry->snapshot_tables, (void *) &relid,
+						HASH_FIND, NULL);
+
 		if (tabentry)
 			return tabentry;
 	}
@@ -2469,18 +2550,15 @@ pgstat_fetch_stat_funcentry(Oid func_id)
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatFuncEntry *funcentry = NULL;
 
-	/* load the stats file if needed */
-	backend_read_statsfile();
+	/* If not done for this transaction, take a stats snapshot */
+	if (!backend_take_stats_snapshot())
+		return NULL;
 
-	/* Lookup our database, then find the requested function.  */
+	/* Lookup our database, then find the requested function */
 	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
-	if (dbentry != NULL && dbentry->functions != NULL)
-	{
-		funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
-														 (void *) &func_id,
-														 HASH_FIND, NULL);
-	}
-
+	if (dbentry != NULL && dbentry->snapshot_functions != NULL)
+		funcentry = hash_search(dbentry->snapshot_functions,
+								(void *) &func_id, HASH_FIND, NULL);
 	return funcentry;
 }
 
@@ -2555,9 +2633,11 @@ pgstat_fetch_stat_numbackends(void)
 PgStat_ArchiverStats *
 pgstat_fetch_stat_archiver(void)
 {
-	backend_read_statsfile();
+	/* If not done for this transaction, take a stats snapshot */
+	if (!backend_take_stats_snapshot())
+		return NULL;
 
-	return &archiverStats;
+	return snapshot_archiverStats;
 }
 
 
@@ -2572,9 +2652,11 @@ pgstat_fetch_stat_archiver(void)
 PgStat_GlobalStats *
 pgstat_fetch_global(void)
 {
-	backend_read_statsfile();
+	/* If not done for this transaction, take a stats snapshot */
+	if (!backend_take_stats_snapshot())
+		return NULL;
 
-	return &globalStats;
+	return snapshot_globalStats;
 }
 
 
@@ -4277,18 +4359,14 @@ PgstatCollectorMain(void)
 	pqsignal(SIGTTOU, SIG_DFL);
 	pqsignal(SIGCONT, SIG_DFL);
 	pqsignal(SIGWINCH, SIG_DFL);
-	PG_SETMASK(&UnBlockSig);
 
-	/*
-	 * Identify myself via ps
-	 */
-	init_ps_display("stats collector", "", "", "");
+	PG_SETMASK(&UnBlockSig);
 
 	/*
 	 * Read in existing stats files or initialize the stats to zero.
 	 */
 	pgStatRunningInCollector = true;
-	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
+	pgstat_read_statsfiles();
 
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
@@ -4330,13 +4408,6 @@ PgstatCollectorMain(void)
 				ProcessConfigFile(PGC_SIGHUP);
 			}
 
-			/*
-			 * Write the stats file(s) if a new request has arrived that is
-			 * not satisfied by existing file(s).
-			 */
-			if (pgstat_write_statsfile_needed())
-				pgstat_write_statsfiles(false, false);
-
 			/*
 			 * Try to receive and process a message.  This will not block,
 			 * since the socket is set to non-blocking mode.
@@ -4385,10 +4456,6 @@ PgstatCollectorMain(void)
 				case PGSTAT_MTYPE_DUMMY:
 					break;
 
-				case PGSTAT_MTYPE_INQUIRY:
-					pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len);
-					break;
-
 				case PGSTAT_MTYPE_TABSTAT:
 					pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
 					break;
@@ -4477,9 +4544,7 @@ PgstatCollectorMain(void)
 		 * fixes that, so don't sleep indefinitely.  This is a crock of the
 		 * first water, but until somebody wants to debug exactly what's
 		 * happening there, this is the best we can do.  The two-second
-		 * timeout matches our pre-9.2 behavior, and needs to be short enough
-		 * to not provoke "using stale statistics" complaints from
-		 * backend_read_statsfile.
+		 * timeout matches our pre-9.2 behavior.
 		 */
 		wr = WaitLatchOrSocket(MyLatch,
 							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
@@ -4499,7 +4564,7 @@ PgstatCollectorMain(void)
 	/*
 	 * Save the final stats to reuse at next startup.
 	 */
-	pgstat_write_statsfiles(true, true);
+	pgstat_write_statsfiles();
 
 	exit(0);
 }
@@ -4549,14 +4614,14 @@ pgstat_shutdown_handler(SIGNAL_ARGS)
 }
 
 /*
- * Subroutine to clear stats in a database entry
+ * Subroutine to reset stats in a shared database entry
  *
  * Tables and functions hashes are initialized to empty.
  */
 static void
 reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 {
-	HASHCTL		hash_ctl;
+	dshash_table *tbl;
 
 	dbentry->n_xact_commit = 0;
 	dbentry->n_xact_rollback = 0;
@@ -4582,20 +4647,14 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 	dbentry->stat_reset_timestamp = GetCurrentTimestamp();
 	dbentry->stats_timestamp = 0;
 
-	memset(&hash_ctl, 0, sizeof(hash_ctl));
-	hash_ctl.keysize = sizeof(Oid);
-	hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-	dbentry->tables = hash_create("Per-database table",
-								  PGSTAT_TAB_HASH_SIZE,
-								  &hash_ctl,
-								  HASH_ELEM | HASH_BLOBS);
 
-	hash_ctl.keysize = sizeof(Oid);
-	hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
-	dbentry->functions = hash_create("Per-database function",
-									 PGSTAT_FUNCTION_HASH_SIZE,
-									 &hash_ctl,
-									 HASH_ELEM | HASH_BLOBS);
+	tbl = dshash_create(area, &dsh_tblparams, 0);
+	dbentry->tables = dshash_get_hash_table_handle(tbl);
+	dshash_detach(tbl);
+
+	tbl = dshash_create(area, &dsh_funcparams, 0);
+	dbentry->functions = dshash_get_hash_table_handle(tbl);
+	dshash_detach(tbl);
 }
 
 /*
@@ -4608,15 +4667,18 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 {
 	PgStat_StatDBEntry *result;
 	bool		found;
-	HASHACTION	action = (create ? HASH_ENTER : HASH_FIND);
+
+	Assert(pgStatRunningInCollector);
 
 	/* Lookup or create the hash table entry for this database */
-	result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-												&databaseid,
-												action, &found);
+	if (create)
+		result = (PgStat_StatDBEntry *)
+			dshash_find_or_insert(db_stats,	&databaseid, &found);
+	else
+		result = (PgStat_StatDBEntry *)	dshash_find(db_stats, &databaseid, true);
 
-	if (!create && !found)
-		return NULL;
+	if (!create)
+		return result;
 
 	/*
 	 * If not found, initialize the new one.  This creates empty hash tables
@@ -4628,23 +4690,23 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 	return result;
 }
 
-
 /*
  * Lookup the hash table entry for the specified table. If no hash
  * table entry exists, initialize it, if the create parameter is true.
  * Else, return NULL.
  */
 static PgStat_StatTabEntry *
-pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create)
 {
 	PgStat_StatTabEntry *result;
 	bool		found;
-	HASHACTION	action = (create ? HASH_ENTER : HASH_FIND);
 
 	/* Lookup or create the hash table entry for this table */
-	result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-												 &tableoid,
-												 action, &found);
+	if (create)
+		result = (PgStat_StatTabEntry *)
+			dshash_find_or_insert(table, &tableoid, &found);
+	else
+		result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false);
 
 	if (!create && !found)
 		return NULL;
@@ -4682,25 +4744,20 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
  * pgstat_write_statsfiles() -
  *		Write the global statistics file, as well as requested DB files.
  *
- *	'permanent' specifies writing to the permanent files not temporary ones.
- *	When true (happens only when the collector is shutting down), also remove
- *	the temporary files so that backends starting up under a new postmaster
- *	can't read old data before the new collector is ready.
- *
  *	When 'allDbs' is false, only the requested databases (listed in
  *	pending_write_requests) will be written; otherwise, all databases
  *	will be written.
  * ----------
  */
 static void
-pgstat_write_statsfiles(bool permanent, bool allDbs)
+pgstat_write_statsfiles(void)
 {
-	HASH_SEQ_STATUS hstat;
+	dshash_seq_status hstat;
 	PgStat_StatDBEntry *dbentry;
 	FILE	   *fpout;
 	int32		format_id;
-	const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
-	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+	const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+	const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
 	int			rc;
 
 	elog(DEBUG2, "writing stats file \"%s\"", statfile);
@@ -4721,7 +4778,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	/*
 	 * Set the timestamp of the stats file.
 	 */
-	globalStats.stats_timestamp = GetCurrentTimestamp();
+	shared_globalStats->stats_timestamp = GetCurrentTimestamp();
 
 	/*
 	 * Write the file header --- currently just a format ID.
@@ -4733,32 +4790,29 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	/*
 	 * Write global stats struct
 	 */
-	rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+	rc = fwrite(shared_globalStats, sizeof(shared_globalStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
 	/*
 	 * Write archiver stats struct
 	 */
-	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
+	rc = fwrite(shared_archiverStats, sizeof(shared_archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
 	/*
 	 * Walk through the database table.
 	 */
-	hash_seq_init(&hstat, pgStatDBHash);
-	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+	dshash_seq_init(&hstat, db_stats, false);
+	while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
 	{
 		/*
 		 * Write out the table and function stats for this DB into the
 		 * appropriate per-DB stat file, if required.
 		 */
-		if (allDbs || pgstat_db_requested(dbentry->databaseid))
-		{
-			/* Make DB's timestamp consistent with the global stats */
-			dbentry->stats_timestamp = globalStats.stats_timestamp;
+		/* Make DB's timestamp consistent with the global stats */
+		dbentry->stats_timestamp = shared_globalStats->stats_timestamp;
 
-			pgstat_write_db_statsfile(dbentry, permanent);
-		}
+		pgstat_write_db_statsfile(dbentry);
 
 		/*
 		 * Write out the DB entry. We don't write the tables or functions
@@ -4802,9 +4856,6 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 		unlink(tmpfile);
 	}
 
-	if (permanent)
-		unlink(pgstat_stat_filename);
-
 	/*
 	 * Now throw away the list of requests.  Note that requests sent after we
 	 * started the write are still waiting on the network socket.
@@ -4818,15 +4869,14 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
  * of length len.
  */
 static void
-get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
+get_dbstat_filename(bool tempname, Oid databaseid,
 					char *filename, int len)
 {
 	int			printed;
 
 	/* NB -- pgstat_reset_remove_files knows about the pattern this uses */
 	printed = snprintf(filename, len, "%s/db_%u.%s",
-					   permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
-					   pgstat_stat_directory,
+					   PGSTAT_STAT_PERMANENT_DIRECTORY,
 					   databaseid,
 					   tempname ? "tmp" : "stat");
 	if (printed > len)
@@ -4844,10 +4894,10 @@ get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
  * ----------
  */
 static void
-pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
+pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry)
 {
-	HASH_SEQ_STATUS tstat;
-	HASH_SEQ_STATUS fstat;
+	dshash_seq_status tstat;
+	dshash_seq_status fstat;
 	PgStat_StatTabEntry *tabentry;
 	PgStat_StatFuncEntry *funcentry;
 	FILE	   *fpout;
@@ -4856,9 +4906,10 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 	int			rc;
 	char		tmpfile[MAXPGPATH];
 	char		statfile[MAXPGPATH];
+	dshash_table *tbl;
 
-	get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
-	get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
+	get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH);
+	get_dbstat_filename(false, dbid, statfile, MAXPGPATH);
 
 	elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
@@ -4885,24 +4936,28 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 	/*
 	 * Walk through the database's access stats per table.
 	 */
-	hash_seq_init(&tstat, dbentry->tables);
-	while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
+	tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+	dshash_seq_init(&tstat, tbl, false);
+	while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL)
 	{
 		fputc('T', fpout);
 		rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
 		(void) rc;				/* we'll check for error with ferror */
 	}
+	dshash_detach(tbl);
 
 	/*
 	 * Walk through the database's function stats table.
 	 */
-	hash_seq_init(&fstat, dbentry->functions);
-	while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+	tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+	dshash_seq_init(&fstat, tbl, false);
+	while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL)
 	{
 		fputc('F', fpout);
 		rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
 		(void) rc;				/* we'll check for error with ferror */
 	}
+	dshash_detach(tbl);
 
 	/*
 	 * No more output to be done. Close the temp file and replace the old
@@ -4936,76 +4991,45 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 						tmpfile, statfile)));
 		unlink(tmpfile);
 	}
-
-	if (permanent)
-	{
-		get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
-		elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
-		unlink(statfile);
-	}
 }
 
 /* ----------
  * pgstat_read_statsfiles() -
  *
- *	Reads in some existing statistics collector files and returns the
- *	databases hash table that is the top level of the data.
+ *	Reads in some existing statistics collector files into the shared stats
+ *	hash.
  *
- *	If 'onlydb' is not InvalidOid, it means we only want data for that DB
- *	plus the shared catalogs ("DB 0").  We'll still populate the DB hash
- *	table for all databases, but we don't bother even creating table/function
- *	hash tables for other databases.
- *
- *	'permanent' specifies reading from the permanent files not temporary ones.
- *	When true (happens only when the collector is starting up), remove the
- *	files after reading; the in-memory status is now authoritative, and the
- *	files would be out of date in case somebody else reads them.
- *
- *	If a 'deep' read is requested, table/function stats are read, otherwise
- *	the table/function hash tables remain empty.
  * ----------
  */
-static HTAB *
-pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
+static void
+pgstat_read_statsfiles(void)
 {
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatDBEntry dbbuf;
-	HASHCTL		hash_ctl;
-	HTAB	   *dbhash;
 	FILE	   *fpin;
 	int32		format_id;
 	bool		found;
-	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+	const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+	dshash_table *tblstats = NULL;
+	dshash_table *funcstats = NULL;
 
+	Assert(pgStatRunningInCollector);
 	/*
 	 * The tables will live in pgStatLocalContext.
 	 */
 	pgstat_setup_memcxt();
 
 	/*
-	 * Create the DB hashtable
+	 * Create the DB hashtable and global stas area
 	 */
-	memset(&hash_ctl, 0, sizeof(hash_ctl));
-	hash_ctl.keysize = sizeof(Oid);
-	hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
-	hash_ctl.hcxt = pgStatLocalContext;
-	dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-						 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
-	/*
-	 * Clear out global and archiver statistics so they start from zero in
-	 * case we can't load an existing statsfile.
-	 */
-	memset(&globalStats, 0, sizeof(globalStats));
-	memset(&archiverStats, 0, sizeof(archiverStats));
+	pgstat_create_shared_stats();
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
 	 * existing statsfile).
 	 */
-	globalStats.stat_reset_timestamp = GetCurrentTimestamp();
-	archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
+	shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
+	shared_archiverStats->stat_reset_timestamp = shared_globalStats->stat_reset_timestamp;
 
 	/*
 	 * Try to open the stats file. If it doesn't exist, the backends simply
@@ -5023,7 +5047,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 					(errcode_for_file_access(),
 					 errmsg("could not open statistics file \"%s\": %m",
 							statfile)));
-		return dbhash;
+		return;
 	}
 
 	/*
@@ -5040,11 +5064,11 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	/*
 	 * Read global stats struct
 	 */
-	if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+	if (fread(shared_globalStats, 1, sizeof(shared_globalStats), fpin) != sizeof(shared_globalStats))
 	{
 		ereport(pgStatRunningInCollector ? LOG : WARNING,
 				(errmsg("corrupted statistics file \"%s\"", statfile)));
-		memset(&globalStats, 0, sizeof(globalStats));
+		memset(shared_globalStats, 0, sizeof(*shared_globalStats));
 		goto done;
 	}
 
@@ -5055,17 +5079,16 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
 	 * an unusual scenario.
 	 */
-	if (pgStatRunningInCollector)
-		globalStats.stats_timestamp = 0;
+	shared_globalStats->stats_timestamp = 0;
 
 	/*
 	 * Read archiver stats struct
 	 */
-	if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
+	if (fread(shared_archiverStats, 1, sizeof(shared_archiverStats), fpin) != sizeof(shared_archiverStats))
 	{
 		ereport(pgStatRunningInCollector ? LOG : WARNING,
 				(errmsg("corrupted statistics file \"%s\"", statfile)));
-		memset(&archiverStats, 0, sizeof(archiverStats));
+		memset(shared_archiverStats, 0, sizeof(*shared_archiverStats));
 		goto done;
 	}
 
@@ -5094,12 +5117,12 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 				/*
 				 * Add to the DB hash
 				 */
-				dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
-															 (void *) &dbbuf.databaseid,
-															 HASH_ENTER,
-															 &found);
+				dbentry = (PgStat_StatDBEntry *)
+					dshash_find_or_insert(db_stats, (void *) &dbbuf.databaseid,
+										  &found);
 				if (found)
 				{
+					dshash_release_lock(db_stats, dbentry);
 					ereport(pgStatRunningInCollector ? LOG : WARNING,
 							(errmsg("corrupted statistics file \"%s\"",
 									statfile)));
@@ -5107,8 +5130,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 				}
 
 				memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
-				dbentry->tables = NULL;
-				dbentry->functions = NULL;
+				dbentry->tables = DSM_HANDLE_INVALID;
+				dbentry->functions = DSM_HANDLE_INVALID;
 
 				/*
 				 * In the collector, disregard the timestamp we read from the
@@ -5116,47 +5139,23 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 				 * stats file immediately upon the first request from any
 				 * backend.
 				 */
-				if (pgStatRunningInCollector)
-					dbentry->stats_timestamp = 0;
-
-				/*
-				 * Don't create tables/functions hashtables for uninteresting
-				 * databases.
-				 */
-				if (onlydb != InvalidOid)
-				{
-					if (dbbuf.databaseid != onlydb &&
-						dbbuf.databaseid != InvalidOid)
-						break;
-				}
-
-				memset(&hash_ctl, 0, sizeof(hash_ctl));
-				hash_ctl.keysize = sizeof(Oid);
-				hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-				hash_ctl.hcxt = pgStatLocalContext;
-				dbentry->tables = hash_create("Per-database table",
-											  PGSTAT_TAB_HASH_SIZE,
-											  &hash_ctl,
-											  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
-				hash_ctl.keysize = sizeof(Oid);
-				hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
-				hash_ctl.hcxt = pgStatLocalContext;
-				dbentry->functions = hash_create("Per-database function",
-												 PGSTAT_FUNCTION_HASH_SIZE,
-												 &hash_ctl,
-												 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+				Assert(pgStatRunningInCollector);
+				dbentry->stats_timestamp = 0;
 
 				/*
 				 * If requested, read the data from the database-specific
 				 * file.  Otherwise we just leave the hashtables empty.
 				 */
-				if (deep)
-					pgstat_read_db_statsfile(dbentry->databaseid,
-											 dbentry->tables,
-											 dbentry->functions,
-											 permanent);
-
+				tblstats = dshash_create(area, &dsh_tblparams, 0);
+				dbentry->tables = dshash_get_hash_table_handle(tblstats);
+				funcstats = dshash_create(area, &dsh_funcparams, 0);
+				dbentry->functions =
+					dshash_get_hash_table_handle(funcstats);
+				dshash_release_lock(db_stats, dbentry);
+				pgstat_read_db_statsfile(dbentry->databaseid,
+										 tblstats, funcstats);
+				dshash_detach(tblstats);
+				dshash_detach(funcstats);
 				break;
 
 			case 'E':
@@ -5173,34 +5172,47 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 done:
 	FreeFile(fpin);
 
-	/* If requested to read the permanent file, also get rid of it. */
-	if (permanent)
-	{
-		elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
-		unlink(statfile);
-	}
+	elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+	unlink(statfile);
 
-	return dbhash;
+	return;
 }
 
 
+Size
+StatsShmemSize(void)
+{
+	return sizeof(dsa_handle);
+}
+
+void
+StatsShmemInit(void)
+{
+	bool	found;
+
+	StatsShmem = (StatsShmemStruct *)
+		ShmemInitStruct("Stats area", StatsShmemSize(),
+						&found);
+	if (!IsUnderPostmaster)
+	{
+		Assert(!found);
+
+		StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+	}
+	else
+		Assert(found);
+}
+
 /* ----------
  * pgstat_read_db_statsfile() -
  *
- *	Reads in the existing statistics collector file for the given database,
- *	filling the passed-in tables and functions hash tables.
- *
- *	As in pgstat_read_statsfiles, if the permanent file is requested, it is
- *	removed after reading.
- *
- *	Note: this code has the ability to skip storing per-table or per-function
- *	data, if NULL is passed for the corresponding hashtable.  That's not used
- *	at the moment though.
+ *	Reads in the permanent statistics collector file and create shared
+ *	statistics tables. The file is removed afer reading.
  * ----------
  */
 static void
-pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
-						 bool permanent)
+pgstat_read_db_statsfile(Oid databaseid,
+						 dshash_table *tabhash, dshash_table *funchash)
 {
 	PgStat_StatTabEntry *tabentry;
 	PgStat_StatTabEntry tabbuf;
@@ -5211,7 +5223,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 	bool		found;
 	char		statfile[MAXPGPATH];
 
-	get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
+	Assert(pgStatRunningInCollector);
+	get_dbstat_filename(false, databaseid, statfile, MAXPGPATH);
 
 	/*
 	 * Try to open the stats file. If it doesn't exist, the backends simply
@@ -5270,12 +5283,13 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				if (tabhash == NULL)
 					break;
 
-				tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-															   (void *) &tabbuf.tableid,
-															   HASH_ENTER, &found);
+				tabentry = (PgStat_StatTabEntry *)
+					dshash_find_or_insert(tabhash,
+										  (void *) &tabbuf.tableid, &found);
 
 				if (found)
 				{
+					dshash_release_lock(tabhash, tabentry);
 					ereport(pgStatRunningInCollector ? LOG : WARNING,
 							(errmsg("corrupted statistics file \"%s\"",
 									statfile)));
@@ -5283,6 +5297,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				}
 
 				memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+				dshash_release_lock(tabhash, tabentry);
 				break;
 
 				/*
@@ -5304,9 +5319,9 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				if (funchash == NULL)
 					break;
 
-				funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
-																 (void *) &funcbuf.functionid,
-																 HASH_ENTER, &found);
+				funcentry = (PgStat_StatFuncEntry *)
+					dshash_find_or_insert(funchash,
+										  (void *) &funcbuf.functionid, &found);
 
 				if (found)
 				{
@@ -5317,6 +5332,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				}
 
 				memcpy(funcentry, &funcbuf, sizeof(funcbuf));
+				dshash_release_lock(funchash, funcentry);
 				break;
 
 				/*
@@ -5336,142 +5352,50 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 done:
 	FreeFile(fpin);
 
-	if (permanent)
-	{
-		elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
-		unlink(statfile);
-	}
+	elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+	unlink(statfile);
 }
 
 /* ----------
- * pgstat_read_db_statsfile_timestamp() -
+ * backend_clean_snapshot_callback() -
  *
- *	Attempt to determine the timestamp of the last db statfile write.
- *	Returns true if successful; the timestamp is stored in *ts.
- *
- *	This needs to be careful about handling databases for which no stats file
- *	exists, such as databases without a stat entry or those not yet written:
- *
- *	- if there's a database entry in the global file, return the corresponding
- *	stats_timestamp value.
- *
- *	- if there's no db stat entry (e.g. for a new or inactive database),
- *	there's no stats_timestamp value, but also nothing to write so we return
- *	the timestamp of the global statfile.
+ *	This is usually called with arg = NULL when the memory context where the
+ *  current snapshot has been taken. Don't bother to release memory for the
+ *  case.
  * ----------
  */
-static bool
-pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
-								   TimestampTz *ts)
+static void
+backend_clean_snapshot_callback(void *arg)
 {
-	PgStat_StatDBEntry dbentry;
-	PgStat_GlobalStats myGlobalStats;
-	PgStat_ArchiverStats myArchiverStats;
-	FILE	   *fpin;
-	int32		format_id;
-	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
-	/*
-	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
-	 * complaining about.
-	 */
-	if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+	if (arg != NULL)
 	{
-		if (errno != ENOENT)
-			ereport(pgStatRunningInCollector ? LOG : WARNING,
-					(errcode_for_file_access(),
-					 errmsg("could not open statistics file \"%s\": %m",
-							statfile)));
-		return false;
-	}
+		/* explicitly called, so explicitly free resources */
+		if (snapshot_globalStats)
+			pfree(snapshot_globalStats);
 
-	/*
-	 * Verify it's of the expected format.
-	 */
-	if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
-		format_id != PGSTAT_FILE_FORMAT_ID)
-	{
-		ereport(pgStatRunningInCollector ? LOG : WARNING,
-				(errmsg("corrupted statistics file \"%s\"", statfile)));
-		FreeFile(fpin);
-		return false;
-	}
+		if (snapshot_archiverStats)
+			pfree(snapshot_archiverStats);
 
-	/*
-	 * Read global stats struct
-	 */
-	if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-			  fpin) != sizeof(myGlobalStats))
-	{
-		ereport(pgStatRunningInCollector ? LOG : WARNING,
-				(errmsg("corrupted statistics file \"%s\"", statfile)));
-		FreeFile(fpin);
-		return false;
-	}
-
-	/*
-	 * Read archiver stats struct
-	 */
-	if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-			  fpin) != sizeof(myArchiverStats))
-	{
-		ereport(pgStatRunningInCollector ? LOG : WARNING,
-				(errmsg("corrupted statistics file \"%s\"", statfile)));
-		FreeFile(fpin);
-		return false;
-	}
-
-	/* By default, we're going to return the timestamp of the global file. */
-	*ts = myGlobalStats.stats_timestamp;
-
-	/*
-	 * We found an existing collector stats file.  Read it and look for a
-	 * record for the requested database.  If found, use its timestamp.
-	 */
-	for (;;)
-	{
-		switch (fgetc(fpin))
+		if (snapshot_db_stats)
 		{
-				/*
-				 * 'D'	A PgStat_StatDBEntry struct describing a database
-				 * follows.
-				 */
-			case 'D':
-				if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-						  fpin) != offsetof(PgStat_StatDBEntry, tables))
-				{
-					ereport(pgStatRunningInCollector ? LOG : WARNING,
-							(errmsg("corrupted statistics file \"%s\"",
-									statfile)));
-					goto done;
-				}
+			HASH_SEQ_STATUS seq;
+			PgStat_StatDBEntry *dbent;
 
-				/*
-				 * If this is the DB we're looking for, save its timestamp and
-				 * we're done.
-				 */
-				if (dbentry.databaseid == databaseid)
-				{
-					*ts = dbentry.stats_timestamp;
-					goto done;
-				}
-
-				break;
-
-			case 'E':
-				goto done;
-
-			default:
-				ereport(pgStatRunningInCollector ? LOG : WARNING,
-						(errmsg("corrupted statistics file \"%s\"",
-								statfile)));
-				goto done;
+			hash_seq_init(&seq, snapshot_db_stats);
+			while ((dbent = hash_seq_search(&seq)) != NULL)
+			{
+				if (dbent->snapshot_tables)
+					hash_destroy(dbent->snapshot_tables);
+				if (dbent->snapshot_functions)
+					hash_destroy(dbent->snapshot_functions);
+			}
+			hash_destroy(snapshot_db_stats);
 		}
 	}
 
-done:
-	FreeFile(fpin);
-	return true;
+	snapshot_globalStats = NULL;
+	snapshot_archiverStats = NULL;
+	snapshot_db_stats = NULL;
 }
 
 /*
@@ -5479,131 +5403,75 @@ done:
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
  * is called (typically, at end of transaction).
  */
-static void
-backend_read_statsfile(void)
+static bool
+backend_take_stats_snapshot(void)
 {
-	TimestampTz min_ts = 0;
-	TimestampTz ref_ts = 0;
-	Oid			inquiry_db;
-	int			count;
+	PgStat_StatDBEntry  *dbent;
+	HASH_SEQ_STATUS seq;
+	MemoryContext oldcontext;
+	MemoryContextCallback *mcxt_cb;
 
-	/* already read it? */
-	if (pgStatDBHash)
-		return;
 	Assert(!pgStatRunningInCollector);
 
-	/*
-	 * In a normal backend, we check staleness of the data for our own DB, and
-	 * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
-	 * check staleness of the shared-catalog data, and send InvalidOid in
-	 * inquiry messages so as not to force writing unnecessary data.
-	 */
-	if (IsAutoVacuumLauncherProcess())
-		inquiry_db = InvalidOid;
-	else
-		inquiry_db = MyDatabaseId;
-
-	/*
-	 * Loop until fresh enough stats file is available or we ran out of time.
-	 * The stats inquiry message is sent repeatedly in case collector drops
-	 * it; but not every single time, as that just swamps the collector.
-	 */
-	for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	if (!pgstat_attach_shared_stats())
 	{
-		bool		ok;
-		TimestampTz file_ts = 0;
-		TimestampTz cur_ts;
+		MemoryContextSwitchTo(oldcontext);
+		return false;
+	}
+	MemoryContextSwitchTo(oldcontext);
 
-		CHECK_FOR_INTERRUPTS();
+	if (snapshot_globalStats)
+		return true;
 
-		ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
+	Assert(snapshot_archiverStats == NULL);
+	Assert(snapshot_db_stats == NULL);
 
-		cur_ts = GetCurrentTimestamp();
-		/* Calculate min acceptable timestamp, if we didn't already */
-		if (count == 0 || cur_ts < ref_ts)
-		{
-			/*
-			 * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
-			 * msec before now.  This indirectly ensures that the collector
-			 * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
-			 * an autovacuum worker, however, we want a lower delay to avoid
-			 * using stale data, so we use PGSTAT_RETRY_DELAY (since the
-			 * number of workers is low, this shouldn't be a problem).
-			 *
-			 * We don't recompute min_ts after sleeping, except in the
-			 * unlikely case that cur_ts went backwards.  So we might end up
-			 * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In
-			 * practice that shouldn't happen, though, as long as the sleep
-			 * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
-			 * tell the collector that our cutoff time is less than what we'd
-			 * actually accept.
-			 */
-			ref_ts = cur_ts;
-			if (IsAutoVacuumWorkerProcess())
-				min_ts = TimestampTzPlusMilliseconds(ref_ts,
-													 -PGSTAT_RETRY_DELAY);
-			else
-				min_ts = TimestampTzPlusMilliseconds(ref_ts,
-													 -PGSTAT_STAT_INTERVAL);
-		}
+	/*
+	 * the snapshot lives within the current transaction if any, the current
+	 * memory context liftime otherwise.
+	 */
+	if (IsTransactionState())
+		MemoryContextSwitchTo(TopTransactionContext);
 
-		/*
-		 * If the file timestamp is actually newer than cur_ts, we must have
-		 * had a clock glitch (system time went backwards) or there is clock
-		 * skew between our processor and the stats collector's processor.
-		 * Accept the file, but send an inquiry message anyway to make
-		 * pgstat_recv_inquiry do a sanity check on the collector's time.
-		 */
-		if (ok && file_ts > cur_ts)
-		{
-			/*
-			 * A small amount of clock skew between processors isn't terribly
-			 * surprising, but a large difference is worth logging.  We
-			 * arbitrarily define "large" as 1000 msec.
-			 */
-			if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
-			{
-				char	   *filetime;
-				char	   *mytime;
+	/* global stats can be just copied  */
+	snapshot_globalStats = palloc(sizeof(PgStat_GlobalStats));
+	memcpy(snapshot_globalStats, shared_globalStats,
+		   sizeof(PgStat_GlobalStats));
 
-				/* Copy because timestamptz_to_str returns a static buffer */
-				filetime = pstrdup(timestamptz_to_str(file_ts));
-				mytime = pstrdup(timestamptz_to_str(cur_ts));
-				elog(LOG, "stats collector's time %s is later than backend local time %s",
-					 filetime, mytime);
-				pfree(filetime);
-				pfree(mytime);
-			}
+	snapshot_archiverStats = palloc(sizeof(PgStat_ArchiverStats));
+	memcpy(snapshot_archiverStats, shared_archiverStats,
+		   sizeof(PgStat_ArchiverStats));
 
-			pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-			break;
-		}
+	/*
+	 * take a local snapshot for every dsahsh. It's ok if the snapshots are
+	 * not in strictly consistent.
+	 */
+	snapshot_db_stats = dshash_take_snapshot(db_stats);
+	hash_seq_init(&seq, snapshot_db_stats);
+	while ((dbent = (PgStat_StatDBEntry *) hash_seq_search(&seq)) != NULL)
+	{
+		dshash_table *t;
 
-		/* Normal acceptance case: file is not older than cutoff time */
-		if (ok && file_ts >= min_ts)
-			break;
-
-		/* Not there or too old, so kick the collector and wait a bit */
-		if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
-			pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-
-		pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
+		t = dshash_attach(area, &dsh_tblparams, dbent->tables, 0);
+		dbent->snapshot_tables = dshash_take_snapshot(t);
+		dshash_detach(t);
+		t = dshash_attach(area, &dsh_funcparams, dbent->functions, 0);
+		dbent->snapshot_functions = dshash_take_snapshot(t);
+		dshash_detach(t);
 	}
 
-	if (count >= PGSTAT_POLL_LOOP_COUNT)
-		ereport(LOG,
-				(errmsg("using stale statistics instead of current ones "
-						"because stats collector is not responding")));
+	/* set the timestamp of this snapshot */
+	snapshot_globalStats->stats_timestamp = GetCurrentTimestamp();
 
-	/*
-	 * Autovacuum launcher wants stats about all databases, but a shallow read
-	 * is sufficient.  Regular backends want a deep read for just the tables
-	 * they can see (MyDatabaseId + shared catalogs).
-	 */
-	if (IsAutoVacuumLauncherProcess())
-		pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
-	else
-		pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
+	/* register callback to clear snapshot */
+	mcxt_cb = (MemoryContextCallback *)palloc(sizeof(MemoryContextCallback));
+	mcxt_cb->func = backend_clean_snapshot_callback;
+	mcxt_cb->arg = NULL;
+	MemoryContextRegisterResetCallback(CurrentMemoryContext, mcxt_cb);
+	MemoryContextSwitchTo(oldcontext);
+
+	return true;
 }
 
 
@@ -5636,6 +5504,8 @@ pgstat_setup_memcxt(void)
 void
 pgstat_clear_snapshot(void)
 {
+	int param = 0;	/* only the address is significant */
+
 	/* Release memory, if any was allocated */
 	if (pgStatLocalContext)
 		MemoryContextDelete(pgStatLocalContext);
@@ -5645,99 +5515,12 @@ pgstat_clear_snapshot(void)
 	pgStatDBHash = NULL;
 	localBackendStatusTable = NULL;
 	localNumBackends = 0;
-}
-
-
-/* ----------
- * pgstat_recv_inquiry() -
- *
- *	Process stat inquiry requests.
- * ----------
- */
-static void
-pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
-{
-	PgStat_StatDBEntry *dbentry;
-
-	elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
 
 	/*
-	 * If there's already a write request for this DB, there's nothing to do.
-	 *
-	 * Note that if a request is found, we return early and skip the below
-	 * check for clock skew.  This is okay, since the only way for a DB
-	 * request to be present in the list is that we have been here since the
-	 * last write round.  It seems sufficient to check for clock skew once per
-	 * write round.
+	 * the parameter inform the function that it is not called from
+	 * MemoryContextCallback
 	 */
-	if (list_member_oid(pending_write_requests, msg->databaseid))
-		return;
-
-	/*
-	 * Check to see if we last wrote this database at a time >= the requested
-	 * cutoff time.  If so, this is a stale request that was generated before
-	 * we updated the DB file, and we don't need to do so again.
-	 *
-	 * If the requestor's local clock time is older than stats_timestamp, we
-	 * should suspect a clock glitch, ie system time going backwards; though
-	 * the more likely explanation is just delayed message receipt.  It is
-	 * worth expending a GetCurrentTimestamp call to be sure, since a large
-	 * retreat in the system clock reading could otherwise cause us to neglect
-	 * to update the stats file for a long time.
-	 */
-	dbentry = pgstat_get_db_entry(msg->databaseid, false);
-	if (dbentry == NULL)
-	{
-		/*
-		 * We have no data for this DB.  Enter a write request anyway so that
-		 * the global stats will get updated.  This is needed to prevent
-		 * backend_read_statsfile from waiting for data that we cannot supply,
-		 * in the case of a new DB that nobody has yet reported any stats for.
-		 * See the behavior of pgstat_read_db_statsfile_timestamp.
-		 */
-	}
-	else if (msg->clock_time < dbentry->stats_timestamp)
-	{
-		TimestampTz cur_ts = GetCurrentTimestamp();
-
-		if (cur_ts < dbentry->stats_timestamp)
-		{
-			/*
-			 * Sure enough, time went backwards.  Force a new stats file write
-			 * to get back in sync; but first, log a complaint.
-			 */
-			char	   *writetime;
-			char	   *mytime;
-
-			/* Copy because timestamptz_to_str returns a static buffer */
-			writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
-			mytime = pstrdup(timestamptz_to_str(cur_ts));
-			elog(LOG,
-				 "stats_timestamp %s is later than collector's time %s for database %u",
-				 writetime, mytime, dbentry->databaseid);
-			pfree(writetime);
-			pfree(mytime);
-		}
-		else
-		{
-			/*
-			 * Nope, it's just an old request.  Assuming msg's clock_time is
-			 * >= its cutoff_time, it must be stale, so we can ignore it.
-			 */
-			return;
-		}
-	}
-	else if (msg->cutoff_time <= dbentry->stats_timestamp)
-	{
-		/* Stale request, ignore it */
-		return;
-	}
-
-	/*
-	 * We need to write this DB, so create a request.
-	 */
-	pending_write_requests = lappend_oid(pending_write_requests,
-										 msg->databaseid);
+	backend_clean_snapshot_callback(&param);
 }
 
 
@@ -5750,6 +5533,7 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 static void
 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 {
+	dshash_table *tabhash;
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatTabEntry *tabentry;
 	int			i;
@@ -5765,6 +5549,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 	dbentry->n_block_read_time += msg->m_block_read_time;
 	dbentry->n_block_write_time += msg->m_block_write_time;
 
+	tabhash = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
 	/*
 	 * Process all table entries in the message.
 	 */
@@ -5772,9 +5557,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 	{
 		PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
 
-		tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-													   (void *) &(tabmsg->t_id),
-													   HASH_ENTER, &found);
+		tabentry = (PgStat_StatTabEntry *)
+			dshash_find_or_insert(tabhash, (void *) &(tabmsg->t_id), &found);
 
 		if (!found)
 		{
@@ -5833,6 +5617,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 		tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
 		/* Likewise for n_dead_tuples */
 		tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+		dshash_release_lock(tabhash, tabentry);
 
 		/*
 		 * Add per-table stats to the per-database entry, too.
@@ -5845,6 +5630,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 		dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
 		dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
 	}
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 
@@ -5857,27 +5644,33 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 static void
 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
 {
+	dshash_table *tbl;
 	PgStat_StatDBEntry *dbentry;
 	int			i;
 
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
 	/*
 	 * No need to purge if we don't even know the database.
 	 */
-	if (!dbentry || !dbentry->tables)
+	if (!dbentry || dbentry->tables == DSM_HANDLE_INVALID)
+	{
+		if (dbentry)
+			dshash_release_lock(db_stats, dbentry);
 		return;
+	}
 
+	tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
 	/*
 	 * Process all table entries in the message.
 	 */
 	for (i = 0; i < msg->m_nentries; i++)
 	{
 		/* Remove from hashtable if present; we don't care if it's not. */
-		(void) hash_search(dbentry->tables,
-						   (void *) &(msg->m_tableid[i]),
-						   HASH_REMOVE, NULL);
+		(void) dshash_delete_key(tbl, (void *) &(msg->m_tableid[i]));
 	}
+
+	dshash_release_lock(db_stats, dbentry);
+
 }
 
 
@@ -5903,23 +5696,20 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
 	 */
 	if (dbentry)
 	{
-		char		statfile[MAXPGPATH];
+		if (dbentry->tables != DSM_HANDLE_INVALID)
+		{
+			dshash_table *tbl =
+				dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+			dshash_destroy(tbl);
+		}
+		if (dbentry->functions != DSM_HANDLE_INVALID)
+		{
+			dshash_table *tbl =
+				dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+			dshash_destroy(tbl);
+		}
 
-		get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
-		elog(DEBUG2, "removing stats file \"%s\"", statfile);
-		unlink(statfile);
-
-		if (dbentry->tables != NULL)
-			hash_destroy(dbentry->tables);
-		if (dbentry->functions != NULL)
-			hash_destroy(dbentry->functions);
-
-		if (hash_search(pgStatDBHash,
-						(void *) &dbid,
-						HASH_REMOVE, NULL) == NULL)
-			ereport(ERROR,
-					(errmsg("database hash table corrupted during cleanup --- abort")));
+		dshash_delete_entry(db_stats, (void *)dbentry);
 	}
 }
 
@@ -5947,19 +5737,28 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 	 * We simply throw away all the database's table entries by recreating a
 	 * new hash table for them.
 	 */
-	if (dbentry->tables != NULL)
-		hash_destroy(dbentry->tables);
-	if (dbentry->functions != NULL)
-		hash_destroy(dbentry->functions);
-
-	dbentry->tables = NULL;
-	dbentry->functions = NULL;
+	if (dbentry->tables != DSM_HANDLE_INVALID)
+	{
+		dshash_table *t =
+			dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+		dshash_destroy(t);
+		dbentry->tables = DSM_HANDLE_INVALID;
+	}
+	if (dbentry->functions != DSM_HANDLE_INVALID)
+	{
+		dshash_table *t =
+			dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+		dshash_destroy(t);
+		dbentry->functions = DSM_HANDLE_INVALID;
+	}
 
 	/*
 	 * Reset database-level stats, too.  This creates empty hash tables for
 	 * tables and functions.
 	 */
 	reset_dbentry_counters(dbentry);
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -5974,14 +5773,14 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 	if (msg->m_resettarget == RESET_BGWRITER)
 	{
 		/* Reset the global background writer statistics for the cluster. */
-		memset(&globalStats, 0, sizeof(globalStats));
-		globalStats.stat_reset_timestamp = GetCurrentTimestamp();
+		memset(&shared_globalStats, 0, sizeof(shared_globalStats));
+		shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
 	}
 	else if (msg->m_resettarget == RESET_ARCHIVER)
 	{
 		/* Reset the archiver statistics for the cluster. */
-		memset(&archiverStats, 0, sizeof(archiverStats));
-		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
+		memset(&shared_archiverStats, 0, sizeof(shared_archiverStats));
+		shared_archiverStats->stat_reset_timestamp = GetCurrentTimestamp();
 	}
 
 	/*
@@ -6011,11 +5810,19 @@ pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
 
 	/* Remove object if it exists, ignore it if not */
 	if (msg->m_resettype == RESET_TABLE)
-		(void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
-						   HASH_REMOVE, NULL);
+	{
+		dshash_table *t =
+			dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+		dshash_delete_key(t, (void *) &(msg->m_objectid));
+	}
 	else if (msg->m_resettype == RESET_FUNCTION)
-		(void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
-						   HASH_REMOVE, NULL);
+	{
+		dshash_table *t =
+			dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+		dshash_delete_key(t, (void *) &(msg->m_objectid));
+	}
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6035,6 +5842,8 @@ pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
 	dbentry->last_autovac_time = msg->m_start_time;
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6048,13 +5857,13 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
 {
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatTabEntry *tabentry;
-
+	dshash_table *table;
 	/*
 	 * Store the data in the table's hashtable entry.
 	 */
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
-	tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
+	table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+	tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true);
 
 	tabentry->n_live_tuples = msg->m_live_tuples;
 	tabentry->n_dead_tuples = msg->m_dead_tuples;
@@ -6069,6 +5878,9 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
 		tabentry->vacuum_timestamp = msg->m_vacuumtime;
 		tabentry->vacuum_count++;
 	}
+	dshash_release_lock(table, tabentry);
+	dshash_detach(table);
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6082,13 +5894,15 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
 {
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatTabEntry *tabentry;
+	dshash_table *table;
 
 	/*
 	 * Store the data in the table's hashtable entry.
 	 */
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
-	tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
+	table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+	tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true);
 
 	tabentry->n_live_tuples = msg->m_live_tuples;
 	tabentry->n_dead_tuples = msg->m_dead_tuples;
@@ -6111,6 +5925,9 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
 		tabentry->analyze_timestamp = msg->m_analyzetime;
 		tabentry->analyze_count++;
 	}
+	dshash_release_lock(table, tabentry);
+	dshash_detach(table);
+	dshash_release_lock(db_stats, dbentry);
 }
 
 
@@ -6126,18 +5943,18 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
 	if (msg->m_failed)
 	{
 		/* Failed archival attempt */
-		++archiverStats.failed_count;
-		memcpy(archiverStats.last_failed_wal, msg->m_xlog,
-			   sizeof(archiverStats.last_failed_wal));
-		archiverStats.last_failed_timestamp = msg->m_timestamp;
+		++shared_archiverStats->failed_count;
+		memcpy(shared_archiverStats->last_failed_wal, msg->m_xlog,
+			   sizeof(shared_archiverStats->last_failed_wal));
+		shared_archiverStats->last_failed_timestamp = msg->m_timestamp;
 	}
 	else
 	{
 		/* Successful archival operation */
-		++archiverStats.archived_count;
-		memcpy(archiverStats.last_archived_wal, msg->m_xlog,
-			   sizeof(archiverStats.last_archived_wal));
-		archiverStats.last_archived_timestamp = msg->m_timestamp;
+		++shared_archiverStats->archived_count;
+		memcpy(shared_archiverStats->last_archived_wal, msg->m_xlog,
+			   sizeof(shared_archiverStats->last_archived_wal));
+		shared_archiverStats->last_archived_timestamp = msg->m_timestamp;
 	}
 }
 
@@ -6150,16 +5967,16 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
 static void
 pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 {
-	globalStats.timed_checkpoints += msg->m_timed_checkpoints;
-	globalStats.requested_checkpoints += msg->m_requested_checkpoints;
-	globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
-	globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
-	globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
-	globalStats.buf_written_clean += msg->m_buf_written_clean;
-	globalStats.maxwritten_clean += msg->m_maxwritten_clean;
-	globalStats.buf_written_backend += msg->m_buf_written_backend;
-	globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
-	globalStats.buf_alloc += msg->m_buf_alloc;
+	shared_globalStats->timed_checkpoints += msg->m_timed_checkpoints;
+	shared_globalStats->requested_checkpoints += msg->m_requested_checkpoints;
+	shared_globalStats->checkpoint_write_time += msg->m_checkpoint_write_time;
+	shared_globalStats->checkpoint_sync_time += msg->m_checkpoint_sync_time;
+	shared_globalStats->buf_written_checkpoints += msg->m_buf_written_checkpoints;
+	shared_globalStats->buf_written_clean += msg->m_buf_written_clean;
+	shared_globalStats->maxwritten_clean += msg->m_maxwritten_clean;
+	shared_globalStats->buf_written_backend += msg->m_buf_written_backend;
+	shared_globalStats->buf_fsync_backend += msg->m_buf_fsync_backend;
+	shared_globalStats->buf_alloc += msg->m_buf_alloc;
 }
 
 /* ----------
@@ -6200,6 +6017,8 @@ pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
 			dbentry->n_conflict_startup_deadlock++;
 			break;
 	}
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6216,6 +6035,8 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
 	dbentry->n_deadlocks++;
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6233,6 +6054,8 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
 
 	dbentry->n_temp_bytes += msg->m_filesize;
 	dbentry->n_temp_files += 1;
+
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6244,6 +6067,7 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
 static void
 pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 {
+	dshash_table *t;
 	PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatFuncEntry *funcentry;
@@ -6252,14 +6076,14 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
+	t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
 	/*
 	 * Process all function entries in the message.
 	 */
 	for (i = 0; i < msg->m_nentries; i++, funcmsg++)
 	{
-		funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
-														 (void *) &(funcmsg->f_id),
-														 HASH_ENTER, &found);
+		funcentry = (PgStat_StatFuncEntry *)
+			dshash_find_or_insert(t, (void *) &(funcmsg->f_id), &found);
 
 		if (!found)
 		{
@@ -6280,7 +6104,11 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 			funcentry->f_total_time += funcmsg->f_total_time;
 			funcentry->f_self_time += funcmsg->f_self_time;
 		}
+		dshash_release_lock(t, funcentry);
 	}
+
+	dshash_detach(t);
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6292,6 +6120,7 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 static void
 pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
 {
+	dshash_table *t;
 	PgStat_StatDBEntry *dbentry;
 	int			i;
 
@@ -6300,60 +6129,20 @@ pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
 	/*
 	 * No need to purge if we don't even know the database.
 	 */
-	if (!dbentry || !dbentry->functions)
+	if (!dbentry || dbentry->functions == DSM_HANDLE_INVALID)
 		return;
 
+	t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
 	/*
 	 * Process all function entries in the message.
 	 */
 	for (i = 0; i < msg->m_nentries; i++)
 	{
 		/* Remove from hashtable if present; we don't care if it's not. */
-		(void) hash_search(dbentry->functions,
-						   (void *) &(msg->m_functionid[i]),
-						   HASH_REMOVE, NULL);
+		dshash_delete_key(t, (void *) &(msg->m_functionid[i]));
 	}
-}
-
-/* ----------
- * pgstat_write_statsfile_needed() -
- *
- *	Do we need to write out any stats files?
- * ----------
- */
-static bool
-pgstat_write_statsfile_needed(void)
-{
-	if (pending_write_requests != NIL)
-		return true;
-
-	/* Everything was written recently */
-	return false;
-}
-
-/* ----------
- * pgstat_db_requested() -
- *
- *	Checks whether stats for a particular DB need to be written to a file.
- * ----------
- */
-static bool
-pgstat_db_requested(Oid databaseid)
-{
-	/*
-	 * If any requests are outstanding at all, we should write the stats for
-	 * shared catalogs (the "database" with OID 0).  This ensures that
-	 * backends will see up-to-date stats for shared catalogs, even though
-	 * they send inquiry messages mentioning only their own DB.
-	 */
-	if (databaseid == InvalidOid && pending_write_requests != NIL)
-		return true;
-
-	/* Search to see if there's an open request to write this database. */
-	if (list_member_oid(pending_write_requests, databaseid))
-		return true;
-
-	return false;
+	dshash_detach(t);
+	dshash_release_lock(db_stats, dbentry);
 }
 
 /*
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 3f1eae38a9..a517bf62b6 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -77,9 +77,6 @@ static bool is_checksummed_file(const char *fullpath, const char *filename);
 /* Was the backup currently in-progress initiated in recovery mode? */
 static bool backup_started_in_recovery = false;
 
-/* Relative path of temporary statistics directory */
-static char *statrelpath = NULL;
-
 /*
  * Size of each block sent into the tar stream for larger files.
  */
@@ -121,13 +118,6 @@ static bool noverify_checksums = false;
  */
 static const char *excludeDirContents[] =
 {
-	/*
-	 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
-	 * when stats_temp_directory is set because PGSS_TEXT_FILE is always
-	 * created there.
-	 */
-	PG_STAT_TMP_DIR,
-
 	/*
 	 * It is generally not useful to backup the contents of this directory
 	 * even if the intention is to restore to another master. See backup.sgml
@@ -223,11 +213,8 @@ perform_base_backup(basebackup_options *opt)
 	TimeLineID	endtli;
 	StringInfo	labelfile;
 	StringInfo	tblspc_map_file = NULL;
-	int			datadirpathlen;
 	List	   *tablespaces = NIL;
 
-	datadirpathlen = strlen(DataDir);
-
 	backup_started_in_recovery = RecoveryInProgress();
 
 	labelfile = makeStringInfo();
@@ -254,18 +241,6 @@ perform_base_backup(basebackup_options *opt)
 
 		SendXlogRecPtrResult(startptr, starttli);
 
-		/*
-		 * Calculate the relative path of temporary statistics directory in
-		 * order to skip the files which are located in that directory later.
-		 */
-		if (is_absolute_path(pgstat_stat_directory) &&
-			strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
-			statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
-		else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
-			statrelpath = psprintf("./%s", pgstat_stat_directory);
-		else
-			statrelpath = pgstat_stat_directory;
-
 		/* Add a node for the base directory at the end */
 		ti = palloc0(sizeof(tablespaceinfo));
 		ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1;
@@ -1174,17 +1149,6 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
 		if (excludeFound)
 			continue;
 
-		/*
-		 * Exclude contents of directory specified by statrelpath if not set
-		 * to the default (pg_stat_tmp) which is caught in the loop above.
-		 */
-		if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
-		{
-			elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
-			size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
-			continue;
-		}
-
 		/*
 		 * We can skip pg_wal, the WAL segments need to be fetched from the
 		 * WAL archive anyway. But include it as an empty directory anyway, so
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 0c86a581c0..ee30e8a14f 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -150,6 +150,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
 		size = add_size(size, BackendRandomShmemSize());
+		size = add_size(size, StatsShmemSize());
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -270,6 +271,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 	BackendRandomShmemInit();
+	StatsShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index a6fda81feb..c46bb8d057 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -521,6 +521,9 @@ RegisterLWLockTranches(void)
 	LWLockRegisterTranche(LWTRANCHE_TBM, "tbm");
 	LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
 	LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
+	LWLockRegisterTranche(LWTRANCHE_STATS_DSA, "stats table dsa");
+	LWLockRegisterTranche(LWTRANCHE_STATS_DB, "db stats");
+	LWLockRegisterTranche(LWTRANCHE_STATS_FUNC_TABLE, "table/func stats");
 
 	/* Register named tranches. */
 	for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6025ecedb..798af9f168 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,4 @@ OldSnapshotTimeMapLock				42
 BackendRandomLock					43
 LogicalRepWorkerLock				44
 CLogTruncationLock					45
+StatsLock							46
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 859ef931e7..50043eb5fc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -186,7 +186,6 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource so
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_effective_io_concurrency(int newval, void *extra);
-static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
 static bool check_cluster_name(char **newval, void **extra, GucSource source);
@@ -3755,17 +3754,6 @@ static struct config_string ConfigureNamesString[] =
 		NULL, NULL, NULL
 	},
 
-	{
-		{"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
-			gettext_noop("Writes temporary statistics files to the specified directory."),
-			NULL,
-			GUC_SUPERUSER_ONLY
-		},
-		&pgstat_temp_directory,
-		PG_STAT_TMP_DIR,
-		check_canonical_path, assign_pgstat_temp_directory, NULL
-	},
-
 	{
 		{"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
 			gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
@@ -10691,35 +10679,6 @@ assign_effective_io_concurrency(int newval, void *extra)
 #endif							/* USE_PREFETCH */
 }
 
-static void
-assign_pgstat_temp_directory(const char *newval, void *extra)
-{
-	/* check_canonical_path already canonicalized newval for us */
-	char	   *dname;
-	char	   *tname;
-	char	   *fname;
-
-	/* directory */
-	dname = guc_malloc(ERROR, strlen(newval) + 1);	/* runtime dir */
-	sprintf(dname, "%s", newval);
-
-	/* global stats */
-	tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */
-	sprintf(tname, "%s/global.tmp", newval);
-	fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */
-	sprintf(fname, "%s/global.stat", newval);
-
-	if (pgstat_stat_directory)
-		free(pgstat_stat_directory);
-	pgstat_stat_directory = dname;
-	if (pgstat_stat_tmpname)
-		free(pgstat_stat_tmpname);
-	pgstat_stat_tmpname = tname;
-	if (pgstat_stat_filename)
-		free(pgstat_stat_filename);
-	pgstat_stat_filename = fname;
-}
-
 static bool
 check_application_name(char **newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9e39baf466..7aa57bc489 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -509,7 +509,6 @@
 #track_io_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
-#stats_temp_directory = 'pg_stat_tmp'
 
 
 # - Monitoring -
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index ae22e7d9fb..0c3b82b455 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -216,7 +216,6 @@ static const char *const subdirs[] = {
 	"pg_replslot",
 	"pg_tblspc",
 	"pg_stat",
-	"pg_stat_tmp",
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index aab2e1eecf..a646d3f972 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -123,7 +123,7 @@ is_deeply(
 
 # Contents of these directories should not be copied.
 foreach my $dirname (
-	qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans)
+	qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans)
   )
 {
 	is_deeply(
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0b9609f96e..6608c9be93 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -13,6 +13,7 @@
 
 #include "datatype/timestamp.h"
 #include "fmgr.h"
+#include "lib/dshash.h"
 #include "libpq/pqcomm.h"
 #include "port/atomics.h"
 #include "portability/instr_time.h"
@@ -30,9 +31,6 @@
 #define PGSTAT_STAT_PERMANENT_FILENAME		"pg_stat/global.stat"
 #define PGSTAT_STAT_PERMANENT_TMPFILE		"pg_stat/global.tmp"
 
-/* Default directory to store temporary statistics data in */
-#define PG_STAT_TMP_DIR		"pg_stat_tmp"
-
 /* Values for track_functions GUC variable --- order is significant! */
 typedef enum TrackFunctionsLevel
 {
@@ -48,7 +46,6 @@ typedef enum TrackFunctionsLevel
 typedef enum StatMsgType
 {
 	PGSTAT_MTYPE_DUMMY,
-	PGSTAT_MTYPE_INQUIRY,
 	PGSTAT_MTYPE_TABSTAT,
 	PGSTAT_MTYPE_TABPURGE,
 	PGSTAT_MTYPE_DROPDB,
@@ -216,35 +213,6 @@ typedef struct PgStat_MsgDummy
 	PgStat_MsgHdr m_hdr;
 } PgStat_MsgDummy;
 
-
-/* ----------
- * PgStat_MsgInquiry			Sent by a backend to ask the collector
- *								to write the stats file(s).
- *
- * Ordinarily, an inquiry message prompts writing of the global stats file,
- * the stats file for shared catalogs, and the stats file for the specified
- * database.  If databaseid is InvalidOid, only the first two are written.
- *
- * New file(s) will be written only if the existing file has a timestamp
- * older than the specified cutoff_time; this prevents duplicated effort
- * when multiple requests arrive at nearly the same time, assuming that
- * backends send requests with cutoff_times a little bit in the past.
- *
- * clock_time should be the requestor's current local time; the collector
- * uses this to check for the system clock going backward, but it has no
- * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
- * ----------
- */
-
-typedef struct PgStat_MsgInquiry
-{
-	PgStat_MsgHdr m_hdr;
-	TimestampTz clock_time;		/* observed local clock time */
-	TimestampTz cutoff_time;	/* minimum acceptable file timestamp */
-	Oid			databaseid;		/* requested DB (InvalidOid => shared only) */
-} PgStat_MsgInquiry;
-
-
 /* ----------
  * PgStat_TableEntry			Per-table info in a MsgTabstat
  * ----------
@@ -539,7 +507,6 @@ typedef union PgStat_Msg
 {
 	PgStat_MsgHdr msg_hdr;
 	PgStat_MsgDummy msg_dummy;
-	PgStat_MsgInquiry msg_inquiry;
 	PgStat_MsgTabstat msg_tabstat;
 	PgStat_MsgTabpurge msg_tabpurge;
 	PgStat_MsgDropdb msg_dropdb;
@@ -601,10 +568,13 @@ typedef struct PgStat_StatDBEntry
 
 	/*
 	 * tables and functions must be last in the struct, because we don't write
-	 * the pointers out to the stats file.
+	 * the handles and pointers out to the stats file.
 	 */
-	HTAB	   *tables;
-	HTAB	   *functions;
+	dshash_table_handle tables;
+	dshash_table_handle functions;
+	/* for snapshot tables */
+	HTAB *snapshot_tables;
+	HTAB *snapshot_functions;
 } PgStat_StatDBEntry;
 
 
@@ -1217,6 +1187,7 @@ extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
 extern void pgstat_initstats(Relation rel);
 
 extern char *pgstat_clip_activity(const char *raw_activity);
+extern PgStat_StatTabEntry *backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid);
 
 /* ----------
  * pgstat_report_wait_start() -
@@ -1344,6 +1315,9 @@ extern void pgstat_send_bgwriter(void);
  * generate the pgstat* views.
  * ----------
  */
+extern Size StatsShmemSize(void);
+extern void StatsShmemInit(void);
+extern void PgstatCollectorMain(void);
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
@@ -1353,7 +1327,4 @@ extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
 
-/* Main loop */
-extern void PgstatCollectorMain(void);
-
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index c21bfe2f66..2cdd10c2fd 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -219,6 +219,9 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_SHARED_TUPLESTORE,
 	LWTRANCHE_TBM,
 	LWTRANCHE_PARALLEL_APPEND,
+	LWTRANCHE_STATS_DSA,
+	LWTRANCHE_STATS_DB,
+	LWTRANCHE_STATS_FUNC_TABLE,
 	LWTRANCHE_FIRST_USER_DEFINED
 }			BuiltinTrancheIds;
 
-- 
2.16.3

Reply via email to