Hello. This is intended to provide more stats like the following thread.
https://www.postgresql.org/message-id/20171010.192616.108347483.horiguchi.kyot...@lab.ntt.co.jp Most major obstracle for having more items in statistics collector views comes from the mechanism to share the values among backends. It is currently using a file. The stats collector writes a file by triggers from backens then backens reads the written file. Larger file makes the latency longer and we don't have a spare bandwidth for additional statistics items. Nowadays PostgreSQL has dynamic shared hash (dshash) so we can use this as the main storage of statistics. We can share data without a stress using this. A PoC previously posted tried to use "locally copied" dshash but it doesn't looks fine so I steered to different direction. With this patch dshash can create a local copy based on dynhash. This patch consists of tree files. v1-0001-Give-dshash-ability-to-make-a-local-snapshot.patch adds dshash to make a local copy backed by dynahash. v1-0002-Change-stats-collector-to-an-axiliary-process.patch change the stats collector to be a auxiliary process so that it can attach dynamic shared memory. v1-0003-dshash-based-stats-collector.patch implements shared-memory based stats collector. I'll put more detailed explanation later. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
>From f1e033f308b9bbd2f1b1a3bb4a71f0fe2c538e82 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Fri, 29 Jun 2018 16:41:04 +0900 Subject: [PATCH 1/3] Give dshash ability to make a local snapshot Add snapshot feature to DSHASH, that makes a dynahash that consists of the same data. --- src/backend/lib/dshash.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++ src/include/lib/dshash.h | 21 +++++- 2 files changed, 184 insertions(+), 1 deletion(-) diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c index b46f7c4cfd..758435efe7 100644 --- a/src/backend/lib/dshash.c +++ b/src/backend/lib/dshash.c @@ -354,6 +354,48 @@ dshash_destroy(dshash_table *hash_table) pfree(hash_table); } +/* + * take a local snapshot of a dshash table + */ +HTAB * +dshash_take_snapshot(dshash_table *org_table) +{ + HTAB *dest_hash; + HASHCTL ctl; + int num_entries = 0; + dshash_seq_status s; + void *ps; + + ctl.keysize = org_table->params.key_size; + ctl.entrysize = org_table->params.entry_size; + + /* + * num_entries is not a strict parameter. We don't care if new entries + * are added before taking snapshot + */ + num_entries = dshash_get_num_entries(org_table); + + /* + * dshash only supports binary hash and comparator functions, which won't + * stop at intemediate NUL(\x0) bytes. Just specify HASH_BLOBS so that the + * local hash behaves in the same way. + */ + dest_hash = hash_create("local snapshot of dshash", + num_entries, &ctl, + HASH_ELEM | HASH_BLOBS); + + dshash_seq_init(&s, org_table, true); + while ((ps = dshash_seq_next(&s)) != NULL) + { + bool found; + void *pd = hash_search(dest_hash, ps, HASH_ENTER, &found); + Assert(!found); + memcpy(pd, ps, ctl.entrysize); + } + + return dest_hash; +} + /* * Get a handle that can be used by other processes to attach to this hash * table. @@ -592,6 +634,128 @@ dshash_memhash(const void *v, size_t size, void *arg) return tag_hash(v, size); } +/* + * Initialize a sequential scan on the hash_table. Allows no modifications + * during a scan if consistent = true. + */ +void +dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table, + bool consistent) +{ + status->hash_table = hash_table; + status->curbucket = 0; + status->nbuckets = ((size_t) 1) << hash_table->control->size_log2; + status->curitem = NULL; + status->curpartition = -1; + status->consistent = consistent; + + /* + * Protect all partitions from modification if the caller wants a + * consistent result. + */ + if (consistent) + { + int i; + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i))); + + LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED); + } + } + ensure_valid_bucket_pointers(hash_table); +} + +void * +dshash_seq_next(dshash_seq_status *status) +{ + dsa_pointer next_item_pointer; + + if (status->curitem == NULL) + { + Assert (status->curbucket == 0); + Assert(!status->hash_table->find_locked); + + /* first shot. grab the first item. */ + next_item_pointer = status->hash_table->buckets[status->curbucket]; + status->hash_table->find_locked = true; + } + else + next_item_pointer = status->curitem->next; + + /* Move to the next bucket if we finished the current bucket */ + while (!DsaPointerIsValid(next_item_pointer)) + { + if (++status->curbucket >= status->nbuckets) + { + /* all buckets have been scanned. finsih. */ + dshash_seq_release(status); + return NULL; + } + Assert(status->hash_table->find_locked); + + next_item_pointer = status->hash_table->buckets[status->curbucket]; + + /* + * we need a lock on the scanning partition even if the caller don't + * requested a consistent snapshot. + */ + if (!status->consistent && DsaPointerIsValid(next_item_pointer)) + { + dshash_table_item *item = dsa_get_address(status->hash_table->area, + next_item_pointer); + int next_partition = PARTITION_FOR_HASH(item->hash); + if (status->curpartition != next_partition) + { + if (status->curpartition >= 0) + LWLockRelease(PARTITION_LOCK(status->hash_table, + status->curpartition)); + LWLockAcquire(PARTITION_LOCK(status->hash_table, + next_partition), + LW_SHARED); + status->curpartition = next_partition; + } + } + } + + status->curitem = + dsa_get_address(status->hash_table->area, next_item_pointer); + return ENTRY_FROM_ITEM(status->curitem); +} + +void +dshash_seq_release(dshash_seq_status *status) +{ + Assert(status->hash_table->find_locked); + status->hash_table->find_locked = false; + + if (status->consistent) + { + int i; + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + LWLockRelease(PARTITION_LOCK(status->hash_table, i)); + } + else if (status->curpartition >= 0) + LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition)); +} + +int +dshash_get_num_entries(dshash_table *hash_table) +{ + /* a shotcut implement. should be improved */ + dshash_seq_status s; + void *p; + int n = 0; + + dshash_seq_init(&s, hash_table, false); + while ((p = dshash_seq_next(&s)) != NULL) + n++; + + return n; +} + /* * Print debugging information about the internal state of the hash table to * stderr. The caller must hold no partition locks. diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h index 8c733bfe25..eb2e45cf66 100644 --- a/src/include/lib/dshash.h +++ b/src/include/lib/dshash.h @@ -15,6 +15,7 @@ #define DSHASH_H #include "utils/dsa.h" +#include "utils/hsearch.h" /* The opaque type representing a hash table. */ struct dshash_table; @@ -59,6 +60,18 @@ typedef struct dshash_parameters struct dshash_table_item; typedef struct dshash_table_item dshash_table_item; +struct dshash_seq_status +{ + dshash_table *hash_table; + int curbucket; + int nbuckets; + dshash_table_item *curitem; + int curpartition; + bool consistent; +}; + +typedef struct dshash_seq_status dshash_seq_status; + /* Creating, sharing and destroying from hash tables. */ extern dshash_table *dshash_create(dsa_area *area, const dshash_parameters *params, @@ -70,7 +83,7 @@ extern dshash_table *dshash_attach(dsa_area *area, extern void dshash_detach(dshash_table *hash_table); extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table); extern void dshash_destroy(dshash_table *hash_table); - +extern HTAB * dshash_take_snapshot(dshash_table *org_table); /* Finding, creating, deleting entries. */ extern void *dshash_find(dshash_table *hash_table, const void *key, bool exclusive); @@ -80,6 +93,12 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key); extern void dshash_delete_entry(dshash_table *hash_table, void *entry); extern void dshash_release_lock(dshash_table *hash_table, void *entry); +/* seq scan support */ +extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table, + bool exclusive); +extern void *dshash_seq_next(dshash_seq_status *status); +extern void dshash_seq_release(dshash_seq_status *status); +extern int dshash_get_num_entries(dshash_table *hash_table); /* Convenience hash and compare functions wrapping memcmp and tag_hash. */ extern int dshash_memcmp(const void *a, const void *b, size_t size, void *arg); extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg); -- 2.16.3
>From 26d1d99e5584cc4868ffc7cb6c20d4303276bdf5 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Fri, 29 Jun 2018 16:58:32 +0900 Subject: [PATCH 2/3] Change stats collector to an axiliary process. Shared memory and LWLocks are required to let stats collector use dshash. This patch makes stats collector an auxiliary process. --- src/backend/bootstrap/bootstrap.c | 8 +++++ src/backend/postmaster/pgstat.c | 58 +++++++++++++++++++++++++------------ src/backend/postmaster/postmaster.c | 24 +++++++++------ src/include/miscadmin.h | 3 +- src/include/pgstat.h | 6 +++- 5 files changed, 70 insertions(+), 29 deletions(-) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 7e34bee63e..8f8327495a 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -335,6 +335,9 @@ AuxiliaryProcessMain(int argc, char *argv[]) case WalReceiverProcess: statmsg = pgstat_get_backend_desc(B_WAL_RECEIVER); break; + case StatsCollectorProcess: + statmsg = pgstat_get_backend_desc(B_STATS_COLLECTOR); + break; default: statmsg = "??? process"; break; @@ -462,6 +465,11 @@ AuxiliaryProcessMain(int argc, char *argv[]) WalReceiverMain(); proc_exit(1); /* should never return */ + case StatsCollectorProcess: + /* don't set signals, stats collector has its own agenda */ + PgstatCollectorMain(); + proc_exit(1); /* should never return */ + default: elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); proc_exit(1); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 084573e77c..6d9344fcca 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -267,6 +267,7 @@ static List *pending_write_requests = NIL; /* Signal handler flags */ static volatile bool need_exit = false; static volatile bool got_SIGHUP = false; +static volatile bool got_SIGTERM = false; /* * Total time charged to functions so far in the current backend. @@ -284,8 +285,8 @@ static instr_time total_func_time; static pid_t pgstat_forkexec(void); #endif -NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); -static void pgstat_exit(SIGNAL_ARGS); +static void pgstat_shutdown_handler(SIGNAL_ARGS); +static void pgstat_quickdie_handler(SIGNAL_ARGS); static void pgstat_beshutdown_hook(int code, Datum arg); static void pgstat_sighup_handler(SIGNAL_ARGS); @@ -770,11 +771,7 @@ pgstat_start(void) /* Close the postmaster's sockets */ ClosePostmasterPorts(false); - /* Drop our connection to postmaster's shared memory, as well */ - dsm_detach_all(); - PGSharedMemoryDetach(); - - PgstatCollectorMain(0, NULL); + PgstatCollectorMain(); break; #endif @@ -2870,6 +2867,9 @@ pgstat_bestart(void) case WalReceiverProcess: beentry->st_backendType = B_WAL_RECEIVER; break; + case StatsCollectorProcess: + beentry->st_backendType = B_STATS_COLLECTOR; + break; default: elog(FATAL, "unrecognized process type: %d", (int) MyAuxProcType); @@ -4132,6 +4132,9 @@ pgstat_get_backend_desc(BackendType backendType) case B_WAL_WRITER: backendDesc = "walwriter"; break; + case B_STATS_COLLECTOR: + backendDesc = "stats collector"; + break; } return backendDesc; @@ -4249,8 +4252,8 @@ pgstat_send_bgwriter(void) * The argc/argv parameters are valid only in EXEC_BACKEND case. * ---------- */ -NON_EXEC_STATIC void -PgstatCollectorMain(int argc, char *argv[]) +void +PgstatCollectorMain(void) { int len; PgStat_Msg msg; @@ -4263,8 +4266,8 @@ PgstatCollectorMain(int argc, char *argv[]) */ pqsignal(SIGHUP, pgstat_sighup_handler); pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SIG_IGN); - pqsignal(SIGQUIT, pgstat_exit); + pqsignal(SIGTERM, pgstat_shutdown_handler); + pqsignal(SIGQUIT, pgstat_quickdie_handler); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); @@ -4309,14 +4312,14 @@ PgstatCollectorMain(int argc, char *argv[]) /* * Quit if we get SIGQUIT from the postmaster. */ - if (need_exit) + if (got_SIGTERM) break; /* * Inner loop iterates as long as we keep getting messages, or until * need_exit becomes set. */ - while (!need_exit) + while (!got_SIGTERM) { /* * Reload configuration if we got SIGHUP from the postmaster. @@ -4504,14 +4507,21 @@ PgstatCollectorMain(int argc, char *argv[]) /* SIGQUIT signal handler for collector process */ static void -pgstat_exit(SIGNAL_ARGS) +pgstat_quickdie_handler(SIGNAL_ARGS) { - int save_errno = errno; + PG_SETMASK(&BlockSig); - need_exit = true; - SetLatch(MyLatch); + /* + * We DO NOT want to run proc_exit() callbacks -- we're here because + * shared memory may be corrupted, so we don't want to try to clean up our + * transaction. Just nail the windows shut and get out of town. Now that + * there's an atexit callback to prevent third-party code from breaking + * things by calling exit() directly, we have to reset the callbacks + * explicitly to make this work as intended. + */ + on_exit_reset(); - errno = save_errno; + exit(2); } /* SIGHUP handler for collector process */ @@ -4526,6 +4536,18 @@ pgstat_sighup_handler(SIGNAL_ARGS) errno = save_errno; } +static void +pgstat_shutdown_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGTERM = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + /* * Subroutine to clear stats in a database entry * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index a4b53b33cd..a6209cd749 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -145,7 +145,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_STATS 0x0010 /* bgworker process */ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER) @@ -551,6 +552,7 @@ static void ShmemBackendArrayRemove(Backend *bn); #define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) +#define StartStatsCollector() StartChildProcess(StatsCollectorProcess) /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) @@ -1760,7 +1762,7 @@ ServerLoop(void) /* If we have lost the stats collector, try to start a new one */ if (PgStatPID == 0 && (pmState == PM_RUN || pmState == PM_HOT_STANDBY)) - PgStatPID = pgstat_start(); + PgStatPID = StartStatsCollector(); /* If we have lost the archiver, try to start a new one. */ if (PgArchPID == 0 && PgArchStartupAllowed()) @@ -2878,7 +2880,7 @@ reaper(SIGNAL_ARGS) if (PgArchStartupAllowed() && PgArchPID == 0) PgArchPID = pgarch_start(); if (PgStatPID == 0) - PgStatPID = pgstat_start(); + PgStatPID = StartStatsCollector(); /* workers may be scheduled to start now */ maybe_start_bgworkers(); @@ -2951,7 +2953,7 @@ reaper(SIGNAL_ARGS) * nothing left for it to do. */ if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); + signal_child(PgStatPID, SIGTERM); } else { @@ -3037,10 +3039,10 @@ reaper(SIGNAL_ARGS) { PgStatPID = 0; if (!EXIT_STATUS_0(exitstatus)) - LogChildExit(LOG, _("statistics collector process"), - pid, exitstatus); + HandleChildCrash(pid, exitstatus, + _("statistics collector process")); if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) - PgStatPID = pgstat_start(); + PgStatPID = StartStatsCollector(); continue; } @@ -3270,7 +3272,7 @@ CleanupBackend(int pid, /* * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, - * walwriter, autovacuum, or background worker. + * walwriter, autovacuum, stats collector or background worker. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. @@ -5071,7 +5073,7 @@ sigusr1_handler(SIGNAL_ARGS) * Likewise, start other special children as needed. */ Assert(PgStatPID == 0); - PgStatPID = pgstat_start(); + PgStatPID = StartStatsCollector(); ereport(LOG, (errmsg("database system is ready to accept read only connections"))); @@ -5361,6 +5363,10 @@ StartChildProcess(AuxProcType type) ereport(LOG, (errmsg("could not fork WAL receiver process: %m"))); break; + case StatsCollectorProcess: + ereport(LOG, + (errmsg("could not fork stats collector process: %m"))); + break; default: ereport(LOG, (errmsg("could not fork process: %m"))); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index e167ee8fcb..53b260cb1f 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -400,7 +400,7 @@ typedef enum CheckpointerProcess, WalWriterProcess, WalReceiverProcess, - + StatsCollectorProcess, NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; @@ -412,6 +412,7 @@ extern AuxProcType MyAuxProcType; #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess) #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess) #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess) +#define AmStatsCollectorProcess() (MyAuxProcType == StatsCollectorProcess) /***************************************************************************** diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239b..0b9609f96e 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -710,7 +710,8 @@ typedef enum BackendType B_STARTUP, B_WAL_RECEIVER, B_WAL_SENDER, - B_WAL_WRITER + B_WAL_WRITER, + B_STATS_COLLECTOR } BackendType; @@ -1352,4 +1353,7 @@ extern int pgstat_fetch_stat_numbackends(void); extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); extern PgStat_GlobalStats *pgstat_fetch_global(void); +/* Main loop */ +extern void PgstatCollectorMain(void); + #endif /* PGSTAT_H */ -- 2.16.3
>From 02891365848ad56494864c8107da436e3f7909d1 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Fri, 29 Jun 2018 17:05:46 +0900 Subject: [PATCH 3/3] dshash-based stats collector Stats collector no longer uses files to distribute stats numbers. They are now stored in dynamic shared hash. --- src/backend/postmaster/autovacuum.c | 6 +- src/backend/postmaster/pgstat.c | 1259 ++++++++++--------------- src/backend/replication/basebackup.c | 36 - src/backend/storage/ipc/ipci.c | 2 + src/backend/storage/lmgr/lwlock.c | 3 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/misc/guc.c | 41 - src/backend/utils/misc/postgresql.conf.sample | 1 - src/bin/initdb/initdb.c | 1 - src/bin/pg_basebackup/t/010_pg_basebackup.pl | 2 +- src/include/pgstat.h | 51 +- src/include/storage/lwlock.h | 3 + 12 files changed, 547 insertions(+), 859 deletions(-) diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 02e6d8131e..4ece3f411d 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2768,12 +2768,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, if (isshared) { if (PointerIsValid(shared)) - tabentry = hash_search(shared->tables, &relid, - HASH_FIND, NULL); + tabentry = backend_get_tab_entry(shared, relid); } else if (PointerIsValid(dbentry)) - tabentry = hash_search(dbentry->tables, &relid, - HASH_FIND, NULL); + tabentry = backend_get_tab_entry(dbentry, relid); return tabentry; } diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 6d9344fcca..626e3c6867 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -77,22 +77,10 @@ #define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file * updates; in milliseconds. */ -#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a - * new file; in milliseconds. */ - -#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats - * file update; in milliseconds. */ - -#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a - * new file; in milliseconds. */ - #define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a * failed statistics collector; in * seconds. */ -#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY) -#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY) - /* Minimum receive buffer size for the collector's socket. */ #define PGSTAT_MIN_RCVBUF (100 * 1024) @@ -101,7 +89,6 @@ * The initial size hints for the hash tables used in the collector. * ---------- */ -#define PGSTAT_DB_HASH_SIZE 16 #define PGSTAT_TAB_HASH_SIZE 512 #define PGSTAT_FUNCTION_HASH_SIZE 512 @@ -127,14 +114,6 @@ bool pgstat_track_counts = false; int pgstat_track_functions = TRACK_FUNC_OFF; int pgstat_track_activity_query_size = 1024; -/* ---------- - * Built from GUC parameter - * ---------- - */ -char *pgstat_stat_directory = NULL; -char *pgstat_stat_filename = NULL; -char *pgstat_stat_tmpname = NULL; - /* * BgWriter global statistics counters (unused in other processes). * Stored directly in a stats message structure so it can be sent @@ -154,6 +133,42 @@ static time_t last_pgstat_start_time; static bool pgStatRunningInCollector = false; +/* Shared stats bootstrap infomation */ +typedef struct StatsShmemStruct { + dsa_handle stats_dsa_handle; + dshash_table_handle db_stats_handle; + dsa_pointer global_stats; + dsa_pointer archiver_stats; +} StatsShmemStruct; + +static StatsShmemStruct * StatsShmem = NULL; +static dsa_area *area = NULL; +static dshash_table *db_stats; +static HTAB *snapshot_db_stats; + +/* dshash parameter for each type of table */ +static const dshash_parameters dsh_dbparams = { + sizeof(Oid), + sizeof(PgStat_StatDBEntry), + dshash_memcmp, + dshash_memhash, + LWTRANCHE_STATS_DB +}; +static const dshash_parameters dsh_tblparams = { + sizeof(Oid), + sizeof(PgStat_StatTabEntry), + dshash_memcmp, + dshash_memhash, + LWTRANCHE_STATS_FUNC_TABLE +}; +static const dshash_parameters dsh_funcparams = { + sizeof(Oid), + sizeof(PgStat_StatFuncEntry), + dshash_memcmp, + dshash_memhash, + LWTRANCHE_STATS_FUNC_TABLE +}; + /* * Structures in which backends store per-table info that's waiting to be * sent to the collector. @@ -250,12 +265,16 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL; static int localNumBackends = 0; /* - * Cluster wide statistics, kept in the stats collector. - * Contains statistics that are not collected per database - * or per table. + * Cluster wide statistics. + * Contains statistics that are not collected per database or per table. + * shared_* are the statistics maintained by pgstats and snapshot_* are the + * snapshots taken on backends. */ -static PgStat_ArchiverStats archiverStats; -static PgStat_GlobalStats globalStats; +static PgStat_ArchiverStats *shared_archiverStats; +static PgStat_ArchiverStats *snapshot_archiverStats; +static PgStat_GlobalStats *shared_globalStats; +static PgStat_GlobalStats *snapshot_globalStats; + /* * List of OIDs of databases we need to write out. If an entry is InvalidOid, @@ -285,23 +304,22 @@ static instr_time total_func_time; static pid_t pgstat_forkexec(void); #endif +/* functions used in stats collector */ static void pgstat_shutdown_handler(SIGNAL_ARGS); static void pgstat_quickdie_handler(SIGNAL_ARGS); static void pgstat_beshutdown_hook(int code, Datum arg); static void pgstat_sighup_handler(SIGNAL_ARGS); static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); -static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, - Oid tableoid, bool create); -static void pgstat_write_statsfiles(bool permanent, bool allDbs); -static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent); -static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep); -static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent); -static void backend_read_statsfile(void); -static void pgstat_read_current_status(void); +static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create); +static void pgstat_write_statsfiles(void); +static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry); +static void pgstat_read_statsfiles(void); +static void pgstat_read_db_statsfile(Oid databaseid, dshash_table *tabhash, dshash_table *funchash); -static bool pgstat_write_statsfile_needed(void); -static bool pgstat_db_requested(Oid databaseid); +/* functions used in backends */ +static bool backend_take_stats_snapshot(void); +static void pgstat_read_current_status(void); static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg); static void pgstat_send_funcstats(void); @@ -320,7 +338,6 @@ static const char *pgstat_get_wait_io(WaitEventIO w); static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); static void pgstat_send(void *msg, int len); -static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len); static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len); static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len); static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len); @@ -685,7 +702,6 @@ pgstat_reset_remove_files(const char *directory) void pgstat_reset_all(void) { - pgstat_reset_remove_files(pgstat_stat_directory); pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY); } @@ -1009,6 +1025,93 @@ pgstat_send_funcstats(void) } +/* ---------- + * pgstat_attach_shared_stats() - + * + * attach existing shared stats memory + * ---------- + */ +static bool +pgstat_attach_shared_stats(void) +{ + MemoryContext oldcontext; + + LWLockAcquire(StatsLock, LW_EXCLUSIVE); + if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID || area != NULL) + { + LWLockRelease(StatsLock); + return area != NULL; + } + + /* top level varialbles. lives for the lifetime of the process */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + area = dsa_attach(StatsShmem->stats_dsa_handle); + dsa_pin_mapping(area); + db_stats = dshash_attach(area, &dsh_dbparams, + StatsShmem->db_stats_handle, 0); + snapshot_db_stats = NULL; + shared_globalStats = (PgStat_GlobalStats *) + dsa_get_address(area, StatsShmem->global_stats); + shared_archiverStats = (PgStat_ArchiverStats *) + dsa_get_address(area, StatsShmem->archiver_stats); + MemoryContextSwitchTo(oldcontext); + LWLockRelease(StatsLock); + + return true; +} + +/* ---------- + * pgstat_create_shared_stats() - + * + * create shared stats memory + * ---------- + */ +static void +pgstat_create_shared_stats(void) +{ + MemoryContext oldcontext; + + LWLockAcquire(StatsLock, LW_EXCLUSIVE); + Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID); + + /* lives for the lifetime of the process */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + area = dsa_create(LWTRANCHE_STATS_DSA); + dsa_pin_mapping(area); + + db_stats = dshash_create(area, &dsh_dbparams, 0); + + /* create shared area and write bootstrap information */ + StatsShmem->stats_dsa_handle = dsa_get_handle(area); + StatsShmem->global_stats = + dsa_allocate0(area, sizeof(PgStat_GlobalStats)); + StatsShmem->archiver_stats = + dsa_allocate0(area, sizeof(PgStat_ArchiverStats)); + StatsShmem->db_stats_handle = + dshash_get_hash_table_handle(db_stats); + + /* connect to the memory */ + snapshot_db_stats = NULL; + shared_globalStats = (PgStat_GlobalStats *) + dsa_get_address(area, StatsShmem->global_stats); + shared_archiverStats = (PgStat_ArchiverStats *) + dsa_get_address(area, StatsShmem->archiver_stats); + MemoryContextSwitchTo(oldcontext); + LWLockRelease(StatsLock); +} + +/* ---------- + * backend_get_tab_entry() - + * + * Find database stats entry on backends. + */ +PgStat_StatTabEntry * +backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid) +{ + Assert(dbent->snapshot_tables); + return hash_search(dbent->snapshot_tables, &relid, HASH_FIND, NULL); +} + /* ---------- * pgstat_vacuum_stat() - * @@ -1030,11 +1133,9 @@ pgstat_vacuum_stat(void) if (pgStatSock == PGINVALID_SOCKET) return; - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); + /* If not done for this transaction, take a snapshot of stats */ + if (!backend_take_stats_snapshot()) + return; /* * Read pg_database and make a list of OIDs of all existing databases @@ -1045,7 +1146,7 @@ pgstat_vacuum_stat(void) * Search the database hash table for dead databases and tell the * collector to drop them. */ - hash_seq_init(&hstat, pgStatDBHash); + hash_seq_init(&hstat, snapshot_db_stats); while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) { Oid dbid = dbentry->databaseid; @@ -1064,10 +1165,10 @@ pgstat_vacuum_stat(void) /* * Lookup our own database entry; if not found, nothing more to do. */ - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats, (void *) &MyDatabaseId, HASH_FIND, NULL); - if (dbentry == NULL || dbentry->tables == NULL) + if (dbentry == NULL || dbentry->snapshot_tables == NULL) return; /* @@ -1083,7 +1184,7 @@ pgstat_vacuum_stat(void) /* * Check for all tables listed in stats hashtable if they still exist. */ - hash_seq_init(&hstat, dbentry->tables); + hash_seq_init(&hstat, dbentry->snapshot_tables); while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL) { Oid tabid = tabentry->tableid; @@ -1134,8 +1235,8 @@ pgstat_vacuum_stat(void) * Now repeat the above steps for functions. However, we needn't bother * in the common case where no function stats are being collected. */ - if (dbentry->functions != NULL && - hash_get_num_entries(dbentry->functions) > 0) + if (dbentry->snapshot_functions != NULL && + hash_get_num_entries(dbentry->snapshot_functions) > 0) { htab = pgstat_collect_oids(ProcedureRelationId); @@ -1143,7 +1244,7 @@ pgstat_vacuum_stat(void) f_msg.m_databaseid = MyDatabaseId; f_msg.m_nentries = 0; - hash_seq_init(&hstat, dbentry->functions); + hash_seq_init(&hstat, dbentry->snapshot_functions); while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL) { Oid funcid = funcentry->functionid; @@ -1551,24 +1652,6 @@ pgstat_ping(void) pgstat_send(&msg, sizeof(msg)); } -/* ---------- - * pgstat_send_inquiry() - - * - * Notify collector that we need fresh data. - * ---------- - */ -static void -pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid) -{ - PgStat_MsgInquiry msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); - msg.clock_time = clock_time; - msg.cutoff_time = cutoff_time; - msg.databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); -} - /* * Initialize function call usage data. @@ -2383,16 +2466,14 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, PgStat_StatDBEntry * pgstat_fetch_stat_dbentry(Oid dbid) { - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); + /* If not done for this transaction, take a stats snapshot */ + if (!backend_take_stats_snapshot()) + return NULL; /* * Lookup the requested database; return NULL if not found */ - return (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + return (PgStat_StatDBEntry *) hash_search(snapshot_db_stats, (void *) &dbid, HASH_FIND, NULL); } @@ -2415,23 +2496,22 @@ pgstat_fetch_stat_tabentry(Oid relid) PgStat_StatTabEntry *tabentry; /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. + * If not done for this transaction, take a stats snapshot */ - backend_read_statsfile(); + if (!backend_take_stats_snapshot()) + return NULL; - /* - * Lookup our database, then look in its table hash table. - */ + /* Lookup our database, then look in its table hash table. */ dbid = MyDatabaseId; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, + dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats, + (void *)&dbid, HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + if (dbentry != NULL && dbentry->snapshot_tables != NULL) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); + tabentry = (PgStat_StatTabEntry *) + hash_search(dbentry->snapshot_tables, (void *)&relid, + HASH_FIND, NULL); + if (tabentry) return tabentry; } @@ -2440,14 +2520,15 @@ pgstat_fetch_stat_tabentry(Oid relid) * If we didn't find it, maybe it's a shared table. */ dbid = InvalidOid; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + dbentry = (PgStat_StatDBEntry *) hash_search(snapshot_db_stats, (void *) &dbid, HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + if (dbentry != NULL && dbentry->snapshot_tables != NULL) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); + tabentry = (PgStat_StatTabEntry *) + hash_search(dbentry->snapshot_tables, (void *) &relid, + HASH_FIND, NULL); + if (tabentry) return tabentry; } @@ -2469,18 +2550,15 @@ pgstat_fetch_stat_funcentry(Oid func_id) PgStat_StatDBEntry *dbentry; PgStat_StatFuncEntry *funcentry = NULL; - /* load the stats file if needed */ - backend_read_statsfile(); + /* If not done for this transaction, take a stats snapshot */ + if (!backend_take_stats_snapshot()) + return NULL; - /* Lookup our database, then find the requested function. */ + /* Lookup our database, then find the requested function */ dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); - if (dbentry != NULL && dbentry->functions != NULL) - { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &func_id, - HASH_FIND, NULL); - } - + if (dbentry != NULL && dbentry->snapshot_functions != NULL) + funcentry = hash_search(dbentry->snapshot_functions, + (void *) &func_id, HASH_FIND, NULL); return funcentry; } @@ -2555,9 +2633,11 @@ pgstat_fetch_stat_numbackends(void) PgStat_ArchiverStats * pgstat_fetch_stat_archiver(void) { - backend_read_statsfile(); + /* If not done for this transaction, take a stats snapshot */ + if (!backend_take_stats_snapshot()) + return NULL; - return &archiverStats; + return snapshot_archiverStats; } @@ -2572,9 +2652,11 @@ pgstat_fetch_stat_archiver(void) PgStat_GlobalStats * pgstat_fetch_global(void) { - backend_read_statsfile(); + /* If not done for this transaction, take a stats snapshot */ + if (!backend_take_stats_snapshot()) + return NULL; - return &globalStats; + return snapshot_globalStats; } @@ -4277,18 +4359,14 @@ PgstatCollectorMain(void) pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); - PG_SETMASK(&UnBlockSig); - /* - * Identify myself via ps - */ - init_ps_display("stats collector", "", "", ""); + PG_SETMASK(&UnBlockSig); /* * Read in existing stats files or initialize the stats to zero. */ pgStatRunningInCollector = true; - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true); + pgstat_read_statsfiles(); /* * Loop to process messages until we get SIGQUIT or detect ungraceful @@ -4330,13 +4408,6 @@ PgstatCollectorMain(void) ProcessConfigFile(PGC_SIGHUP); } - /* - * Write the stats file(s) if a new request has arrived that is - * not satisfied by existing file(s). - */ - if (pgstat_write_statsfile_needed()) - pgstat_write_statsfiles(false, false); - /* * Try to receive and process a message. This will not block, * since the socket is set to non-blocking mode. @@ -4385,10 +4456,6 @@ PgstatCollectorMain(void) case PGSTAT_MTYPE_DUMMY: break; - case PGSTAT_MTYPE_INQUIRY: - pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len); - break; - case PGSTAT_MTYPE_TABSTAT: pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len); break; @@ -4477,9 +4544,7 @@ PgstatCollectorMain(void) * fixes that, so don't sleep indefinitely. This is a crock of the * first water, but until somebody wants to debug exactly what's * happening there, this is the best we can do. The two-second - * timeout matches our pre-9.2 behavior, and needs to be short enough - * to not provoke "using stale statistics" complaints from - * backend_read_statsfile. + * timeout matches our pre-9.2 behavior. */ wr = WaitLatchOrSocket(MyLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT, @@ -4499,7 +4564,7 @@ PgstatCollectorMain(void) /* * Save the final stats to reuse at next startup. */ - pgstat_write_statsfiles(true, true); + pgstat_write_statsfiles(); exit(0); } @@ -4549,14 +4614,14 @@ pgstat_shutdown_handler(SIGNAL_ARGS) } /* - * Subroutine to clear stats in a database entry + * Subroutine to reset stats in a shared database entry * * Tables and functions hashes are initialized to empty. */ static void reset_dbentry_counters(PgStat_StatDBEntry *dbentry) { - HASHCTL hash_ctl; + dshash_table *tbl; dbentry->n_xact_commit = 0; dbentry->n_xact_rollback = 0; @@ -4582,20 +4647,14 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry) dbentry->stat_reset_timestamp = GetCurrentTimestamp(); dbentry->stats_timestamp = 0; - memset(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); + tbl = dshash_create(area, &dsh_tblparams, 0); + dbentry->tables = dshash_get_hash_table_handle(tbl); + dshash_detach(tbl); + + tbl = dshash_create(area, &dsh_funcparams, 0); + dbentry->functions = dshash_get_hash_table_handle(tbl); + dshash_detach(tbl); } /* @@ -4608,15 +4667,18 @@ pgstat_get_db_entry(Oid databaseid, bool create) { PgStat_StatDBEntry *result; bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); + + Assert(pgStatRunningInCollector); /* Lookup or create the hash table entry for this database */ - result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - &databaseid, - action, &found); + if (create) + result = (PgStat_StatDBEntry *) + dshash_find_or_insert(db_stats, &databaseid, &found); + else + result = (PgStat_StatDBEntry *) dshash_find(db_stats, &databaseid, true); - if (!create && !found) - return NULL; + if (!create) + return result; /* * If not found, initialize the new one. This creates empty hash tables @@ -4628,23 +4690,23 @@ pgstat_get_db_entry(Oid databaseid, bool create) return result; } - /* * Lookup the hash table entry for the specified table. If no hash * table entry exists, initialize it, if the create parameter is true. * Else, return NULL. */ static PgStat_StatTabEntry * -pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) +pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create) { PgStat_StatTabEntry *result; bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); /* Lookup or create the hash table entry for this table */ - result = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - &tableoid, - action, &found); + if (create) + result = (PgStat_StatTabEntry *) + dshash_find_or_insert(table, &tableoid, &found); + else + result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false); if (!create && !found) return NULL; @@ -4682,25 +4744,20 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) * pgstat_write_statsfiles() - * Write the global statistics file, as well as requested DB files. * - * 'permanent' specifies writing to the permanent files not temporary ones. - * When true (happens only when the collector is shutting down), also remove - * the temporary files so that backends starting up under a new postmaster - * can't read old data before the new collector is ready. - * * When 'allDbs' is false, only the requested databases (listed in * pending_write_requests) will be written; otherwise, all databases * will be written. * ---------- */ static void -pgstat_write_statsfiles(bool permanent, bool allDbs) +pgstat_write_statsfiles(void) { - HASH_SEQ_STATUS hstat; + dshash_seq_status hstat; PgStat_StatDBEntry *dbentry; FILE *fpout; int32 format_id; - const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; int rc; elog(DEBUG2, "writing stats file \"%s\"", statfile); @@ -4721,7 +4778,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) /* * Set the timestamp of the stats file. */ - globalStats.stats_timestamp = GetCurrentTimestamp(); + shared_globalStats->stats_timestamp = GetCurrentTimestamp(); /* * Write the file header --- currently just a format ID. @@ -4733,32 +4790,29 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) /* * Write global stats struct */ - rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout); + rc = fwrite(shared_globalStats, sizeof(shared_globalStats), 1, fpout); (void) rc; /* we'll check for error with ferror */ /* * Write archiver stats struct */ - rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout); + rc = fwrite(shared_archiverStats, sizeof(shared_archiverStats), 1, fpout); (void) rc; /* we'll check for error with ferror */ /* * Walk through the database table. */ - hash_seq_init(&hstat, pgStatDBHash); - while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) + dshash_seq_init(&hstat, db_stats, false); + while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL) { /* * Write out the table and function stats for this DB into the * appropriate per-DB stat file, if required. */ - if (allDbs || pgstat_db_requested(dbentry->databaseid)) - { - /* Make DB's timestamp consistent with the global stats */ - dbentry->stats_timestamp = globalStats.stats_timestamp; + /* Make DB's timestamp consistent with the global stats */ + dbentry->stats_timestamp = shared_globalStats->stats_timestamp; - pgstat_write_db_statsfile(dbentry, permanent); - } + pgstat_write_db_statsfile(dbentry); /* * Write out the DB entry. We don't write the tables or functions @@ -4802,9 +4856,6 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) unlink(tmpfile); } - if (permanent) - unlink(pgstat_stat_filename); - /* * Now throw away the list of requests. Note that requests sent after we * started the write are still waiting on the network socket. @@ -4818,15 +4869,14 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) * of length len. */ static void -get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, +get_dbstat_filename(bool tempname, Oid databaseid, char *filename, int len) { int printed; /* NB -- pgstat_reset_remove_files knows about the pattern this uses */ printed = snprintf(filename, len, "%s/db_%u.%s", - permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY : - pgstat_stat_directory, + PGSTAT_STAT_PERMANENT_DIRECTORY, databaseid, tempname ? "tmp" : "stat"); if (printed > len) @@ -4844,10 +4894,10 @@ get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, * ---------- */ static void -pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) +pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry) { - HASH_SEQ_STATUS tstat; - HASH_SEQ_STATUS fstat; + dshash_seq_status tstat; + dshash_seq_status fstat; PgStat_StatTabEntry *tabentry; PgStat_StatFuncEntry *funcentry; FILE *fpout; @@ -4856,9 +4906,10 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) int rc; char tmpfile[MAXPGPATH]; char statfile[MAXPGPATH]; + dshash_table *tbl; - get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH); - get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH); + get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH); + get_dbstat_filename(false, dbid, statfile, MAXPGPATH); elog(DEBUG2, "writing stats file \"%s\"", statfile); @@ -4885,24 +4936,28 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) /* * Walk through the database's access stats per table. */ - hash_seq_init(&tstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) + tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + dshash_seq_init(&tstat, tbl, false); + while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL) { fputc('T', fpout); rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); (void) rc; /* we'll check for error with ferror */ } + dshash_detach(tbl); /* * Walk through the database's function stats table. */ - hash_seq_init(&fstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) + tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); + dshash_seq_init(&fstat, tbl, false); + while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL) { fputc('F', fpout); rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); (void) rc; /* we'll check for error with ferror */ } + dshash_detach(tbl); /* * No more output to be done. Close the temp file and replace the old @@ -4936,76 +4991,45 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) tmpfile, statfile))); unlink(tmpfile); } - - if (permanent) - { - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing temporary stats file \"%s\"", statfile); - unlink(statfile); - } } /* ---------- * pgstat_read_statsfiles() - * - * Reads in some existing statistics collector files and returns the - * databases hash table that is the top level of the data. + * Reads in some existing statistics collector files into the shared stats + * hash. * - * If 'onlydb' is not InvalidOid, it means we only want data for that DB - * plus the shared catalogs ("DB 0"). We'll still populate the DB hash - * table for all databases, but we don't bother even creating table/function - * hash tables for other databases. - * - * 'permanent' specifies reading from the permanent files not temporary ones. - * When true (happens only when the collector is starting up), remove the - * files after reading; the in-memory status is now authoritative, and the - * files would be out of date in case somebody else reads them. - * - * If a 'deep' read is requested, table/function stats are read, otherwise - * the table/function hash tables remain empty. * ---------- */ -static HTAB * -pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) +static void +pgstat_read_statsfiles(void) { PgStat_StatDBEntry *dbentry; PgStat_StatDBEntry dbbuf; - HASHCTL hash_ctl; - HTAB *dbhash; FILE *fpin; int32 format_id; bool found; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + dshash_table *tblstats = NULL; + dshash_table *funcstats = NULL; + Assert(pgStatRunningInCollector); /* * The tables will live in pgStatLocalContext. */ pgstat_setup_memcxt(); /* - * Create the DB hashtable + * Create the DB hashtable and global stas area */ - memset(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatDBEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* - * Clear out global and archiver statistics so they start from zero in - * case we can't load an existing statsfile. - */ - memset(&globalStats, 0, sizeof(globalStats)); - memset(&archiverStats, 0, sizeof(archiverStats)); + pgstat_create_shared_stats(); /* * Set the current timestamp (will be kept only in case we can't load an * existing statsfile). */ - globalStats.stat_reset_timestamp = GetCurrentTimestamp(); - archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp; + shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp(); + shared_archiverStats->stat_reset_timestamp = shared_globalStats->stat_reset_timestamp; /* * Try to open the stats file. If it doesn't exist, the backends simply @@ -5023,7 +5047,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) (errcode_for_file_access(), errmsg("could not open statistics file \"%s\": %m", statfile))); - return dbhash; + return; } /* @@ -5040,11 +5064,11 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) /* * Read global stats struct */ - if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) + if (fread(shared_globalStats, 1, sizeof(shared_globalStats), fpin) != sizeof(shared_globalStats)) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&globalStats, 0, sizeof(globalStats)); + memset(shared_globalStats, 0, sizeof(*shared_globalStats)); goto done; } @@ -5055,17 +5079,16 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not * an unusual scenario. */ - if (pgStatRunningInCollector) - globalStats.stats_timestamp = 0; + shared_globalStats->stats_timestamp = 0; /* * Read archiver stats struct */ - if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats)) + if (fread(shared_archiverStats, 1, sizeof(shared_archiverStats), fpin) != sizeof(shared_archiverStats)) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&archiverStats, 0, sizeof(archiverStats)); + memset(shared_archiverStats, 0, sizeof(*shared_archiverStats)); goto done; } @@ -5094,12 +5117,12 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) /* * Add to the DB hash */ - dbentry = (PgStat_StatDBEntry *) hash_search(dbhash, - (void *) &dbbuf.databaseid, - HASH_ENTER, - &found); + dbentry = (PgStat_StatDBEntry *) + dshash_find_or_insert(db_stats, (void *) &dbbuf.databaseid, + &found); if (found) { + dshash_release_lock(db_stats, dbentry); ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); @@ -5107,8 +5130,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) } memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry)); - dbentry->tables = NULL; - dbentry->functions = NULL; + dbentry->tables = DSM_HANDLE_INVALID; + dbentry->functions = DSM_HANDLE_INVALID; /* * In the collector, disregard the timestamp we read from the @@ -5116,47 +5139,23 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) * stats file immediately upon the first request from any * backend. */ - if (pgStatRunningInCollector) - dbentry->stats_timestamp = 0; - - /* - * Don't create tables/functions hashtables for uninteresting - * databases. - */ - if (onlydb != InvalidOid) - { - if (dbbuf.databaseid != onlydb && - dbbuf.databaseid != InvalidOid) - break; - } - - memset(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + Assert(pgStatRunningInCollector); + dbentry->stats_timestamp = 0; /* * If requested, read the data from the database-specific * file. Otherwise we just leave the hashtables empty. */ - if (deep) - pgstat_read_db_statsfile(dbentry->databaseid, - dbentry->tables, - dbentry->functions, - permanent); - + tblstats = dshash_create(area, &dsh_tblparams, 0); + dbentry->tables = dshash_get_hash_table_handle(tblstats); + funcstats = dshash_create(area, &dsh_funcparams, 0); + dbentry->functions = + dshash_get_hash_table_handle(funcstats); + dshash_release_lock(db_stats, dbentry); + pgstat_read_db_statsfile(dbentry->databaseid, + tblstats, funcstats); + dshash_detach(tblstats); + dshash_detach(funcstats); break; case 'E': @@ -5173,34 +5172,47 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) done: FreeFile(fpin); - /* If requested to read the permanent file, also get rid of it. */ - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); - return dbhash; + return; } +Size +StatsShmemSize(void) +{ + return sizeof(dsa_handle); +} + +void +StatsShmemInit(void) +{ + bool found; + + StatsShmem = (StatsShmemStruct *) + ShmemInitStruct("Stats area", StatsShmemSize(), + &found); + if (!IsUnderPostmaster) + { + Assert(!found); + + StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID; + } + else + Assert(found); +} + /* ---------- * pgstat_read_db_statsfile() - * - * Reads in the existing statistics collector file for the given database, - * filling the passed-in tables and functions hash tables. - * - * As in pgstat_read_statsfiles, if the permanent file is requested, it is - * removed after reading. - * - * Note: this code has the ability to skip storing per-table or per-function - * data, if NULL is passed for the corresponding hashtable. That's not used - * at the moment though. + * Reads in the permanent statistics collector file and create shared + * statistics tables. The file is removed afer reading. * ---------- */ static void -pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, - bool permanent) +pgstat_read_db_statsfile(Oid databaseid, + dshash_table *tabhash, dshash_table *funchash) { PgStat_StatTabEntry *tabentry; PgStat_StatTabEntry tabbuf; @@ -5211,7 +5223,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool found; char statfile[MAXPGPATH]; - get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH); + Assert(pgStatRunningInCollector); + get_dbstat_filename(false, databaseid, statfile, MAXPGPATH); /* * Try to open the stats file. If it doesn't exist, the backends simply @@ -5270,12 +5283,13 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, if (tabhash == NULL) break; - tabentry = (PgStat_StatTabEntry *) hash_search(tabhash, - (void *) &tabbuf.tableid, - HASH_ENTER, &found); + tabentry = (PgStat_StatTabEntry *) + dshash_find_or_insert(tabhash, + (void *) &tabbuf.tableid, &found); if (found) { + dshash_release_lock(tabhash, tabentry); ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); @@ -5283,6 +5297,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, } memcpy(tabentry, &tabbuf, sizeof(tabbuf)); + dshash_release_lock(tabhash, tabentry); break; /* @@ -5304,9 +5319,9 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, if (funchash == NULL) break; - funcentry = (PgStat_StatFuncEntry *) hash_search(funchash, - (void *) &funcbuf.functionid, - HASH_ENTER, &found); + funcentry = (PgStat_StatFuncEntry *) + dshash_find_or_insert(funchash, + (void *) &funcbuf.functionid, &found); if (found) { @@ -5317,6 +5332,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, } memcpy(funcentry, &funcbuf, sizeof(funcbuf)); + dshash_release_lock(funchash, funcentry); break; /* @@ -5336,142 +5352,50 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, done: FreeFile(fpin); - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); } /* ---------- - * pgstat_read_db_statsfile_timestamp() - + * backend_clean_snapshot_callback() - * - * Attempt to determine the timestamp of the last db statfile write. - * Returns true if successful; the timestamp is stored in *ts. - * - * This needs to be careful about handling databases for which no stats file - * exists, such as databases without a stat entry or those not yet written: - * - * - if there's a database entry in the global file, return the corresponding - * stats_timestamp value. - * - * - if there's no db stat entry (e.g. for a new or inactive database), - * there's no stats_timestamp value, but also nothing to write so we return - * the timestamp of the global statfile. + * This is usually called with arg = NULL when the memory context where the + * current snapshot has been taken. Don't bother to release memory for the + * case. * ---------- */ -static bool -pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, - TimestampTz *ts) +static void +backend_clean_snapshot_callback(void *arg) { - PgStat_StatDBEntry dbentry; - PgStat_GlobalStats myGlobalStats; - PgStat_ArchiverStats myArchiverStats; - FILE *fpin; - int32 format_id; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - - /* - * Try to open the stats file. As above, anything but ENOENT is worthy of - * complaining about. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + if (arg != NULL) { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return false; - } + /* explicitly called, so explicitly free resources */ + if (snapshot_globalStats) + pfree(snapshot_globalStats); - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } + if (snapshot_archiverStats) + pfree(snapshot_archiverStats); - /* - * Read global stats struct - */ - if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), - fpin) != sizeof(myGlobalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read archiver stats struct - */ - if (fread(&myArchiverStats, 1, sizeof(myArchiverStats), - fpin) != sizeof(myArchiverStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* By default, we're going to return the timestamp of the global file. */ - *ts = myGlobalStats.stats_timestamp; - - /* - * We found an existing collector stats file. Read it and look for a - * record for the requested database. If found, use its timestamp. - */ - for (;;) - { - switch (fgetc(fpin)) + if (snapshot_db_stats) { - /* - * 'D' A PgStat_StatDBEntry struct describing a database - * follows. - */ - case 'D': - if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } + HASH_SEQ_STATUS seq; + PgStat_StatDBEntry *dbent; - /* - * If this is the DB we're looking for, save its timestamp and - * we're done. - */ - if (dbentry.databaseid == databaseid) - { - *ts = dbentry.stats_timestamp; - goto done; - } - - break; - - case 'E': - goto done; - - default: - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; + hash_seq_init(&seq, snapshot_db_stats); + while ((dbent = hash_seq_search(&seq)) != NULL) + { + if (dbent->snapshot_tables) + hash_destroy(dbent->snapshot_tables); + if (dbent->snapshot_functions) + hash_destroy(dbent->snapshot_functions); + } + hash_destroy(snapshot_db_stats); } } -done: - FreeFile(fpin); - return true; + snapshot_globalStats = NULL; + snapshot_archiverStats = NULL; + snapshot_db_stats = NULL; } /* @@ -5479,131 +5403,75 @@ done: * some hash tables. The results will be kept until pgstat_clear_snapshot() * is called (typically, at end of transaction). */ -static void -backend_read_statsfile(void) +static bool +backend_take_stats_snapshot(void) { - TimestampTz min_ts = 0; - TimestampTz ref_ts = 0; - Oid inquiry_db; - int count; + PgStat_StatDBEntry *dbent; + HASH_SEQ_STATUS seq; + MemoryContext oldcontext; + MemoryContextCallback *mcxt_cb; - /* already read it? */ - if (pgStatDBHash) - return; Assert(!pgStatRunningInCollector); - /* - * In a normal backend, we check staleness of the data for our own DB, and - * so we send MyDatabaseId in inquiry messages. In the autovac launcher, - * check staleness of the shared-catalog data, and send InvalidOid in - * inquiry messages so as not to force writing unnecessary data. - */ - if (IsAutoVacuumLauncherProcess()) - inquiry_db = InvalidOid; - else - inquiry_db = MyDatabaseId; - - /* - * Loop until fresh enough stats file is available or we ran out of time. - * The stats inquiry message is sent repeatedly in case collector drops - * it; but not every single time, as that just swamps the collector. - */ - for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + if (!pgstat_attach_shared_stats()) { - bool ok; - TimestampTz file_ts = 0; - TimestampTz cur_ts; + MemoryContextSwitchTo(oldcontext); + return false; + } + MemoryContextSwitchTo(oldcontext); - CHECK_FOR_INTERRUPTS(); + if (snapshot_globalStats) + return true; - ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts); + Assert(snapshot_archiverStats == NULL); + Assert(snapshot_db_stats == NULL); - cur_ts = GetCurrentTimestamp(); - /* Calculate min acceptable timestamp, if we didn't already */ - if (count == 0 || cur_ts < ref_ts) - { - /* - * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL - * msec before now. This indirectly ensures that the collector - * needn't write the file more often than PGSTAT_STAT_INTERVAL. In - * an autovacuum worker, however, we want a lower delay to avoid - * using stale data, so we use PGSTAT_RETRY_DELAY (since the - * number of workers is low, this shouldn't be a problem). - * - * We don't recompute min_ts after sleeping, except in the - * unlikely case that cur_ts went backwards. So we might end up - * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In - * practice that shouldn't happen, though, as long as the sleep - * time is less than PGSTAT_STAT_INTERVAL; and we don't want to - * tell the collector that our cutoff time is less than what we'd - * actually accept. - */ - ref_ts = cur_ts; - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_STAT_INTERVAL); - } + /* + * the snapshot lives within the current transaction if any, the current + * memory context liftime otherwise. + */ + if (IsTransactionState()) + MemoryContextSwitchTo(TopTransactionContext); - /* - * If the file timestamp is actually newer than cur_ts, we must have - * had a clock glitch (system time went backwards) or there is clock - * skew between our processor and the stats collector's processor. - * Accept the file, but send an inquiry message anyway to make - * pgstat_recv_inquiry do a sanity check on the collector's time. - */ - if (ok && file_ts > cur_ts) - { - /* - * A small amount of clock skew between processors isn't terribly - * surprising, but a large difference is worth logging. We - * arbitrarily define "large" as 1000 msec. - */ - if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) - { - char *filetime; - char *mytime; + /* global stats can be just copied */ + snapshot_globalStats = palloc(sizeof(PgStat_GlobalStats)); + memcpy(snapshot_globalStats, shared_globalStats, + sizeof(PgStat_GlobalStats)); - /* Copy because timestamptz_to_str returns a static buffer */ - filetime = pstrdup(timestamptz_to_str(file_ts)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - elog(LOG, "stats collector's time %s is later than backend local time %s", - filetime, mytime); - pfree(filetime); - pfree(mytime); - } + snapshot_archiverStats = palloc(sizeof(PgStat_ArchiverStats)); + memcpy(snapshot_archiverStats, shared_archiverStats, + sizeof(PgStat_ArchiverStats)); - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - break; - } + /* + * take a local snapshot for every dsahsh. It's ok if the snapshots are + * not in strictly consistent. + */ + snapshot_db_stats = dshash_take_snapshot(db_stats); + hash_seq_init(&seq, snapshot_db_stats); + while ((dbent = (PgStat_StatDBEntry *) hash_seq_search(&seq)) != NULL) + { + dshash_table *t; - /* Normal acceptance case: file is not older than cutoff time */ - if (ok && file_ts >= min_ts) - break; - - /* Not there or too old, so kick the collector and wait a bit */ - if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - - pg_usleep(PGSTAT_RETRY_DELAY * 1000L); + t = dshash_attach(area, &dsh_tblparams, dbent->tables, 0); + dbent->snapshot_tables = dshash_take_snapshot(t); + dshash_detach(t); + t = dshash_attach(area, &dsh_funcparams, dbent->functions, 0); + dbent->snapshot_functions = dshash_take_snapshot(t); + dshash_detach(t); } - if (count >= PGSTAT_POLL_LOOP_COUNT) - ereport(LOG, - (errmsg("using stale statistics instead of current ones " - "because stats collector is not responding"))); + /* set the timestamp of this snapshot */ + snapshot_globalStats->stats_timestamp = GetCurrentTimestamp(); - /* - * Autovacuum launcher wants stats about all databases, but a shallow read - * is sufficient. Regular backends want a deep read for just the tables - * they can see (MyDatabaseId + shared catalogs). - */ - if (IsAutoVacuumLauncherProcess()) - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false); - else - pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true); + /* register callback to clear snapshot */ + mcxt_cb = (MemoryContextCallback *)palloc(sizeof(MemoryContextCallback)); + mcxt_cb->func = backend_clean_snapshot_callback; + mcxt_cb->arg = NULL; + MemoryContextRegisterResetCallback(CurrentMemoryContext, mcxt_cb); + MemoryContextSwitchTo(oldcontext); + + return true; } @@ -5636,6 +5504,8 @@ pgstat_setup_memcxt(void) void pgstat_clear_snapshot(void) { + int param = 0; /* only the address is significant */ + /* Release memory, if any was allocated */ if (pgStatLocalContext) MemoryContextDelete(pgStatLocalContext); @@ -5645,99 +5515,12 @@ pgstat_clear_snapshot(void) pgStatDBHash = NULL; localBackendStatusTable = NULL; localNumBackends = 0; -} - - -/* ---------- - * pgstat_recv_inquiry() - - * - * Process stat inquiry requests. - * ---------- - */ -static void -pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - elog(DEBUG2, "received inquiry for database %u", msg->databaseid); /* - * If there's already a write request for this DB, there's nothing to do. - * - * Note that if a request is found, we return early and skip the below - * check for clock skew. This is okay, since the only way for a DB - * request to be present in the list is that we have been here since the - * last write round. It seems sufficient to check for clock skew once per - * write round. + * the parameter inform the function that it is not called from + * MemoryContextCallback */ - if (list_member_oid(pending_write_requests, msg->databaseid)) - return; - - /* - * Check to see if we last wrote this database at a time >= the requested - * cutoff time. If so, this is a stale request that was generated before - * we updated the DB file, and we don't need to do so again. - * - * If the requestor's local clock time is older than stats_timestamp, we - * should suspect a clock glitch, ie system time going backwards; though - * the more likely explanation is just delayed message receipt. It is - * worth expending a GetCurrentTimestamp call to be sure, since a large - * retreat in the system clock reading could otherwise cause us to neglect - * to update the stats file for a long time. - */ - dbentry = pgstat_get_db_entry(msg->databaseid, false); - if (dbentry == NULL) - { - /* - * We have no data for this DB. Enter a write request anyway so that - * the global stats will get updated. This is needed to prevent - * backend_read_statsfile from waiting for data that we cannot supply, - * in the case of a new DB that nobody has yet reported any stats for. - * See the behavior of pgstat_read_db_statsfile_timestamp. - */ - } - else if (msg->clock_time < dbentry->stats_timestamp) - { - TimestampTz cur_ts = GetCurrentTimestamp(); - - if (cur_ts < dbentry->stats_timestamp) - { - /* - * Sure enough, time went backwards. Force a new stats file write - * to get back in sync; but first, log a complaint. - */ - char *writetime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - elog(LOG, - "stats_timestamp %s is later than collector's time %s for database %u", - writetime, mytime, dbentry->databaseid); - pfree(writetime); - pfree(mytime); - } - else - { - /* - * Nope, it's just an old request. Assuming msg's clock_time is - * >= its cutoff_time, it must be stale, so we can ignore it. - */ - return; - } - } - else if (msg->cutoff_time <= dbentry->stats_timestamp) - { - /* Stale request, ignore it */ - return; - } - - /* - * We need to write this DB, so create a request. - */ - pending_write_requests = lappend_oid(pending_write_requests, - msg->databaseid); + backend_clean_snapshot_callback(¶m); } @@ -5750,6 +5533,7 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) { + dshash_table *tabhash; PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; int i; @@ -5765,6 +5549,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) dbentry->n_block_read_time += msg->m_block_read_time; dbentry->n_block_write_time += msg->m_block_write_time; + tabhash = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); /* * Process all table entries in the message. */ @@ -5772,9 +5557,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) { PgStat_TableEntry *tabmsg = &(msg->m_entry[i]); - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &(tabmsg->t_id), - HASH_ENTER, &found); + tabentry = (PgStat_StatTabEntry *) + dshash_find_or_insert(tabhash, (void *) &(tabmsg->t_id), &found); if (!found) { @@ -5833,6 +5617,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); /* Likewise for n_dead_tuples */ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + dshash_release_lock(tabhash, tabentry); /* * Add per-table stats to the per-database entry, too. @@ -5845,6 +5630,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched; dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit; } + + dshash_release_lock(db_stats, dbentry); } @@ -5857,27 +5644,33 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len) { + dshash_table *tbl; PgStat_StatDBEntry *dbentry; int i; dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - /* * No need to purge if we don't even know the database. */ - if (!dbentry || !dbentry->tables) + if (!dbentry || dbentry->tables == DSM_HANDLE_INVALID) + { + if (dbentry) + dshash_release_lock(db_stats, dbentry); return; + } + tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); /* * Process all table entries in the message. */ for (i = 0; i < msg->m_nentries; i++) { /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->tables, - (void *) &(msg->m_tableid[i]), - HASH_REMOVE, NULL); + (void) dshash_delete_key(tbl, (void *) &(msg->m_tableid[i])); } + + dshash_release_lock(db_stats, dbentry); + } @@ -5903,23 +5696,20 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) */ if (dbentry) { - char statfile[MAXPGPATH]; + if (dbentry->tables != DSM_HANDLE_INVALID) + { + dshash_table *tbl = + dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + dshash_destroy(tbl); + } + if (dbentry->functions != DSM_HANDLE_INVALID) + { + dshash_table *tbl = + dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); + dshash_destroy(tbl); + } - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing stats file \"%s\"", statfile); - unlink(statfile); - - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - if (hash_search(pgStatDBHash, - (void *) &dbid, - HASH_REMOVE, NULL) == NULL) - ereport(ERROR, - (errmsg("database hash table corrupted during cleanup --- abort"))); + dshash_delete_entry(db_stats, (void *)dbentry); } } @@ -5947,19 +5737,28 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) * We simply throw away all the database's table entries by recreating a * new hash table for them. */ - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - dbentry->tables = NULL; - dbentry->functions = NULL; + if (dbentry->tables != DSM_HANDLE_INVALID) + { + dshash_table *t = + dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + dshash_destroy(t); + dbentry->tables = DSM_HANDLE_INVALID; + } + if (dbentry->functions != DSM_HANDLE_INVALID) + { + dshash_table *t = + dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); + dshash_destroy(t); + dbentry->functions = DSM_HANDLE_INVALID; + } /* * Reset database-level stats, too. This creates empty hash tables for * tables and functions. */ reset_dbentry_counters(dbentry); + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -5974,14 +5773,14 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len) if (msg->m_resettarget == RESET_BGWRITER) { /* Reset the global background writer statistics for the cluster. */ - memset(&globalStats, 0, sizeof(globalStats)); - globalStats.stat_reset_timestamp = GetCurrentTimestamp(); + memset(&shared_globalStats, 0, sizeof(shared_globalStats)); + shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp(); } else if (msg->m_resettarget == RESET_ARCHIVER) { /* Reset the archiver statistics for the cluster. */ - memset(&archiverStats, 0, sizeof(archiverStats)); - archiverStats.stat_reset_timestamp = GetCurrentTimestamp(); + memset(&shared_archiverStats, 0, sizeof(shared_archiverStats)); + shared_archiverStats->stat_reset_timestamp = GetCurrentTimestamp(); } /* @@ -6011,11 +5810,19 @@ pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len) /* Remove object if it exists, ignore it if not */ if (msg->m_resettype == RESET_TABLE) - (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); + { + dshash_table *t = + dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + dshash_delete_key(t, (void *) &(msg->m_objectid)); + } else if (msg->m_resettype == RESET_FUNCTION) - (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); + { + dshash_table *t = + dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); + dshash_delete_key(t, (void *) &(msg->m_objectid)); + } + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6035,6 +5842,8 @@ pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) dbentry = pgstat_get_db_entry(msg->m_databaseid, true); dbentry->last_autovac_time = msg->m_start_time; + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6048,13 +5857,13 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) { PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; - + dshash_table *table; /* * Store the data in the table's hashtable entry. */ dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true); tabentry->n_live_tuples = msg->m_live_tuples; tabentry->n_dead_tuples = msg->m_dead_tuples; @@ -6069,6 +5878,9 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) tabentry->vacuum_timestamp = msg->m_vacuumtime; tabentry->vacuum_count++; } + dshash_release_lock(table, tabentry); + dshash_detach(table); + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6082,13 +5894,15 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) { PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; + dshash_table *table; /* * Store the data in the table's hashtable entry. */ dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0); + tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true); tabentry->n_live_tuples = msg->m_live_tuples; tabentry->n_dead_tuples = msg->m_dead_tuples; @@ -6111,6 +5925,9 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) tabentry->analyze_timestamp = msg->m_analyzetime; tabentry->analyze_count++; } + dshash_release_lock(table, tabentry); + dshash_detach(table); + dshash_release_lock(db_stats, dbentry); } @@ -6126,18 +5943,18 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len) if (msg->m_failed) { /* Failed archival attempt */ - ++archiverStats.failed_count; - memcpy(archiverStats.last_failed_wal, msg->m_xlog, - sizeof(archiverStats.last_failed_wal)); - archiverStats.last_failed_timestamp = msg->m_timestamp; + ++shared_archiverStats->failed_count; + memcpy(shared_archiverStats->last_failed_wal, msg->m_xlog, + sizeof(shared_archiverStats->last_failed_wal)); + shared_archiverStats->last_failed_timestamp = msg->m_timestamp; } else { /* Successful archival operation */ - ++archiverStats.archived_count; - memcpy(archiverStats.last_archived_wal, msg->m_xlog, - sizeof(archiverStats.last_archived_wal)); - archiverStats.last_archived_timestamp = msg->m_timestamp; + ++shared_archiverStats->archived_count; + memcpy(shared_archiverStats->last_archived_wal, msg->m_xlog, + sizeof(shared_archiverStats->last_archived_wal)); + shared_archiverStats->last_archived_timestamp = msg->m_timestamp; } } @@ -6150,16 +5967,16 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len) static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len) { - globalStats.timed_checkpoints += msg->m_timed_checkpoints; - globalStats.requested_checkpoints += msg->m_requested_checkpoints; - globalStats.checkpoint_write_time += msg->m_checkpoint_write_time; - globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time; - globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints; - globalStats.buf_written_clean += msg->m_buf_written_clean; - globalStats.maxwritten_clean += msg->m_maxwritten_clean; - globalStats.buf_written_backend += msg->m_buf_written_backend; - globalStats.buf_fsync_backend += msg->m_buf_fsync_backend; - globalStats.buf_alloc += msg->m_buf_alloc; + shared_globalStats->timed_checkpoints += msg->m_timed_checkpoints; + shared_globalStats->requested_checkpoints += msg->m_requested_checkpoints; + shared_globalStats->checkpoint_write_time += msg->m_checkpoint_write_time; + shared_globalStats->checkpoint_sync_time += msg->m_checkpoint_sync_time; + shared_globalStats->buf_written_checkpoints += msg->m_buf_written_checkpoints; + shared_globalStats->buf_written_clean += msg->m_buf_written_clean; + shared_globalStats->maxwritten_clean += msg->m_maxwritten_clean; + shared_globalStats->buf_written_backend += msg->m_buf_written_backend; + shared_globalStats->buf_fsync_backend += msg->m_buf_fsync_backend; + shared_globalStats->buf_alloc += msg->m_buf_alloc; } /* ---------- @@ -6200,6 +6017,8 @@ pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len) dbentry->n_conflict_startup_deadlock++; break; } + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6216,6 +6035,8 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) dbentry = pgstat_get_db_entry(msg->m_databaseid, true); dbentry->n_deadlocks++; + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6233,6 +6054,8 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len) dbentry->n_temp_bytes += msg->m_filesize; dbentry->n_temp_files += 1; + + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6244,6 +6067,7 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len) static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) { + dshash_table *t; PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]); PgStat_StatDBEntry *dbentry; PgStat_StatFuncEntry *funcentry; @@ -6252,14 +6076,14 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); /* * Process all function entries in the message. */ for (i = 0; i < msg->m_nentries; i++, funcmsg++) { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &(funcmsg->f_id), - HASH_ENTER, &found); + funcentry = (PgStat_StatFuncEntry *) + dshash_find_or_insert(t, (void *) &(funcmsg->f_id), &found); if (!found) { @@ -6280,7 +6104,11 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) funcentry->f_total_time += funcmsg->f_total_time; funcentry->f_self_time += funcmsg->f_self_time; } + dshash_release_lock(t, funcentry); } + + dshash_detach(t); + dshash_release_lock(db_stats, dbentry); } /* ---------- @@ -6292,6 +6120,7 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) { + dshash_table *t; PgStat_StatDBEntry *dbentry; int i; @@ -6300,60 +6129,20 @@ pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) /* * No need to purge if we don't even know the database. */ - if (!dbentry || !dbentry->functions) + if (!dbentry || dbentry->functions == DSM_HANDLE_INVALID) return; + t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0); /* * Process all function entries in the message. */ for (i = 0; i < msg->m_nentries; i++) { /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->functions, - (void *) &(msg->m_functionid[i]), - HASH_REMOVE, NULL); + dshash_delete_key(t, (void *) &(msg->m_functionid[i])); } -} - -/* ---------- - * pgstat_write_statsfile_needed() - - * - * Do we need to write out any stats files? - * ---------- - */ -static bool -pgstat_write_statsfile_needed(void) -{ - if (pending_write_requests != NIL) - return true; - - /* Everything was written recently */ - return false; -} - -/* ---------- - * pgstat_db_requested() - - * - * Checks whether stats for a particular DB need to be written to a file. - * ---------- - */ -static bool -pgstat_db_requested(Oid databaseid) -{ - /* - * If any requests are outstanding at all, we should write the stats for - * shared catalogs (the "database" with OID 0). This ensures that - * backends will see up-to-date stats for shared catalogs, even though - * they send inquiry messages mentioning only their own DB. - */ - if (databaseid == InvalidOid && pending_write_requests != NIL) - return true; - - /* Search to see if there's an open request to write this database. */ - if (list_member_oid(pending_write_requests, databaseid)) - return true; - - return false; + dshash_detach(t); + dshash_release_lock(db_stats, dbentry); } /* diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 3f1eae38a9..a517bf62b6 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -77,9 +77,6 @@ static bool is_checksummed_file(const char *fullpath, const char *filename); /* Was the backup currently in-progress initiated in recovery mode? */ static bool backup_started_in_recovery = false; -/* Relative path of temporary statistics directory */ -static char *statrelpath = NULL; - /* * Size of each block sent into the tar stream for larger files. */ @@ -121,13 +118,6 @@ static bool noverify_checksums = false; */ static const char *excludeDirContents[] = { - /* - * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even - * when stats_temp_directory is set because PGSS_TEXT_FILE is always - * created there. - */ - PG_STAT_TMP_DIR, - /* * It is generally not useful to backup the contents of this directory * even if the intention is to restore to another master. See backup.sgml @@ -223,11 +213,8 @@ perform_base_backup(basebackup_options *opt) TimeLineID endtli; StringInfo labelfile; StringInfo tblspc_map_file = NULL; - int datadirpathlen; List *tablespaces = NIL; - datadirpathlen = strlen(DataDir); - backup_started_in_recovery = RecoveryInProgress(); labelfile = makeStringInfo(); @@ -254,18 +241,6 @@ perform_base_backup(basebackup_options *opt) SendXlogRecPtrResult(startptr, starttli); - /* - * Calculate the relative path of temporary statistics directory in - * order to skip the files which are located in that directory later. - */ - if (is_absolute_path(pgstat_stat_directory) && - strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0) - statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1); - else if (strncmp(pgstat_stat_directory, "./", 2) != 0) - statrelpath = psprintf("./%s", pgstat_stat_directory); - else - statrelpath = pgstat_stat_directory; - /* Add a node for the base directory at the end */ ti = palloc0(sizeof(tablespaceinfo)); ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1; @@ -1174,17 +1149,6 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, if (excludeFound) continue; - /* - * Exclude contents of directory specified by statrelpath if not set - * to the default (pg_stat_tmp) which is caught in the loop above. - */ - if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0) - { - elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath); - size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); - continue; - } - /* * We can skip pg_wal, the WAL segments need to be fetched from the * WAL archive anyway. But include it as an empty directory anyway, so diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0c86a581c0..ee30e8a14f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -150,6 +150,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); size = add_size(size, BackendRandomShmemSize()); + size = add_size(size, StatsShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -270,6 +271,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) SyncScanShmemInit(); AsyncShmemInit(); BackendRandomShmemInit(); + StatsShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index a6fda81feb..c46bb8d057 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -521,6 +521,9 @@ RegisterLWLockTranches(void) LWLockRegisterTranche(LWTRANCHE_TBM, "tbm"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join"); + LWLockRegisterTranche(LWTRANCHE_STATS_DSA, "stats table dsa"); + LWLockRegisterTranche(LWTRANCHE_STATS_DB, "db stats"); + LWLockRegisterTranche(LWTRANCHE_STATS_FUNC_TABLE, "table/func stats"); /* Register named tranches. */ for (i = 0; i < NamedLWLockTrancheRequests; i++) diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6025ecedb..798af9f168 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,4 @@ OldSnapshotTimeMapLock 42 BackendRandomLock 43 LogicalRepWorkerLock 44 CLogTruncationLock 45 +StatsLock 46 diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 859ef931e7..50043eb5fc 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -186,7 +186,6 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource so static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source); static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source); static void assign_effective_io_concurrency(int newval, void *extra); -static void assign_pgstat_temp_directory(const char *newval, void *extra); static bool check_application_name(char **newval, void **extra, GucSource source); static void assign_application_name(const char *newval, void *extra); static bool check_cluster_name(char **newval, void **extra, GucSource source); @@ -3755,17 +3754,6 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, - { - {"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR, - gettext_noop("Writes temporary statistics files to the specified directory."), - NULL, - GUC_SUPERUSER_ONLY - }, - &pgstat_temp_directory, - PG_STAT_TMP_DIR, - check_canonical_path, assign_pgstat_temp_directory, NULL - }, - { {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER, gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."), @@ -10691,35 +10679,6 @@ assign_effective_io_concurrency(int newval, void *extra) #endif /* USE_PREFETCH */ } -static void -assign_pgstat_temp_directory(const char *newval, void *extra) -{ - /* check_canonical_path already canonicalized newval for us */ - char *dname; - char *tname; - char *fname; - - /* directory */ - dname = guc_malloc(ERROR, strlen(newval) + 1); /* runtime dir */ - sprintf(dname, "%s", newval); - - /* global stats */ - tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */ - sprintf(tname, "%s/global.tmp", newval); - fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */ - sprintf(fname, "%s/global.stat", newval); - - if (pgstat_stat_directory) - free(pgstat_stat_directory); - pgstat_stat_directory = dname; - if (pgstat_stat_tmpname) - free(pgstat_stat_tmpname); - pgstat_stat_tmpname = tname; - if (pgstat_stat_filename) - free(pgstat_stat_filename); - pgstat_stat_filename = fname; -} - static bool check_application_name(char **newval, void **extra, GucSource source) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9e39baf466..7aa57bc489 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -509,7 +509,6 @@ #track_io_timing = off #track_functions = none # none, pl, all #track_activity_query_size = 1024 # (change requires restart) -#stats_temp_directory = 'pg_stat_tmp' # - Monitoring - diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ae22e7d9fb..0c3b82b455 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -216,7 +216,6 @@ static const char *const subdirs[] = { "pg_replslot", "pg_tblspc", "pg_stat", - "pg_stat_tmp", "pg_xact", "pg_logical", "pg_logical/snapshots", diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index aab2e1eecf..a646d3f972 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -123,7 +123,7 @@ is_deeply( # Contents of these directories should not be copied. foreach my $dirname ( - qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans) + qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans) ) { is_deeply( diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0b9609f96e..6608c9be93 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -13,6 +13,7 @@ #include "datatype/timestamp.h" #include "fmgr.h" +#include "lib/dshash.h" #include "libpq/pqcomm.h" #include "port/atomics.h" #include "portability/instr_time.h" @@ -30,9 +31,6 @@ #define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" #define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" -/* Default directory to store temporary statistics data in */ -#define PG_STAT_TMP_DIR "pg_stat_tmp" - /* Values for track_functions GUC variable --- order is significant! */ typedef enum TrackFunctionsLevel { @@ -48,7 +46,6 @@ typedef enum TrackFunctionsLevel typedef enum StatMsgType { PGSTAT_MTYPE_DUMMY, - PGSTAT_MTYPE_INQUIRY, PGSTAT_MTYPE_TABSTAT, PGSTAT_MTYPE_TABPURGE, PGSTAT_MTYPE_DROPDB, @@ -216,35 +213,6 @@ typedef struct PgStat_MsgDummy PgStat_MsgHdr m_hdr; } PgStat_MsgDummy; - -/* ---------- - * PgStat_MsgInquiry Sent by a backend to ask the collector - * to write the stats file(s). - * - * Ordinarily, an inquiry message prompts writing of the global stats file, - * the stats file for shared catalogs, and the stats file for the specified - * database. If databaseid is InvalidOid, only the first two are written. - * - * New file(s) will be written only if the existing file has a timestamp - * older than the specified cutoff_time; this prevents duplicated effort - * when multiple requests arrive at nearly the same time, assuming that - * backends send requests with cutoff_times a little bit in the past. - * - * clock_time should be the requestor's current local time; the collector - * uses this to check for the system clock going backward, but it has no - * effect unless that occurs. We assume clock_time >= cutoff_time, though. - * ---------- - */ - -typedef struct PgStat_MsgInquiry -{ - PgStat_MsgHdr m_hdr; - TimestampTz clock_time; /* observed local clock time */ - TimestampTz cutoff_time; /* minimum acceptable file timestamp */ - Oid databaseid; /* requested DB (InvalidOid => shared only) */ -} PgStat_MsgInquiry; - - /* ---------- * PgStat_TableEntry Per-table info in a MsgTabstat * ---------- @@ -539,7 +507,6 @@ typedef union PgStat_Msg { PgStat_MsgHdr msg_hdr; PgStat_MsgDummy msg_dummy; - PgStat_MsgInquiry msg_inquiry; PgStat_MsgTabstat msg_tabstat; PgStat_MsgTabpurge msg_tabpurge; PgStat_MsgDropdb msg_dropdb; @@ -601,10 +568,13 @@ typedef struct PgStat_StatDBEntry /* * tables and functions must be last in the struct, because we don't write - * the pointers out to the stats file. + * the handles and pointers out to the stats file. */ - HTAB *tables; - HTAB *functions; + dshash_table_handle tables; + dshash_table_handle functions; + /* for snapshot tables */ + HTAB *snapshot_tables; + HTAB *snapshot_functions; } PgStat_StatDBEntry; @@ -1217,6 +1187,7 @@ extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); extern void pgstat_initstats(Relation rel); extern char *pgstat_clip_activity(const char *raw_activity); +extern PgStat_StatTabEntry *backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid); /* ---------- * pgstat_report_wait_start() - @@ -1344,6 +1315,9 @@ extern void pgstat_send_bgwriter(void); * generate the pgstat* views. * ---------- */ +extern Size StatsShmemSize(void); +extern void StatsShmemInit(void); +extern void PgstatCollectorMain(void); extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid); @@ -1353,7 +1327,4 @@ extern int pgstat_fetch_stat_numbackends(void); extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); extern PgStat_GlobalStats *pgstat_fetch_global(void); -/* Main loop */ -extern void PgstatCollectorMain(void); - #endif /* PGSTAT_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index c21bfe2f66..2cdd10c2fd 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -219,6 +219,9 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SHARED_TUPLESTORE, LWTRANCHE_TBM, LWTRANCHE_PARALLEL_APPEND, + LWTRANCHE_STATS_DSA, + LWTRANCHE_STATS_DB, + LWTRANCHE_STATS_FUNC_TABLE, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; -- 2.16.3