Hi,
Recently I've heard people complaining that Postgres doesn't expose any
statistics about how many full page writes happened during some time frame.
Indeed, I couldn't find any easy way to do so, and judging from my
understanding of xloginsert.c it actually can be done per database with the
attached poc patch.
I guess it can be implemented in a more effective and optimized way, but with
what I have right now first naive pgbench tests show that slowdown is about 3%.
Before I'll dig into it more, it would be nice to hear your opinion about this
idea - does it make sense to have something like this?
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 5bea073..64e450c 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -31,6 +31,7 @@
#include "storage/proc.h"
#include "utils/memutils.h"
#include "pg_trace.h"
+#include "pgstat.h"
/* Buffer size required to store a compressed version of backup block image */
#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@@ -103,6 +104,15 @@ static int max_rdatas; /* allocated size */
static bool begininsert_called = false;
+#define FPW_COUNTER_HASH_SIZE 100
+
+typedef struct {
+ Oid db;
+ PgStat_Counter counter;
+} FpwCounterEntry;
+
+static HTAB *fpwCounterStatHash = NULL;
+
/* Memory context to hold the registered buffer and data references. */
static MemoryContext xloginsert_cxt;
@@ -192,7 +202,9 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas)
void
XLogResetInsertion(void)
{
- int i;
+ int i;
+ HASH_SEQ_STATUS fstat;
+ FpwCounterEntry *fpwEntry;
for (i = 0; i < max_registered_block_id; i++)
registered_buffers[i].in_use = false;
@@ -203,6 +215,14 @@ XLogResetInsertion(void)
mainrdata_last = (XLogRecData *) &mainrdata_head;
curinsert_flags = 0;
begininsert_called = false;
+
+
+ hash_seq_init(&fstat, fpwCounterStatHash);
+ while ((fpwEntry = (FpwCounterEntry *) hash_seq_search(&fstat)) != NULL)
+ {
+ pgstat_report_fpw(fpwEntry->db, fpwEntry->counter);
+ fpwEntry->counter = 0;
+ }
}
/*
@@ -584,7 +604,20 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
if (include_image)
{
Page page = regbuf->page;
+ bool found = false;
uint16 compressed_len = 0;
+ FpwCounterEntry *fpwEntry;
+ Oid dbOid = regbuf->rnode.dbNode;
+
+ fpwEntry = (FpwCounterEntry *) hash_search(fpwCounterStatHash,
+ &dbOid, HASH_ENTER, &found);
+ if (!found)
+ {
+ fpwEntry->counter = 0;
+ fpwEntry->db = dbOid;
+ }
+
+ fpwEntry->counter++;
/*
* The page needs to be backed up, so calculate its hole length
@@ -1055,4 +1088,16 @@ InitXLogInsert(void)
if (hdr_scratch == NULL)
hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
HEADER_SCRATCH_SIZE);
+
+ if (fpwCounterStatHash == NULL)
+ {
+ HASHCTL hash_ctl;
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(Oid);
+ hash_ctl.entrysize = sizeof(FpwCounterEntry);
+
+ fpwCounterStatHash = hash_create("Full page write counter hask",
+ FPW_COUNTER_HASH_SIZE, &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_PREALLOC | HASH_FIXED_SIZE);
+ }
}
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8cd8bf4..6ee5692 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -822,7 +822,8 @@ CREATE VIEW pg_stat_database AS
pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
- pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
+ pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset,
+ pg_stat_get_db_fpw(D.oid) AS fpw
FROM pg_database D;
CREATE VIEW pg_stat_database_conflicts AS
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 084573e..9606bb7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -336,6 +336,7 @@ static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+static void pgstat_recv_fpw(PgStat_MsgFpw *msg, int len);
/* ------------------------------------------------------------
* Public functions called from postmaster follow
@@ -3210,6 +3211,26 @@ pgstat_report_xact_timestamp(TimestampTz tstamp)
pgstat_increment_changecount_after(beentry);
}
+/* --------
+ * pgstat_report_fpw() -
+ *
+ * Tell the collector about full page writes.
+ * --------
+ */
+void
+pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter)
+{
+ PgStat_MsgFpw msg;
+
+ if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ return;
+
+ pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FPW);
+ msg.m_databaseid = dboid;
+ msg.m_fpw_counter = fpwCounter;
+ pgstat_send(&msg, sizeof(msg));
+}
+
/* ----------
* pgstat_read_current_status() -
*
@@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[])
pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
break;
+ case PGSTAT_MTYPE_FPW:
+ pgstat_recv_fpw((PgStat_MsgFpw *) &msg, len);
+ break;
+
default:
break;
}
@@ -6197,6 +6222,23 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
}
/* ----------
+ * pgstat_recv_fpw() -
+ *
+ * Process a FPW message.
+ * ----------
+ */
+static void
+pgstat_recv_fpw(PgStat_MsgFpw *msg, int len)
+{
+ PgStat_StatDBEntry *dbentry;
+
+ dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+ dbentry->n_fpw += msg->m_fpw_counter;
+}
+
+
+/* ----------
* pgstat_recv_tempfile() -
*
* Process a TEMPFILE message.
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 6110e40..7f0dcad 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1504,6 +1504,21 @@ pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS)
}
Datum
+pg_stat_get_db_fpw(PG_FUNCTION_ARGS)
+{
+ Oid dbid = PG_GETARG_OID(0);
+ int64 result;
+ PgStat_StatDBEntry *dbentry;
+
+ if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ result = 0;
+ else
+ result = (int64) (dbentry->n_fpw);
+
+ PG_RETURN_INT64(result);
+}
+
+Datum
pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS)
{
PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 785e0fa..14c92f8 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -517,6 +517,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
* space if the caller correctly estimates a small table size.
*/
if ((flags & HASH_SHARED_MEM) ||
+ (flags & HASH_PREALLOC) ||
nelem < hctl->nelem_alloc)
{
int i,
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f643f56..5c1b9da 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5344,6 +5344,10 @@
proname => 'pg_stat_get_db_blk_write_time', provolatile => 's',
proparallel => 'r', prorettype => 'float8', proargtypes => 'oid',
prosrc => 'pg_stat_get_db_blk_write_time' },
+{ oid => '3423', descr => 'statistics: number of full page writes for database',
+ proname => 'pg_stat_get_db_fpw', provolatile => 's', proparallel => 'r',
+ prorettype => 'int8', proargtypes => 'oid',
+ prosrc => 'pg_stat_get_db_fpw' },
{ oid => '3195', descr => 'statistics: information about WAL archiver',
proname => 'pg_stat_get_archiver', proisstrict => 'f', provolatile => 's',
proparallel => 'r', prorettype => 'record', proargtypes => '',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index be2f592..2d67dc3 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -64,7 +64,8 @@ typedef enum StatMsgType
PGSTAT_MTYPE_FUNCPURGE,
PGSTAT_MTYPE_RECOVERYCONFLICT,
PGSTAT_MTYPE_TEMPFILE,
- PGSTAT_MTYPE_DEADLOCK
+ PGSTAT_MTYPE_DEADLOCK,
+ PGSTAT_MTYPE_FPW
} StatMsgType;
/* ----------
@@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock
Oid m_databaseid;
} PgStat_MsgDeadlock;
+/* ----------
+ * PgStat_MsgFpw Sent by the backend to tell the collector
+ * about a fpw that occurred.
+ * ----------
+ */
+typedef struct PgStat_MsgFpw
+{
+ PgStat_MsgHdr m_hdr;
+ Oid m_databaseid;
+ PgStat_Counter m_fpw_counter;
+} PgStat_MsgFpw;
/* ----------
* PgStat_Msg Union over all possible messages.
@@ -595,6 +607,7 @@ typedef struct PgStat_StatDBEntry
PgStat_Counter n_deadlocks;
PgStat_Counter n_block_read_time; /* times in microseconds */
PgStat_Counter n_block_write_time;
+ PgStat_Counter n_fpw;
TimestampTz stat_reset_timestamp;
TimestampTz stats_timestamp; /* time of db stats file update */
@@ -1196,6 +1209,7 @@ extern void pgstat_report_activity(BackendState state, const char *cmd_str);
extern void pgstat_report_tempfile(size_t filesize);
extern void pgstat_report_appname(const char *appname);
extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
+extern void pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter);
extern const char *pgstat_get_wait_event(uint32 wait_event_info);
extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 8357faa..5bf04a2 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -94,6 +94,7 @@ typedef struct HASHCTL
#define HASH_SHARED_MEM 0x0800 /* Hashtable is in shared memory */
#define HASH_ATTACH 0x1000 /* Do not initialize hctl */
#define HASH_FIXED_SIZE 0x2000 /* Initial size is a hard limit */
+#define HASH_PREALLOC 0x4000 /* Preallocate hashtable */
/* max_dsize value to indicate expansible directory */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ae0cd25..acb5bbe 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1811,7 +1811,8 @@ pg_stat_database| SELECT d.oid AS datid,
pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
- pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
+ pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset,
+ pg_stat_get_db_fpw(d.oid) AS fpw
FROM pg_database d;
pg_stat_database_conflicts| SELECT d.oid AS datid,
d.datname,