Hi all,

On HEAD, xlog.c has the following comment, which has been on my own
TODO list for a couple of weeks now:

     * TODO: With a bit of extra work we could just start with a pgstat file
     * associated with the checkpoint redo location we're starting from.

Please find a patch series to implement that, giving the possibility
to keep statistics after a crash rather than discard them.  I have
been looking at the code for a while, before settling down on:
- Forcing the flush of the pgstats file to happen during non-shutdown
checkpoint and restart points, after updating the control file's redo
LSN and the critical sections in the area.
- Leaving the current before_shmem_exit() callback around, as a matter
of delaying the flush of the stats for as long as possible in a
shutdown sequence.  This also makes the single-user mode shutdown
simpler.
- Adding the redo LSN to the pgstats file, with a bump of
PGSTAT_FILE_FORMAT_ID, cross-checked with the redo LSN.  This change
is independently useful on its own when loading stats after a clean
startup, as well.
- The crash recovery case is simplified, as there is no more need for
the "discard" code path.
- Not using a logic where I would need to stick a LSN into the stats
file name, implying that we would need a potential lookup at the
contents of pg_stat/ to clean up past files at crash recovery.  These
should not be costly, but I'd rather not add more of these.

7ff23c6d277d, that has removed the last call of CreateCheckPoint()
from the startup process, is older than 5891c7a8ed8f, still it seems
to me that pgstats relies on some areas of the code that don't make
sense on HEAD (see locking mentioned at the top of the write routine
for example).  The logic gets straight-forward to think about as
restart points and checkpoints always run from the checkpointer,
implying that pgstat_write_statsfile() is already called only from the
postmaster in single-user mode or the checkpointer itself, at
shutdown.

Attached is a patch set, with the one being the actual feature, with
some stuff prior to that:
- 0001 adds the redo LSN to the pgstats file flushed.
- 0002 adds an assertion in pgstat_write_statsfile(), to check from
where it is called.
- 0003 with more debugging.
- 0004 is the meat of the thread.

I am adding that to the next CF.  Thoughts and comments are welcome.
Thanks,
--
Michael
From a39ffddc1c525fa4e39fd08657c6dae95a20043f Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 18 Jun 2024 11:00:36 +0900
Subject: [PATCH 1/4] Add redo LSN to pgstats file

This is used in the startup process to check that the file we are
loading is the one referring to the latest checkpoint.

Bump PGSTAT_FILE_FORMAT_ID.
---
 src/include/pgstat.h                |  5 +++--
 src/backend/access/transam/xlog.c   |  2 +-
 src/backend/utils/activity/pgstat.c | 26 +++++++++++++++++++-------
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 2136239710..dbc3430b18 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -11,6 +11,7 @@
 #ifndef PGSTAT_H
 #define PGSTAT_H
 
+#include "access/xlogdefs.h"
 #include "datatype/timestamp.h"
 #include "portability/instr_time.h"
 #include "postmaster/pgarch.h"	/* for MAX_XFN_CHARS */
@@ -235,7 +236,7 @@ typedef struct PgStat_TableXactStatus
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BCAC
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BCAD
 
 typedef struct PgStat_ArchiverStats
 {
@@ -466,7 +467,7 @@ extern Size StatsShmemSize(void);
 extern void StatsShmemInit(void);
 
 /* Functions called during server startup / shutdown */
-extern void pgstat_restore_stats(void);
+extern void pgstat_restore_stats(XLogRecPtr redo);
 extern void pgstat_discard_stats(void);
 extern void pgstat_before_server_shutdown(int code, Datum arg);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 330e058c5f..96b458da42 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5641,7 +5641,7 @@ StartupXLOG(void)
 	if (didCrash)
 		pgstat_discard_stats();
 	else
-		pgstat_restore_stats();
+		pgstat_restore_stats(checkPoint.redo);
 
 	lastFullPageWrites = checkPoint.fullPageWrites;
 
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index dcc2ad8d95..824f742bde 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -94,6 +94,7 @@
 #include <unistd.h>
 
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "lib/dshash.h"
 #include "pgstat.h"
 #include "port/atomics.h"
@@ -162,8 +163,8 @@ typedef struct PgStat_SnapshotEntry
  * ----------
  */
 
-static void pgstat_write_statsfile(void);
-static void pgstat_read_statsfile(void);
+static void pgstat_write_statsfile(XLogRecPtr redo);
+static void pgstat_read_statsfile(XLogRecPtr redo);
 
 static void pgstat_reset_after_failure(void);
 
@@ -404,9 +405,9 @@ static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = {
  * Should only be called by the startup process or in single user mode.
  */
 void
-pgstat_restore_stats(void)
+pgstat_restore_stats(XLogRecPtr redo)
 {
-	pgstat_read_statsfile();
+	pgstat_read_statsfile(redo);
 }
 
 /*
@@ -482,7 +483,7 @@ pgstat_before_server_shutdown(int code, Datum arg)
 	if (code == 0)
 	{
 		pgStatLocal.shmem->is_shutdown = true;
-		pgstat_write_statsfile();
+		pgstat_write_statsfile(GetRedoRecPtr());
 	}
 }
 
@@ -1305,7 +1306,7 @@ write_chunk(FILE *fpout, void *ptr, size_t len)
  * stats so locking is not required.
  */
 static void
-pgstat_write_statsfile(void)
+pgstat_write_statsfile(XLogRecPtr redo)
 {
 	FILE	   *fpout;
 	int32		format_id;
@@ -1340,6 +1341,9 @@ pgstat_write_statsfile(void)
 	format_id = PGSTAT_FILE_FORMAT_ID;
 	write_chunk_s(fpout, &format_id);
 
+	/* Write the redo LSN, used to cross check the file loaded */
+	write_chunk_s(fpout, &redo);
+
 	/*
 	 * XXX: The following could now be generalized to just iterate over
 	 * pgstat_kind_infos instead of knowing about the different kinds of
@@ -1480,13 +1484,14 @@ read_chunk(FILE *fpin, void *ptr, size_t len)
  * stats so locking is not required.
  */
 static void
-pgstat_read_statsfile(void)
+pgstat_read_statsfile(XLogRecPtr redo)
 {
 	FILE	   *fpin;
 	int32		format_id;
 	bool		found;
 	const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
 	PgStat_ShmemControl *shmem = pgStatLocal.shmem;
+	XLogRecPtr	file_redo;
 
 	/* shouldn't be called from postmaster */
 	Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
@@ -1520,6 +1525,13 @@ pgstat_read_statsfile(void)
 		format_id != PGSTAT_FILE_FORMAT_ID)
 		goto error;
 
+	/*
+	 * Read the redo LSN stored in the file.
+	 */
+	if (!read_chunk_s(fpin, &file_redo) ||
+		file_redo != redo)
+		goto error;
+
 	/*
 	 * XXX: The following could now be generalized to just iterate over
 	 * pgstat_kind_infos instead of knowing about the different kinds of
-- 
2.45.1

From 9b252f56d1cad3e12001829bbb61c51cc742e9e0 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 18 Jun 2024 10:59:31 +0900
Subject: [PATCH 2/4] Add assertion in pgstat_write_statsfile()

This routine can currently only be called from the postmaster in
single-user mode or the checkpointer, so make sure that this is always
the case.
---
 src/backend/utils/activity/pgstat.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 824f742bde..9cdd986582 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -1317,6 +1317,9 @@ pgstat_write_statsfile(XLogRecPtr redo)
 
 	pgstat_assert_is_up();
 
+	/* should be called only by the checkpointer or single user mode */
+	Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
 	/* we're shutting down, so it's ok to just override this */
 	pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE;
 
-- 
2.45.1

From ed7fe49cf147b09def2b78554a989d592e02fe07 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 18 Jun 2024 11:10:19 +0900
Subject: [PATCH 3/4] Add some DEBUG2 information about the redo LSN of the
 stats file

This is useful for.. Debugging.  How surprising.
---
 src/backend/utils/activity/pgstat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 9cdd986582..855dec9e52 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -1323,7 +1323,8 @@ pgstat_write_statsfile(XLogRecPtr redo)
 	/* we're shutting down, so it's ok to just override this */
 	pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE;
 
-	elog(DEBUG2, "writing stats file \"%s\"", statfile);
+	elog(DEBUG2, "writing stats file \"%s\" with redo %X/%X", statfile,
+		 LSN_FORMAT_ARGS(redo));
 
 	/*
 	 * Open the statistics temp file to write out the current values.
@@ -1499,7 +1500,8 @@ pgstat_read_statsfile(XLogRecPtr redo)
 	/* shouldn't be called from postmaster */
 	Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
 
-	elog(DEBUG2, "reading stats file \"%s\"", statfile);
+	elog(DEBUG2, "reading stats file \"%s\" with redo %X/%X", statfile,
+		LSN_FORMAT_ARGS(redo));
 
 	/*
 	 * Try to open the stats file. If it doesn't exist, the backends simply
-- 
2.45.1

From d206073d3cc6a0a5ed9228cdd3089c24d88f51f6 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 18 Jun 2024 14:49:33 +0900
Subject: [PATCH 4/4] Flush pgstats file during checkpoints

The flushes are done for non-shutdown checkpoints and restart points, so
as it is possible to keep some statistics around in the event of a
crash.  Keeping the before_shmem_exit() callback to flush the pgstats
file in a shutdown sequence is a bit easier than doing so at checkpoint,
as they may be stats to flush before we are completely done with the
shutdown checkpoint.  Keeping the callback also keeps the shutdown of
single-user mode simpler, where the stats also need to be flushed.

At the beginning of recovery, the stats file is loaded when the redo LSN
it stores matches with the point we are replaying from.
---
 src/include/pgstat.h                     |  4 +-
 src/backend/access/transam/xlog.c        | 27 ++++++++---
 src/backend/utils/activity/pgstat.c      | 62 ++++++------------------
 src/test/recovery/t/029_stats_restart.pl | 22 +++++++--
 4 files changed, 54 insertions(+), 61 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index dbc3430b18..9860bb6a99 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -466,9 +466,9 @@ typedef struct PgStat_PendingWalStats
 extern Size StatsShmemSize(void);
 extern void StatsShmemInit(void);
 
-/* Functions called during server startup / shutdown */
+/* Functions called during server startup / checkpoint / shutdown */
 extern void pgstat_restore_stats(XLogRecPtr redo);
-extern void pgstat_discard_stats(void);
+extern void pgstat_flush_stats(XLogRecPtr redo);
 extern void pgstat_before_server_shutdown(int code, Datum arg);
 
 /* Functions for backend initialization */
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 96b458da42..7ce304c227 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5390,7 +5390,6 @@ StartupXLOG(void)
 	XLogCtlInsert *Insert;
 	CheckPoint	checkPoint;
 	bool		wasShutdown;
-	bool		didCrash;
 	bool		haveTblspcMap;
 	bool		haveBackupLabel;
 	XLogRecPtr	EndOfLog;
@@ -5510,10 +5509,7 @@ StartupXLOG(void)
 	{
 		RemoveTempXlogFiles();
 		SyncDataDirectory();
-		didCrash = true;
 	}
-	else
-		didCrash = false;
 
 	/*
 	 * Prepare for WAL recovery if needed.
@@ -5638,10 +5634,7 @@ StartupXLOG(void)
 	 * TODO: With a bit of extra work we could just start with a pgstat file
 	 * associated with the checkpoint redo location we're starting from.
 	 */
-	if (didCrash)
-		pgstat_discard_stats();
-	else
-		pgstat_restore_stats(checkPoint.redo);
+	pgstat_restore_stats(checkPoint.redo);
 
 	lastFullPageWrites = checkPoint.fullPageWrites;
 
@@ -7203,6 +7196,15 @@ CreateCheckPoint(int flags)
 	 */
 	END_CRIT_SECTION();
 
+	/*
+	 * The control file update is done, hence it is time to write some fresh
+	 * statistics.  Stats are flushed at shutdown by the checkpointer in a
+	 * dedicated before_shmem_exit callback, combined with sanity checks
+	 * related to the shutdown of pgstats.
+	 */
+	if (!shutdown)
+		pgstat_flush_stats(RedoRecPtr);
+
 	/*
 	 * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
 	 * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
@@ -7671,6 +7673,15 @@ CreateRestartPoint(int flags)
 	}
 	LWLockRelease(ControlFileLock);
 
+	/*
+	 * The control file update is done, hence it is time to write some fresh
+	 * statistics.  Stats are flushed at shutdown by the checkpointer in a
+	 * dedicated before_shmem_exit callback, combined with sanity checks
+	 * related to the shutdown of pgstats.
+	 */
+	if ((flags & CHECKPOINT_IS_SHUTDOWN) == 0)
+		pgstat_flush_stats(RedoRecPtr);
+
 	/*
 	 * Update the average distance between checkpoints/restartpoints if the
 	 * prior checkpoint exists.
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 855dec9e52..a0d891aefc 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -10,9 +10,10 @@
  * statistics.
  *
  * Statistics are loaded from the filesystem during startup (by the startup
- * process), unless preceded by a crash, in which case all stats are
- * discarded. They are written out by the checkpointer process just before
- * shutting down, except when shutting down in immediate mode.
+ * process) if the stats file has a redo LSN that matches with the . They
+ * are written out by the checkpointer during checkpoints and restart points,
+ * as well as before shutting down, except when shutting down in immediate
+ * mode.
  *
  * Fixed-numbered stats are stored in plain (non-dynamic) shared memory.
  *
@@ -411,53 +412,19 @@ pgstat_restore_stats(XLogRecPtr redo)
 }
 
 /*
- * Remove the stats file.  This is currently used only if WAL recovery is
- * needed after a crash.
+ * Write stats in memory to disk.
  *
- * Should only be called by the startup process or in single user mode.
+ * This is called by the checkpointer or in single-user mode.
  */
 void
-pgstat_discard_stats(void)
+pgstat_flush_stats(XLogRecPtr redo)
 {
-	int			ret;
-
-	/* NB: this needs to be done even in single user mode */
-
-	ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME);
-	if (ret != 0)
-	{
-		if (errno == ENOENT)
-			elog(DEBUG2,
-				 "didn't need to unlink permanent stats file \"%s\" - didn't exist",
-				 PGSTAT_STAT_PERMANENT_FILENAME);
-		else
-			ereport(LOG,
-					(errcode_for_file_access(),
-					 errmsg("could not unlink permanent statistics file \"%s\": %m",
-							PGSTAT_STAT_PERMANENT_FILENAME)));
-	}
-	else
-	{
-		ereport(DEBUG2,
-				(errcode_for_file_access(),
-				 errmsg_internal("unlinked permanent statistics file \"%s\"",
-								 PGSTAT_STAT_PERMANENT_FILENAME)));
-	}
-
-	/*
-	 * Reset stats contents. This will set reset timestamps of fixed-numbered
-	 * stats to the current time (no variable stats exist).
-	 */
-	pgstat_reset_after_failure();
+	pgstat_write_statsfile(redo);
 }
 
 /*
  * pgstat_before_server_shutdown() needs to be called by exactly one process
- * during regular server shutdowns. Otherwise all stats will be lost.
- *
- * We currently only write out stats for proc_exit(0). We might want to change
- * that at some point... But right now pgstat_discard_stats() would be called
- * during the start after a disorderly shutdown, anyway.
+ * during regular server shutdowns.
  */
 void
 pgstat_before_server_shutdown(int code, Datum arg)
@@ -482,6 +449,9 @@ pgstat_before_server_shutdown(int code, Datum arg)
 	 */
 	if (code == 0)
 	{
+		/* we're shutting down, so it's ok to just override this */
+		pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE;
+
 		pgStatLocal.shmem->is_shutdown = true;
 		pgstat_write_statsfile(GetRedoRecPtr());
 	}
@@ -1302,8 +1272,8 @@ write_chunk(FILE *fpout, void *ptr, size_t len)
 #define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr))
 
 /*
- * This function is called in the last process that is accessing the shared
- * stats so locking is not required.
+ * This function is called in the checkpointer or in single-user mode,
+ * so locking is not required.
  */
 static void
 pgstat_write_statsfile(XLogRecPtr redo)
@@ -1320,9 +1290,6 @@ pgstat_write_statsfile(XLogRecPtr redo)
 	/* should be called only by the checkpointer or single user mode */
 	Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
 
-	/* we're shutting down, so it's ok to just override this */
-	pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE;
-
 	elog(DEBUG2, "writing stats file \"%s\" with redo %X/%X", statfile,
 		 LSN_FORMAT_ARGS(redo));
 
@@ -1402,7 +1369,6 @@ pgstat_write_statsfile(XLogRecPtr redo)
 		CHECK_FOR_INTERRUPTS();
 
 		/* we may have some "dropped" entries not yet removed, skip them */
-		Assert(!ps->dropped);
 		if (ps->dropped)
 			continue;
 
diff --git a/src/test/recovery/t/029_stats_restart.pl b/src/test/recovery/t/029_stats_restart.pl
index 93a7209f69..9956a6ab82 100644
--- a/src/test/recovery/t/029_stats_restart.pl
+++ b/src/test/recovery/t/029_stats_restart.pl
@@ -59,7 +59,7 @@ ok(-f "$og_stats", "origin stats file must exist");
 copy($og_stats, $statsfile) or die "Copy failed: $!";
 
 
-## test discarding of stats file after crash etc
+## test retention of stats file after crash etc
 
 $node->start;
 
@@ -69,12 +69,28 @@ is(have_stats('function', $dboid, $funcoid),
 	't', "$sect: function stats do exist");
 is(have_stats('relation', $dboid, $tableoid),
 	't', "$sect: relation stats do exist");
+# Flush the stats to disk.
+$node->psql('postgres', 'checkpoint');
 
 $node->stop('immediate');
 
-ok(!-f "$og_stats", "no stats file should exist after immediate shutdown");
+ok(-f "$og_stats", "stats file should exist after immediate shutdown");
 
-# copy the old stats back to test we discard stats after crash restart
+# Start once, there should be stats restored from the previous checkpoint.
+$node->start;
+
+$sect = "crashrecovery";
+is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist");
+is(have_stats('function', $dboid, $funcoid),
+	't', "$sect: function stats do exist");
+is(have_stats('relation', $dboid, $tableoid),
+	't', "$sect: relation stats do exist");
+
+$node->stop('immediate');
+
+# Copy the old stats back, and test that we discard stats after crash restart
+# because these don't match with the redo LSN stored in the stats file
+# generated by the previous checkpoint.
 copy($statsfile, $og_stats) or die "Copy failed: $!";
 
 $node->start;
-- 
2.45.1

Attachment: signature.asc
Description: PGP signature

Reply via email to