On Mon, Mar 14, 2022 at 1:33 PM Michael Paquier <mich...@paquier.xyz> wrote:
>
> On Mon, Mar 14, 2022 at 10:54:56AM +0530, Bharath Rupireddy wrote:
> > Yes, this is a concern. Also, when there were no logical replication
> > slots on a plain server or the server removed or cleaned up all the
> > snapshot/mappings files, why would anyone want to have these messages
> > with all 0s in the server log?
>
> The default settings don't enable that, so making things conditional
> roughly as you are suggesting with two different LOG messages sounds
> rather fine.

Attaching v3 patch which emits logs only when necessary and doesn't
clutter the existing "checkpoint/restartpoint completed" message, see
some sample logs at [1]. Please review it further.

Just another thought - after looking at sample logs at [1], once can
think why can't the checkpoint logs be like [2] or [3]?

[1]
2022-03-14 11:20:28.381 UTC [3257102] LOG:  checkpoint starting:
immediate force wait
2022-03-14 11:20:28.444 UTC [3257102] LOG:  checkpoint complete: wrote
0 buffers (0.0%); 0 WAL file(s) added, 0 removed, 0 recycled;
write=0.001 s, sync=0.001 s, total=0.064 s; sync files=0,
longest=0.000 s, average=0.000 s; distance=0 kB, estimate=35717 kB
2022-03-14 11:20:28.444 UTC [3257102] LOG:  snapbuild snapshot file(s)
removed=2, size=280 bytes, time=0.001 s, cutoff LSN=0/40D2F40
2022-03-14 11:20:28.444 UTC [3257102] LOG:  logical rewrite mapping
file(s) removed=16, size=1368 bytes, synced=16, size=1368 bytes,
time=0.047 s, cutoff LSN=0/40D2F40

[2]
2022-03-14 11:20:28.381 UTC [3257102] LOG:  checkpoint starting:
immediate force wait
2022-03-14 11:20:28.444 UTC [3257102] LOG:  wrote 0 buffers (0.0%); 0
WAL file(s) added, 0 removed, 0 recycled; write=0.001 s, sync=0.001 s,
total=0.064 s; sync files=0, longest=0.000 s, average=0.000 s;
distance=0 kB, estimate=35717 kB
2022-03-14 11:20:28.444 UTC [3257102] LOG:  snapbuild snapshot file(s)
removed=2, size=280 bytes, time=0.001 s, cutoff LSN=0/40D2F40
2022-03-14 11:20:28.444 UTC [3257102] LOG:  logical rewrite mapping
file(s) removed=16, size=1368 bytes, synced=16, size=1368 bytes,
time=0.047 s, cutoff LSN=0/40D2F40
2022-03-14 11:20:28.444 UTC [3257102] LOG:  checkpoint complete:
immediate force wait

[3]
2022-03-14 11:20:28.381 UTC [3257102] LOG:  checkpoint starting:
immediate force wait
2022-03-14 11:20:28.444 UTC [3257102] LOG:  wrote 0 buffers (0.0%); 0
WAL file(s) added, 0 removed, 0 recycled; write=0.001 s, sync=0.001 s,
total=0.064 s; sync files=0, longest=0.000 s, average=0.000 s;
distance=0 kB, estimate=35717 kB
2022-03-14 11:20:28.444 UTC [3257102] LOG:  snapbuild snapshot file(s)
removed=2, size=280 bytes, time=0.001 s, cutoff LSN=0/40D2F40
2022-03-14 11:20:28.444 UTC [3257102] LOG:  logical rewrite mapping
file(s) removed=16, size=1368 bytes, synced=16, size=1368 bytes,
time=0.047 s, cutoff LSN=0/40D2F40
2022-03-14 11:20:28.444 UTC [3257102] LOG:  checkpoint complete

Regards,
Bharath Rupireddy.
From 08b47249abfeddf7f06191d4bcd223f0c5597016 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 14 Mar 2022 11:22:15 +0000
Subject: [PATCH v3] Add checkpoint stats of snapshot and mapping files of
 pg_logical dir

At times, there can be many snapshot and mapping files under
pg_logical dir that the checkpoint might have to delete/fsync
based on the cutoff LSN which can increase the checkpoint time.
Add stats related to these files to better understand the delays
or time spent by the checkpointer processing them.

Add these new log message only when necessary i.e. at least one
snapshot or mapping files is processed during the checkpoint, that
to as a separate log messages to not clutter existing "checkpoint/
restartpoint complete" log message.
---
 src/backend/access/heap/rewriteheap.c       |  8 ++++
 src/backend/access/transam/xlog.c           | 44 +++++++++++++++++++++
 src/backend/replication/logical/snapbuild.c |  5 +++
 src/include/access/xlog.h                   | 16 ++++++++
 4 files changed, 73 insertions(+)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..9f3c2f510b 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1210,6 +1210,8 @@ CheckPointLogicalRewriteHeap(void)
 	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
 		cutoff = redo;
 
+	CheckpointStats.repl_map_cutoff_lsn = cutoff;
+
 	mappings_dir = AllocateDir("pg_logical/mappings");
 	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
 	{
@@ -1247,6 +1249,9 @@ CheckPointLogicalRewriteHeap(void)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
+
+			CheckpointStats.repl_map_files_rmvd_cnt++;
+			CheckpointStats.repl_map_files_rmvd_sz += statbuf.st_size;
 		}
 		else
 		{
@@ -1280,6 +1285,9 @@ CheckPointLogicalRewriteHeap(void)
 						(errcode_for_file_access(),
 						 errmsg("could not close file \"%s\": %m", path)));
 		}
+
+		CheckpointStats.repl_map_files_syncd_cnt++;
+		CheckpointStats.repl_map_files_syncd_sz += statbuf.st_size;
 	}
 	FreeDir(mappings_dir);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a357..fe92c3f53c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6154,6 +6154,43 @@ LogCheckpointEnd(bool restartpoint)
 						average_msecs / 1000, (int) (average_msecs % 1000),
 						(int) (PrevCheckPointDistance / 1024.0),
 						(int) (CheckPointDistanceEstimate / 1024.0))));
+
+	/*
+	 * Let's emit snapshot and mapping files stats only when necessary, that to
+	 * as separate log messages, to not clutter server logs.
+	 */
+	if (CheckpointStats.repl_snap_files_rmvd_cnt > 0)
+	{
+		long t_msecs;
+
+		t_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_snap_start_t,
+												  CheckpointStats.repl_snap_end_t);
+
+		ereport(LOG,
+				(errmsg("snapbuild snapshot file(s) removed=" UINT64_FORMAT ", size=%zu bytes, time=%ld.%03d s, cutoff LSN=%X/%X",
+						CheckpointStats.repl_snap_files_rmvd_cnt,
+						CheckpointStats.repl_snap_files_rmvd_sz,
+						t_msecs / 1000, (int) (t_msecs % 1000),
+						LSN_FORMAT_ARGS(CheckpointStats.repl_snap_cutoff_lsn))));
+	}
+
+	if (CheckpointStats.repl_map_files_rmvd_cnt > 0 ||
+		CheckpointStats.repl_map_files_syncd_cnt > 0)
+	{
+		long t_msecs;
+
+		t_msecs = TimestampDifferenceMilliseconds(CheckpointStats.repl_map_start_t,
+												  CheckpointStats.repl_map_end_t);
+
+		ereport(LOG,
+				(errmsg("logical rewrite mapping file(s) removed=" UINT64_FORMAT ", size=%zu bytes, synced=" UINT64_FORMAT ", size=%zu bytes, time=%ld.%03d s, cutoff LSN=%X/%X",
+						CheckpointStats.repl_map_files_rmvd_cnt,
+						CheckpointStats.repl_map_files_rmvd_sz,
+						CheckpointStats.repl_map_files_syncd_cnt,
+						CheckpointStats.repl_map_files_syncd_sz,
+						t_msecs / 1000, (int) (t_msecs % 1000),
+						LSN_FORMAT_ARGS(CheckpointStats.repl_map_cutoff_lsn))));
+	}
 }
 
 /*
@@ -6809,8 +6846,15 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
+
+	CheckpointStats.repl_snap_start_t = GetCurrentTimestamp();
 	CheckPointSnapBuild();
+	CheckpointStats.repl_snap_end_t = GetCurrentTimestamp();
+
+	CheckpointStats.repl_map_start_t = GetCurrentTimestamp();
 	CheckPointLogicalRewriteHeap();
+	CheckpointStats.repl_map_end_t = GetCurrentTimestamp();
+
 	CheckPointReplicationOrigin();
 
 	/* Write out all dirty data in SLRUs and the main buffer pool */
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..5e7d8d9dea 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1941,6 +1941,8 @@ CheckPointSnapBuild(void)
 	if (redo < cutoff)
 		cutoff = redo;
 
+	CheckpointStats.repl_snap_cutoff_lsn = cutoff;
+
 	snap_dir = AllocateDir("pg_logical/snapshots");
 	while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
 	{
@@ -1997,6 +1999,9 @@ CheckPointSnapBuild(void)
 								path)));
 				continue;
 			}
+
+			CheckpointStats.repl_snap_files_rmvd_cnt++;
+			CheckpointStats.repl_snap_files_rmvd_sz += statbuf.st_size;
 		}
 	}
 	FreeDir(snap_dir);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 09f6464331..40cda7f4bb 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -172,6 +172,22 @@ typedef struct CheckpointStatsData
 									 * times, which is not necessarily the
 									 * same as the total elapsed time for the
 									 * entire sync phase. */
+
+	/* Statistics of snapshot files under "pg_logical/snapshots" */
+	Size repl_snap_files_rmvd_sz;
+	uint64 repl_snap_files_rmvd_cnt;
+	XLogRecPtr repl_snap_cutoff_lsn;
+	TimestampTz	repl_snap_start_t;
+	TimestampTz	repl_snap_end_t;
+
+	/* Statistics of map files under "pg_logical/mappings" */
+	Size repl_map_files_syncd_sz;
+	uint64 repl_map_files_syncd_cnt;
+	Size repl_map_files_rmvd_sz;
+	uint64 repl_map_files_rmvd_cnt;
+	XLogRecPtr repl_map_cutoff_lsn;
+	TimestampTz	repl_map_start_t;
+	TimestampTz	repl_map_end_t;
 } CheckpointStatsData;
 
 extern CheckpointStatsData CheckpointStats;
-- 
2.25.1

Reply via email to