On Fri, Jan 21, 2022 at 11:49:56AM -0800, Andres Freund wrote:
> On 2022-01-20 20:41:16 +0000, Bossart, Nathan wrote:
>> Here's this part.
> 
> And pushed to all branches. Thanks.

Thanks!

I spent some time thinking about the right way to proceed here, and I came
up with the attached patches.  The first patch just adds error checking for
various lstat() calls in the replication code.  If lstat() fails, then it
probably doesn't make sense to try to continue processing the file.

The second patch changes some nearby calls to ereport() to ERROR.  If these
failures are truly unexpected, and we don't intend to support use-cases
like concurrent manual deletion, then failing might be the right way to go.
I think it's a shame that such failures could cause checkpointing to
continually fail, but that topic is already being discussed elsewhere [0].

[0] https://postgr.es/m/C1EE64B0-D4DB-40F3-98C8-0CED324D34CB%40amazon.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com/
>From df7a07d27c2858ec3ac383335af3e55e5428e3b1 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 26 Jan 2022 14:03:11 -0800
Subject: [PATCH v4 1/2] add error checking for calls to lstat() in replication
 code

---
 src/backend/access/heap/rewriteheap.c           | 6 +++++-
 src/backend/replication/logical/reorderbuffer.c | 6 +++++-
 src/backend/replication/logical/snapbuild.c     | 6 +++++-
 src/backend/replication/slot.c                  | 6 +++++-
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..81319e0c78 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1227,7 +1227,11 @@ CheckPointLogicalRewriteHeap(void)
 			continue;
 
 		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		if (lstat(path, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISREG(statbuf.st_mode))
 			continue;
 
 		/* Skip over files that cannot be ours. */
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 19b2ba2000..d8d784a42f 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -4407,7 +4407,11 @@ ReorderBufferCleanupSerializedTXNs(const char *slotname)
 	sprintf(path, "pg_replslot/%s", slotname);
 
 	/* we're only handling directories here, skip if it's not ours */
-	if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+	if (lstat(path, &statbuf) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
+	else if (!S_ISDIR(statbuf.st_mode))
 		return;
 
 	spill_dir = AllocateDir(path);
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..57f5a5e81f 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1955,7 +1955,11 @@ CheckPointSnapBuild(void)
 
 		snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
 
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		if (lstat(path, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISREG(statbuf.st_mode))
 		{
 			elog(DEBUG1, "only regular files expected: %s", path);
 			continue;
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index e5e0cf8768..21fb7536e2 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1413,7 +1413,11 @@ StartupReplicationSlots(void)
 		snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
 
 		/* we're only creating directories here, skip if it's not our's */
-		if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+		if (lstat(path, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISDIR(statbuf.st_mode))
 			continue;
 
 		/* we crashed while a slot was being setup or deleted, clean up */
-- 
2.25.1

>From 28a06a2cb84102a92da3fca0f0055dd67902d7de Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 26 Jan 2022 16:24:04 -0800
Subject: [PATCH v4 2/2] minor improvements to replication code

---
 src/backend/replication/logical/snapbuild.c | 19 ++-----------------
 src/backend/replication/slot.c              |  5 +----
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 57f5a5e81f..ce6cb85c1a 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1970,16 +1970,10 @@ CheckPointSnapBuild(void)
 		 * everything but are postfixed by .$pid.tmp. We can just remove them
 		 * the same as other files because there can be none that are
 		 * currently being written that are older than cutoff.
-		 *
-		 * We just log a message if a file doesn't fit the pattern, it's
-		 * probably some editors lock/state file or similar...
 		 */
 		if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-		{
-			ereport(LOG,
+			ereport(ERROR,
 					(errmsg("could not parse file name \"%s\"", path)));
-			continue;
-		}
 
 		lsn = ((uint64) hi) << 32 | lo;
 
@@ -1987,20 +1981,11 @@ CheckPointSnapBuild(void)
 		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
 		{
 			elog(DEBUG1, "removing snapbuild snapshot %s", path);
-
-			/*
-			 * It's not particularly harmful, though strange, if we can't
-			 * remove the file here. Don't prevent the checkpoint from
-			 * completing, that'd be a cure worse than the disease.
-			 */
 			if (unlink(path) < 0)
-			{
-				ereport(LOG,
+				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								path)));
-				continue;
-			}
 		}
 	}
 	FreeDir(snap_dir);
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 21fb7536e2..37ec3fa633 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1424,12 +1424,9 @@ StartupReplicationSlots(void)
 		if (pg_str_endswith(replication_de->d_name, ".tmp"))
 		{
 			if (!rmtree(path, true))
-			{
-				ereport(WARNING,
+				ereport(ERROR,
 						(errmsg("could not remove directory \"%s\"",
 								path)));
-				continue;
-			}
 			fsync_fname("pg_replslot", true);
 			continue;
 		}
-- 
2.25.1

Reply via email to