On Tue, Dec 04, 2018 at 08:40:40PM +0000, Bossart, Nathan wrote:
> Thanks for the updated patch!  The code looks good to me, the patch
> applies cleanly and builds without warnings, and it seems to work well
> in my manual tests.  I just have a few wording suggestions.

How are you testing this?  I just stop the server and manually touch
some fake status files in archive_status :)

> I would phrase this comment this way:
> 
>         Since archive_status files are not durably removed, a system
>         crash could leave behind .ready files for WAL segments that
>         have already been recycled or removed.  In this case, simply
>         remove the orphan status file and move on.

Fine for me.  Thanks!

> +     ereport(WARNING,
> +             (errmsg("removed orphan archive status file %s",
> +                     xlogready)));
> 
> I think we should put quotes around the file name like we do elsewhere
> in pgarch_ArchiverCopyLoop().

Done.

> +                                     ereport(WARNING,
> +                                                     (errmsg("failed removal 
> of \"%s\" too many times, will try again later",
> +                                                                     
> xlogready)));
> 
> I'd suggest mirroring the log statement for failed archiving commands
> and saying something like, "removing orphan archive status file \"%s\"
> failed too many times, will try again later."  IMO that makes it
> clearer what is failing and why we are removing it in the first place.

"removal of" is more consistent here I think, so changed this way with
your wording merged.
--
Michael
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 844b9d1b0e..90817c8598 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -28,6 +28,7 @@
 #include <fcntl.h>
 #include <signal.h>
 #include <time.h>
+#include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <unistd.h>
@@ -60,6 +61,7 @@
 									 * failed archiver; in seconds. */
 
 #define NUM_ARCHIVE_RETRIES 3
+#define NUM_STATUS_CLEANUP_RETRIES 3
 
 
 /* ----------
@@ -424,9 +426,13 @@ pgarch_ArchiverCopyLoop(void)
 	while (pgarch_readyXlog(xlog))
 	{
 		int			failures = 0;
+		int			failures_unlink = 0;
 
 		for (;;)
 		{
+			struct stat	stat_buf;
+			char		pathname[MAXPGPATH];
+
 			/*
 			 * Do not initiate any more archive commands after receiving
 			 * SIGTERM, nor after the postmaster has died unexpectedly. The
@@ -456,6 +462,43 @@ pgarch_ArchiverCopyLoop(void)
 				return;
 			}
 
+			/*
+			 * Since archive status files are not durably removed, a system
+			 * crash could leave behind .ready files for WAL segments that
+			 * have already been recycled or removed.  In this case, simply
+			 * remove the orphan status file and move on.
+			 */
+			snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
+			if (stat(pathname, &stat_buf) != 0 && errno == ENOENT)
+			{
+				char		xlogready[MAXPGPATH];
+
+				StatusFilePath(xlogready, xlog, ".ready");
+				if (durable_unlink(xlogready, WARNING) == 0)
+				{
+					ereport(WARNING,
+							(errmsg("removed orphan archive status file \"%s\"",
+									xlogready)));
+
+					/* leave loop and move to the next status file */
+					break;
+				}
+
+				if (++failures_unlink >= NUM_STATUS_CLEANUP_RETRIES)
+				{
+					ereport(WARNING,
+							(errmsg("removal of orphan archive status file \"%s\" failed too many times, will try again later",
+									xlogready)));
+
+					/* give up cleanup of orphan status files */
+					return;
+				}
+
+				/* wait a bit before retrying */
+				pg_usleep(1000000L);
+				continue;
+			}
+
 			if (pgarch_archiveXlog(xlog))
 			{
 				/* successful */

Attachment: signature.asc
Description: PGP signature

Reply via email to