Hello hackers, As a recent failure, produced by drongo [1], shows, pg_ctl stop/start sequence may break on Windows due to the transient DELETE PENDING state of posmaster.pid.
Please look at the excerpt from the failure log: ... pg_createsubscriber: stopping the subscriber 2024-08-19 18:02:47.608 UTC [6988:4] LOG: received fast shutdown request 2024-08-19 18:02:47.608 UTC [6988:5] LOG: aborting any active transactions 2024-08-19 18:02:47.612 UTC [5884:2] FATAL: terminating walreceiver process due to administrator command 2024-08-19 18:02:47.705 UTC [7036:1] LOG: shutting down pg_createsubscriber: server was stopped ### the server instance (1) emitted only "shutting down" yet, but pg_ctl ### considered it stopped and returned 0 to pg_createsubscriber [18:02:47.900](2.828s) ok 29 - run pg_createsubscriber without --databases ... pg_createsubscriber: starting the standby with command-line options pg_createsubscriber: pg_ctl command is: ... 2024-08-19 18:02:48.163 UTC [5284:1] FATAL: could not create lock file "postmaster.pid": File exists pg_createsubscriber: server was started pg_createsubscriber: checking settings on subscriber ### pg_createsubscriber attempts to start new server instance (2), but ### it fails due to "postmaster.pid" still found on disk 2024-08-19 18:02:48.484 UTC [6988:6] LOG: database system is shut down ### the server instance (1) is finally stopped and postmaster.pid unlinked With extra debug logging and the ntries limit decreased to 10 (in CreateLockFile()), I reproduced the failure easily (when running 20 tests in parallel) and got additional information (see attached). IIUC, the issue is caused by inconsistent checks for postmaster.pid existence: "pg_ctl stop" ... -> get_pgpid() calls fopen(pid_file, "r"), which fails with ENOENT for the DELETE_PENDING state (see pgwin32_open_handle()). "pg_ctl start" ... -> CreateLockFile() calls fd = open(filename, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode); which fails with EEXISTS for the same state of postmaster.pid. [1] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-19%2017%3A32%3A54 Best regards, Alexander
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 537d92c0cf..570d2d2557 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1219,11 +1219,13 @@ CreateLockFile(const char *filename, bool amPostmaster,
/*
* Couldn't create the pid file. Probably it already exists.
*/
- if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+ if ((errno != EEXIST && errno != EACCES) || ntries > 10)
+{
ereport(FATAL,
(errcode_for_file_access(),
- errmsg("could not create lock file \"%s\": %m",
- filename)));
+ errmsg("could not create lock file (ntries: %d) \"%s\": %m",
+ ntries, filename)));
+}
/*
* Read the file to get the old owner's PID. Note race condition
diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c
index 6295783cde..583ed7d449 100644
--- a/src/bin/pg_basebackup/pg_createsubscriber.c
+++ b/src/bin/pg_basebackup/pg_createsubscriber.c
@@ -1479,7 +1479,7 @@ stop_standby_server(const char *datadir)
char *pg_ctl_cmd;
int rc;
- pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" -s", pg_ctl_path,
+ pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" ", pg_ctl_path,
datadir);
pg_log_debug("pg_ctl command is: %s", pg_ctl_cmd);
rc = system(pg_ctl_cmd);
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index e7e878c22f..04787c6aec 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -276,6 +276,9 @@ get_pgpid(bool is_status_request)
pidf = fopen(pid_file, "r");
if (pidf == NULL)
{
+int en = errno;
+write_stderr("!!!get_pgpid| pid_file: %s, pidf: %p, errno: %d\n", pid_file, pidf, en);
+errno = en;
/* No pid file, not an error on startup */
if (errno == ENOENT)
return 0;
@@ -723,7 +726,10 @@ wait_for_postmaster_stop(void)
pid_t pid;
if ((pid = get_pgpid(false)) == 0)
+{
+write_stderr("!!!wait_for_postmaster_stop| pid: %d\n", pid);
return true; /* pid file is gone */
+}
if (kill(pid, 0) != 0)
{
diff --git a/src/port/open.c b/src/port/open.c
index 13e49af8d4..f7cbc819c0 100644
--- a/src/port/open.c
+++ b/src/port/open.c
@@ -138,8 +138,14 @@ pgwin32_open_handle(const char *fileName, int fileFlags, bool backup_semantics)
* invisible. With O_CREAT, we have no choice but to report that
* there's a file in the way (which wouldn't happen on Unix).
*/
+DWORD ntstat = pg_RtlGetLastNtStatus();
+if (strstr(fileName, "postmaster.pid") != NULL)
+{
+fprintf(stderr, "!!!pgwin32_open_handle| fileFlags: %X, err: %d, ntstatus: %X\n", fileFlags, err, ntstat);
+}
+
if (err == ERROR_ACCESS_DENIED &&
- pg_RtlGetLastNtStatus() == STATUS_DELETE_PENDING)
+ ntstat == STATUS_DELETE_PENDING)
{
if (fileFlags & O_CREAT)
err = ERROR_FILE_EXISTS;
@@ -214,6 +220,12 @@ pgwin32_fopen(const char *fileName, const char *mode)
openmode |= O_TEXT;
fd = pgwin32_open(fileName, openmode);
+if (strstr(fileName, "postmaster.pid") != NULL)
+{
+int en = errno;
+fprintf(stderr, "!!!pgwin32_fopen| fileName: %s, fd: %d, errno: %d\n", fileName, fd, en);
+errno = en;
+}
if (fd == -1)
return NULL;
return _fdopen(fd, mode);
regress_log_040_pg_createsubscriber.tar.bz2
Description: application/bzip
