Re: [HACKERS] Idea for improving buildfarm robustness

Tom Lane Tue, 29 Sep 2015 14:34:01 -0700

Josh Berkus <[email protected]> writes:
> Give me source with the change, and I'll put it on a cheap, low-bandwith
> AWS instance and hammer the heck out of it.  That should raise any false
> positives we can expect.


Here's a draft patch against HEAD (looks like it will work on 9.5 or
9.4 without modifications, too).

                        regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index baa43b2..52c9acd 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static void CloseServerPorts(int status,
*** 373,378 ****
--- 373,379 ----
  static void unlink_external_pid_file(int status, Datum arg);
  static void getInstallationPaths(const char *argv0);
  static void checkDataDir(void);
+ static bool recheckDataDir(void);
  static Port *ConnCreate(int serverFd);
  static void ConnFree(Port *port);
  static void reset_shared(int port);
*************** checkDataDir(void)
*** 1490,1495 ****
--- 1491,1539 ----
  }
  
  /*
+  * Revalidate the data directory; return TRUE if OK, FALSE if not
+  *
+  * We don't try to check everything that checkDataDir() does.  Ideally, we'd
+  * return FALSE *only* if the data directory has been deleted.  As a proxy
+  * for that that matches a condition that checkDataDir() checked, verify that
+  * pg_control still exists.  Because the postmaster will quit if we return
+  * FALSE, do not do so if there is any doubt or possibly-transient failure.
+  * Better to wait till we're sure.
+  *
+  * Unlike checkDataDir(), we assume we've chdir'd into the data directory.
+  */
+ static bool
+ recheckDataDir(void)
+ {
+ 	const char *path = "global/pg_control";
+ 	FILE	   *fp;
+ 
+ 	fp = AllocateFile(path, PG_BINARY_R);
+ 	if (fp != NULL)
+ 	{
+ 		FreeFile(fp);
+ 		return true;
+ 	}
+ 
+ 	/*
+ 	 * There are many foreseeable false-positive error conditions, for example
+ 	 * EINTR or ENFILE should not cause us to fail.  For safety, fail only on
+ 	 * enumerated clearly-something-is-wrong conditions.
+ 	 */
+ 	switch (errno)
+ 	{
+ 		case ENOENT:
+ 		case ENOTDIR:
+ 			elog(LOG, "could not open file \"%s\": %m", path);
+ 			return false;
+ 		default:
+ 			elog(LOG, "could not open file \"%s\": %m; continuing anyway",
+ 				 path);
+ 			return true;
+ 	}
+ }
+ 
+ /*
   * Determine how long should we let ServerLoop sleep.
   *
   * In normal conditions we wait at most one minute, to ensure that the other
*************** ServerLoop(void)
*** 1602,1610 ****
  	fd_set		readmask;
  	int			nSockets;
  	time_t		now,
  				last_touch_time;
  
! 	last_touch_time = time(NULL);
  
  	nSockets = initMasks(&readmask);
  
--- 1646,1655 ----
  	fd_set		readmask;
  	int			nSockets;
  	time_t		now,
+ 				last_dir_recheck_time,
  				last_touch_time;
  
! 	last_dir_recheck_time = last_touch_time = time(NULL);
  
  	nSockets = initMasks(&readmask);
  
*************** ServerLoop(void)
*** 1754,1772 ****
  		if (StartWorkerNeeded || HaveCrashedWorker)
  			maybe_start_bgworker();
  
- 		/*
- 		 * Touch Unix socket and lock files every 58 minutes, to ensure that
- 		 * they are not removed by overzealous /tmp-cleaning tasks.  We assume
- 		 * no one runs cleaners with cutoff times of less than an hour ...
- 		 */
- 		now = time(NULL);
- 		if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
- 		{
- 			TouchSocketFiles();
- 			TouchSocketLockFiles();
- 			last_touch_time = now;
- 		}
- 
  #ifdef HAVE_PTHREAD_IS_THREADED_NP
  
  		/*
--- 1799,1804 ----
*************** ServerLoop(void)
*** 1793,1798 ****
--- 1825,1868 ----
  			/* reset flag so we don't SIGKILL again */
  			AbortStartTime = 0;
  		}
+ 
+ 		/*
+ 		 * Lastly, check to see if it's time to do some things that we don't
+ 		 * want to do every single time through the loop, because they're a
+ 		 * bit expensive.  Note that there's up to a minute of slop in when
+ 		 * these tasks will be performed, since DetermineSleepTime() will let
+ 		 * us sleep at most that long.
+ 		 */
+ 		now = time(NULL);
+ 
+ 		/*
+ 		 * Once a minute, verify that $PGDATA hasn't been removed.  If it has,
+ 		 * we want to commit hara-kiri.  This avoids having postmasters and
+ 		 * child processes hanging around after their database is gone, and
+ 		 * maybe causing problems if a new database cluster is created in the
+ 		 * same place.
+ 		 */
+ 		if (now - last_dir_recheck_time >= 1 * SECS_PER_MINUTE)
+ 		{
+ 			if (!recheckDataDir())
+ 			{
+ 				elog(LOG, "shutting down because data directory is gone");
+ 				kill(MyProcPid, SIGQUIT);
+ 			}
+ 			last_dir_recheck_time = now;
+ 		}
+ 
+ 		/*
+ 		 * Touch Unix socket and lock files every 58 minutes, to ensure that
+ 		 * they are not removed by overzealous /tmp-cleaning tasks.  We assume
+ 		 * no one runs cleaners with cutoff times of less than an hour ...
+ 		 */
+ 		if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+ 		{
+ 			TouchSocketFiles();
+ 			TouchSocketLockFiles();
+ 			last_touch_time = now;
+ 		}
  	}
  }

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Idea for improving buildfarm robustness

Reply via email to