Alvaro Herrera <alvhe...@2ndquadrant.com> writes:
> Tom Lane wrote:
>> It also appears to me that do_start_bgworker's treatment of fork
>> failure is completely brain dead.  Did anyone really think about
>> that case?

> Hmm, I probably modelled it on autovacuum without giving that case much
> additional consideration.

Attached is a proposed patch that should make fork failure behave
sanely, ie it works much the same as a worker crash immediately after
launch.  I also refactored things a bit to make do_start_bgworker
fully responsible for updating the RegisteredBgWorker's state,
rather than doing just some of it as before.

I tested this by hot-wiring the fork_process call to fail some of
the time, which showed that the postmaster now seems to recover OK,
but parallel.c's logic is completely innocent of the idea that
worker-startup failure is possible.  The leader backend just freezes,
and nothing short of kill -9 on that backend will get you out of it.
Fixing that seems like material for a separate patch though.

                        regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c382345..8461c24 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static void TerminateChildren(int signal
*** 420,425 ****
--- 420,426 ----
  #define SignalChildren(sig)			   SignalSomeChildren(sig, BACKEND_TYPE_ALL)
  
  static int	CountChildren(int target);
+ static bool assign_backendlist_entry(RegisteredBgWorker *rw);
  static void maybe_start_bgworker(void);
  static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
  static pid_t StartChildProcess(AuxProcType type);
*************** bgworker_forkexec(int shmem_slot)
*** 5531,5543 ****
   * Start a new bgworker.
   * Starting time conditions must have been checked already.
   *
   * This code is heavily based on autovacuum.c, q.v.
   */
! static void
  do_start_bgworker(RegisteredBgWorker *rw)
  {
  	pid_t		worker_pid;
  
  	ereport(DEBUG1,
  			(errmsg("starting background worker process \"%s\"",
  					rw->rw_worker.bgw_name)));
--- 5532,5564 ----
   * Start a new bgworker.
   * Starting time conditions must have been checked already.
   *
+  * Returns true on success, false on failure.
+  * In either case, update the RegisteredBgWorker's state appropriately.
+  *
   * This code is heavily based on autovacuum.c, q.v.
   */
! static bool
  do_start_bgworker(RegisteredBgWorker *rw)
  {
  	pid_t		worker_pid;
  
+ 	Assert(rw->rw_pid == 0);
+ 
+ 	/*
+ 	 * Allocate and assign the Backend element.  Note we must do this before
+ 	 * forking, so that we can handle out of memory properly.
+ 	 *
+ 	 * Treat failure as though the worker had crashed.  That way, the
+ 	 * postmaster will wait a bit before attempting to start it again; if it
+ 	 * tried again right away, most likely it'd find itself repeating the
+ 	 * out-of-memory or fork failure condition.
+ 	 */
+ 	if (!assign_backendlist_entry(rw))
+ 	{
+ 		rw->rw_crashed_at = GetCurrentTimestamp();
+ 		return false;
+ 	}
+ 
  	ereport(DEBUG1,
  			(errmsg("starting background worker process \"%s\"",
  					rw->rw_worker.bgw_name)));
*************** do_start_bgworker(RegisteredBgWorker *rw
*** 5549,5557 ****
  #endif
  	{
  		case -1:
  			ereport(LOG,
  					(errmsg("could not fork worker process: %m")));
! 			return;
  
  #ifndef EXEC_BACKEND
  		case 0:
--- 5570,5586 ----
  #endif
  	{
  		case -1:
+ 			/* in postmaster, fork failed ... */
  			ereport(LOG,
  					(errmsg("could not fork worker process: %m")));
! 			/* undo what assign_backendlist_entry did */
! 			ReleasePostmasterChildSlot(rw->rw_child_slot);
! 			rw->rw_child_slot = 0;
! 			free(rw->rw_backend);
! 			rw->rw_backend = NULL;
! 			/* mark entry as crashed, so we'll try again later */
! 			rw->rw_crashed_at = GetCurrentTimestamp();
! 			break;
  
  #ifndef EXEC_BACKEND
  		case 0:
*************** do_start_bgworker(RegisteredBgWorker *rw
*** 5575,5588 ****
  			PostmasterContext = NULL;
  
  			StartBackgroundWorker();
  			break;
  #endif
  		default:
  			rw->rw_pid = worker_pid;
  			rw->rw_backend->pid = rw->rw_pid;
  			ReportBackgroundWorkerPID(rw);
! 			break;
  	}
  }
  
  /*
--- 5604,5627 ----
  			PostmasterContext = NULL;
  
  			StartBackgroundWorker();
+ 
+ 			exit(1);			/* should not get here */
  			break;
  #endif
  		default:
+ 			/* in postmaster, fork successful ... */
  			rw->rw_pid = worker_pid;
  			rw->rw_backend->pid = rw->rw_pid;
  			ReportBackgroundWorkerPID(rw);
! 			/* add new worker to lists of backends */
! 			dlist_push_head(&BackendList, &rw->rw_backend->elem);
! #ifdef EXEC_BACKEND
! 			ShmemBackendArrayAdd(rw->rw_backend);
! #endif
! 			return true;
  	}
+ 
+ 	return false;
  }
  
  /*
*************** bgworker_should_start_now(BgWorkerStartT
*** 5629,5634 ****
--- 5668,5675 ----
   * Allocate the Backend struct for a connected background worker, but don't
   * add it to the list of backends just yet.
   *
+  * On failure, return false without changing any worker state.
+  *
   * Some info from the Backend is copied into the passed rw.
   */
  static bool
*************** assign_backendlist_entry(RegisteredBgWor
*** 5647,5654 ****
  		ereport(LOG,
  				(errcode(ERRCODE_INTERNAL_ERROR),
  				 errmsg("could not generate random cancel key")));
- 
- 		rw->rw_crashed_at = GetCurrentTimestamp();
  		return false;
  	}
  
--- 5688,5693 ----
*************** assign_backendlist_entry(RegisteredBgWor
*** 5658,5671 ****
  		ereport(LOG,
  				(errcode(ERRCODE_OUT_OF_MEMORY),
  				 errmsg("out of memory")));
- 
- 		/*
- 		 * The worker didn't really crash, but setting this nonzero makes
- 		 * postmaster wait a bit before attempting to start it again; if it
- 		 * tried again right away, most likely it'd find itself under the same
- 		 * memory pressure.
- 		 */
- 		rw->rw_crashed_at = GetCurrentTimestamp();
  		return false;
  	}
  
--- 5697,5702 ----
*************** assign_backendlist_entry(RegisteredBgWor
*** 5687,5692 ****
--- 5718,5728 ----
   * As a side effect, the bgworker control variables are set or reset whenever
   * there are more workers to start after this one, and whenever the overall
   * system state requires it.
+  *
+  * The reason we start at most one worker per call is to avoid consuming the
+  * postmaster's attention for too long when many such requests are pending.
+  * As long as StartWorkerNeeded is true, ServerLoop will not block and will
+  * call this function again after dealing with any other issues.
   */
  static void
  maybe_start_bgworker(void)
*************** maybe_start_bgworker(void)
*** 5694,5706 ****
  	slist_mutable_iter iter;
  	TimestampTz now = 0;
  
  	if (FatalError)
  	{
  		StartWorkerNeeded = false;
  		HaveCrashedWorker = false;
! 		return;					/* not yet */
  	}
  
  	HaveCrashedWorker = false;
  
  	slist_foreach_modify(iter, &BackgroundWorkerList)
--- 5730,5748 ----
  	slist_mutable_iter iter;
  	TimestampTz now = 0;
  
+ 	/*
+ 	 * During crash recovery, we have no need to be called until the state
+ 	 * transition out of recovery.
+ 	 */
  	if (FatalError)
  	{
  		StartWorkerNeeded = false;
  		HaveCrashedWorker = false;
! 		return;
  	}
  
+ 	/* Don't need to be called again unless we find a reason for it below */
+ 	StartWorkerNeeded = false;
  	HaveCrashedWorker = false;
  
  	slist_foreach_modify(iter, &BackgroundWorkerList)
*************** maybe_start_bgworker(void)
*** 5709,5719 ****
  
  		rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);
  
! 		/* already running? */
  		if (rw->rw_pid != 0)
  			continue;
  
! 		/* marked for death? */
  		if (rw->rw_terminate)
  		{
  			ForgetBackgroundWorker(&iter);
--- 5751,5761 ----
  
  		rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);
  
! 		/* ignore if already running */
  		if (rw->rw_pid != 0)
  			continue;
  
! 		/* if marked for death, clean up and remove from list */
  		if (rw->rw_terminate)
  		{
  			ForgetBackgroundWorker(&iter);
*************** maybe_start_bgworker(void)
*** 5735,5746 ****
--- 5777,5790 ----
  				continue;
  			}
  
+ 			/* read system time only when needed */
  			if (now == 0)
  				now = GetCurrentTimestamp();
  
  			if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now,
  									  rw->rw_worker.bgw_restart_time * 1000))
  			{
+ 				/* Set flag to remember that we have workers to start later */
  				HaveCrashedWorker = true;
  				continue;
  			}
*************** maybe_start_bgworker(void)
*** 5748,5782 ****
  
  		if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
  		{
! 			/* reset crash time before calling assign_backendlist_entry */
  			rw->rw_crashed_at = 0;
  
  			/*
! 			 * Allocate and assign the Backend element.  Note we must do this
! 			 * before forking, so that we can handle out of memory properly.
  			 */
! 			if (!assign_backendlist_entry(rw))
  				return;
! 
! 			do_start_bgworker(rw);		/* sets rw->rw_pid */
! 
! 			dlist_push_head(&BackendList, &rw->rw_backend->elem);
! #ifdef EXEC_BACKEND
! 			ShmemBackendArrayAdd(rw->rw_backend);
! #endif
  
  			/*
! 			 * Have ServerLoop call us again.  Note that there might not
! 			 * actually *be* another runnable worker, but we don't care all
! 			 * that much; we will find out the next time we run.
  			 */
  			StartWorkerNeeded = true;
  			return;
  		}
  	}
- 
- 	/* no runnable worker found */
- 	StartWorkerNeeded = false;
  }
  
  /*
--- 5792,5826 ----
  
  		if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
  		{
! 			/* reset crash time before trying to start worker */
  			rw->rw_crashed_at = 0;
  
  			/*
! 			 * Try to start the worker.
! 			 *
! 			 * On failure, give up processing workers for now, but set
! 			 * StartWorkerNeeded so we'll come back here on the next iteration
! 			 * of ServerLoop to try again.  (We don't want to wait, because
! 			 * there might be additional ready-to-run workers.)  We could set
! 			 * HaveCrashedWorker as well, since this worker is now marked
! 			 * crashed, but there's no need because the next run of this
! 			 * function will do that.
  			 */
! 			if (!do_start_bgworker(rw))
! 			{
! 				StartWorkerNeeded = true;
  				return;
! 			}
  
  			/*
! 			 * Quit, but have ServerLoop call us again to look for additional
! 			 * ready-to-run workers.  There might not be any, but we'll find
! 			 * out the next time we run.
  			 */
  			StartWorkerNeeded = true;
  			return;
  		}
  	}
  }
  
  /*
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to