[HACKERS] removing set_latch_on_sigusr1

Robert Haas Thu, 08 Oct 2015 08:43:06 -0700

As part of the dynamic background worker mechanism, I introduced a
flag set_latch_on_sigusr1.  When set, a process sets its own latch any
time it receives SIGUSR1.  The original motivation for this was that
the postmaster sends SIGUSR1 to a process to indicate the death of a
background worker, but cannot directly set the process latch since we
want to minimize the postmaster's contact surface with the main shared
memory segment.  The reason I introduced a new flag instead of just
*always* setting the latch was because I thought somebody might
complain about the cost of setting the latch every time.  But now I
think that's exactly what we should do: forget the flag, always set
the latch.  If you get a SIGUSR1, somebody is trying to get your
attention.  At worst, setting the latch will cause whatever latch-wait
loop got interrupted to re-execute without making forward progress.
Most of the time, though, you'll do something like
CHECK_FOR_INTERRUPTS() in that loop, and you'll probably find that
you've got one.


The code we're talking about here is in procsignal_sigusr1_handler.
That function can potentially call HandleCatchupInterrupt,
HandleNotifyInterrupt, HandleParallelMessageInterrupt, or
RecoveryConflictInterrupt.   And all of those functions set the
process latch.  So the only thing we're gaining by having
set_latch_on_sigusr1 is the possibility that we might avoid setting
the latch in response to either (1) a postmaster signal related to a
background worker that we happen not to care about at the moment or
(2) a totally spurious SIGUSR1.  But neither of those events should
happen very often, so what's the point?  The cost of setting the latch
when we don't need to is typically small, but NOT setting it when we
should have done can lead to an indefinite hang.

As it happens, the TupleQueueFunnelNext function I recently committed
has such a hazard, which I failed to spot during review and testing.
If people don't like this, I can instead cause that function to set
the flag.  But every place that sets the flag has to use a
PG_TRY()/PG_CATCH() block to make sure the old value of the flag gets
restored.  I'm pretty sure that's going to burn more cycles than the
flag can ever hope to save, not to mention the risk of bugs due to
people forgetting to add necessary volatile qualifiers.  We've already
got four PG_TRY() blocks in the code to cater to this stupid flag, and
if we keep it around I'm sure we'll accumulate at least a few more.

Patch attached.  Objections?  Suggestions?  Comments?

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 68c9505..c38d486 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -954,45 +954,31 @@ WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp)
 {
 	BgwHandleStatus status;
 	int			rc;
-	bool		save_set_latch_on_sigusr1;
 
-	save_set_latch_on_sigusr1 = set_latch_on_sigusr1;
-	set_latch_on_sigusr1 = true;
-
-	PG_TRY();
+	for (;;)
 	{
-		for (;;)
-		{
-			pid_t		pid;
+		pid_t		pid;
 
-			CHECK_FOR_INTERRUPTS();
+		CHECK_FOR_INTERRUPTS();
 
-			status = GetBackgroundWorkerPid(handle, &pid);
-			if (status == BGWH_STARTED)
-				*pidp = pid;
-			if (status != BGWH_NOT_YET_STARTED)
-				break;
-
-			rc = WaitLatch(MyLatch,
-						   WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);
+		status = GetBackgroundWorkerPid(handle, &pid);
+		if (status == BGWH_STARTED)
+			*pidp = pid;
+		if (status != BGWH_NOT_YET_STARTED)
+			break;
 
-			if (rc & WL_POSTMASTER_DEATH)
-			{
-				status = BGWH_POSTMASTER_DIED;
-				break;
-			}
+		rc = WaitLatch(MyLatch,
+					   WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);
 
-			ResetLatch(MyLatch);
+		if (rc & WL_POSTMASTER_DEATH)
+		{
+			status = BGWH_POSTMASTER_DIED;
+			break;
 		}
+
+		ResetLatch(MyLatch);
 	}
-	PG_CATCH();
-	{
-		set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
 
-	set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
 	return status;
 }
 
@@ -1009,40 +995,26 @@ WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *handle)
 {
 	BgwHandleStatus status;
 	int			rc;
-	bool		save_set_latch_on_sigusr1;
-
-	save_set_latch_on_sigusr1 = set_latch_on_sigusr1;
-	set_latch_on_sigusr1 = true;
 
-	PG_TRY();
+	for (;;)
 	{
-		for (;;)
-		{
-			pid_t		pid;
+		pid_t		pid;
 
-			CHECK_FOR_INTERRUPTS();
+		CHECK_FOR_INTERRUPTS();
 
-			status = GetBackgroundWorkerPid(handle, &pid);
-			if (status == BGWH_STOPPED)
-				return status;
+		status = GetBackgroundWorkerPid(handle, &pid);
+		if (status == BGWH_STOPPED)
+			return status;
 
-			rc = WaitLatch(&MyProc->procLatch,
-						   WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);
+		rc = WaitLatch(&MyProc->procLatch,
+					   WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);
 
-			if (rc & WL_POSTMASTER_DEATH)
-				return BGWH_POSTMASTER_DIED;
+		if (rc & WL_POSTMASTER_DEATH)
+			return BGWH_POSTMASTER_DIED;
 
-			ResetLatch(&MyProc->procLatch);
-		}
-	}
-	PG_CATCH();
-	{
-		set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
-		PG_RE_THROW();
+		ResetLatch(&MyProc->procLatch);
 	}
-	PG_END_TRY();
 
-	set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
 	return status;
 }
 
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 0abde43..acfb4e9 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -59,14 +59,6 @@ typedef struct
  */
 #define NumProcSignalSlots	(MaxBackends + NUM_AUXPROCTYPES)
 
-/*
- * If this flag is set, the process latch will be set whenever SIGUSR1
- * is received.  This is useful when waiting for a signal from the postmaster.
- * Spurious wakeups must be expected.  Make sure that the flag is cleared
- * in the error path.
- */
-bool		set_latch_on_sigusr1;
-
 static ProcSignalSlot *ProcSignalSlots = NULL;
 static volatile ProcSignalSlot *MyProcSignalSlot = NULL;
 
@@ -296,8 +288,7 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
 	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
 		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
 
-	if (set_latch_on_sigusr1)
-		SetLatch(MyLatch);
+	SetLatch(MyLatch);
 
 	latch_sigusr1_handler();
 
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index c78f165..80956ce 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -962,63 +962,49 @@ static bool
 shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
 					 BackgroundWorkerHandle *handle)
 {
-	bool		save_set_latch_on_sigusr1;
 	bool		result = false;
 
-	save_set_latch_on_sigusr1 = set_latch_on_sigusr1;
-	if (handle != NULL)
-		set_latch_on_sigusr1 = true;
-
-	PG_TRY();
+	for (;;)
 	{
-		for (;;)
+		BgwHandleStatus status;
+		pid_t		pid;
+		bool		detached;
+
+		/* Acquire the lock just long enough to check the pointer. */
+		SpinLockAcquire(&mq->mq_mutex);
+		detached = mq->mq_detached;
+		result = (*ptr != NULL);
+		SpinLockRelease(&mq->mq_mutex);
+
+		/* Fail if detached; else succeed if initialized. */
+		if (detached)
+		{
+			result = false;
+			break;
+		}
+		if (result)
+			break;
+
+		if (handle != NULL)
 		{
-			BgwHandleStatus status;
-			pid_t		pid;
-			bool		detached;
-
-			/* Acquire the lock just long enough to check the pointer. */
-			SpinLockAcquire(&mq->mq_mutex);
-			detached = mq->mq_detached;
-			result = (*ptr != NULL);
-			SpinLockRelease(&mq->mq_mutex);
-
-			/* Fail if detached; else succeed if initialized. */
-			if (detached)
+			/* Check for unexpected worker death. */
+			status = GetBackgroundWorkerPid(handle, &pid);
+			if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
 			{
 				result = false;
 				break;
 			}
-			if (result)
-				break;
-
-			if (handle != NULL)
-			{
-				/* Check for unexpected worker death. */
-				status = GetBackgroundWorkerPid(handle, &pid);
-				if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
-				{
-					result = false;
-					break;
-				}
-			}
+		}
 
-			/* Wait to be signalled. */
-			WaitLatch(MyLatch, WL_LATCH_SET, 0);
+		/* Wait to be signalled. */
+		WaitLatch(MyLatch, WL_LATCH_SET, 0);
 
-			/* An interrupt may have occurred while we were waiting. */
-			CHECK_FOR_INTERRUPTS();
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
 
-			/* Reset the latch so we don't spin. */
-			ResetLatch(MyLatch);
-		}
-	}
-	PG_CATCH();
-	{
-		set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
-		PG_RE_THROW();
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
 	}
-	PG_END_TRY();
 
 	return result;
 }
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index aee13ae..d30fe35 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2825,9 +2825,7 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
 	/*
 	 * Set the process latch. This function essentially emulates signal
 	 * handlers like die() and StatementCancelHandler() and it seems prudent
-	 * to behave similarly as they do. Alternatively all plain backend code
-	 * waiting on that latch, expecting to get interrupted by query cancels et
-	 * al., would also need to set set_latch_on_sigusr1.
+	 * to behave similarly as they do.
 	 */
 	SetLatch(MyLatch);
 
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index af1a0cd..50ffb13 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -55,6 +55,5 @@ extern int SendProcSignal(pid_t pid, ProcSignalReason reason,
 			   BackendId backendId);
 
 extern void procsignal_sigusr1_handler(SIGNAL_ARGS);
-extern PGDLLIMPORT bool set_latch_on_sigusr1;
 
 #endif   /* PROCSIGNAL_H */
diff --git a/src/test/modules/test_shm_mq/setup.c b/src/test/modules/test_shm_mq/setup.c
index 7f2f5fd..dedd72f 100644
--- a/src/test/modules/test_shm_mq/setup.c
+++ b/src/test/modules/test_shm_mq/setup.c
@@ -255,51 +255,38 @@ static void
 wait_for_workers_to_become_ready(worker_state *wstate,
 								 volatile test_shm_mq_header *hdr)
 {
-	bool		save_set_latch_on_sigusr1;
 	bool		result = false;
 
-	save_set_latch_on_sigusr1 = set_latch_on_sigusr1;
-	set_latch_on_sigusr1 = true;
-
-	PG_TRY();
+	for (;;)
 	{
-		for (;;)
+		int			workers_ready;
+
+		/* If all the workers are ready, we have succeeded. */
+		SpinLockAcquire(&hdr->mutex);
+		workers_ready = hdr->workers_ready;
+		SpinLockRelease(&hdr->mutex);
+		if (workers_ready >= wstate->nworkers)
 		{
-			int			workers_ready;
-
-			/* If all the workers are ready, we have succeeded. */
-			SpinLockAcquire(&hdr->mutex);
-			workers_ready = hdr->workers_ready;
-			SpinLockRelease(&hdr->mutex);
-			if (workers_ready >= wstate->nworkers)
-			{
-				result = true;
-				break;
-			}
-
-			/* If any workers (or the postmaster) have died, we have failed. */
-			if (!check_worker_status(wstate))
-			{
-				result = false;
-				break;
-			}
-
-			/* Wait to be signalled. */
-			WaitLatch(MyLatch, WL_LATCH_SET, 0);
-
-			/* An interrupt may have occurred while we were waiting. */
-			CHECK_FOR_INTERRUPTS();
-
-			/* Reset the latch so we don't spin. */
-			ResetLatch(MyLatch);
+			result = true;
+			break;
 		}
+
+		/* If any workers (or the postmaster) have died, we have failed. */
+		if (!check_worker_status(wstate))
+		{
+			result = false;
+			break;
+		}
+
+		/* Wait to be signalled. */
+		WaitLatch(MyLatch, WL_LATCH_SET, 0);
+
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
+
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
 	}
-	PG_CATCH();
-	{
-		set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
 
 	if (!result)
 		ereport(ERROR,

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] removing set_latch_on_sigusr1

Reply via email to