Re: [HACKERS] Hot Standby and deadlock detection

Simon Riggs Mon, 01 Feb 2010 09:06:36 -0800

On Mon, 2010-02-01 at 17:50 +0200, Heikki Linnakangas wrote:

> Umm, so why not run the deadlock check immediately in
> max_standby_delay=-1 case as well? Why is that case handled differently
> from max_standby_delay>0 case?


Done, tested, working.

Will commit tomorrow if no further questions or comments.

-- 
 Simon Riggs           www.2ndQuadrant.com

*** a/src/backend/storage/ipc/procsignal.c
--- b/src/backend/storage/ipc/procsignal.c
***************
*** 272,277 **** procsignal_sigusr1_handler(SIGNAL_ARGS)
--- 272,280 ----
  	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
  		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
  
+ 	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
+ 		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ 
  	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
  		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
  
*** a/src/backend/storage/ipc/standby.c
--- b/src/backend/storage/ipc/standby.c
***************
*** 127,132 **** WaitExceedsMaxStandbyDelay(void)
--- 127,135 ----
  	long	delay_secs;
  	int		delay_usecs;
  
+ 	if (MaxStandbyDelay == -1)
+ 		return false;
+ 
  	/* Are we past max_standby_delay? */
  	TimestampDifference(GetLatestXLogTime(), GetCurrentTimestamp(),
  						&delay_secs, &delay_usecs);
***************
*** 351,365 **** ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
   * they hold one of the buffer pins that is blocking Startup process. If so,
   * backends will take an appropriate error action, ERROR or FATAL.
   *
!  * A secondary purpose of this is to avoid deadlocks that might occur between
!  * the Startup process and lock waiters. Deadlocks occur because if queries
   * wait on a lock, that must be behind an AccessExclusiveLock, which can only
!  * be clared if the Startup process replays a transaction completion record.
!  * If Startup process is waiting then that is a deadlock. If we allowed a
!  * setting of max_standby_delay that meant "wait forever" we would then need
!  * special code to protect against deadlock. Such deadlocks are rare, so the
!  * code would be almost certainly buggy, so we avoid both long waits and
!  * deadlocks using the same mechanism.
   */
  void
  ResolveRecoveryConflictWithBufferPin(void)
--- 354,368 ----
   * they hold one of the buffer pins that is blocking Startup process. If so,
   * backends will take an appropriate error action, ERROR or FATAL.
   *
!  * We also check for deadlocks before we wait, though applications that cause
!  * these will be extremely rare.  Deadlocks occur because if queries
   * wait on a lock, that must be behind an AccessExclusiveLock, which can only
!  * be cleared if the Startup process replays a transaction completion record.
!  * If Startup process is also waiting then that is a deadlock. The deadlock
!  * can occur if the query is waiting and then the Startup sleeps, or if
!  * Startup is sleeping and the the query waits on a lock. We protect against
!  * only the former sequence here, the latter sequence is checked prior to
!  * the query sleeping, in CheckRecoveryConflictDeadlock().
   */
  void
  ResolveRecoveryConflictWithBufferPin(void)
***************
*** 368,378 **** ResolveRecoveryConflictWithBufferPin(void)
  
  	Assert(InHotStandby);
  
- 	/*
- 	 * Signal immediately or set alarm for later.
- 	 */
  	if (MaxStandbyDelay == 0)
! 		SendRecoveryConflictWithBufferPin();
  	else
  	{
  		TimestampTz now;
--- 371,393 ----
  
  	Assert(InHotStandby);
  
  	if (MaxStandbyDelay == 0)
! 	{
! 		/*
! 		 * We don't want to wait, so just tell everybody holding the pin to 
! 		 * get out of town.
! 		 */
! 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
! 	}
! 	else if (MaxStandbyDelay == -1)
! 	{
! 		/*
! 		 * Send out a request to check for buffer pin deadlocks before we wait.
! 		 * This is fairly cheap, so no need to wait for deadlock timeout before
! 		 * trying to send it out.
! 		 */
! 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
! 	}
  	else
  	{
  		TimestampTz now;
***************
*** 386,392 **** ResolveRecoveryConflictWithBufferPin(void)
  							&standby_delay_secs, &standby_delay_usecs);
  
  		if (standby_delay_secs >= MaxStandbyDelay)
! 			SendRecoveryConflictWithBufferPin();
  		else
  		{
  			TimestampTz fin_time;			/* Expected wake-up time by timer */
--- 401,412 ----
  							&standby_delay_secs, &standby_delay_usecs);
  
  		if (standby_delay_secs >= MaxStandbyDelay)
! 		{
! 			/*
! 			 * We're already behind, so clear a path as quickly as possible.
! 			 */
! 			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
! 		}
  		else
  		{
  			TimestampTz fin_time;			/* Expected wake-up time by timer */
***************
*** 394,399 **** ResolveRecoveryConflictWithBufferPin(void)
--- 414,426 ----
  			int		timer_delay_usecs = 0;
  
  			/*
+ 			 * Send out a request to check for buffer pin deadlocks before we wait.
+ 			 * This is fairly cheap, so no need to wait for deadlock timeout before
+ 			 * trying to send it out.
+ 			 */
+ 			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ 
+ 			/*
  			 * How much longer we should wait?
  			 */
  			timer_delay_secs = MaxStandbyDelay - standby_delay_secs;
***************
*** 435,449 **** ResolveRecoveryConflictWithBufferPin(void)
  }
  
  void
! SendRecoveryConflictWithBufferPin(void)
  {
  	/*
  	 * We send signal to all backends to ask them if they are holding
  	 * the buffer pin which is delaying the Startup process. We must
  	 * not set the conflict flag yet, since most backends will be innocent.
  	 * Let the SIGUSR1 handling in each backend decide their own fate.
  	 */
! 	CancelDBBackends(InvalidOid, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, false);
  }
  
  /*
--- 462,479 ----
  }
  
  void
! SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
  {
+ 	Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
+ 		   reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ 
  	/*
  	 * We send signal to all backends to ask them if they are holding
  	 * the buffer pin which is delaying the Startup process. We must
  	 * not set the conflict flag yet, since most backends will be innocent.
  	 * Let the SIGUSR1 handling in each backend decide their own fate.
  	 */
! 	CancelDBBackends(InvalidOid, reason, false);
  }
  
  /*
*** a/src/backend/storage/lmgr/proc.c
--- b/src/backend/storage/lmgr/proc.c
***************
*** 45,50 ****
--- 45,51 ----
  #include "storage/pmsignal.h"
  #include "storage/proc.h"
  #include "storage/procarray.h"
+ #include "storage/procsignal.h"
  #include "storage/spin.h"
  
  
***************
*** 556,561 **** HaveNFreeProcs(int n)
--- 557,571 ----
  	return (n <= 0);
  }
  
+ bool
+ IsWaitingForLock(void)
+ {
+ 	if (lockAwaited == NULL)
+ 		return false;
+ 
+ 	return true;
+ }
+ 
  /*
   * Cancel any pending wait for lock, when aborting a transaction.
   *
***************
*** 1671,1677 **** CheckStandbyTimeout(void)
  	now = GetCurrentTimestamp();
  
  	if (now >= statement_fin_time)
! 		SendRecoveryConflictWithBufferPin();
  	else
  	{
  		/* Not time yet, so (re)schedule the interrupt */
--- 1681,1687 ----
  	now = GetCurrentTimestamp();
  
  	if (now >= statement_fin_time)
! 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
  	else
  	{
  		/* Not time yet, so (re)schedule the interrupt */
*** a/src/backend/tcop/postgres.c
--- b/src/backend/tcop/postgres.c
***************
*** 2278,2283 **** errdetail_recovery_conflict(void)
--- 2278,2286 ----
  		case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
  				errdetail("User query might have needed to see row versions that must be removed.");
  				break;
+ 		case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ 				errdetail("User transaction caused buffer deadlock with recovery.");
+ 				break;
  		case PROCSIG_RECOVERY_CONFLICT_DATABASE:
  				errdetail("User was connected to a database that must be dropped.");
  				break;
***************
*** 2754,2759 **** RecoveryConflictInterrupt(ProcSignalReason reason)
--- 2757,2771 ----
  		RecoveryConflictReason = reason;
  		switch (reason)
  		{
+ 			case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ 					/*
+ 					 * If we aren't waiting for a lock we can never deadlock.
+ 					 */
+ 					if (!IsWaitingForLock())
+ 						return;
+ 
+ 					/* Intentional drop through to check wait for pin */
+ 
  			case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
  					/*
  					 * If we aren't blocking the Startup process there is
***************
*** 2819,2824 **** RecoveryConflictInterrupt(ProcSignalReason reason)
--- 2831,2838 ----
  					elog(FATAL, "Unknown conflict mode");
  		}
  
+ 		Assert(RecoveryConflictPending && (QueryCancelPending || ProcDiePending));
+ 
  		/*
  		 * If it's safe to interrupt, and we're waiting for input or a lock,
  		 * service the interrupt immediately
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1381,1387 **** static struct config_int ConfigureNamesInt[] =
  			NULL
  		},
  		&MaxStandbyDelay,
! 		30, 0, INT_MAX, NULL, NULL
  	},
  
  	{
--- 1381,1387 ----
  			NULL
  		},
  		&MaxStandbyDelay,
! 		30, -1, INT_MAX, NULL, NULL
  	},
  
  	{
*** a/src/include/storage/proc.h
--- b/src/include/storage/proc.h
***************
*** 189,194 **** extern void ProcQueueInit(PROC_QUEUE *queue);
--- 189,195 ----
  extern int	ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
  extern PGPROC *ProcWakeup(PGPROC *proc, int waitStatus);
  extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
+ extern bool IsWaitingForLock(void);
  extern void LockWaitCancel(void);
  
  extern void ProcWaitForSignal(void);
*** a/src/include/storage/procsignal.h
--- b/src/include/storage/procsignal.h
***************
*** 38,43 **** typedef enum
--- 38,44 ----
  	PROCSIG_RECOVERY_CONFLICT_LOCK,
  	PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
  	PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ 	PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
  
  	NUM_PROCSIGNALS				/* Must be last! */
  } ProcSignalReason;
*** a/src/include/storage/standby.h
--- b/src/include/storage/standby.h
***************
*** 16,21 ****
--- 16,22 ----
  
  #include "access/xlog.h"
  #include "storage/lock.h"
+ #include "storage/procsignal.h"
  #include "storage/relfilenode.h"
  
  extern int	vacuum_defer_cleanup_age;
***************
*** 30,36 **** extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
  extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
  
  extern void ResolveRecoveryConflictWithBufferPin(void);
! extern void SendRecoveryConflictWithBufferPin(void);
  extern void CheckRecoveryConflictDeadlock(LWLockId partitionLock);
  
  /*
--- 31,37 ----
  extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
  
  extern void ResolveRecoveryConflictWithBufferPin(void);
! extern void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
  extern void CheckRecoveryConflictDeadlock(LWLockId partitionLock);
  
  /*

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Hot Standby and deadlock detection

Reply via email to