Hello,

we found this issue year ago  -
http://www.postgresql.org/message-id/cafj8prahvzupfbx+8ey-xhfwbo8bxvu_ynmbapsdj8w-ara...@mail.gmail.com

I try to simulate this error, but without success - so I prepared patch
that had to help with identification of this issue. Important part is
backport process startup from 9.2. After applying we detected this issue
newer.

Regards

Pavel



2013/11/29 Tom Lane <t...@sss.pgh.pa.us>

> Daniel Wood <dw...@salesforce.com> writes:
> > ... Presuming your fix is putting PG_SETMASK(&UnBlockSig)
> > immediately before each of the 6 calls to ereport(ERROR,...) I've been
> > running the stress test with both this fix and the lock already held fix.
>
> I'm now planning to put it in error cleanup instead, but that's good
> enough for proving that the problem is what I thought it was.
>
> > I get plenty of lock timeout errors as expected.  However, once in a
> great
> > while I get:  sqlcode = -400, sqlstate = 57014, sqlerrmc = canceling
> > statement due to user request
> > My stress test certainly doesn't do a user cancel.  Should this be
> expected?
>
> I think I see what must be happening there: the lock timeout interrupt is
> happening at some point after the lock has been granted, but before
> ProcSleep reaches its disable_timeouts call.  QueryCancelPending gets set,
> and will be honored next time something does CHECK_FOR_INTERRUPTS.
> But because ProcSleep told disable_timeouts to clear the LOCK_TIMEOUT
> indicator bit, ProcessInterrupts thinks the cancel must've been a plain
> user SIGINT, and reports it that way.
>
> What we should probably do about this is change ProcSleep to not clear the
> LOCK_TIMEOUT indicator bit, same as we already do in LockErrorCleanup,
> which is the less-race-condition-y path out of a lock timeout.
>
> (It would be cooler if the timeout handler had a way to realize that the
> lock is already granted, and not issue a query cancel in the first place.
> But having a signal handler poking at shared memory state is a little too
> scary for my taste.)
>
> It strikes me that this also means that places where we throw away pending
> cancels by clearing QueryCancelPending, such as the sigsetjmp stanza in
> postgres.c, had better reset the LOCK_TIMEOUT indicator bit.  Otherwise,
> a thrown-away lock timeout cancel might cause a later SIGINT cancel to be
> misreported.
>
>                         regards, tom lane
>
>
> --
> Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> To make changes to your subscription:
> http://www.postgresql.org/mailpref/pgsql-hackers
>
*** ./src/backend/storage/lmgr/proc.c.orig	2013-06-27 20:57:34.357713067 +0200
--- ./src/backend/storage/lmgr/proc.c	2013-06-27 21:19:04.917502553 +0200
***************
*** 179,185 ****
  InitProcGlobal(void)
  {
  	PGPROC	   *procs;
! 	int			i;
  	bool		found;
  
  	/* Create the ProcGlobal shared structure */
--- 179,186 ----
  InitProcGlobal(void)
  {
  	PGPROC	   *procs;
! 	int			i,
! 				 j;
  	bool		found;
  
  	/* Create the ProcGlobal shared structure */
***************
*** 223,228 ****
--- 224,233 ----
  		InitSharedLatch(&procs[i].procLatch);
  		procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs;
  		ProcGlobal->freeProcs = &procs[i];
+ 
+ 		/* Initialize myProcLocks[] shred memory queues. */
+ 		for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+ 			SHMQueueInit(&(procs[i].myProcLocks[j]));
  	}
  
  	/*
***************
*** 242,247 ****
--- 247,255 ----
  		InitSharedLatch(&procs[i].procLatch);
  		procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs;
  		ProcGlobal->autovacFreeProcs = &procs[i];
+ 
+ 		for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+ 			SHMQueueInit(&(procs[i].myProcLocks[j]));
  	}
  
  	/*
***************
*** 253,258 ****
--- 261,269 ----
  		AuxiliaryProcs[i].pid = 0;		/* marks auxiliary proc as not in use */
  		PGSemaphoreCreate(&(AuxiliaryProcs[i].sem));
  		InitSharedLatch(&AuxiliaryProcs[i].procLatch);
+ 
+ 		for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+ 			SHMQueueInit(&(procs[i].myProcLocks[j]));
  	}
  
  	/* Create ProcStructLock spinlock, too */
***************
*** 268,274 ****
  {
  	/* use volatile pointer to prevent code rearrangement */
  	volatile PROC_HDR *procglobal = ProcGlobal;
- 	int			i;
  
  	/*
  	 * ProcGlobal should be set up already (if we are a backend, we inherit
--- 279,284 ----
***************
*** 358,365 ****
  	MyProc->lwWaitLink = NULL;
  	MyProc->waitLock = NULL;
  	MyProc->waitProcLock = NULL;
! 	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
! 		SHMQueueInit(&(MyProc->myProcLocks[i]));
  	MyProc->recoveryConflictPending = false;
  
  	/* Initialize fields for sync rep */
--- 368,393 ----
  	MyProc->lwWaitLink = NULL;
  	MyProc->waitLock = NULL;
  	MyProc->waitProcLock = NULL;
! 
! 	/*
! 	 * Our patch
! 	 *
! 	 * we would to diagnose how much often is proces startup find
! 	 * orphaned PGPROCLOCKS. Don't use ereport, because process is
! 	 * not fully prepared in this moment.
! 	 */
! 	{
! 		int		i;
! 
! 		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
! 		{
! 			if (!SHMQueueEmpty(&(MyProc->myProcLocks[i])))
! 			{
! 				fprintf(stderr, "InitProcess: found orphaned PGPROCLOCK proc:%p, pid:%d\n", MyProc, MyProcPid);
! 			}
! 		}
! 	}
! 
  	MyProc->recoveryConflictPending = false;
  
  	/* Initialize fields for sync rep */
***************
*** 729,734 ****
--- 757,807 ----
  	SyncRepCleanupAtProcExit();
  
  	/*
+ 	 * Our patch
+ 	 *
+ 	 * we would to diagnose how much often is proces startup find
+ 	 * orphaned PGPROCLOCKS. Don't use ereport, because process is
+ 	 * not fully prepared in this moment.
+ 	 */
+ 	{
+ 		int		i;
+ 		bool			found_orphaned_locks = false;
+ 
+ 		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ 		{
+ 			if (!SHMQueueEmpty(&(MyProc->myProcLocks[i])))
+ 			{
+ 				fprintf(stderr, "ProcKill: found orphaned PGPROCLOCK proc:%p, pid:%d\n", MyProc, MyProcPid);
+ 				found_orphaned_locks = true;
+ 				break;
+ 			}
+ 		}
+ 
+ 		if (found_orphaned_locks)
+ 		{
+ 			LockReleaseAll(DEFAULT_LOCKMETHOD, true);
+ 			/* Release transaction-level advisory locks */
+ 			LockReleaseAll(USER_LOCKMETHOD, false);
+ 			found_orphaned_locks = false;
+ 
+ 			for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ 			{
+ 				if (!SHMQueueEmpty(&(MyProc->myProcLocks[i])))
+ 				{
+ 					fprintf(stderr, "ProcKill: found orphaned PGPROCLOCK proc:%p, pid:%d again\n", MyProc, MyProcPid);
+ 					found_orphaned_locks = true;
+ 					break;
+ 				}
+ 			}
+ 
+ 			if (!found_orphaned_locks)
+ 				fprintf(stderr, "ProcKill: orphaned PGPROCLOCK proc:%p, pid:%d was removed\n", MyProc, MyProcPid);
+ 			else
+ 				fprintf(stderr, "ProcKill: orphaned PGPROCLOCK proc:%p, pid:%d was not removed\n", MyProc, MyProcPid);
+ 		}
+ 	}
+ 
+ 	/*
  	 * Release any LW locks I am holding.  There really shouldn't be any, but
  	 * it's cheap to check again before we cut the knees off the LWLock
  	 * facility by releasing our PGPROC ...
*** ./src/include/miscadmin.h.orig	2013-06-27 21:15:39.477104870 +0200
--- ./src/include/miscadmin.h	2013-06-27 21:15:49.679124666 +0200
***************
*** 83,89 ****
--- 83,92 ----
  #define CHECK_FOR_INTERRUPTS() \
  do { \
  	if (InterruptPending) \
+ 	{ \
+ 		fprintf(stderr, "CHECK_FOR_INTERRUPTS: func:%s file:%s line:%d, pid:%d\n", __func__, __FILE__, __LINE__, MyProcPid); \
  		ProcessInterrupts(); \
+ 	} \
  } while(0)
  #else							/* WIN32 */
  
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to