On Sun, Aug 14, 2011 at 12:16:39AM -0400, Robert Haas wrote:
> On Fri, Aug 12, 2011 at 7:19 PM, daveg <[email protected]> wrote:
> > This seems to be bug month for my client. Now there are seeing periods
> > where all new connections fail immediately with the error:
> >
> > FATAL: lock AccessShareLock on object 0/1260/0 is already held
...
> > What can I do to help track this down?
>
> I've seen that error (though not that exact fact pattern) caused by
> bad RAM. It's unclear to me what else could cause it.
>
> In terms of debugging, it seems like it might be sensible to start by
> injecting some debugging code that dumps out the contents of the LOCK
> and LOCALLOCK structures at the point the error occurs.
I've made up the attached patch to print this, please suggest any additions.
I'll deploy this on a couple of the production hosts that have had the
issue this evening, but there is no telling when or if it will strike next.
-dg
--
David Gould [email protected] 510 536 1443 510 282 0869
If simplicity worked, the world would be overrun with insects.
*** postgresql-9.0.4/src/backend/storage/lmgr/lock.c.orig 2011-08-22
13:46:29.386428943 -0700
--- postgresql-9.0.4/src/backend/storage/lmgr/lock.c 2011-08-23
00:12:43.456422966 -0700
***************
*** 469,474 ****
--- 469,534 ----
return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
true);
}
+ int DG_FORCE_TRAP = 0;
+
+ inline static void
+ DG_LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type)
+ {
+ elog(LOG,
+ "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) "
+ "req(%d,%d,%d,%d,%d,%d,%d)=%d "
+ "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)",
+ where, lock,
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3, lock->tag.locktag_field4,
+ lock->tag.locktag_type, lock->tag.locktag_lockmethodid,
+ lock->grantMask,
+ lock->requested[1], lock->requested[2], lock->requested[3],
+ lock->requested[4], lock->requested[5], lock->requested[6],
+ lock->requested[7], lock->nRequested,
+ lock->granted[1], lock->granted[2], lock->granted[3],
+ lock->granted[4], lock->granted[5], lock->granted[6],
+ lock->granted[7], lock->nGranted,
+ lock->waitProcs.size,
+ LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]);
+ }
+
+
+ inline static void
+ DG_PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
+ {
+ elog(LOG,
+ "%s: proclock(%p) lock(%p) method(%u) proc(%p) "
+ "hold(%x) release(%x) "
+ "links(p,n): lock=(%p,%p), proc=(%p,%p)",
+ where, proclockP, proclockP->tag.myLock,
+ PROCLOCK_LOCKMETHOD(*(proclockP)),
+ proclockP->tag.myProc,
+ (int) proclockP->holdMask, (int) proclockP->releaseMask,
+ proclockP->lockLink.prev, proclockP->lockLink.next,
+ proclockP->procLink.prev, proclockP->procLink.next
+ );
+ }
+
+ inline static void
+ DG_LOCALLOCK_PRINT(const char *where, const LOCALLOCK *localP)
+ {
+ elog(LOG,
+ "%s: locallock(%p) id(%u,%u,%u,%u,%u,%u mode %x) "
+ "lock(%p), proclock(%p) "
+ "hashcode %x nlocks %ld numLockOwners %d maxLockOwners %d ",
+ where, localP,
+ localP->tag.lock.locktag_field1,
localP->tag.lock.locktag_field2,
+ localP->tag.lock.locktag_field3,
localP->tag.lock.locktag_field4,
+ localP->tag.lock.locktag_type,
localP->tag.lock.locktag_lockmethodid,
+ localP->tag.mode,
+ localP->lock, localP->proclock,
+ localP->hashcode, localP->nLocks,
+ localP->numLockOwners, localP->maxLockOwners
+ /* localP->lockOwners[0].owner, localP->lockOwners.nlocks "%p
%d" */
+ );
+ }
+
/*
* LockAcquireExtended - allows us to specify additional options
*
***************
*** 500,505 ****
--- 560,568 ----
LWLockId partitionLock;
int status;
bool log_lock = false;
+ int DG_found_local = -1;
+ int DG_found_lock = -1;
+ int DG_found_proc = -1;
if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
elog(ERROR, "unrecognized lock method: %d", lockmethodid);
***************
*** 540,546 ****
locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
(void *) &localtag,
HASH_ENTER, &found);
!
/*
* if it's a new locallock object, initialize it
*/
--- 603,609 ----
locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
(void *) &localtag,
HASH_ENTER, &found);
! DG_found_local = found ? 1 : 0;
/*
* if it's a new locallock object, initialize it
*/
***************
*** 620,625 ****
--- 683,689 ----
hashcode,
HASH_ENTER_NULL,
&found);
+ DG_found_lock = found ? 1 : 0;
if (!lock)
{
LWLockRelease(partitionLock);
***************
*** 672,677 ****
--- 736,742 ----
proclock_hashcode,
HASH_ENTER_NULL,
&found);
+ DG_found_proc = found ? 1 : 0;
if (!proclock)
{
/* Ooops, not enough shmem for the proclock */
***************
*** 771,782 ****
* We shouldn't already hold the desired lock; else locallock table is
* broken.
*/
! if (proclock->holdMask & LOCKBIT_ON(lockmode))
elog(ERROR, "lock %s on object %u/%u/%u is already held",
lockMethodTable->lockModeNames[lockmode],
lock->tag.locktag_field1, lock->tag.locktag_field2,
lock->tag.locktag_field3);
!
/*
* If lock requested conflicts with locks requested by waiters, must
join
* wait queue. Otherwise, check for conflict with already-held locks.
--- 836,858 ----
* We shouldn't already hold the desired lock; else locallock table is
* broken.
*/
! if (DG_FORCE_TRAP || proclock->holdMask & LOCKBIT_ON(lockmode))
! {
! DG_FORCE_TRAP = 0;
! elog(LOG,
! "LockAcquire: already held: holdMask %x LOCKBIT %x "
! "found_local %d found_lock %d found_proc %d ",
! proclock->holdMask, LOCKBIT_ON(lockmode),
! DG_found_local, DG_found_lock, DG_found_proc);
! DG_LOCALLOCK_PRINT("LockAcquire LOCALLOCK", locallock);
! DG_LOCK_PRINT("LockAcquire LOCK", lock, lockmode);
! DG_PROCLOCK_PRINT("LockAcquire PROCLOCK", proclock);
!
elog(ERROR, "lock %s on object %u/%u/%u is already held",
lockMethodTable->lockModeNames[lockmode],
lock->tag.locktag_field1, lock->tag.locktag_field2,
lock->tag.locktag_field3);
! }
/*
* If lock requested conflicts with locks requested by waiters, must
join
* wait queue. Otherwise, check for conflict with already-held locks.
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers