Here are the actual stacks when the original test hangs (read this and ignore my 1st email)...
Found one Java-level deadlock: ============================= "pool-1-thread-2": waiting to lock monitor 0x00007fa5ae150b28 (object 0x0000000773645bd8, a java.util.HashMap), which is held by "pool-1-thread-1" "pool-1-thread-1": waiting for ownable synchronizer 0x0000000773696b48, (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync), which is held by "pool-1-thread-2" Java stack information for the threads listed above: =================================================== "pool-1-thread-2": at org.apache.geode.internal.cache.GemFireCacheImpl.removeRoot(GemFireCacheImpl.java:3577) - waiting to lock <0x0000000773645bd8> (a java.util.HashMap) at org.apache.geode.internal.cache.LocalRegion.basicDestroyRegion(LocalRegion.java:6333) at org.apache.geode.internal.cache.DistributedRegion.basicDestroyRegion(DistributedRegion.java:1755) at org.apache.geode.internal.cache.LocalRegion.basicDestroyRegion(LocalRegion.java:6255) at org.apache.geode.internal.cache.LocalRegion.localDestroyRegion(LocalRegion.java:2242) at org.apache.geode.internal.cache.AbstractRegion.localDestroyRegion(AbstractRegion.java:430) at org.apache.geode.management.internal.ManagementResourceRepo.destroyLocalMonitoringRegion(ManagementResourceRepo.java:73) at org.apache.geode.management.internal.LocalManager.cleanUpResources(LocalManager.java:260) at org.apache.geode.management.internal.LocalManager.stopManager(LocalManager.java:388) at org.apache.geode.management.internal.SystemManagementService.close(SystemManagementService.java:239) - locked <0x00000007736cef28> (a java.util.HashMap) at org.apache.geode.management.internal.beans.ManagementAdapter.handleCacheRemoval(ManagementAdapter.java:737) at org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:119) at org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2201) at org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:606) at org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:2127) - locked <0x00000006c01f69b0> (a java.lang.Class for org.apache.geode.internal.cache.GemFireCacheImpl) at org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:1966) at org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:1956) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.closeCache(CreateDestroyRegionRegressionTest.java:107) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.lambda$hang$1(CreateDestroyRegionRegressionTest.java:81) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest$$Lambda$3/453211571.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) "pool-1-thread-1": at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x0000000773696b48> (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836) at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283) at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727) at org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:110) at org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2201) at org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:606) at org.apache.geode.internal.cache.DiskStoreFactoryImpl.create(DiskStoreFactoryImpl.java:144) - locked <0x0000000773645a78> (a org.apache.geode.internal.cache.GemFireCacheImpl) at org.apache.geode.internal.cache.GemFireCacheImpl.getOrCreateDefaultDiskStore(GemFireCacheImpl.java:2566) - locked <0x0000000773645a78> (a org.apache.geode.internal.cache.GemFireCacheImpl) at org.apache.geode.internal.cache.LocalRegion.findDiskStore(LocalRegion.java:7600) at org.apache.geode.internal.cache.LocalRegion.<init>(LocalRegion.java:647) at org.apache.geode.internal.cache.GemFireCacheImpl.createVMRegion(GemFireCacheImpl.java:3023) - locked <0x0000000773645bd8> (a java.util.HashMap) at org.apache.geode.internal.cache.GemFireCacheImpl.basicCreateRegion(GemFireCacheImpl.java:2956) at org.apache.geode.internal.cache.GemFireCacheImpl.createRegion(GemFireCacheImpl.java:2944) at org.apache.geode.cache.RegionFactory.create(RegionFactory.java:755) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.createRegionWithDefaultDiskStore(CreateDestroyRegionRegressionTest.java:92) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.lambda$hang$0(CreateDestroyRegionRegressionTest.java:80) at org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest$$Lambda$2/13648335.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) On Fri, Jan 18, 2019 at 2:24 PM Kirk Lund <kl...@apache.org> wrote: > And my description doesn't match the threads again... Just ignore the > message. > > Look at the threads if you care. > > On Fri, Jan 18, 2019 at 2:18 PM Kirk Lund <kl...@apache.org> wrote: > >> Have there been any changes within the last year involving the following? >> Is anyone familiar with getOrCreateDefaultDiskStore and when it's invoked? >> >> When Region creation for a persistent Region does not specify a disk >> store, the code call getOrCreateDefaultDiskStore. It then proceeds to >> create the default disk store. >> >> In December, this began to cause a dead lock ( >> https://issues.apache.org/jira/browse/GEODE-6255) in a persistent region >> test that has two threads: >> >> * Thread-1 is invoking Cache.close() >> * Thread-2 is invoking Region creation for a persistent Region that will >> use the default Disk Store which has not yet been created >> >> Thread-1 (close) is acquiring locks in this order: synchronization on >> GemFireCacheImpl.rootRegions, MangementListener.writeLock >> >> Thread-2 (create region) is acquiring locks in this order: >> a) (for Region) synchronization on GemFireCacheImpl.rootRegions, >> MangementListener.readLock >> b) (for Disk Store) MangementListener.readLock, synchronization on >> GemFireCacheImpl.rootRegions >> >> Step (b) is what causes the deadlock and this only occurs if the default >> disk store needs to be created for the newly created region. >> >> ManagementListener is creating JMX mbeans for the whatever component was >> just created. >> >> I filed a ticket for the deadlock: >> https://issues.apache.org/jira/browse/GEODE-6255 (not sure I used >> "thread-1" and "thread-2" consistently between this email and the ticket -- >> I may have flipped them around). >> >> I think creating the default disk store before creating the region might >> be the only easy way to fix the bug. My pair already tried changing >> ManagementListener to use a dedicated thread (or thread pool). We also >> tried removing the ReadWriteLock to see what it's actually protecting and >> the failures are more complicated than creating the default disk store >> before creating the region. >> >> "thread-1": >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.removeRoot(GemFireCacheImpl.java:3577) >> - waiting to lock <0x0000000773583c28> (a java.util.HashMap) >> at >> org.apache.geode.internal.cache.LocalRegion.basicDestroyRegion(LocalRegion.java:6333) >> at >> org.apache.geode.internal.cache.DistributedRegion.basicDestroyRegion(DistributedRegion.java:1755) >> at >> org.apache.geode.internal.cache.LocalRegion.basicDestroyRegion(LocalRegion.java:6255) >> at >> org.apache.geode.internal.cache.LocalRegion.localDestroyRegion(LocalRegion.java:2242) >> at >> org.apache.geode.internal.cache.AbstractRegion.localDestroyRegion(AbstractRegion.java:430) >> at >> org.apache.geode.management.internal.ManagementResourceRepo.destroyLocalMonitoringRegion(ManagementResourceRepo.java:73) >> at >> org.apache.geode.management.internal.LocalManager.cleanUpResources(LocalManager.java:260) >> at >> org.apache.geode.management.internal.LocalManager.stopManager(LocalManager.java:388) >> at >> org.apache.geode.management.internal.SystemManagementService.close(SystemManagementService.java:239) >> - locked <0x000000077361b900> (a java.util.HashMap) >> at >> org.apache.geode.management.internal.beans.ManagementAdapter.handleCacheRemoval(ManagementAdapter.java:737) >> at >> org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:119) >> at >> org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2201) >> at >> org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:606) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:2127) >> - locked <0x00000006c010d508> (a java.lang.Class for >> org.apache.geode.internal.cache.GemFireCacheImpl) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:1966) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.close(GemFireCacheImpl.java:1956) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.closeCache(CreateDestroyRegionRegressionTest.java:119) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.lambda$hang$1(CreateDestroyRegionRegressionTest.java:93) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest$$Lambda$3/1456208737.run(Unknown >> Source) >> at >> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) >> at >> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) >> at java.lang.Thread.run(Thread.java:748) >> >> "thread-2": at sun.misc.Unsafe.park(Native Method) >> - parking to wait for <0x00000007735ff8e0> (a >> java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync) >> at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) >> at >> java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836) >> at >> java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967) >> at >> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283) >> at >> java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727) >> at >> org.apache.geode.management.internal.beans.ManagementListener.handleEvent(ManagementListener.java:110) >> at >> org.apache.geode.distributed.internal.InternalDistributedSystem.notifyResourceEventListeners(InternalDistributedSystem.java:2201) >> at >> org.apache.geode.distributed.internal.InternalDistributedSystem.handleResourceEvent(InternalDistributedSystem.java:606) >> at >> org.apache.geode.internal.cache.DiskStoreFactoryImpl.create(DiskStoreFactoryImpl.java:144) >> - locked <0x0000000773583ac8> (a >> org.apache.geode.internal.cache.GemFireCacheImpl) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.getOrCreateDefaultDiskStore(GemFireCacheImpl.java:2566) >> - locked <0x0000000773583ac8> (a >> org.apache.geode.internal.cache.GemFireCacheImpl) >> at >> org.apache.geode.internal.cache.LocalRegion.findDiskStore(LocalRegion.java:7600) >> at >> org.apache.geode.internal.cache.LocalRegion.<init>(LocalRegion.java:647) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.createVMRegion(GemFireCacheImpl.java:3023) >> - locked <0x0000000773583c28> (a java.util.HashMap) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.basicCreateRegion(GemFireCacheImpl.java:2956) >> at >> org.apache.geode.internal.cache.GemFireCacheImpl.createRegion(GemFireCacheImpl.java:2944) >> at org.apache.geode.cache.RegionFactory.create(RegionFactory.java:755) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.createRegionWithDefaultDiskStore(CreateDestroyRegionRegressionTest.java:105) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest.lambda$hang$0(CreateDestroyRegionRegressionTest.java:92) >> at >> org.apache.geode.internal.cache.persistence.CreateDestroyRegionRegressionTest$$Lambda$2/901506536.run(Unknown >> Source) >> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) >> at java.util.concurrent.FutureTask.run(FutureTask.java:266) >> at >> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) >> at >> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) >> at java.lang.Thread.run(Thread.java:748) >> >> Found 1 deadlock. >> >