[ https://issues.apache.org/jira/browse/HBASE-25905?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17451100#comment-17451100 ]
Duo Zhang commented on HBASE-25905: ----------------------------------- OK, good, the UT failed as expected, inside waitForSafePoint. Let me prepare the fix. https://ci-hadoop.apache.org/job/HBase/job/HBase-PreCommit-GitHub-PR/job/PR-3898/1/testReport/org.apache.hadoop.hbase.regionserver.wal/TestAsyncFSWALRollStuck/___/ {noformat} org.junit.runners.model.TestTimedOutException: test timed out after 780 seconds at java.base@11.0.10/jdk.internal.misc.Unsafe.park(Native Method) at java.base@11.0.10/java.util.concurrent.locks.LockSupport.park(LockSupport.java:194) at java.base@11.0.10/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitUninterruptibly(AbstractQueuedSynchronizer.java:2018) at app//org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:715) at app//org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:745) at app//org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doReplaceWriter(AsyncFSWAL.java:128) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.lambda$replaceWriter$6(AbstractFSWAL.java:865) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$$Lambda$169/0x0000000800344440.call(Unknown Source) at app//org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:218) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.replaceWriter(AbstractFSWAL.java:864) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriterInternal(AbstractFSWAL.java:919) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.lambda$rollWriter$8(AbstractFSWAL.java:948) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$$Lambda$162/0x0000000800347440.call(Unknown Source) at app//org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:218) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriter(AbstractFSWAL.java:948) at app//org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriter(AbstractFSWAL.java:567) at app//org.apache.hadoop.hbase.regionserver.wal.TestAsyncFSWALRollStuck.testRoll(TestAsyncFSWALRollStuck.java:201) at java.base@11.0.10/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base@11.0.10/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base@11.0.10/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base@11.0.10/java.lang.reflect.Method.invoke(Method.java:566) at app//org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) at app//org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at app//org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) at app//org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at app//org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at app//org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100) at app//org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at app//org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103) at app//org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63) at app//org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at app//org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at app//org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at app//org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at app//org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at app//org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at app//org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at app//org.apache.hadoop.hbase.SystemExitRule$1.evaluate(SystemExitRule.java:38) at app//org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:288) at app//org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:282) at java.base@11.0.10/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base@11.0.10/java.lang.Thread.run(Thread.java:834) {noformat} > Shutdown of WAL stuck at waitForSafePoint > ----------------------------------------- > > Key: HBASE-25905 > URL: https://issues.apache.org/jira/browse/HBASE-25905 > Project: HBase > Issue Type: Bug > Components: regionserver, wal > Affects Versions: 3.0.0-alpha-1, 2.0.0 > Reporter: Xiaolin Ha > Assignee: Duo Zhang > Priority: Blocker > Fix For: 2.5.0, 3.0.0-alpha-2, 2.4.9 > > Attachments: rs-jstack1, rs-jstack2, wal-stuck-error-logs.png > > > We use the fan-out HDFS OutputStream and AsyncFSWAL on our clusters, but met > the problem than RS can not exit completely for several hours util manual > interventions. > The two jstacks below show that the regionserver thread can waiting > unlimitedly in both > AsyncFSWAL#waitForSafePoint() > {code:java} > "regionserver/gh-data-hbase-finance08.mt/10.22.179.24:16020" #29 prio=5 > os_prio=0 tid=0x00007fb2feb5c000 nid=0xa92b waiting on condition > [0x00007f9ccb992000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007faea229a9d0> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitUninterruptibly(AbstractQueuedSynchronizer.java:1976) > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.waitForSafePoint(AsyncFSWAL.java:687) > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doShutdown(AsyncFSWAL.java:743) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:900) > at > org.apache.hadoop.hbase.wal.AbstractFSWALProvider.shutdown(AbstractFSWALProvider.java:182) > at > org.apache.hadoop.hbase.wal.RegionGroupingProvider.shutdown(RegionGroupingProvider.java:232) > at > org.apache.hadoop.hbase.wal.WALFactory.shutdown(WALFactory.java:271) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.shutdownWAL(HRegionServer.java:1405) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:1147) > at java.lang.Thread.run(Thread.java:745) > {code} > and the log roller stuck at waiting for lock > {code:java} > "regionserver/gh-data-hbase-finance08.mt/10.22.179.24:16020.logRoller" #322 > daemon prio=5 os_prio=0 tid=0x00007fb2e11a4000 nid=0xa953 waiting on > condition [0x00007f9cbd9f1000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007faea1217048> (a > java.util.concurrent.locks.ReentrantLock$FairSync) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199) > at > java.util.concurrent.locks.ReentrantLock$FairSync.lock(ReentrantLock.java:224) > at > java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.rollWriter(AbstractFSWAL.java:822) > at > org.apache.hadoop.hbase.wal.AbstractWALRoller$RollController.rollWal(AbstractWALRoller.java:269) > at > org.apache.hadoop.hbase.wal.AbstractWALRoller.run(AbstractWALRoller.java:186){code} > -- This message was sent by Atlassian Jira (v8.20.1#820001)